From 9ae347614bb7beef5186a4b791baafd26231ce93 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 8 Nov 2023 06:21:45 +0000 Subject: [PATCH 01/68] initial work for pairwise ranking (dataset part) --- include/LightGBM/config.h | 5 + include/LightGBM/dataset.h | 24 + include/LightGBM/feature_group.h | 2 +- .../LightGBM/pairwise_ranking_feature_group.h | 107 +++++ src/boosting/cuda/nccl_gbdt.cpp | 429 ++++++++++++++++++ src/io/dataset.cpp | 78 ++++ src/io/metadata.cpp | 33 ++ src/io/pairwise_lambdarank_bin.hpp | 107 +++++ src/objective/rank_objective.hpp | 20 + 9 files changed, 804 insertions(+), 1 deletion(-) create mode 100644 include/LightGBM/pairwise_ranking_feature_group.h create mode 100644 src/boosting/cuda/nccl_gbdt.cpp create mode 100644 src/io/pairwise_lambdarank_bin.hpp diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 6d61bc764924..002fbf987e2b 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -36,6 +36,11 @@ enum TaskType { }; const int kDefaultNumLeaves = 31; +/*! \brief Types of pairwise ranking mode */ +enum PairwiseRankingMode { + kNone, kFull, kRelevance, kManual +}; + struct Config { public: std::string ToString() const; diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index e7baa42dc2e6..fe562309e9b0 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -199,6 +199,12 @@ class Metadata { const double* init_scores, const int32_t* queries); + /*! + * \brief Build metadata for ranking with pairwise features from metadata of an existing ranking dataset + * \param metadata Pointer to metadata of the existing ranking dataset + */ + void BuildPairwiseFeatureRanking(const Metadata& metadata); + /*! * \brief Perform any extra operations after all data has been loaded */ @@ -247,6 +253,18 @@ class Metadata { return position_ids_.size(); } + /*! + * \brief Get the pairwise item index map in ranking with pairwise features + * \return Pointer to the pairwise item index map + */ + inline const std::pair* paired_ranking_item_index_map() const { + if (!paired_ranking_item_index_map_.empty()) { + return paired_ranking_item_index_map_.data(); + } else { + return nullptr; + } + } + /*! * \brief Get data boundaries on queries, if not exists, will return nullptr * we assume data will order by query, @@ -367,6 +385,10 @@ class Metadata { std::vector init_score_; /*! \brief Queries data */ std::vector queries_; + /*! \brief Mode for pairwise ranking */ + PairwiseRankingMode pairwise_ranking_mode_; + /*! \brief Pairwise data index to original data indices for ranking with pairwise features */ + std::vector> paired_ranking_item_index_map_; /*! \brief mutex for threading safe call */ std::mutex mutex_; bool weight_load_from_file_; @@ -677,6 +699,8 @@ class Dataset { LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset); + LIGHTGBM_EXPORT void CreatePairWiseRankingData(const Dataset* dataset, std::vector> pair_index_map); + void InitTrain(const std::vector& is_feature_used, TrainingShareStates* share_state) const; diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index f13a5fff966f..41caac8b3443 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -581,7 +581,7 @@ class FeatureGroup { } } - private: + protected: void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { if (is_multi_val) { multi_bin_data_.clear(); diff --git a/include/LightGBM/pairwise_ranking_feature_group.h b/include/LightGBM/pairwise_ranking_feature_group.h new file mode 100644 index 000000000000..bf668150e862 --- /dev/null +++ b/include/LightGBM/pairwise_ranking_feature_group.h @@ -0,0 +1,107 @@ +/*! + * Copyright (c) 2023 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_PAIRWISE_FEATURE_GROUP_H_ +#define LIGHTGBM_PAIRWISE_FEATURE_GROUP_H_ + +#include "feature_group.h" + +#include +#include +#include +#include + +namespace LightGBM { + +/*! \brief Using to store data and providing some operations on one feature +group*/ +class PairwiseRankingFeatureGroup: public FeatureGroup { + public: + /*! + * \brief Constructor + * \param num_feature number of features of this group + * \param bin_mappers Bin mapper for features + * \param num_data Total number of data + * \param is_enable_sparse True if enable sparse feature + */ + + PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_data) { + num_feature_ = other.num_feature_; + is_multi_val_ = false; + is_dense_multi_val_ = false; + is_sparse_ = false; + num_total_bin_ = other.num_total_bin_; + bin_offsets_ = other.bin_offsets_; + num_data_ = num_data; + + bin_mappers_.reserve(other.bin_mappers_.size()); + for (auto& bin_mapper : other.bin_mappers_) { + bin_mappers_.emplace_back(new BinMapper(*bin_mapper)); + } + CreateBinData(num_data, is_multi_val_, !is_sparse_, is_sparse_); + } + + /*! + * \brief Constructor from memory when data is present + * \param memory Pointer of memory + * \param num_all_data Number of global data + * \param local_used_indices Local used indices, empty means using all data + * \param group_id Id of group + */ + PairwiseRankingFeatureGroup(const void* memory, + data_size_t num_all_data, + const std::vector& local_used_indices, + int group_id) { + // TODO(shiyu1994) + } + + /*! + * \brief Constructor from definition in memory (without data) + * \param memory Pointer of memory + * \param local_used_indices Local used indices, empty means using all data + */ + FeatureGroup(const void* memory, data_size_t num_data, int group_id) { + // TODO(shiyu1994) + } + + /*! \brief Destructor */ + ~PairwiseRankingFeatureGroup() {} + + /*! + * \brief Load the overall definition of the feature group from binary serialized data + * \param memory Pointer of memory + * \param group_id Id of group + */ + const char* LoadDefinitionFromMemory(const void* memory, int group_id) { + // TODO(shiyu1994) + } + + inline BinIterator* SubFeatureIterator(int sub_feature) { + // TODO(shiyu1994) + } + + inline void FinishLoad() { + // TODO(shiyu1994) + } + + inline BinIterator* FeatureGroupIterator() { + // TODO(shiyu1994) + } + + private: + void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { + // TODO(shiyu1994) + } + + + /*! \brief Pairwise data index to original data indices for ranking with pairwise features */ + const std::pair* paired_ranking_item_index_map_; + /*! \brief Number of pairwise data */ + data_size_t num_data_; +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_PAIRWISE_FEATURE_GROUP_H_ diff --git a/src/boosting/cuda/nccl_gbdt.cpp b/src/boosting/cuda/nccl_gbdt.cpp new file mode 100644 index 000000000000..47672a53679c --- /dev/null +++ b/src/boosting/cuda/nccl_gbdt.cpp @@ -0,0 +1,429 @@ +/*! + * Copyright (c) 2023 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "nccl_gbdt.hpp" +#include + +#ifdef USE_CUDA + +namespace LightGBM { + +template +NCCLGBDT::NCCLGBDT(): GBDT_T() {} + +template +NCCLGBDT::~NCCLGBDT() {} + +template +void NCCLGBDT::Init( + const Config* gbdt_config, const Dataset* train_data, + const ObjectiveFunction* objective_function, + const std::vector& training_metrics) { + GBDT_T::Init(gbdt_config, train_data, objective_function, training_metrics); + int max_num_gpu = 0; + CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&max_num_gpu)); + num_gpu_ = this->config_->num_gpu; + if (num_gpu_ > max_num_gpu) { + Log::Warning("Specifying %d GPUs, but only %d available.", num_gpu_, max_num_gpu); + num_gpu_ = max_num_gpu; + } + int gpu_device_id = this->config_->gpu_device_id; + if (this->config_->gpu_device_list == std::string("")) { + if (gpu_device_id < 0 || gpu_device_id >= num_gpu_) { + Log::Warning("Master GPU Device ID %d is not in the valid range [%d, %d], will use GPU 0 as master.", gpu_device_id, 0, max_num_gpu); + gpu_device_id = 0; + } + } + master_gpu_device_id_ = gpu_device_id; + master_gpu_index_ = master_gpu_device_id_; + + if (this->config_->gpu_device_list != std::string("")) { + std::vector gpu_list_str = Common::Split(this->config_->gpu_device_list.c_str(), ","); + for (const auto& gpu_str : gpu_list_str) { + int gpu_id = 0; + Common::Atoi(gpu_str.c_str(), &gpu_id); + gpu_list_.emplace_back(gpu_id); + } + bool check_master_gpu = false; + for (int i = 0; i < static_cast(gpu_list_.size()); ++i) { + const int gpu_id = gpu_list_[i]; + if (gpu_id == master_gpu_device_id_) { + master_gpu_index_ = i; + check_master_gpu = true; + } + } + if (!check_master_gpu) { + Log::Fatal("Master GPU ID %d is not in GPU ID list.", master_gpu_device_id_); + } + } + + const int num_threads = OMP_NUM_THREADS(); + if (num_gpu_ > num_threads) { + Log::Fatal("Number of GPUs %d is greather than the number of threads %d. Please use more threads.", num_gpu_, num_threads); + } + + InitNCCL(); + + // partition data across GPUs + const data_size_t num_data_per_gpu = (this->num_data_ + num_gpu_ - 1) / num_gpu_; + std::vector all_data_indices(this->num_data_, 0); + #pragma omp parallel for schedule(static) + for (data_size_t i = 0; i < this->num_data_; ++i) { + all_data_indices[i] = i; + } + per_gpu_data_start_.resize(num_gpu_); + per_gpu_data_end_.resize(num_gpu_); + per_gpu_datasets_.resize(num_gpu_); + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + SetCUDADevice(gpu_index); + const data_size_t data_start = num_data_per_gpu * gpu_index; + const data_size_t data_end = std::min(data_start + num_data_per_gpu, this->num_data_); + const data_size_t num_data_in_gpu = data_end - data_start; + per_gpu_data_start_[gpu_index] = data_start; + per_gpu_data_end_[gpu_index] = data_end; + per_gpu_datasets_[gpu_index].reset(new Dataset(num_data_in_gpu)); + per_gpu_datasets_[gpu_index]->ReSize(num_data_in_gpu); + per_gpu_datasets_[gpu_index]->CopyFeatureMapperFrom(this->train_data_); + per_gpu_datasets_[gpu_index]->CopySubrow(this->train_data_, all_data_indices.data() + data_start, num_data_in_gpu, true, data_start, data_end, GetCUDADevice(gpu_index)); + } + + // initialize per gpu objectives, training scores and tree learners + per_gpu_objective_functions_.resize(num_gpu_); + per_gpu_train_score_updater_.resize(num_gpu_); + per_gpu_gradients_.resize(num_gpu_); + per_gpu_hessians_.resize(num_gpu_); + per_gpu_tree_learners_.resize(num_gpu_); + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + SetCUDADevice(gpu_index); + const data_size_t num_data_in_gpu = per_gpu_data_end_[gpu_index] - per_gpu_data_start_[gpu_index]; + per_gpu_objective_functions_[gpu_index].reset(ObjectiveFunction::CreateObjectiveFunction(this->config_->objective, *(this->config_.get()))); + per_gpu_objective_functions_[gpu_index]->Init(per_gpu_datasets_[gpu_index]->metadata(), per_gpu_datasets_[gpu_index]->num_data()); + per_gpu_objective_functions_[gpu_index]->SetNCCLComm(&nccl_communicators_[gpu_index]); + per_gpu_train_score_updater_[gpu_index].reset(new CUDAScoreUpdater(per_gpu_datasets_[gpu_index].get(), this->num_tree_per_iteration_)); + per_gpu_gradients_[gpu_index].reset(new CUDAVector(num_data_in_gpu)); + per_gpu_hessians_[gpu_index].reset(new CUDAVector(num_data_in_gpu)); + per_gpu_tree_learners_[gpu_index].reset(TreeLearner::CreateTreeLearner( + this->config_->tree_learner, + this->config_->device_type, + this->config_.get())); + per_gpu_tree_learners_[gpu_index]->SetNCCL(&nccl_communicators_[gpu_index], nccl_gpu_rank_[gpu_index], GetCUDADevice(gpu_index), this->num_data_); + per_gpu_tree_learners_[gpu_index]->Init(per_gpu_datasets_[gpu_index].get(), this->is_constant_hessian_); + } + + // initialize host threads and thread data + host_threads_.resize(num_gpu_); + boosting_thread_data_.resize(num_gpu_); + train_tree_learner_thread_data_.resize(num_gpu_); + update_score_thread_data_.resize(num_gpu_); + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + boosting_thread_data_[gpu_index].gpu_index = GetCUDADevice(gpu_index); + boosting_thread_data_[gpu_index].gpu_objective_function = per_gpu_objective_functions_[gpu_index].get(); + boosting_thread_data_[gpu_index].gradients = per_gpu_gradients_[gpu_index]->RawData(); + boosting_thread_data_[gpu_index].hessians = per_gpu_hessians_[gpu_index]->RawData(); + boosting_thread_data_[gpu_index].score = per_gpu_train_score_updater_[gpu_index]->score(); + train_tree_learner_thread_data_[gpu_index].gpu_index = GetCUDADevice(gpu_index); + train_tree_learner_thread_data_[gpu_index].gpu_tree_learner = per_gpu_tree_learners_[gpu_index].get(); + train_tree_learner_thread_data_[gpu_index].gradients = per_gpu_gradients_[gpu_index]->RawData(); + train_tree_learner_thread_data_[gpu_index].hessians = per_gpu_hessians_[gpu_index]->RawData(); + train_tree_learner_thread_data_[gpu_index].num_data_in_gpu = per_gpu_data_end_[gpu_index] - per_gpu_data_start_[gpu_index]; + update_score_thread_data_[gpu_index].gpu_index = GetCUDADevice(gpu_index); + update_score_thread_data_[gpu_index].gpu_score_updater = per_gpu_train_score_updater_[gpu_index].get(); + update_score_thread_data_[gpu_index].gpu_tree_learner = per_gpu_tree_learners_[gpu_index].get(); + } + + // return to master gpu device + CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); +} + +template +void NCCLGBDT::InitNCCL() { + nccl_gpu_rank_.resize(num_gpu_, -1); + nccl_communicators_.resize(num_gpu_); + ncclUniqueId nccl_unique_id; + if (Network::num_machines() == 1 || Network::rank() == 0) { + NCCLCHECK(ncclGetUniqueId(&nccl_unique_id)); + } + if (Network::num_machines() > 1) { + std::vector output_buffer(Network::num_machines()); + Network::Allgather( + reinterpret_cast(&nccl_unique_id), + sizeof(ncclUniqueId) / sizeof(char), + reinterpret_cast(output_buffer.data())); + if (Network::rank() > 0) { + nccl_unique_id = output_buffer[0]; + } + } + + if (Network::num_machines() > 1) { + std::vector num_gpus_per_machine(Network::num_machines() + 1, 0); + Network::Allgather( + reinterpret_cast(&num_gpu_), + sizeof(int) / sizeof(char), + reinterpret_cast(num_gpus_per_machine.data() + 1)); + for (int rank = 1; rank < Network::num_machines() + 1; ++rank) { + num_gpus_per_machine[rank] += num_gpus_per_machine[rank - 1]; + } + CHECK_EQ(num_gpus_per_machine[Network::rank() + 1] - num_gpus_per_machine[Network::rank()], num_gpu_); + NCCLCHECK(ncclGroupStart()); + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + SetCUDADevice(gpu_index); + nccl_gpu_rank_[gpu_index] = gpu_index + num_gpus_per_machine[Network::rank()]; + NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], num_gpus_per_machine.back(), nccl_unique_id, nccl_gpu_rank_[gpu_index])); + } + NCCLCHECK(ncclGroupEnd()); + } else { + NCCLCHECK(ncclGroupStart()); + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + SetCUDADevice(gpu_index); + nccl_gpu_rank_[gpu_index] = gpu_index; + NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], num_gpu_, nccl_unique_id, gpu_index)); + } + NCCLCHECK(ncclGroupEnd()); + } + + // return to master gpu device + CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); +} + +template +void* NCCLGBDT::BoostingThread(void* thread_data) { + const BoostingThreadData* boosting_thread_data = reinterpret_cast(thread_data); + const int gpu_index = boosting_thread_data->gpu_index; + const ObjectiveFunction* objective_function = boosting_thread_data->gpu_objective_function; + score_t* gradients = boosting_thread_data->gradients; + score_t* hessians = boosting_thread_data->hessians; + const double* score = boosting_thread_data->score; + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_index)); + objective_function->GetGradients(score, gradients, hessians); + return nullptr; +} + +template +void NCCLGBDT::Boosting() { + Common::FunctionTimer fun_timer("NCCLGBDT::Boosting", global_timer); + if (this->objective_function_ == nullptr) { + Log::Fatal("No object function provided"); + } + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + if (pthread_create(&host_threads_[gpu_index], nullptr, BoostingThread, + reinterpret_cast(&boosting_thread_data_[gpu_index]))) { + Log::Fatal("Error in creating boosting threads."); + } + } + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + if (pthread_join(host_threads_[gpu_index], nullptr)) { + Log::Fatal("Error in joining boosting threads."); + } + } +} + +template +double NCCLGBDT::BoostFromAverage(int class_id, bool update_scorer) { + double init_score = GBDT_T::BoostFromAverage(class_id, update_scorer); + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + SetCUDADevice(gpu_index); + if (std::fabs(init_score) > kEpsilon && update_scorer) { + per_gpu_train_score_updater_[gpu_index]->AddScore(init_score, class_id); + } + } + + // return to master gpu device + CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); +} + +template +void* NCCLGBDT::TrainTreeLearnerThread(void* thread_data) { + TrainTreeLearnerThreadData* tree_train_learner_thread_data = reinterpret_cast(thread_data); + const int gpu_index = tree_train_learner_thread_data->gpu_index; + const int class_id = tree_train_learner_thread_data->class_id; + const data_size_t num_data_in_gpu = tree_train_learner_thread_data->num_data_in_gpu; + const score_t* gradients = tree_train_learner_thread_data->gradients + class_id * num_data_in_gpu; + const score_t* hessians = tree_train_learner_thread_data->hessians + class_id * num_data_in_gpu; + const bool is_first_tree = tree_train_learner_thread_data->is_first_time; + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_index)); + tree_train_learner_thread_data->tree.reset( + tree_train_learner_thread_data->gpu_tree_learner->Train(gradients, hessians, is_first_tree)); + return nullptr; +} + +template +bool NCCLGBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { + Common::FunctionTimer fun_timer("NCCLGBDT::TrainOneIter", global_timer); + std::vector init_scores(this->num_tree_per_iteration_, 0.0); + // boosting first + if (gradients == nullptr || hessians == nullptr) { + for (int cur_tree_id = 0; cur_tree_id < this->num_tree_per_iteration_; ++cur_tree_id) { + init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true); + } + Boosting(); + } else { + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + SetCUDADevice(gpu_index); + const data_size_t gpu_data_start = per_gpu_data_start_[gpu_index]; + const data_size_t num_data_in_gpu = per_gpu_data_end_[gpu_index] - gpu_data_start; + for (int class_id = 0; class_id < this->num_class_; ++class_id) { + CopyFromHostToCUDADevice( + per_gpu_gradients_[gpu_index]->RawData() + class_id * num_data_in_gpu, + gradients + class_id * this->num_data_ + gpu_data_start, num_data_in_gpu, __FILE__, __LINE__); + CopyFromHostToCUDADevice( + per_gpu_hessians_[gpu_index]->RawData() + class_id * num_data_in_gpu, + hessians + class_id * this->num_data_ + gpu_data_start, num_data_in_gpu, __FILE__, __LINE__); + } + } + + // return to master gpu device + CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + } + + bool should_continue = false; + for (int cur_tree_id = 0; cur_tree_id < this->num_tree_per_iteration_; ++cur_tree_id) { + std::vector> new_tree(num_gpu_); + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + new_tree[gpu_index].reset(nullptr); + } + if (this->class_need_train_[cur_tree_id] && this->train_data_->num_features() > 0) { + if (this->is_use_subset_ && this->bag_data_cnt_ < this->num_data_) { + Log::Fatal("Bagging is not supported for NCCLGBDT"); + } + bool is_first_tree = this->models_.size() < static_cast(this->num_tree_per_iteration_); + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + train_tree_learner_thread_data_[gpu_index].is_first_time = is_first_tree; + train_tree_learner_thread_data_[gpu_index].class_id = cur_tree_id; + if (pthread_create(&host_threads_[gpu_index], nullptr, TrainTreeLearnerThread, + reinterpret_cast(&train_tree_learner_thread_data_[gpu_index]))) { + Log::Fatal("Error in creating tree training threads."); + } + } + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + if (pthread_join(host_threads_[gpu_index], nullptr)) { + Log::Fatal("Error in joining tree training threads."); + } + new_tree[gpu_index].reset(train_tree_learner_thread_data_[gpu_index].tree.release()); + } + } + + if (new_tree[master_gpu_index_]->num_leaves() > 1) { + should_continue = true; + if (this->objective_function_ != nullptr && this->objective_function_->IsRenewTreeOutput()) { + Log::Fatal("Objective function with renewing is not supported for NCCLGBDT."); + } + // shrinkage by learning rate + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + SetCUDADevice(gpu_index); + new_tree[gpu_index]->Shrinkage(this->shrinkage_rate_); + } + CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + // update score + UpdateScore(new_tree, cur_tree_id); + if (std::fabs(init_scores[cur_tree_id]) > kEpsilon) { + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + SetCUDADevice(gpu_index); + new_tree[gpu_index]->AddBias(init_scores[cur_tree_id]); + } + CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + } + } else { + // only add default score one-time + if (this->models_.size() < static_cast(this->num_tree_per_iteration_)) { + Log::Warning("Training stopped with no splits."); + } + } + + // add model + this->models_.push_back(std::move(new_tree[master_gpu_index_])); + + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + if (gpu_index != master_gpu_index_) { + SetCUDADevice(gpu_index); + new_tree[gpu_index].reset(nullptr); + } + } + CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + } + + if (!should_continue) { + Log::Warning("Stopped training because there are no more leaves that meet the split requirements"); + if (this->models_.size() > static_cast(this->num_tree_per_iteration_)) { + for (int cur_tree_id = 0; cur_tree_id < this->num_tree_per_iteration_; ++cur_tree_id) { + this->models_.pop_back(); + } + } + return true; + } + + ++this->iter_; + return false; +} + +template +void* NCCLGBDT::UpdateScoreThread(void* thread_data) { + const UpdateScoreThreadData* update_score_thread_data = reinterpret_cast(thread_data); + const int gpu_index = update_score_thread_data->gpu_index; + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_index)); + update_score_thread_data->gpu_score_updater->AddScore( + update_score_thread_data->gpu_tree_learner, + update_score_thread_data->tree, + update_score_thread_data->cur_tree_id); + return nullptr; +} + +template +void NCCLGBDT::UpdateScore(const std::vector>& tree, const int cur_tree_id) { + Common::FunctionTimer fun_timer("GBDT::UpdateScore", global_timer); + // update training score + if (!this->is_use_subset_) { + if (this->num_data_ - this->bag_data_cnt_ > 0) { + Log::Fatal("bagging is not supported for NCCLGBDT."); + } + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + update_score_thread_data_[gpu_index].tree = tree[gpu_index].get(); + update_score_thread_data_[gpu_index].cur_tree_id = cur_tree_id; + if (pthread_create(&host_threads_[gpu_index], nullptr, UpdateScoreThread, + reinterpret_cast(&update_score_thread_data_[gpu_index]))) { + Log::Fatal("Error in creating update score threads."); + } + } + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + if (pthread_join(host_threads_[gpu_index], nullptr)) { + Log::Fatal("Error in joining tree training threads."); + } + } + } else { + Log::Fatal("bagging is not supported for NCCLGBDT."); + } + + // update validation score + for (auto& score_updater : this->valid_score_updater_) { + score_updater->AddScore(tree[master_gpu_index_].get(), cur_tree_id); + } +} + +template +std::vector NCCLGBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const { + if (score == this->train_score_updater_->score()) { + // delegate to per gpu train score updater + std::vector tmp_score(num_data * this->num_class_, 0.0f); + for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { + const data_size_t data_start = per_gpu_data_start_[gpu_index]; + const data_size_t num_data_in_gpu = per_gpu_data_end_[gpu_index] - data_start; + for (int class_id = 0; class_id < this->num_class_; ++class_id) { + SetCUDADevice(gpu_index); + CopyFromCUDADeviceToHost(tmp_score.data() + class_id * this->num_data_ + data_start, + per_gpu_train_score_updater_[gpu_index]->score() + class_id * num_data_in_gpu, + static_cast(num_data_in_gpu), __FILE__, __LINE__); + } + CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); + return metric->Eval(tmp_score.data(), this->objective_function_); + } + } else { + return GBDT_T::EvalOneMetric(metric, score, num_data); + } +} + +template class NCCLGBDT; + +} // LightGBM + +#endif // USE_CUDA diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 147765644887..0fe208147aa5 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -819,6 +819,84 @@ void Dataset::CreateValid(const Dataset* dataset) { gpu_device_id_ = dataset->gpu_device_id_; } +void Dataset::CreatePairWiseRankingData(const Dataset* dataset, std::vector> pair_index_map) { + metadata_.BuildPairwiseFeatureRanking(dataset->metadata()); + + feature_groups_.clear(); + num_features_ = dataset->num_features_ * 2; + num_groups_ = dataset->num_groups_ * 2; + max_bin_ = dataset->max_bin_; + min_data_in_bin_ = dataset->min_data_in_bin_; + bin_construct_sample_cnt_ = dataset->bin_construct_sample_cnt_; + use_missing_ = dataset->use_missing_; + zero_as_missing_ = dataset->zero_as_missing_; + feature2group_.clear(); + feature2subfeature_.clear(); + has_raw_ = dataset->has_raw(); + numeric_feature_map_ = dataset->numeric_feature_map_; + num_numeric_features_ = dataset->num_numeric_features_; + // copy feature bin mapper data + feature_need_push_zeros_.clear(); + group_bin_boundaries_.clear(); + uint64_t num_total_bin = 0; + group_bin_boundaries_.push_back(num_total_bin); + group_feature_start_.resize(num_groups_); + group_feature_cnt_.resize(num_groups_); + + int cur_feature_index = 0; + for (int i = 0; i < num_groups_; ++i) { + int original_group_index = i % dataset->num_groups_; + int original_group_feature_start = dataset->group_feature_start_[original_group_index]; + group_feature_start_[i] = cur_feature_index; + for (int feature_index_in_group = 0; feature_index_in_group < dataset->group_feature_cnt_[original_group_index]; ++feature_index_in_group) { + const BinMapper* feature_bin_mapper = dataset->FeatureBinMapper(original_group_feature_start + feature_index_in_group); + if (feature_bin_mapper->GetDefaultBin() != feature_bin_mapper->GetMostFreqBin()) { + feature_need_push_zeros_.push_back(cur_feature_index); + } + feature2group_.push_back(i); + feature2subfeature_.push_back(dataset->feature2subfeature_[original_group_feature_start + feature_index_in_group]); + cur_feature_index += 1; + } + feature_groups_.emplace_back(new FeatureGroup(*dataset->feature_groups_[original_group_index].get(), num_data_)); + num_total_bin += dataset->FeatureGroupNumBin(original_group_index); + group_bin_boundaries_.push_back(num_total_bin); + group_feature_cnt_[i] = dataset->group_feature_cnt_[original_group_index]; + } + + feature_groups_.shrink_to_fit(); + + used_feature_map_.clear(); + used_feature_map_.reserve(2 * dataset->used_feature_map_.size()); + used_feature_map_.insert(used_feature_map_.begin(), dataset->used_feature_map_.begin(), dataset->used_feature_map_.end()); + used_feature_map_.insert(used_feature_map_.begin() + dataset->used_feature_map_.size(), dataset->used_feature_map_.begin(), dataset->used_feature_map_.end()); + + feature_names_.clear(); + for (const std::string& feature_name : dataset->feature_names_) { + feature_names_.push_back(feature_name + std::string("_i")); + } + for (const std::string& feature_name : dataset->feature_names_) { + feature_names_.push_back(feature_name + std::string("_j")); + } + + real_feature_idx_.clear(); + for (const int idx : dataset->real_feature_idx_) { + real_feature_idx_.push_back(idx); + } + for (const int idx : dataset->real_feature_idx_) { + real_feature_idx_.push_back(idx + dataset->num_total_features_); + } + + forced_bin_bounds_.clear(); + forced_bin_bounds_.reserve(dataset->forced_bin_bounds_.size() * 2); + forced_bin_bounds_.insert(forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.end()); + forced_bin_bounds_.insert(forced_bin_bounds_.begin() + dataset->forced_bin_bounds_.size(), dataset->forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.end()); + + num_total_features_ = dataset->num_total_features_ * 2; + label_idx_ = dataset->label_idx_; + device_type_ = dataset->device_type_; + gpu_device_id_ = dataset->gpu_device_id_; +} + void Dataset::ReSize(data_size_t num_data) { if (num_data_ != num_data) { num_data_ = num_data; diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index c9e8973addb4..71a306c1c7bf 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -805,5 +805,38 @@ size_t Metadata::SizesInByte() const { return size; } +void Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { + num_data_ = 0; + num_queries_ = metadata.num_queries(); + if (pairwise_ranking_mode_ == PairwiseRankingMode::kRelevance) { + const label_t* labels = metadata.label(); + paired_ranking_item_index_map_.clear(); + const data_size_t* query_boundaries = metadata.query_boundaries(); + data_size_t num_pairs_in_query = 0; + query_boundaries_.clear(); + query_boundaries_.push_back(0); + for (data_size_t query_index = 0; query_index < num_queries_; ++query_index) { + const data_size_t query_start = query_boundaries[query_index]; + const data_size_t query_end = query_boundaries[query_index + 1]; + for (data_size_t item_index_i = query_start; item_index_i < query_end; ++item_index_i) { + const label_t label_i = labels[item_index_i]; + for (data_size_t item_index_j = query_start; item_index_j < query_end; ++item_index_j) { + if (item_index_i == item_index_j) { + continue; + } + const label_t label_j = labels[item_index_j]; + if (label_i != label_j) { + paired_ranking_item_index_map_.push_back(std::pair{item_index_i, item_index_j}); + ++num_pairs_in_query; + ++num_data_; + } + } + } + query_boundaries_.push_back(num_pairs_in_query); + } + } else { + // TODO(shiyu1994) + } +} } // namespace LightGBM diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp new file mode 100644 index 000000000000..7407029b2bed --- /dev/null +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -0,0 +1,107 @@ +/*! + * Copyright (c) 2016 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_IO_PAIRWISE_LAMBDARANK_BIN_HPP_ +#define LIGHTGBM_IO_PAIRWISE_LAMBDARANK_BIN_HPP_ + +#include + +namespace LightGBM { + +template +class PairwiseRankingFirstIterator: public BinIterator { + public: + PairwiseRankingFirstIterator(const Bin* unpaired_bin, const std::pair* paired_ranking_item_index_map) { + unpaired_bin_ = unpaired_bin; + unpaired_bin_iterator_ = unpaired_bin_->GetIterator(); + unpaired_bin_iterator_->Reset(); + paired_ranking_item_index_map_ = paired_ranking_item_index_map; + prev_index_ = 0; + prev_val_ = 0; + } + + ~PairwiseRankingFirstIterator() {} + + uint32_t Get(data_size_t idx) { + const data_size_t data_index = paired_ranking_item_index_map[idx].first + if (data_index != prev_index_) { + CHECK_GT(data_index, prev_index_); + prev_val_ = unpaired_bin_iterator_i_->Get(data_index); + } + prev_index_ = data_index; + return prev_val_; + } + + uint32_t RawGet(data_size_t idx) { + const data_size_t data_index = paired_ranking_item_index_map[idx].first + if (data_index != prev_index_) { + CHECK_GT(data_index, prev_index_); + prev_val_ = unpaired_bin_iterator_i_->RawGet(data_index); + } + prev_index_ = data_index; + return prev_val_; + } + + void Reset(data_size_t idx) { + unpaired_bin_iterator_->Reset(idx); + prev_index_ = 0; + prev_val_ = 0; + } + + private: + const Bin* unpaired_bin_; + BinIterator* unpaired_bin_iterator_; + const std::pair* paired_ranking_item_index_map_; + const data_size_t prev_index_; + const uint32_t prev_val_; +}; + +template +class PairwiseRankingSecondIterator: public BinIterator { + public: + PairwiseRankingSecondIterator(const Bin* unpaired_bin, const std::pair* paired_ranking_item_index_map) { + unpaired_bin_ = unpaired_bin; + unpaired_bin_iterator_ = unpaired_bin_->GetIterator(); + unpaired_bin_iterator_->Reset(); + paired_ranking_item_index_map_ = paired_ranking_item_index_map; + prev_index_ = 0; + prev_val_ = 0; + } + + ~PairwiseRankingSecondIterator() {} + + uint32_t Get(data_size_t idx) { + const data_size_t data_index = paired_ranking_item_index_map[idx].second + if (data_index < prev_index_) { + unpaired_bin_iterator_i_.Reset(0); + } + prev_index_ = data_index; + return unpaired_bin_iterator_i_->Get(data_index); + } + + uint32_t RawGet(data_size_t idx) { + const data_size_t data_index = paired_ranking_item_index_map[idx].second + if (data_index < prev_index_) { + unpaired_bin_iterator_i_.Reset(0); + } + prev_index_ = data_index; + return unpaired_bin_iterator_i_->RawGet(data_index); + } + + void Reset(data_size_t idx) { + unpaired_bin_iterator_->Reset(idx); + prev_index_ = 0; + } + + private: + const Bin* unpaired_bin_; + BinIterator* unpaired_bin_iterator_; + const std::pair* paired_ranking_item_index_map_; + const data_size_t prev_index_; +}; + +} // LightGBM + +#endif // LIGHTGBM_IO_PAIRWISE_LAMBDARANK_BIN_HPP_ diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index ae3b74651759..8b63a0c30710 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -449,5 +449,25 @@ class RankXENDCG : public RankingObjective { mutable std::vector rands_; }; + +class PairwiseLambdarankNDCG: public LambdarankNDCG { + public: + explicit PairwiseLambdarankNDCG(const Config& config): LambdarankNDCG(config) {} + + explicit PairwiseLambdarankNDCG(const std::vector& strs): LambdarankNDCG(strs) {} + + ~PairwiseLambdarankNDCG() {} + + void Init(const Metadata& metadata, data_size_t num_data) override { + LambdarankNDCG::Init(metadata, num_data); + + paired_index_map_ = metadata.paired_ranking_item_index_map(); + } + + private: + const std::pair* paired_index_map_; +}; + + } // namespace LightGBM #endif // LightGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_ From 23140999d28bcaac54f1c5ad68f2ee8c341c49e0 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 8 Nov 2023 06:24:53 +0000 Subject: [PATCH 02/68] remove unrelated changes --- src/boosting/cuda/nccl_gbdt.cpp | 429 -------------------------------- 1 file changed, 429 deletions(-) delete mode 100644 src/boosting/cuda/nccl_gbdt.cpp diff --git a/src/boosting/cuda/nccl_gbdt.cpp b/src/boosting/cuda/nccl_gbdt.cpp deleted file mode 100644 index 47672a53679c..000000000000 --- a/src/boosting/cuda/nccl_gbdt.cpp +++ /dev/null @@ -1,429 +0,0 @@ -/*! - * Copyright (c) 2023 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include "nccl_gbdt.hpp" -#include - -#ifdef USE_CUDA - -namespace LightGBM { - -template -NCCLGBDT::NCCLGBDT(): GBDT_T() {} - -template -NCCLGBDT::~NCCLGBDT() {} - -template -void NCCLGBDT::Init( - const Config* gbdt_config, const Dataset* train_data, - const ObjectiveFunction* objective_function, - const std::vector& training_metrics) { - GBDT_T::Init(gbdt_config, train_data, objective_function, training_metrics); - int max_num_gpu = 0; - CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&max_num_gpu)); - num_gpu_ = this->config_->num_gpu; - if (num_gpu_ > max_num_gpu) { - Log::Warning("Specifying %d GPUs, but only %d available.", num_gpu_, max_num_gpu); - num_gpu_ = max_num_gpu; - } - int gpu_device_id = this->config_->gpu_device_id; - if (this->config_->gpu_device_list == std::string("")) { - if (gpu_device_id < 0 || gpu_device_id >= num_gpu_) { - Log::Warning("Master GPU Device ID %d is not in the valid range [%d, %d], will use GPU 0 as master.", gpu_device_id, 0, max_num_gpu); - gpu_device_id = 0; - } - } - master_gpu_device_id_ = gpu_device_id; - master_gpu_index_ = master_gpu_device_id_; - - if (this->config_->gpu_device_list != std::string("")) { - std::vector gpu_list_str = Common::Split(this->config_->gpu_device_list.c_str(), ","); - for (const auto& gpu_str : gpu_list_str) { - int gpu_id = 0; - Common::Atoi(gpu_str.c_str(), &gpu_id); - gpu_list_.emplace_back(gpu_id); - } - bool check_master_gpu = false; - for (int i = 0; i < static_cast(gpu_list_.size()); ++i) { - const int gpu_id = gpu_list_[i]; - if (gpu_id == master_gpu_device_id_) { - master_gpu_index_ = i; - check_master_gpu = true; - } - } - if (!check_master_gpu) { - Log::Fatal("Master GPU ID %d is not in GPU ID list.", master_gpu_device_id_); - } - } - - const int num_threads = OMP_NUM_THREADS(); - if (num_gpu_ > num_threads) { - Log::Fatal("Number of GPUs %d is greather than the number of threads %d. Please use more threads.", num_gpu_, num_threads); - } - - InitNCCL(); - - // partition data across GPUs - const data_size_t num_data_per_gpu = (this->num_data_ + num_gpu_ - 1) / num_gpu_; - std::vector all_data_indices(this->num_data_, 0); - #pragma omp parallel for schedule(static) - for (data_size_t i = 0; i < this->num_data_; ++i) { - all_data_indices[i] = i; - } - per_gpu_data_start_.resize(num_gpu_); - per_gpu_data_end_.resize(num_gpu_); - per_gpu_datasets_.resize(num_gpu_); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - const data_size_t data_start = num_data_per_gpu * gpu_index; - const data_size_t data_end = std::min(data_start + num_data_per_gpu, this->num_data_); - const data_size_t num_data_in_gpu = data_end - data_start; - per_gpu_data_start_[gpu_index] = data_start; - per_gpu_data_end_[gpu_index] = data_end; - per_gpu_datasets_[gpu_index].reset(new Dataset(num_data_in_gpu)); - per_gpu_datasets_[gpu_index]->ReSize(num_data_in_gpu); - per_gpu_datasets_[gpu_index]->CopyFeatureMapperFrom(this->train_data_); - per_gpu_datasets_[gpu_index]->CopySubrow(this->train_data_, all_data_indices.data() + data_start, num_data_in_gpu, true, data_start, data_end, GetCUDADevice(gpu_index)); - } - - // initialize per gpu objectives, training scores and tree learners - per_gpu_objective_functions_.resize(num_gpu_); - per_gpu_train_score_updater_.resize(num_gpu_); - per_gpu_gradients_.resize(num_gpu_); - per_gpu_hessians_.resize(num_gpu_); - per_gpu_tree_learners_.resize(num_gpu_); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - const data_size_t num_data_in_gpu = per_gpu_data_end_[gpu_index] - per_gpu_data_start_[gpu_index]; - per_gpu_objective_functions_[gpu_index].reset(ObjectiveFunction::CreateObjectiveFunction(this->config_->objective, *(this->config_.get()))); - per_gpu_objective_functions_[gpu_index]->Init(per_gpu_datasets_[gpu_index]->metadata(), per_gpu_datasets_[gpu_index]->num_data()); - per_gpu_objective_functions_[gpu_index]->SetNCCLComm(&nccl_communicators_[gpu_index]); - per_gpu_train_score_updater_[gpu_index].reset(new CUDAScoreUpdater(per_gpu_datasets_[gpu_index].get(), this->num_tree_per_iteration_)); - per_gpu_gradients_[gpu_index].reset(new CUDAVector(num_data_in_gpu)); - per_gpu_hessians_[gpu_index].reset(new CUDAVector(num_data_in_gpu)); - per_gpu_tree_learners_[gpu_index].reset(TreeLearner::CreateTreeLearner( - this->config_->tree_learner, - this->config_->device_type, - this->config_.get())); - per_gpu_tree_learners_[gpu_index]->SetNCCL(&nccl_communicators_[gpu_index], nccl_gpu_rank_[gpu_index], GetCUDADevice(gpu_index), this->num_data_); - per_gpu_tree_learners_[gpu_index]->Init(per_gpu_datasets_[gpu_index].get(), this->is_constant_hessian_); - } - - // initialize host threads and thread data - host_threads_.resize(num_gpu_); - boosting_thread_data_.resize(num_gpu_); - train_tree_learner_thread_data_.resize(num_gpu_); - update_score_thread_data_.resize(num_gpu_); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - boosting_thread_data_[gpu_index].gpu_index = GetCUDADevice(gpu_index); - boosting_thread_data_[gpu_index].gpu_objective_function = per_gpu_objective_functions_[gpu_index].get(); - boosting_thread_data_[gpu_index].gradients = per_gpu_gradients_[gpu_index]->RawData(); - boosting_thread_data_[gpu_index].hessians = per_gpu_hessians_[gpu_index]->RawData(); - boosting_thread_data_[gpu_index].score = per_gpu_train_score_updater_[gpu_index]->score(); - train_tree_learner_thread_data_[gpu_index].gpu_index = GetCUDADevice(gpu_index); - train_tree_learner_thread_data_[gpu_index].gpu_tree_learner = per_gpu_tree_learners_[gpu_index].get(); - train_tree_learner_thread_data_[gpu_index].gradients = per_gpu_gradients_[gpu_index]->RawData(); - train_tree_learner_thread_data_[gpu_index].hessians = per_gpu_hessians_[gpu_index]->RawData(); - train_tree_learner_thread_data_[gpu_index].num_data_in_gpu = per_gpu_data_end_[gpu_index] - per_gpu_data_start_[gpu_index]; - update_score_thread_data_[gpu_index].gpu_index = GetCUDADevice(gpu_index); - update_score_thread_data_[gpu_index].gpu_score_updater = per_gpu_train_score_updater_[gpu_index].get(); - update_score_thread_data_[gpu_index].gpu_tree_learner = per_gpu_tree_learners_[gpu_index].get(); - } - - // return to master gpu device - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); -} - -template -void NCCLGBDT::InitNCCL() { - nccl_gpu_rank_.resize(num_gpu_, -1); - nccl_communicators_.resize(num_gpu_); - ncclUniqueId nccl_unique_id; - if (Network::num_machines() == 1 || Network::rank() == 0) { - NCCLCHECK(ncclGetUniqueId(&nccl_unique_id)); - } - if (Network::num_machines() > 1) { - std::vector output_buffer(Network::num_machines()); - Network::Allgather( - reinterpret_cast(&nccl_unique_id), - sizeof(ncclUniqueId) / sizeof(char), - reinterpret_cast(output_buffer.data())); - if (Network::rank() > 0) { - nccl_unique_id = output_buffer[0]; - } - } - - if (Network::num_machines() > 1) { - std::vector num_gpus_per_machine(Network::num_machines() + 1, 0); - Network::Allgather( - reinterpret_cast(&num_gpu_), - sizeof(int) / sizeof(char), - reinterpret_cast(num_gpus_per_machine.data() + 1)); - for (int rank = 1; rank < Network::num_machines() + 1; ++rank) { - num_gpus_per_machine[rank] += num_gpus_per_machine[rank - 1]; - } - CHECK_EQ(num_gpus_per_machine[Network::rank() + 1] - num_gpus_per_machine[Network::rank()], num_gpu_); - NCCLCHECK(ncclGroupStart()); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - nccl_gpu_rank_[gpu_index] = gpu_index + num_gpus_per_machine[Network::rank()]; - NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], num_gpus_per_machine.back(), nccl_unique_id, nccl_gpu_rank_[gpu_index])); - } - NCCLCHECK(ncclGroupEnd()); - } else { - NCCLCHECK(ncclGroupStart()); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - nccl_gpu_rank_[gpu_index] = gpu_index; - NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], num_gpu_, nccl_unique_id, gpu_index)); - } - NCCLCHECK(ncclGroupEnd()); - } - - // return to master gpu device - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); -} - -template -void* NCCLGBDT::BoostingThread(void* thread_data) { - const BoostingThreadData* boosting_thread_data = reinterpret_cast(thread_data); - const int gpu_index = boosting_thread_data->gpu_index; - const ObjectiveFunction* objective_function = boosting_thread_data->gpu_objective_function; - score_t* gradients = boosting_thread_data->gradients; - score_t* hessians = boosting_thread_data->hessians; - const double* score = boosting_thread_data->score; - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_index)); - objective_function->GetGradients(score, gradients, hessians); - return nullptr; -} - -template -void NCCLGBDT::Boosting() { - Common::FunctionTimer fun_timer("NCCLGBDT::Boosting", global_timer); - if (this->objective_function_ == nullptr) { - Log::Fatal("No object function provided"); - } - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - if (pthread_create(&host_threads_[gpu_index], nullptr, BoostingThread, - reinterpret_cast(&boosting_thread_data_[gpu_index]))) { - Log::Fatal("Error in creating boosting threads."); - } - } - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - if (pthread_join(host_threads_[gpu_index], nullptr)) { - Log::Fatal("Error in joining boosting threads."); - } - } -} - -template -double NCCLGBDT::BoostFromAverage(int class_id, bool update_scorer) { - double init_score = GBDT_T::BoostFromAverage(class_id, update_scorer); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - if (std::fabs(init_score) > kEpsilon && update_scorer) { - per_gpu_train_score_updater_[gpu_index]->AddScore(init_score, class_id); - } - } - - // return to master gpu device - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); -} - -template -void* NCCLGBDT::TrainTreeLearnerThread(void* thread_data) { - TrainTreeLearnerThreadData* tree_train_learner_thread_data = reinterpret_cast(thread_data); - const int gpu_index = tree_train_learner_thread_data->gpu_index; - const int class_id = tree_train_learner_thread_data->class_id; - const data_size_t num_data_in_gpu = tree_train_learner_thread_data->num_data_in_gpu; - const score_t* gradients = tree_train_learner_thread_data->gradients + class_id * num_data_in_gpu; - const score_t* hessians = tree_train_learner_thread_data->hessians + class_id * num_data_in_gpu; - const bool is_first_tree = tree_train_learner_thread_data->is_first_time; - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_index)); - tree_train_learner_thread_data->tree.reset( - tree_train_learner_thread_data->gpu_tree_learner->Train(gradients, hessians, is_first_tree)); - return nullptr; -} - -template -bool NCCLGBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { - Common::FunctionTimer fun_timer("NCCLGBDT::TrainOneIter", global_timer); - std::vector init_scores(this->num_tree_per_iteration_, 0.0); - // boosting first - if (gradients == nullptr || hessians == nullptr) { - for (int cur_tree_id = 0; cur_tree_id < this->num_tree_per_iteration_; ++cur_tree_id) { - init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true); - } - Boosting(); - } else { - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - const data_size_t gpu_data_start = per_gpu_data_start_[gpu_index]; - const data_size_t num_data_in_gpu = per_gpu_data_end_[gpu_index] - gpu_data_start; - for (int class_id = 0; class_id < this->num_class_; ++class_id) { - CopyFromHostToCUDADevice( - per_gpu_gradients_[gpu_index]->RawData() + class_id * num_data_in_gpu, - gradients + class_id * this->num_data_ + gpu_data_start, num_data_in_gpu, __FILE__, __LINE__); - CopyFromHostToCUDADevice( - per_gpu_hessians_[gpu_index]->RawData() + class_id * num_data_in_gpu, - hessians + class_id * this->num_data_ + gpu_data_start, num_data_in_gpu, __FILE__, __LINE__); - } - } - - // return to master gpu device - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); - } - - bool should_continue = false; - for (int cur_tree_id = 0; cur_tree_id < this->num_tree_per_iteration_; ++cur_tree_id) { - std::vector> new_tree(num_gpu_); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - new_tree[gpu_index].reset(nullptr); - } - if (this->class_need_train_[cur_tree_id] && this->train_data_->num_features() > 0) { - if (this->is_use_subset_ && this->bag_data_cnt_ < this->num_data_) { - Log::Fatal("Bagging is not supported for NCCLGBDT"); - } - bool is_first_tree = this->models_.size() < static_cast(this->num_tree_per_iteration_); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - train_tree_learner_thread_data_[gpu_index].is_first_time = is_first_tree; - train_tree_learner_thread_data_[gpu_index].class_id = cur_tree_id; - if (pthread_create(&host_threads_[gpu_index], nullptr, TrainTreeLearnerThread, - reinterpret_cast(&train_tree_learner_thread_data_[gpu_index]))) { - Log::Fatal("Error in creating tree training threads."); - } - } - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - if (pthread_join(host_threads_[gpu_index], nullptr)) { - Log::Fatal("Error in joining tree training threads."); - } - new_tree[gpu_index].reset(train_tree_learner_thread_data_[gpu_index].tree.release()); - } - } - - if (new_tree[master_gpu_index_]->num_leaves() > 1) { - should_continue = true; - if (this->objective_function_ != nullptr && this->objective_function_->IsRenewTreeOutput()) { - Log::Fatal("Objective function with renewing is not supported for NCCLGBDT."); - } - // shrinkage by learning rate - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - new_tree[gpu_index]->Shrinkage(this->shrinkage_rate_); - } - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); - // update score - UpdateScore(new_tree, cur_tree_id); - if (std::fabs(init_scores[cur_tree_id]) > kEpsilon) { - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - SetCUDADevice(gpu_index); - new_tree[gpu_index]->AddBias(init_scores[cur_tree_id]); - } - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); - } - } else { - // only add default score one-time - if (this->models_.size() < static_cast(this->num_tree_per_iteration_)) { - Log::Warning("Training stopped with no splits."); - } - } - - // add model - this->models_.push_back(std::move(new_tree[master_gpu_index_])); - - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - if (gpu_index != master_gpu_index_) { - SetCUDADevice(gpu_index); - new_tree[gpu_index].reset(nullptr); - } - } - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); - } - - if (!should_continue) { - Log::Warning("Stopped training because there are no more leaves that meet the split requirements"); - if (this->models_.size() > static_cast(this->num_tree_per_iteration_)) { - for (int cur_tree_id = 0; cur_tree_id < this->num_tree_per_iteration_; ++cur_tree_id) { - this->models_.pop_back(); - } - } - return true; - } - - ++this->iter_; - return false; -} - -template -void* NCCLGBDT::UpdateScoreThread(void* thread_data) { - const UpdateScoreThreadData* update_score_thread_data = reinterpret_cast(thread_data); - const int gpu_index = update_score_thread_data->gpu_index; - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_index)); - update_score_thread_data->gpu_score_updater->AddScore( - update_score_thread_data->gpu_tree_learner, - update_score_thread_data->tree, - update_score_thread_data->cur_tree_id); - return nullptr; -} - -template -void NCCLGBDT::UpdateScore(const std::vector>& tree, const int cur_tree_id) { - Common::FunctionTimer fun_timer("GBDT::UpdateScore", global_timer); - // update training score - if (!this->is_use_subset_) { - if (this->num_data_ - this->bag_data_cnt_ > 0) { - Log::Fatal("bagging is not supported for NCCLGBDT."); - } - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - update_score_thread_data_[gpu_index].tree = tree[gpu_index].get(); - update_score_thread_data_[gpu_index].cur_tree_id = cur_tree_id; - if (pthread_create(&host_threads_[gpu_index], nullptr, UpdateScoreThread, - reinterpret_cast(&update_score_thread_data_[gpu_index]))) { - Log::Fatal("Error in creating update score threads."); - } - } - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - if (pthread_join(host_threads_[gpu_index], nullptr)) { - Log::Fatal("Error in joining tree training threads."); - } - } - } else { - Log::Fatal("bagging is not supported for NCCLGBDT."); - } - - // update validation score - for (auto& score_updater : this->valid_score_updater_) { - score_updater->AddScore(tree[master_gpu_index_].get(), cur_tree_id); - } -} - -template -std::vector NCCLGBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const { - if (score == this->train_score_updater_->score()) { - // delegate to per gpu train score updater - std::vector tmp_score(num_data * this->num_class_, 0.0f); - for (int gpu_index = 0; gpu_index < num_gpu_; ++gpu_index) { - const data_size_t data_start = per_gpu_data_start_[gpu_index]; - const data_size_t num_data_in_gpu = per_gpu_data_end_[gpu_index] - data_start; - for (int class_id = 0; class_id < this->num_class_; ++class_id) { - SetCUDADevice(gpu_index); - CopyFromCUDADeviceToHost(tmp_score.data() + class_id * this->num_data_ + data_start, - per_gpu_train_score_updater_[gpu_index]->score() + class_id * num_data_in_gpu, - static_cast(num_data_in_gpu), __FILE__, __LINE__); - } - CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_)); - return metric->Eval(tmp_score.data(), this->objective_function_); - } - } else { - return GBDT_T::EvalOneMetric(metric, score, num_data); - } -} - -template class NCCLGBDT; - -} // LightGBM - -#endif // USE_CUDA From da5f02d45f2278b3417c219a76b471df25fb9c5f Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 5 Dec 2023 09:16:17 +0000 Subject: [PATCH 03/68] first version of pairwie ranking bin --- include/LightGBM/bin.h | 20 ++++ include/LightGBM/dataset.h | 2 +- include/LightGBM/feature_group.h | 4 +- .../LightGBM/pairwise_ranking_feature_group.h | 62 +++++------- src/io/bin.cpp | 35 +++++++ src/io/dataset.cpp | 4 +- src/io/pairwise_lambdarank_bin.hpp | 95 ++++++++++++++----- src/io/pairwise_ranking_feature_group.cpp | 57 +++++++++++ 8 files changed, 215 insertions(+), 64 deletions(-) create mode 100644 src/io/pairwise_ranking_feature_group.cpp diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index de1bb6eb94ed..be306acf6928 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -466,6 +466,26 @@ class Bin { */ static Bin* CreateSparseBin(data_size_t num_data, int num_bin); + /*! + * \brief Create object for bin data of one feature, used for pairwise ranking, for an original dense bin + * \param num_data Size of the pairwise dataset + * \param num_bin Number of bin + * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair + * \return The bin data object + */ + template typename PAIRWISE_BIN_TYPE> + static Bin* CreateDensePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + + /*! + * \brief Create object for bin data of one feature, used for pairwise ranking, for an original sparse bin + * \param num_data Size of the pairwise dataset + * \param num_bin Number of bin + * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair + * \return The bin data object + */ + template typename PAIRWISE_BIN_TYPE> + static Bin* CreateSparsePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + /*! * \brief Deep copy the bin */ diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index fe562309e9b0..3e6f7910bc4d 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -201,7 +201,7 @@ class Metadata { /*! * \brief Build metadata for ranking with pairwise features from metadata of an existing ranking dataset - * \param metadata Pointer to metadata of the existing ranking dataset + * \param metadata Reference to metadata of the existing ranking dataset */ void BuildPairwiseFeatureRanking(const Metadata& metadata); diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index 41caac8b3443..9ecbadf57cc6 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -350,7 +350,7 @@ class FeatureGroup { uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1; return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin); } else { - int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1; + int addi = most_freq_bin == 0 ? 0 : 1; uint32_t min_bin = 1; uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi; return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin, @@ -582,7 +582,7 @@ class FeatureGroup { } protected: - void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { + virtual void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { if (is_multi_val) { multi_bin_data_.clear(); for (int i = 0; i < num_feature_; ++i) { diff --git a/include/LightGBM/pairwise_ranking_feature_group.h b/include/LightGBM/pairwise_ranking_feature_group.h index bf668150e862..2055855b775c 100644 --- a/include/LightGBM/pairwise_ranking_feature_group.h +++ b/include/LightGBM/pairwise_ranking_feature_group.h @@ -3,8 +3,8 @@ * Licensed under the MIT License. See LICENSE file in the project root for * license information. */ -#ifndef LIGHTGBM_PAIRWISE_FEATURE_GROUP_H_ -#define LIGHTGBM_PAIRWISE_FEATURE_GROUP_H_ +#ifndef LIGHTGBM_PAIRWISE_RANKING_FEATURE_GROUP_H_ +#define LIGHTGBM_PAIRWISE_RANKING_FEATURE_GROUP_H_ #include "feature_group.h" @@ -25,23 +25,11 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { * \param bin_mappers Bin mapper for features * \param num_data Total number of data * \param is_enable_sparse True if enable sparse feature + * \param is_first_or_second_in_pairing Mark whether features in this group belong to the first or second element in the pairing */ - PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_data) { - num_feature_ = other.num_feature_; - is_multi_val_ = false; - is_dense_multi_val_ = false; - is_sparse_ = false; - num_total_bin_ = other.num_total_bin_; - bin_offsets_ = other.bin_offsets_; - num_data_ = num_data; - - bin_mappers_.reserve(other.bin_mappers_.size()); - for (auto& bin_mapper : other.bin_mappers_) { - bin_mappers_.emplace_back(new BinMapper(*bin_mapper)); - } - CreateBinData(num_data, is_multi_val_, !is_sparse_, is_sparse_); - } + PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_data, const int is_first_or_second_in_pairing): + FeatureGroup(other, num_data), is_first_or_second_in_pairing_(is_first_or_second_in_pairing) {} /*! * \brief Constructor from memory when data is present @@ -50,21 +38,21 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { * \param local_used_indices Local used indices, empty means using all data * \param group_id Id of group */ - PairwiseRankingFeatureGroup(const void* memory, - data_size_t num_all_data, - const std::vector& local_used_indices, - int group_id) { - // TODO(shiyu1994) - } - - /*! - * \brief Constructor from definition in memory (without data) - * \param memory Pointer of memory - * \param local_used_indices Local used indices, empty means using all data - */ - FeatureGroup(const void* memory, data_size_t num_data, int group_id) { - // TODO(shiyu1994) - } + // PairwiseRankingFeatureGroup(const void* memory, + // data_size_t num_all_data, + // const std::vector& local_used_indices, + // int group_id) { + // // TODO(shiyu1994) + // } + + // /*! + // * \brief Constructor from definition in memory (without data) + // * \param memory Pointer of memory + // * \param local_used_indices Local used indices, empty means using all data + // */ + // PairwiseRankingFeatureGroup(const void* memory, data_size_t num_data, int group_id): FeatureGroup(memory, num_data, group_id) { + // // TODO(shiyu1994) + // } /*! \brief Destructor */ ~PairwiseRankingFeatureGroup() {} @@ -91,17 +79,19 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { } private: - void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { - // TODO(shiyu1994) - } + template + void CreateBinDataInner(int num_data, bool is_multi_val, bool force_dense, bool force_sparse); + void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) override; /*! \brief Pairwise data index to original data indices for ranking with pairwise features */ const std::pair* paired_ranking_item_index_map_; /*! \brief Number of pairwise data */ data_size_t num_data_; + /*! \brief Mark whether features in this group belong to the first or second element in the pairing */ + const int is_first_or_second_in_pairing_; }; } // namespace LightGBM -#endif // LIGHTGBM_PAIRWISE_FEATURE_GROUP_H_ +#endif // LIGHTGBM_PAIRWISE_RANKING_FEATURE_GROUP_H_ diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 3d84599e6589..95626d150d96 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -17,6 +17,7 @@ #include "multi_val_dense_bin.hpp" #include "multi_val_sparse_bin.hpp" #include "sparse_bin.hpp" +#include "pairwise_lambdarank_bin.hpp" namespace LightGBM { @@ -632,6 +633,38 @@ namespace LightGBM { } } + template typename PAIRWISE_BIN_TYPE> + Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) { + if (num_bin <= 16) { + return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new DenseBin(num_data), min_bin, max_bin, most_freq_bin); + } else if (num_bin <= 256) { + return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new DenseBin(num_data), min_bin, max_bin, most_freq_bin); + } else if (num_bin <= 65536) { + return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new DenseBin(num_data), min_bin, max_bin, most_freq_bin); + } else { + return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new DenseBin(num_data), min_bin, max_bin, most_freq_bin); + } + } + + template typename PAIRWISE_BIN_TYPE> + Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) { + if (num_bin <= 256) { + return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new SparseBin(num_data), min_bin, max_bin, most_freq_bin); + } else if (num_bin <= 65536) { + return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new SparseBin(num_data), min_bin, max_bin, most_freq_bin); + } else { + return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new SparseBin(num_data), min_bin, max_bin, most_freq_bin); + } + } + + template Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + + template Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + + template Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + + template Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate, const std::vector& offsets) { if (sparse_rate >= multi_val_bin_sparse_threshold) { @@ -705,6 +738,8 @@ namespace LightGBM { } } + + template <> const void* DenseBin::GetColWiseData( uint8_t* bit_type, diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 0fe208147aa5..3ba550c50cff 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -847,6 +848,7 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, std::vectornum_groups_; int original_group_feature_start = dataset->group_feature_start_[original_group_index]; + const int is_first_or_second_in_pairing = original_group_index / dataset->num_groups_; // 0 for first, 1 for second group_feature_start_[i] = cur_feature_index; for (int feature_index_in_group = 0; feature_index_in_group < dataset->group_feature_cnt_[original_group_index]; ++feature_index_in_group) { const BinMapper* feature_bin_mapper = dataset->FeatureBinMapper(original_group_feature_start + feature_index_in_group); @@ -857,7 +859,7 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, std::vectorfeature2subfeature_[original_group_feature_start + feature_index_in_group]); cur_feature_index += 1; } - feature_groups_.emplace_back(new FeatureGroup(*dataset->feature_groups_[original_group_index].get(), num_data_)); + feature_groups_.emplace_back(new PairwiseRankingFeatureGroup(*dataset->feature_groups_[original_group_index].get(), num_data_, is_first_or_second_in_pairing)); num_total_bin += dataset->FeatureGroupNumBin(original_group_index); group_bin_boundaries_.push_back(num_total_bin); group_feature_cnt_[i] = dataset->group_feature_cnt_[original_group_index]; diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp index 7407029b2bed..1e177c694170 100644 --- a/src/io/pairwise_lambdarank_bin.hpp +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -10,13 +10,20 @@ namespace LightGBM { -template +template +class PairwiseRankingFirstBin; + +template +class PairwiseRankingSecondBin; + +template class PairwiseRankingFirstIterator: public BinIterator { + friend PairwiseRankingFirstBin; public: - PairwiseRankingFirstIterator(const Bin* unpaired_bin, const std::pair* paired_ranking_item_index_map) { + PairwiseRankingFirstIterator(const BIN_TYPE* unpaired_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) { unpaired_bin_ = unpaired_bin; - unpaired_bin_iterator_ = unpaired_bin_->GetIterator(); - unpaired_bin_iterator_->Reset(); + unpaired_bin_iterator_ = unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin); + unpaired_bin_iterator_->Reset(0); paired_ranking_item_index_map_ = paired_ranking_item_index_map; prev_index_ = 0; prev_val_ = 0; @@ -25,20 +32,20 @@ class PairwiseRankingFirstIterator: public BinIterator { ~PairwiseRankingFirstIterator() {} uint32_t Get(data_size_t idx) { - const data_size_t data_index = paired_ranking_item_index_map[idx].first + const data_size_t data_index = paired_ranking_item_index_map_[idx].first; if (data_index != prev_index_) { CHECK_GT(data_index, prev_index_); - prev_val_ = unpaired_bin_iterator_i_->Get(data_index); + prev_val_ = unpaired_bin_iterator_->Get(data_index); } prev_index_ = data_index; return prev_val_; } uint32_t RawGet(data_size_t idx) { - const data_size_t data_index = paired_ranking_item_index_map[idx].first + const data_size_t data_index = paired_ranking_item_index_map_[idx].first; if (data_index != prev_index_) { CHECK_GT(data_index, prev_index_); - prev_val_ = unpaired_bin_iterator_i_->RawGet(data_index); + prev_val_ = unpaired_bin_iterator_->RawGet(data_index); } prev_index_ = data_index; return prev_val_; @@ -51,43 +58,43 @@ class PairwiseRankingFirstIterator: public BinIterator { } private: - const Bin* unpaired_bin_; + const BIN_TYPE* unpaired_bin_; BinIterator* unpaired_bin_iterator_; const std::pair* paired_ranking_item_index_map_; - const data_size_t prev_index_; - const uint32_t prev_val_; + data_size_t prev_index_; + uint32_t prev_val_; }; -template +template class PairwiseRankingSecondIterator: public BinIterator { + friend PairwiseRankingSecondBin; public: - PairwiseRankingSecondIterator(const Bin* unpaired_bin, const std::pair* paired_ranking_item_index_map) { + PairwiseRankingSecondIterator(const BIN_TYPE* unpaired_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) { unpaired_bin_ = unpaired_bin; - unpaired_bin_iterator_ = unpaired_bin_->GetIterator(); - unpaired_bin_iterator_->Reset(); + unpaired_bin_iterator_ = unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin); + unpaired_bin_iterator_->Reset(0); paired_ranking_item_index_map_ = paired_ranking_item_index_map; prev_index_ = 0; - prev_val_ = 0; } ~PairwiseRankingSecondIterator() {} uint32_t Get(data_size_t idx) { - const data_size_t data_index = paired_ranking_item_index_map[idx].second + const data_size_t data_index = paired_ranking_item_index_map_[idx].second; if (data_index < prev_index_) { - unpaired_bin_iterator_i_.Reset(0); + unpaired_bin_iterator_->Reset(0); } prev_index_ = data_index; - return unpaired_bin_iterator_i_->Get(data_index); + return unpaired_bin_iterator_->Get(data_index); } uint32_t RawGet(data_size_t idx) { - const data_size_t data_index = paired_ranking_item_index_map[idx].second + const data_size_t data_index = paired_ranking_item_index_map_[idx].second; if (data_index < prev_index_) { - unpaired_bin_iterator_i_.Reset(0); + unpaired_bin_iterator_->Reset(0); } prev_index_ = data_index; - return unpaired_bin_iterator_i_->RawGet(data_index); + return unpaired_bin_iterator_->RawGet(data_index); } void Reset(data_size_t idx) { @@ -96,10 +103,50 @@ class PairwiseRankingSecondIterator: public BinIterator { } private: - const Bin* unpaired_bin_; + const BIN_TYPE* unpaired_bin_; BinIterator* unpaired_bin_iterator_; const std::pair* paired_ranking_item_index_map_; - const data_size_t prev_index_; + data_size_t prev_index_; +}; + +template +class PairwiseRankingFirstBin: public BIN_TYPE { + public: + PairwiseRankingFirstBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin): BIN_TYPE(0), unpaired_bin_(unpaired_bin), min_bin_(min_bin), max_bin_(max_bin), most_freq_bin_(most_freq_bin) { + paired_ranking_item_index_map_ = paired_ranking_item_index_map; + num_data_ = num_data; + } + + BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + return new PairwiseRankingFirstIterator(unpaired_bin_.get(), paired_ranking_item_index_map_, min_bin_, max_bin_, most_freq_bin_); + } + + private: + const std::pair* paired_ranking_item_index_map_; + const std::shared_ptr unpaired_bin_; + const uint32_t min_bin_; + const uint32_t max_bin_; + const uint32_t most_freq_bin_; +}; + +template +class PairwiseRankingSecondBin: public BIN_TYPE { + public: + PairwiseRankingSecondBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin): BIN_TYPE(0), unpaired_bin_(unpaired_bin), min_bin_(min_bin), max_bin_(max_bin), most_freq_bin_(most_freq_bin) { + paired_ranking_item_index_map_ = paired_ranking_item_index_map; + num_data_ = num_data; + } + + BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + return new PairwiseRankingSecondIterator(unpaired_bin_.get(), paired_ranking_item_index_map_, min_bin_, max_bin_, most_freq_bin_); + } + + private: + const std::pair* paired_ranking_item_index_map_; + const std::shared_ptr unpaired_bin_; + const uint32_t min_bin_; + const uint32_t max_bin_; + const uint32_t most_freq_bin_; }; } // LightGBM diff --git a/src/io/pairwise_ranking_feature_group.cpp b/src/io/pairwise_ranking_feature_group.cpp new file mode 100644 index 000000000000..f0b9485fb266 --- /dev/null +++ b/src/io/pairwise_ranking_feature_group.cpp @@ -0,0 +1,57 @@ +/*! + * Copyright (c) 2016 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include +#include "pairwise_lambdarank_bin.hpp" + +namespace LightGBM { + +template typename PAIRWISE_BIN_TYPE> +void PairwiseRankingFeatureGroup::CreateBinDataInner(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { + if (is_multi_val) { + multi_bin_data_.clear(); + for (int i = 0; i < num_feature_; ++i) { + uint32_t most_freq_bin = bin_mappers_[i]->GetMostFreqBin(); + int addi = most_freq_bin == 0 ? 0 : 1; + if (!is_multi_val) { + uint32_t min_bin = bin_offsets_[i]; + uint32_t max_bin = bin_offsets_[i + 1] - 1; + } else { + uint32_t min_bin = 1; + uint32_t max_bin = bin_mappers_[i]->num_bin() - 1 + addi; + } + if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) { + multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingBin( + num_data, bin_mappers_[i]->num_bin() + addi, paired_ranking_item_index_map_)); + } else { + multi_bin_data_.emplace_back( + Bin::CreateDensePairwiseRankingBin(num_data, bin_mappers_[i]->num_bin() + addi, paired_ranking_item_index_map_)); + } + } + is_multi_val_ = true; + } else { + if (force_sparse || + (!force_dense && num_feature_ == 1 && + bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { + is_sparse_ = true; + bin_data_.reset(Bin::CreateSparsePairwiseRankingBin(num_data, num_total_bin_, paired_ranking_item_index_map_)); + } else { + is_sparse_ = false; + bin_data_.reset(Bin::CreateDensePairwiseRankingBin(num_data, num_total_bin_, paired_ranking_item_index_map_)); + } + is_multi_val_ = false; + } +} + +void PairwiseRankingFeatureGroup::CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { + if (is_first_or_second_in_pairing_ == 0) { + CreateBinDataInner(num_data, is_multi_val, force_dense, force_sparse); + } else { + CreateBinDataInner(num_data, is_multi_val, force_dense, force_sparse); + } +} + +} // namespace LightGBM From 0cb436db5cd5d5791404da8fc050d36d8e93d4bc Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 5 Dec 2023 14:01:17 +0000 Subject: [PATCH 04/68] templates for bins in pairwise ranking dataset --- include/LightGBM/bin.h | 4 +- include/LightGBM/dataset.h | 4 ++ .../LightGBM/pairwise_ranking_feature_group.h | 12 +++--- src/io/bin.cpp | 26 ++++++------- src/io/dataset.cpp | 4 +- src/io/pairwise_lambdarank_bin.hpp | 37 +++++++------------ src/io/pairwise_ranking_feature_group.cpp | 19 +++------- 7 files changed, 46 insertions(+), 60 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index be306acf6928..50adb57714db 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -474,7 +474,7 @@ class Bin { * \return The bin data object */ template typename PAIRWISE_BIN_TYPE> - static Bin* CreateDensePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + static Bin* CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); /*! * \brief Create object for bin data of one feature, used for pairwise ranking, for an original sparse bin @@ -484,7 +484,7 @@ class Bin { * \return The bin data object */ template typename PAIRWISE_BIN_TYPE> - static Bin* CreateSparsePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + static Bin* CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); /*! * \brief Deep copy the bin diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 00281367b093..79879bc4ad41 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -269,6 +269,10 @@ class Metadata { } } + inline data_size_t paired_ranking_item_index_map_size() const { + return static_cast(paired_ranking_item_index_map_.size()); + } + /*! * \brief Get data boundaries on queries, if not exists, will return nullptr * we assume data will order by query, diff --git a/include/LightGBM/pairwise_ranking_feature_group.h b/include/LightGBM/pairwise_ranking_feature_group.h index 2055855b775c..ed0953e97d0f 100644 --- a/include/LightGBM/pairwise_ranking_feature_group.h +++ b/include/LightGBM/pairwise_ranking_feature_group.h @@ -28,8 +28,8 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { * \param is_first_or_second_in_pairing Mark whether features in this group belong to the first or second element in the pairing */ - PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_data, const int is_first_or_second_in_pairing): - FeatureGroup(other, num_data), is_first_or_second_in_pairing_(is_first_or_second_in_pairing) {} + PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map): + FeatureGroup(other, num_original_data), paired_ranking_item_index_map_(paired_ranking_item_index_map), num_data_(num_pairs), is_first_or_second_in_pairing_(is_first_or_second_in_pairing) {} /*! * \brief Constructor from memory when data is present @@ -62,11 +62,11 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { * \param memory Pointer of memory * \param group_id Id of group */ - const char* LoadDefinitionFromMemory(const void* memory, int group_id) { + const char* LoadDefinitionFromMemory(const void* /*memory*/, int /*group_id*/) { // TODO(shiyu1994) } - inline BinIterator* SubFeatureIterator(int sub_feature) { + inline BinIterator* SubFeatureIterator(int /*sub_feature*/) { // TODO(shiyu1994) } @@ -79,11 +79,11 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { } private: - template + template typename PAIRWISE_BIN_TYPE> void CreateBinDataInner(int num_data, bool is_multi_val, bool force_dense, bool force_sparse); void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) override; - + /*! \brief Pairwise data index to original data indices for ranking with pairwise features */ const std::pair* paired_ranking_item_index_map_; /*! \brief Number of pairwise data */ diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 95626d150d96..6deabe562ca9 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -634,36 +634,36 @@ namespace LightGBM { } template typename PAIRWISE_BIN_TYPE> - Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) { + Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { if (num_bin <= 16) { - return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new DenseBin(num_data), min_bin, max_bin, most_freq_bin); + return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); } else if (num_bin <= 256) { - return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new DenseBin(num_data), min_bin, max_bin, most_freq_bin); + return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); } else if (num_bin <= 65536) { - return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new DenseBin(num_data), min_bin, max_bin, most_freq_bin); + return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); } else { - return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new DenseBin(num_data), min_bin, max_bin, most_freq_bin); + return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); } } template typename PAIRWISE_BIN_TYPE> - Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) { + Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { if (num_bin <= 256) { - return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new SparseBin(num_data), min_bin, max_bin, most_freq_bin); + return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); } else if (num_bin <= 65536) { - return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new SparseBin(num_data), min_bin, max_bin, most_freq_bin); + return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); } else { - return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new SparseBin(num_data), min_bin, max_bin, most_freq_bin); + return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); } } - template Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + template Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); - template Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + template Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); - template Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + template Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); - template Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + template Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate, const std::vector& offsets) { diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 4dafa4ef5ced..0b4985c5a2ee 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -820,7 +820,7 @@ void Dataset::CreateValid(const Dataset* dataset) { gpu_device_id_ = dataset->gpu_device_id_; } -void Dataset::CreatePairWiseRankingData(const Dataset* dataset, std::vector> pair_index_map) { +void Dataset::CreatePairWiseRankingData(const Dataset* dataset, std::vector> /* TODO(shiyu1994) pair_index_map*/) { metadata_.BuildPairwiseFeatureRanking(dataset->metadata()); feature_groups_.clear(); @@ -859,7 +859,7 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, std::vectorfeature2subfeature_[original_group_feature_start + feature_index_in_group]); cur_feature_index += 1; } - feature_groups_.emplace_back(new PairwiseRankingFeatureGroup(*dataset->feature_groups_[original_group_index].get(), num_data_, is_first_or_second_in_pairing)); + feature_groups_.emplace_back(new PairwiseRankingFeatureGroup(*dataset->feature_groups_[original_group_index].get(), num_data_, is_first_or_second_in_pairing, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_index_map())); num_total_bin += dataset->FeatureGroupNumBin(original_group_index); group_bin_boundaries_.push_back(num_total_bin); group_feature_cnt_[i] = dataset->group_feature_cnt_[original_group_index]; diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp index 1e177c694170..c3d1854acda3 100644 --- a/src/io/pairwise_lambdarank_bin.hpp +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -109,44 +109,33 @@ class PairwiseRankingSecondIterator: public BinIterator { data_size_t prev_index_; }; -template -class PairwiseRankingFirstBin: public BIN_TYPE { +template typename ITERATOR_TYPE> +class PairwiseRankingBin: public BIN_TYPE { public: - PairwiseRankingFirstBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin): BIN_TYPE(0), unpaired_bin_(unpaired_bin), min_bin_(min_bin), max_bin_(max_bin), most_freq_bin_(most_freq_bin) { - paired_ranking_item_index_map_ = paired_ranking_item_index_map; + PairwiseRankingBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): BIN_TYPE(0), paired_ranking_item_index_map_(paired_ranking_item_index_map), unpaired_bin_(unpaired_bin) { num_data_ = num_data; } BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { - return new PairwiseRankingFirstIterator(unpaired_bin_.get(), paired_ranking_item_index_map_, min_bin_, max_bin_, most_freq_bin_); + return new ITERATOR_TYPE(unpaired_bin_.get(), paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin); } - private: + protected: const std::pair* paired_ranking_item_index_map_; const std::shared_ptr unpaired_bin_; - const uint32_t min_bin_; - const uint32_t max_bin_; - const uint32_t most_freq_bin_; + data_size_t num_data_; }; template -class PairwiseRankingSecondBin: public BIN_TYPE { +class PairwiseRankingFirstBin: public PairwiseRankingBin { public: - PairwiseRankingSecondBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin): BIN_TYPE(0), unpaired_bin_(unpaired_bin), min_bin_(min_bin), max_bin_(max_bin), most_freq_bin_(most_freq_bin) { - paired_ranking_item_index_map_ = paired_ranking_item_index_map; - num_data_ = num_data; - } - - BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { - return new PairwiseRankingSecondIterator(unpaired_bin_.get(), paired_ranking_item_index_map_, min_bin_, max_bin_, most_freq_bin_); - } + PairwiseRankingFirstBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): PairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} +}; - private: - const std::pair* paired_ranking_item_index_map_; - const std::shared_ptr unpaired_bin_; - const uint32_t min_bin_; - const uint32_t max_bin_; - const uint32_t most_freq_bin_; +template +class PairwiseRankingSecondBin: public PairwiseRankingBin { + public: + PairwiseRankingSecondBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): PairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} }; } // LightGBM diff --git a/src/io/pairwise_ranking_feature_group.cpp b/src/io/pairwise_ranking_feature_group.cpp index f0b9485fb266..cdde9fbaadc2 100644 --- a/src/io/pairwise_ranking_feature_group.cpp +++ b/src/io/pairwise_ranking_feature_group.cpp @@ -11,24 +11,17 @@ namespace LightGBM { template typename PAIRWISE_BIN_TYPE> void PairwiseRankingFeatureGroup::CreateBinDataInner(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { + CHECK(!is_multi_val); // do not support multi-value bin for now if (is_multi_val) { multi_bin_data_.clear(); for (int i = 0; i < num_feature_; ++i) { - uint32_t most_freq_bin = bin_mappers_[i]->GetMostFreqBin(); - int addi = most_freq_bin == 0 ? 0 : 1; - if (!is_multi_val) { - uint32_t min_bin = bin_offsets_[i]; - uint32_t max_bin = bin_offsets_[i + 1] - 1; - } else { - uint32_t min_bin = 1; - uint32_t max_bin = bin_mappers_[i]->num_bin() - 1 + addi; - } + int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1; if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) { multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingBin( - num_data, bin_mappers_[i]->num_bin() + addi, paired_ranking_item_index_map_)); + num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); } else { multi_bin_data_.emplace_back( - Bin::CreateDensePairwiseRankingBin(num_data, bin_mappers_[i]->num_bin() + addi, paired_ranking_item_index_map_)); + Bin::CreateDensePairwiseRankingBin(num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); } } is_multi_val_ = true; @@ -37,10 +30,10 @@ void PairwiseRankingFeatureGroup::CreateBinDataInner(int num_data, bool is_multi (!force_dense && num_feature_ == 1 && bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { is_sparse_ = true; - bin_data_.reset(Bin::CreateSparsePairwiseRankingBin(num_data, num_total_bin_, paired_ranking_item_index_map_)); + bin_data_.reset(Bin::CreateSparsePairwiseRankingBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); } else { is_sparse_ = false; - bin_data_.reset(Bin::CreateDensePairwiseRankingBin(num_data, num_total_bin_, paired_ranking_item_index_map_)); + bin_data_.reset(Bin::CreateDensePairwiseRankingBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); } is_multi_val_ = false; } From 6fbc674e2646b1904d054af71704e97cff6cfa93 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 6 Dec 2023 04:45:31 +0000 Subject: [PATCH 05/68] fix lint issues and compilation errors --- include/LightGBM/bin.h | 5 +++-- include/LightGBM/feature_group.h | 2 +- include/LightGBM/pairwise_ranking_feature_group.h | 9 ++++++--- src/io/bin.cpp | 6 ++---- src/io/pairwise_lambdarank_bin.hpp | 15 ++++++++++----- src/io/pairwise_ranking_feature_group.cpp | 6 +++--- src/objective/rank_objective.hpp | 1 + 7 files changed, 26 insertions(+), 18 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 50adb57714db..2a2a798372bb 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -14,6 +14,7 @@ #include #include #include +#include #include namespace LightGBM { @@ -473,7 +474,7 @@ class Bin { * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair * \return The bin data object */ - template typename PAIRWISE_BIN_TYPE> + template class PAIRWISE_BIN_TYPE> static Bin* CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); /*! @@ -483,7 +484,7 @@ class Bin { * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair * \return The bin data object */ - template typename PAIRWISE_BIN_TYPE> + template class PAIRWISE_BIN_TYPE> static Bin* CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); /*! diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index 9ecbadf57cc6..ea01bd84c8da 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -152,7 +152,7 @@ class FeatureGroup { } /*! \brief Destructor */ - ~FeatureGroup() {} + virtual ~FeatureGroup() {} /*! * \brief Load the overall definition of the feature group from binary serialized data diff --git a/include/LightGBM/pairwise_ranking_feature_group.h b/include/LightGBM/pairwise_ranking_feature_group.h index ed0953e97d0f..e01f5027ea32 100644 --- a/include/LightGBM/pairwise_ranking_feature_group.h +++ b/include/LightGBM/pairwise_ranking_feature_group.h @@ -6,13 +6,13 @@ #ifndef LIGHTGBM_PAIRWISE_RANKING_FEATURE_GROUP_H_ #define LIGHTGBM_PAIRWISE_RANKING_FEATURE_GROUP_H_ -#include "feature_group.h" - #include #include #include #include +#include "feature_group.h" + namespace LightGBM { /*! \brief Using to store data and providing some operations on one feature @@ -64,10 +64,12 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { */ const char* LoadDefinitionFromMemory(const void* /*memory*/, int /*group_id*/) { // TODO(shiyu1994) + return nullptr; } inline BinIterator* SubFeatureIterator(int /*sub_feature*/) { // TODO(shiyu1994) + return nullptr; } inline void FinishLoad() { @@ -76,10 +78,11 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { inline BinIterator* FeatureGroupIterator() { // TODO(shiyu1994) + return nullptr; } private: - template typename PAIRWISE_BIN_TYPE> + template class PAIRWISE_BIN_TYPE> void CreateBinDataInner(int num_data, bool is_multi_val, bool force_dense, bool force_sparse); void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) override; diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 6deabe562ca9..4d21532c742f 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -633,7 +633,7 @@ namespace LightGBM { } } - template typename PAIRWISE_BIN_TYPE> + template class PAIRWISE_BIN_TYPE> Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { if (num_bin <= 16) { return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); @@ -646,7 +646,7 @@ namespace LightGBM { } } - template typename PAIRWISE_BIN_TYPE> + template class PAIRWISE_BIN_TYPE> Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { if (num_bin <= 256) { return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); @@ -738,8 +738,6 @@ namespace LightGBM { } } - - template <> const void* DenseBin::GetColWiseData( uint8_t* bit_type, diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp index c3d1854acda3..9db802e14e71 100644 --- a/src/io/pairwise_lambdarank_bin.hpp +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -6,6 +6,9 @@ #ifndef LIGHTGBM_IO_PAIRWISE_LAMBDARANK_BIN_HPP_ #define LIGHTGBM_IO_PAIRWISE_LAMBDARANK_BIN_HPP_ +#include +#include + #include namespace LightGBM { @@ -18,8 +21,9 @@ class PairwiseRankingSecondBin; template class PairwiseRankingFirstIterator: public BinIterator { - friend PairwiseRankingFirstBin; public: + friend PairwiseRankingFirstBin; + PairwiseRankingFirstIterator(const BIN_TYPE* unpaired_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) { unpaired_bin_ = unpaired_bin; unpaired_bin_iterator_ = unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin); @@ -67,8 +71,9 @@ class PairwiseRankingFirstIterator: public BinIterator { template class PairwiseRankingSecondIterator: public BinIterator { - friend PairwiseRankingSecondBin; public: + friend PairwiseRankingSecondBin; + PairwiseRankingSecondIterator(const BIN_TYPE* unpaired_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) { unpaired_bin_ = unpaired_bin; unpaired_bin_iterator_ = unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin); @@ -109,7 +114,7 @@ class PairwiseRankingSecondIterator: public BinIterator { data_size_t prev_index_; }; -template typename ITERATOR_TYPE> +template class ITERATOR_TYPE> class PairwiseRankingBin: public BIN_TYPE { public: PairwiseRankingBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): BIN_TYPE(0), paired_ranking_item_index_map_(paired_ranking_item_index_map), unpaired_bin_(unpaired_bin) { @@ -122,7 +127,7 @@ class PairwiseRankingBin: public BIN_TYPE { protected: const std::pair* paired_ranking_item_index_map_; - const std::shared_ptr unpaired_bin_; + const std::unique_ptr unpaired_bin_; data_size_t num_data_; }; @@ -138,6 +143,6 @@ class PairwiseRankingSecondBin: public PairwiseRankingBin* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): PairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} }; -} // LightGBM +} // namespace LightGBM #endif // LIGHTGBM_IO_PAIRWISE_LAMBDARANK_BIN_HPP_ diff --git a/src/io/pairwise_ranking_feature_group.cpp b/src/io/pairwise_ranking_feature_group.cpp index cdde9fbaadc2..e0634fe2a9c5 100644 --- a/src/io/pairwise_ranking_feature_group.cpp +++ b/src/io/pairwise_ranking_feature_group.cpp @@ -9,9 +9,9 @@ namespace LightGBM { -template typename PAIRWISE_BIN_TYPE> +template class PAIRWISE_BIN_TYPE> void PairwiseRankingFeatureGroup::CreateBinDataInner(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { - CHECK(!is_multi_val); // do not support multi-value bin for now + CHECK(!is_multi_val); // do not support multi-value bin for now if (is_multi_val) { multi_bin_data_.clear(); for (int i = 0; i < num_feature_; ++i) { @@ -47,4 +47,4 @@ void PairwiseRankingFeatureGroup::CreateBinData(int num_data, bool is_multi_val, } } -} // namespace LightGBM +} // namespace LightGBM diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 8b63a0c30710..5c5fb3c7183b 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include namespace LightGBM { From 9e16dc33aa1713db65f4489ce9db0e34f958dfdb Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 6 Dec 2023 06:28:05 +0000 Subject: [PATCH 06/68] add methods for pairwise bin --- src/io/dense_bin.hpp | 2 +- src/io/pairwise_lambdarank_bin.cpp | 382 +++++++++++++++++++++++++++++ src/io/pairwise_lambdarank_bin.hpp | 154 +++++++++++- 3 files changed, 533 insertions(+), 5 deletions(-) create mode 100644 src/io/pairwise_lambdarank_bin.cpp diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index e612052e47d2..c84d618ab0f1 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -605,7 +605,7 @@ class DenseBin : public Bin { const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const override; - private: + protected: data_size_t num_data_; #ifdef USE_CUDA std::vector> data_; diff --git a/src/io/pairwise_lambdarank_bin.cpp b/src/io/pairwise_lambdarank_bin.cpp new file mode 100644 index 000000000000..92901dd17ee1 --- /dev/null +++ b/src/io/pairwise_lambdarank_bin.cpp @@ -0,0 +1,382 @@ +/*! + * Copyright (c) 2016 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "pairwise_lambdarank_bin.hpp" + +namespace LightGBM { + +template class ITERATOR_TYPE> +void PairwiseRankingBin::InitStreaming(uint32_t num_thread, int32_t omp_max_threads) { + unpaired_bin_->InitStreaming(num_thread, omp_max_threads); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin::Push(int tid, data_size_t idx, uint32_t value) { + unpaired_bin_->Push(tid, idx, value); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) { + unpaired_bin_->CopySubrow(full_bin, used_indices, num_used_indices); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin::SaveBinaryToFile(BinaryWriter* writer) const { + unpaired_bin_->SaveBinaryToFile(writer); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin::LoadFromMemory(const void* memory, const std::vector& local_used_indices) { + unpaired_bin_->LoadFromMemory(memory, local_used_indices); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin::SizesInByte() const { + return unpaired_bin_->SizesInByte(); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin::num_data() const { + return unpaired_bin_->num_data(); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin::ReSize(data_size_t num_data) { + return unpaired_bin_->ReSize(num_data); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInner(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const { + data_size_t i = start; + hist_t* grad = out; + hist_t* hess = out + 1; + hist_cnt_t* cnt = reinterpret_cast(hess); + if (USE_PREFETCH) { + const data_size_t pf_offset = 64 / sizeof(VAL_T); + const data_size_t pf_end = end - pf_offset; + for (; i < pf_end; ++i) { + const auto paired_idx = USE_INDICES ? data_indices[i] : i; + const auto idx = get_unpaired_index(paired_idx); + const auto paired_pf_idx = + USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; + const auto pf_idx = get_unpaired_index(paired_pf_idx); + if (IS_4BIT) { + PREFETCH_T0(unpaired_bin_->data_.data() + (pf_idx >> 1)); + } else { + PREFETCH_T0(unpaired_bin_->data_.data() + pf_idx); + } + const auto ti = static_cast(unpaired_bin_->data(idx)) << 1; + if (USE_HESSIAN) { + grad[ti] += ordered_gradients[i]; + hess[ti] += ordered_hessians[i]; + } else { + grad[ti] += ordered_gradients[i]; + ++cnt[ti]; + } + } + } + for (; i < end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto ti = static_cast(unpaired_bin_->data(idx)) << 1; + if (USE_HESSIAN) { + grad[ti] += ordered_gradients[i]; + hess[ti] += ordered_hessians[i]; + } else { + grad[ti] += ordered_gradients[i]; + ++cnt[ti]; + } + } +} + +template +void ConstructHistogramIntInner(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const { + data_size_t i = start; + PACKED_HIST_T* out_ptr = reinterpret_cast(out); + const int16_t* gradients_ptr = reinterpret_cast(ordered_gradients); + const VAL_T* data_ptr_base = unpaired_bin_->data_.data(); + if (USE_PREFETCH) { + const data_size_t pf_offset = 64 / sizeof(VAL_T); + const data_size_t pf_end = end - pf_offset; + for (; i < pf_end; ++i) { + const auto paired_idx = USE_INDICES ? data_indices[i] : i; + const auto paired_pf_idx = + USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; + const auto idx = get_unpaired_index(paired_idx); + const auto pf_idx = get_unpaired_index(paired_pf_idx); + if (IS_4BIT) { + PREFETCH_T0(data_ptr_base + (pf_idx >> 1)); + } else { + PREFETCH_T0(data_ptr_base + pf_idx); + } + const auto ti = static_cast(unpaired_bin_->data(idx)); + const int16_t gradient_16 = gradients_ptr[i]; + if (USE_HESSIAN) { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff); + out_ptr[ti] += gradient_packed; + } else { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (1); + out_ptr[ti] += gradient_packed; + } + } + } + for (; i < end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto ti = static_cast(unpaired_bin_->data(idx)); + const int16_t gradient_16 = gradients_ptr[i]; + if (USE_HESSIAN) { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff); + out_ptr[ti] += gradient_packed; + } else { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (1); + out_ptr[ti] += gradient_packed; + } + } +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const { + ConstructHistogramInner( + data_indices, start, end, ordered_gradients, ordered_hessians, out); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const override { + ConstructHistogramInner( + nullptr, start, end, ordered_gradients, ordered_hessians, out); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, + ordered_gradients, nullptr, out); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramInner( + nullptr, start, end, ordered_gradients, nullptr, out); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE, bool MISS_IS_ZERO, bool MISS_IS_NA, bool MFB_IS_ZERO, + bool MFB_IS_NA, bool USE_MIN_BIN> +data_size_t PairwiseRankingBin, ITERATOR_TYPE>::SplitInner(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const { + auto th = static_cast(threshold + min_bin); + auto t_zero_bin = static_cast(min_bin + default_bin); + if (most_freq_bin == 0) { + --th; + --t_zero_bin; + } + const auto minb = static_cast(min_bin); + const auto maxb = static_cast(max_bin); + data_size_t lte_count = 0; + data_size_t gt_count = 0; + data_size_t* default_indices = gt_indices; + data_size_t* default_count = >_count; + data_size_t* missing_default_indices = gt_indices; + data_size_t* missing_default_count = >_count; + if (most_freq_bin <= threshold) { + default_indices = lte_indices; + default_count = <e_count; + } + if (MISS_IS_ZERO || MISS_IS_NA) { + if (default_left) { + missing_default_indices = lte_indices; + missing_default_count = <e_count; + } + } + if (min_bin < max_bin) { + for (data_size_t i = 0; i < cnt; ++i) { + const data_size_t idx = data_indices[i]; + const auto bin = data(idx); + if ((MISS_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || + (MISS_IS_NA && !MFB_IS_NA && bin == maxb)) { + missing_default_indices[(*missing_default_count)++] = idx; + } else if ((USE_MIN_BIN && (bin < minb || bin > maxb)) || + (!USE_MIN_BIN && bin == 0)) { + if ((MISS_IS_NA && MFB_IS_NA) || (MISS_IS_ZERO && MFB_IS_ZERO)) { + missing_default_indices[(*missing_default_count)++] = idx; + } else { + default_indices[(*default_count)++] = idx; + } + } else if (bin > th) { + gt_indices[gt_count++] = idx; + } else { + lte_indices[lte_count++] = idx; + } + } + } else { + data_size_t* max_bin_indices = gt_indices; + data_size_t* max_bin_count = >_count; + if (maxb <= th) { + max_bin_indices = lte_indices; + max_bin_count = <e_count; + } + for (data_size_t i = 0; i < cnt; ++i) { + const data_size_t idx = data_indices[i]; + const auto bin = data(idx); + if (MISS_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { + missing_default_indices[(*missing_default_count)++] = idx; + } else if (bin != maxb) { + if ((MISS_IS_NA && MFB_IS_NA) || (MISS_IS_ZERO && MFB_IS_ZERO)) { + missing_default_indices[(*missing_default_count)++] = idx; + } else { + default_indices[(*default_count)++] = idx; + } + } else { + if (MISS_IS_NA && !MFB_IS_NA) { + missing_default_indices[(*missing_default_count)++] = idx; + } else { + max_bin_indices[(*max_bin_count)++] = idx; + } + } + } + } + return lte_count; +} + +} // namespace LightGBM diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp index 9db802e14e71..3fb69cb199b1 100644 --- a/src/io/pairwise_lambdarank_bin.hpp +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -26,7 +26,7 @@ class PairwiseRankingFirstIterator: public BinIterator { PairwiseRankingFirstIterator(const BIN_TYPE* unpaired_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) { unpaired_bin_ = unpaired_bin; - unpaired_bin_iterator_ = unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin); + unpaired_bin_iterator_.reset(unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin)); unpaired_bin_iterator_->Reset(0); paired_ranking_item_index_map_ = paired_ranking_item_index_map; prev_index_ = 0; @@ -63,7 +63,7 @@ class PairwiseRankingFirstIterator: public BinIterator { private: const BIN_TYPE* unpaired_bin_; - BinIterator* unpaired_bin_iterator_; + std::unique_ptr unpaired_bin_iterator_; const std::pair* paired_ranking_item_index_map_; data_size_t prev_index_; uint32_t prev_val_; @@ -76,7 +76,7 @@ class PairwiseRankingSecondIterator: public BinIterator { PairwiseRankingSecondIterator(const BIN_TYPE* unpaired_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) { unpaired_bin_ = unpaired_bin; - unpaired_bin_iterator_ = unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin); + unpaired_bin_iterator_.reset(unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin)); unpaired_bin_iterator_->Reset(0); paired_ranking_item_index_map_ = paired_ranking_item_index_map; prev_index_ = 0; @@ -109,7 +109,7 @@ class PairwiseRankingSecondIterator: public BinIterator { private: const BIN_TYPE* unpaired_bin_; - BinIterator* unpaired_bin_iterator_; + std::unique_ptr unpaired_bin_iterator_; const std::pair* paired_ranking_item_index_map_; data_size_t prev_index_; }; @@ -121,11 +121,146 @@ class PairwiseRankingBin: public BIN_TYPE { num_data_ = num_data; } + virtual ~PairwiseRankingBin() {} + BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { return new ITERATOR_TYPE(unpaired_bin_.get(), paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin); } + void InitStreaming(uint32_t num_thread, int32_t omp_max_threads) override; + + void Push(int tid, data_size_t idx, uint32_t value) override; + + void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override; + + void SaveBinaryToFile(BinaryWriter* writer) const override; + + void LoadFromMemory(const void* memory, + const std::vector& local_used_indices) override; + + size_t SizesInByte() const override; + + data_size_t num_data() const override; + + void* get_data() override; + + void ReSize(data_size_t num_data) override; + + void ConstructHistogram( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogram(data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + void ConstructHistogram(data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + data_size_t Split(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + MissingType missing_type, bool default_left, + uint32_t threshold, const data_size_t* data_indices, + data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const override; + + data_size_t SplitCategorical( + uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin, + const uint32_t* threshold, int num_threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, data_size_t* gt_indices) const override; + + virtual data_size_t Split(uint32_t max_bin, uint32_t default_bin, + uint32_t most_freq_bin, MissingType missing_type, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const = 0; + + virtual data_size_t SplitCategorical( + uint32_t max_bin, uint32_t most_freq_bin, const uint32_t* threshold, + int num_threshold, const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, data_size_t* gt_indices) const = 0; + + const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, const int num_threads) const override; + + const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const override; + protected: + template + void ConstructHistogramInner(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const; + + template + void ConstructHistogramIntInner(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + + template + data_size_t SplitInner(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + + virtual inline data_size_t get_unpaired_index(const data_size_t paired_index) = 0; + const std::pair* paired_ranking_item_index_map_; const std::unique_ptr unpaired_bin_; data_size_t num_data_; @@ -135,14 +270,25 @@ template class PairwiseRankingFirstBin: public PairwiseRankingBin { public: PairwiseRankingFirstBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): PairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} + + private: + inline data_size_t get_unpaired_index(const data_size_t paired_index) { + return this->paired_ranking_item_index_map_[paired_index].first; + } }; template class PairwiseRankingSecondBin: public PairwiseRankingBin { public: PairwiseRankingSecondBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): PairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} + + private: + inline data_size_t get_unpaired_index(const data_size_t paired_index) { + return this->paired_ranking_item_index_map_[paired_index].second; + } }; + } // namespace LightGBM #endif // LIGHTGBM_IO_PAIRWISE_LAMBDARANK_BIN_HPP_ From 6154bde56671cd74404344f901a6687cffe99a8a Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 6 Dec 2023 13:07:50 +0000 Subject: [PATCH 07/68] instantiate templates --- include/LightGBM/bin.h | 24 +- .../LightGBM/pairwise_ranking_feature_group.h | 3 - src/io/bin.cpp | 48 +- src/io/pairwise_lambdarank_bin.cpp | 1138 ++++++++++++++++- src/io/pairwise_lambdarank_bin.hpp | 130 +- src/io/pairwise_ranking_feature_group.cpp | 41 +- src/io/template | Bin 0 -> 9016 bytes src/io/template.cpp | 26 + 8 files changed, 1269 insertions(+), 141 deletions(-) create mode 100755 src/io/template create mode 100644 src/io/template.cpp diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 2a2a798372bb..59ca758b5e2f 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -474,8 +474,7 @@ class Bin { * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair * \return The bin data object */ - template class PAIRWISE_BIN_TYPE> - static Bin* CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); + static Bin* CreateDensePairwiseRankingFirstBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); /*! * \brief Create object for bin data of one feature, used for pairwise ranking, for an original sparse bin @@ -484,8 +483,25 @@ class Bin { * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair * \return The bin data object */ - template class PAIRWISE_BIN_TYPE> - static Bin* CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); + static Bin* CreateSparsePairwiseRankingFirstBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); + + /*! + * \brief Create object for bin data of one feature, used for pairwise ranking, for an original dense bin + * \param num_data Size of the pairwise dataset + * \param num_bin Number of bin + * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair + * \return The bin data object + */ + static Bin* CreateDensePairwiseRankingSecondBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); + + /*! + * \brief Create object for bin data of one feature, used for pairwise ranking, for an original sparse bin + * \param num_data Size of the pairwise dataset + * \param num_bin Number of bin + * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair + * \return The bin data object + */ + static Bin* CreateSparsePairwiseRankingSecondBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); /*! * \brief Deep copy the bin diff --git a/include/LightGBM/pairwise_ranking_feature_group.h b/include/LightGBM/pairwise_ranking_feature_group.h index e01f5027ea32..f61e6c5dbc45 100644 --- a/include/LightGBM/pairwise_ranking_feature_group.h +++ b/include/LightGBM/pairwise_ranking_feature_group.h @@ -82,9 +82,6 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { } private: - template class PAIRWISE_BIN_TYPE> - void CreateBinDataInner(int num_data, bool is_multi_val, bool force_dense, bool force_sparse); - void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) override; /*! \brief Pairwise data index to original data indices for ranking with pairwise features */ diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 4d21532c742f..e2cfb50acfb7 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -633,37 +633,49 @@ namespace LightGBM { } } - template class PAIRWISE_BIN_TYPE> - Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { + Bin* Bin::CreateDensePairwiseRankingFirstBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { if (num_bin <= 16) { - return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); + return new DensePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); } else if (num_bin <= 256) { - return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); + return new DensePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); } else if (num_bin <= 65536) { - return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); + return new DensePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); } else { - return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); + return new DensePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); } } - template class PAIRWISE_BIN_TYPE> - Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { - if (num_bin <= 256) { - return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); + Bin* Bin::CreateDensePairwiseRankingSecondBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { + if (num_bin <= 16) { + return new DensePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); + } else if (num_bin <= 256) { + return new DensePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); } else if (num_bin <= 65536) { - return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); + return new DensePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); } else { - return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); + return new DensePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); } } - template Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); - - template Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); - - template Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); + Bin* Bin::CreateSparsePairwiseRankingFirstBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { + if (num_bin <= 256) { + return new SparsePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); + } else if (num_bin <= 65536) { + return new SparsePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); + } else { + return new SparsePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); + } + } - template Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); + Bin* Bin::CreateSparsePairwiseRankingSecondBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { + if (num_bin <= 256) { + return new SparsePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); + } else if (num_bin <= 65536) { + return new SparsePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); + } else { + return new SparsePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); + } + } MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate, const std::vector& offsets) { diff --git a/src/io/pairwise_lambdarank_bin.cpp b/src/io/pairwise_lambdarank_bin.cpp index 92901dd17ee1..6d85179099c2 100644 --- a/src/io/pairwise_lambdarank_bin.cpp +++ b/src/io/pairwise_lambdarank_bin.cpp @@ -13,43 +13,165 @@ void PairwiseRankingBin::InitStreaming(uint32_t num_thr unpaired_bin_->InitStreaming(num_thread, omp_max_threads); } +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); + template class ITERATOR_TYPE> void PairwiseRankingBin::Push(int tid, data_size_t idx, uint32_t value) { unpaired_bin_->Push(tid, idx, value); } +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::Push(int tid, data_size_t idx, uint32_t value); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::Push(int tid, data_size_t idx, uint32_t value); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::Push(int tid, data_size_t idx, uint32_t value); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::Push(int tid, data_size_t idx, uint32_t value); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::Push(int tid, data_size_t idx, uint32_t value); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::Push(int tid, data_size_t idx, uint32_t value); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::Push(int tid, data_size_t idx, uint32_t value); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::Push(int tid, data_size_t idx, uint32_t value); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::Push(int tid, data_size_t idx, uint32_t value); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::Push(int tid, data_size_t idx, uint32_t value); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::Push(int tid, data_size_t idx, uint32_t value); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::Push(int tid, data_size_t idx, uint32_t value); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::Push(int tid, data_size_t idx, uint32_t value); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::Push(int tid, data_size_t idx, uint32_t value); + template class ITERATOR_TYPE> void PairwiseRankingBin::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) { unpaired_bin_->CopySubrow(full_bin, used_indices, num_used_indices); } +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); + template class ITERATOR_TYPE> void PairwiseRankingBin::SaveBinaryToFile(BinaryWriter* writer) const { unpaired_bin_->SaveBinaryToFile(writer); } +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::SaveBinaryToFile(BinaryWriter* writer) const; +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::SaveBinaryToFile(BinaryWriter* writer) const; +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::SaveBinaryToFile(BinaryWriter* writer) const; +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::SaveBinaryToFile(BinaryWriter* writer) const; +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::SaveBinaryToFile(BinaryWriter* writer) const; +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::SaveBinaryToFile(BinaryWriter* writer) const; +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::SaveBinaryToFile(BinaryWriter* writer) const; +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::SaveBinaryToFile(BinaryWriter* writer) const; +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::SaveBinaryToFile(BinaryWriter* writer) const; +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::SaveBinaryToFile(BinaryWriter* writer) const; +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::SaveBinaryToFile(BinaryWriter* writer) const; +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::SaveBinaryToFile(BinaryWriter* writer) const; +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::SaveBinaryToFile(BinaryWriter* writer) const; +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::SaveBinaryToFile(BinaryWriter* writer) const; + template class ITERATOR_TYPE> void PairwiseRankingBin::LoadFromMemory(const void* memory, const std::vector& local_used_indices) { unpaired_bin_->LoadFromMemory(memory, local_used_indices); } +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); + template class ITERATOR_TYPE> -void PairwiseRankingBin::SizesInByte() const { +size_t PairwiseRankingBin::SizesInByte() const { return unpaired_bin_->SizesInByte(); } +template size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::SizesInByte() const; +template size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::SizesInByte() const; +template size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::SizesInByte() const; +template size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::SizesInByte() const; +template size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::SizesInByte() const; +template size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::SizesInByte() const; +template size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::SizesInByte() const; +template size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::SizesInByte() const; +template size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::SizesInByte() const; +template size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::SizesInByte() const; +template size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::SizesInByte() const; +template size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::SizesInByte() const; +template size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::SizesInByte() const; +template size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::SizesInByte() const; + template class ITERATOR_TYPE> -void PairwiseRankingBin::num_data() const { +data_size_t PairwiseRankingBin::num_data() const { return unpaired_bin_->num_data(); } +template data_size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::num_data() const; +template data_size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::num_data() const; +template data_size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::num_data() const; +template data_size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::num_data() const; +template data_size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::num_data() const; +template data_size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::num_data() const; +template data_size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::num_data() const; +template data_size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::num_data() const; +template data_size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::num_data() const; +template data_size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::num_data() const; +template data_size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::num_data() const; +template data_size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::num_data() const; +template data_size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::num_data() const; +template data_size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::num_data() const; + template class ITERATOR_TYPE> void PairwiseRankingBin::ReSize(data_size_t num_data) { return unpaired_bin_->ReSize(num_data); } +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::ReSize(data_size_t num_data); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::ReSize(data_size_t num_data); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::ReSize(data_size_t num_data); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::ReSize(data_size_t num_data); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::ReSize(data_size_t num_data); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::ReSize(data_size_t num_data); +template void PairwiseRankingBin, PairwiseRankingFirstIterator>::ReSize(data_size_t num_data); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::ReSize(data_size_t num_data); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::ReSize(data_size_t num_data); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::ReSize(data_size_t num_data); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::ReSize(data_size_t num_data); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::ReSize(data_size_t num_data); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::ReSize(data_size_t num_data); +template void PairwiseRankingBin, PairwiseRankingSecondIterator>::ReSize(data_size_t num_data); + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInner(const data_size_t* data_indices, +template +void DensePairwiseRankingBin::ConstructHistogramInner( + const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, @@ -58,21 +180,22 @@ void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHisto hist_t* grad = out; hist_t* hess = out + 1; hist_cnt_t* cnt = reinterpret_cast(hess); + const VAL_T* base_data_ptr = reinterpret_cast(this->unpaired_bin_->get_data()); if (USE_PREFETCH) { const data_size_t pf_offset = 64 / sizeof(VAL_T); const data_size_t pf_end = end - pf_offset; for (; i < pf_end; ++i) { const auto paired_idx = USE_INDICES ? data_indices[i] : i; - const auto idx = get_unpaired_index(paired_idx); + const auto idx = this->get_unpaired_index(paired_idx); const auto paired_pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; - const auto pf_idx = get_unpaired_index(paired_pf_idx); + const auto pf_idx = this->get_unpaired_index(paired_pf_idx); if (IS_4BIT) { - PREFETCH_T0(unpaired_bin_->data_.data() + (pf_idx >> 1)); + PREFETCH_T0(base_data_ptr + (pf_idx >> 1)); } else { - PREFETCH_T0(unpaired_bin_->data_.data() + pf_idx); + PREFETCH_T0(base_data_ptr + pf_idx); } - const auto ti = static_cast(unpaired_bin_->data(idx)) << 1; + const auto ti = static_cast(this->unpaired_bin_->data(idx)) << 1; if (USE_HESSIAN) { grad[ti] += ordered_gradients[i]; hess[ti] += ordered_hessians[i]; @@ -83,8 +206,9 @@ void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHisto } } for (; i < end; ++i) { - const auto idx = USE_INDICES ? data_indices[i] : i; - const auto ti = static_cast(unpaired_bin_->data(idx)) << 1; + const auto paired_idx = USE_INDICES ? data_indices[i] : i; + const auto idx = this->get_unpaired_index(paired_idx); + const auto ti = static_cast(this->unpaired_bin_->data(idx)) << 1; if (USE_HESSIAN) { grad[ti] += ordered_gradients[i]; hess[ti] += ordered_hessians[i]; @@ -95,15 +219,17 @@ void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHisto } } +template class ITERATOR_TYPE> template -void ConstructHistogramIntInner(const data_size_t* data_indices, - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const { +void DensePairwiseRankingBin::ConstructHistogramIntInner( + const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const { data_size_t i = start; PACKED_HIST_T* out_ptr = reinterpret_cast(out); const int16_t* gradients_ptr = reinterpret_cast(ordered_gradients); - const VAL_T* data_ptr_base = unpaired_bin_->data_.data(); + const VAL_T* data_ptr_base = reinterpret_cast(this->unpaired_bin_->get_data()); if (USE_PREFETCH) { const data_size_t pf_offset = 64 / sizeof(VAL_T); const data_size_t pf_end = end - pf_offset; @@ -111,14 +237,14 @@ void ConstructHistogramIntInner(const data_size_t* data_indices, const auto paired_idx = USE_INDICES ? data_indices[i] : i; const auto paired_pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; - const auto idx = get_unpaired_index(paired_idx); - const auto pf_idx = get_unpaired_index(paired_pf_idx); + const auto idx = this->get_unpaired_index(paired_idx); + const auto pf_idx = this->get_unpaired_index(paired_pf_idx); if (IS_4BIT) { PREFETCH_T0(data_ptr_base + (pf_idx >> 1)); } else { PREFETCH_T0(data_ptr_base + pf_idx); } - const auto ti = static_cast(unpaired_bin_->data(idx)); + const auto ti = static_cast(this->unpaired_bin_->data(idx)); const int16_t gradient_16 = gradients_ptr[i]; if (USE_HESSIAN) { const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : @@ -132,8 +258,9 @@ void ConstructHistogramIntInner(const data_size_t* data_indices, } } for (; i < end; ++i) { - const auto idx = USE_INDICES ? data_indices[i] : i; - const auto ti = static_cast(unpaired_bin_->data(idx)); + const auto paired_idx = USE_INDICES ? data_indices[i] : i; + const auto idx = this->get_unpaired_index(paired_idx); + const auto ti = static_cast(this->unpaired_bin_->data(idx)); const int16_t gradient_16 = gradients_ptr[i]; if (USE_HESSIAN) { const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : @@ -148,7 +275,7 @@ void ConstructHistogramIntInner(const data_size_t* data_indices, } template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogram( +void DensePairwiseRankingBin::ConstructHistogram( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const { @@ -156,150 +283,848 @@ void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHisto data_indices, start, end, ordered_gradients, ordered_hessians, out); } +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const; + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::(data_size_t start, data_size_t end, +void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, - hist_t* out) const override { + hist_t* out) const { ConstructHistogramInner( nullptr, start, end, ordered_gradients, ordered_hessians, out); } +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const; + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogram( +void DensePairwiseRankingBin::ConstructHistogram( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, - hist_t* out) const override { + hist_t* out) const { ConstructHistogramInner(data_indices, start, end, ordered_gradients, nullptr, out); } +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogram( +void DensePairwiseRankingBin::ConstructHistogram( data_size_t start, data_size_t end, const score_t* ordered_gradients, - hist_t* out) const override { + hist_t* out) const { ConstructHistogramInner( nullptr, start, end, ordered_gradients, nullptr, out); } +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt8( +void DensePairwiseRankingBin::ConstructHistogramInt8( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* /*ordered_hessians*/, - hist_t* out) const override { + hist_t* out) const { ConstructHistogramIntInner( data_indices, start, end, ordered_gradients, out); } +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt8( +void DensePairwiseRankingBin::ConstructHistogramInt8( data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* /*ordered_hessians*/, - hist_t* out) const override { + hist_t* out) const { ConstructHistogramIntInner( nullptr, start, end, ordered_gradients, out); } +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt8( +void DensePairwiseRankingBin::ConstructHistogramInt8( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, - hist_t* out) const override { + hist_t* out) const { ConstructHistogramIntInner( data_indices, start, end, ordered_gradients, out); } +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt8( +void DensePairwiseRankingBin::ConstructHistogramInt8( data_size_t start, data_size_t end, const score_t* ordered_gradients, - hist_t* out) const override { + hist_t* out) const { ConstructHistogramIntInner( nullptr, start, end, ordered_gradients, out); } +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt16( +void DensePairwiseRankingBin::ConstructHistogramInt16( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* /*ordered_hessians*/, - hist_t* out) const override { + hist_t* out) const { ConstructHistogramIntInner( data_indices, start, end, ordered_gradients, out); } +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt16( +void DensePairwiseRankingBin::ConstructHistogramInt16( data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* /*ordered_hessians*/, - hist_t* out) const override { + hist_t* out) const { ConstructHistogramIntInner( nullptr, start, end, ordered_gradients, out); } +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt16( +void DensePairwiseRankingBin::ConstructHistogramInt16( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, - hist_t* out) const override { + hist_t* out) const { ConstructHistogramIntInner( data_indices, start, end, ordered_gradients, out); } +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt16( +void DensePairwiseRankingBin::ConstructHistogramInt16( data_size_t start, data_size_t end, const score_t* ordered_gradients, - hist_t* out) const override { + hist_t* out) const { ConstructHistogramIntInner( nullptr, start, end, ordered_gradients, out); } +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt32( +void DensePairwiseRankingBin::ConstructHistogramInt32( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* /*ordered_hessians*/, - hist_t* out) const override { + hist_t* out) const { ConstructHistogramIntInner( data_indices, start, end, ordered_gradients, out); } +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt32( +void DensePairwiseRankingBin::ConstructHistogramInt32( data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* /*ordered_hessians*/, - hist_t* out) const override { + hist_t* out) const { ConstructHistogramIntInner( nullptr, start, end, ordered_gradients, out); } +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const; + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt32( +void DensePairwiseRankingBin::ConstructHistogramInt32( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, - hist_t* out) const override { + hist_t* out) const { ConstructHistogramIntInner( data_indices, start, end, ordered_gradients, out); } +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const; + template class ITERATOR_TYPE> -void PairwiseRankingBin, ITERATOR_TYPE>::ConstructHistogramInt32( +void DensePairwiseRankingBin::ConstructHistogramInt32( data_size_t start, data_size_t end, const score_t* ordered_gradients, - hist_t* out) const override { + hist_t* out) const { ConstructHistogramIntInner( nullptr, start, end, ordered_gradients, out); } -template class ITERATOR_TYPE, bool MISS_IS_ZERO, bool MISS_IS_NA, bool MFB_IS_ZERO, +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + +template class ITERATOR_TYPE> +template -data_size_t PairwiseRankingBin, ITERATOR_TYPE>::SplitInner(uint32_t min_bin, uint32_t max_bin, +data_size_t DensePairwiseRankingBin::SplitInner(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t most_freq_bin, bool default_left, uint32_t threshold, const data_size_t* data_indices, data_size_t cnt, @@ -331,22 +1156,23 @@ data_size_t PairwiseRankingBin, ITERATOR_TYPE>::SplitIn } if (min_bin < max_bin) { for (data_size_t i = 0; i < cnt; ++i) { - const data_size_t idx = data_indices[i]; - const auto bin = data(idx); + const data_size_t paired_idx = data_indices[i]; + const data_size_t idx = this->get_unpaired_index(paired_idx); + const auto bin = this->unpaired_bin_->data(idx); if ((MISS_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || (MISS_IS_NA && !MFB_IS_NA && bin == maxb)) { - missing_default_indices[(*missing_default_count)++] = idx; + missing_default_indices[(*missing_default_count)++] = paired_idx; } else if ((USE_MIN_BIN && (bin < minb || bin > maxb)) || (!USE_MIN_BIN && bin == 0)) { if ((MISS_IS_NA && MFB_IS_NA) || (MISS_IS_ZERO && MFB_IS_ZERO)) { - missing_default_indices[(*missing_default_count)++] = idx; + missing_default_indices[(*missing_default_count)++] = paired_idx; } else { - default_indices[(*default_count)++] = idx; + default_indices[(*default_count)++] = paired_idx; } } else if (bin > th) { - gt_indices[gt_count++] = idx; + gt_indices[gt_count++] = paired_idx; } else { - lte_indices[lte_count++] = idx; + lte_indices[lte_count++] = paired_idx; } } } else { @@ -357,21 +1183,22 @@ data_size_t PairwiseRankingBin, ITERATOR_TYPE>::SplitIn max_bin_count = <e_count; } for (data_size_t i = 0; i < cnt; ++i) { - const data_size_t idx = data_indices[i]; - const auto bin = data(idx); + const data_size_t paired_idx = data_indices[i]; + const data_size_t idx = this->get_unpaired_index(paired_idx); + const auto bin = this->unpaired_bin_->data(idx); if (MISS_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { - missing_default_indices[(*missing_default_count)++] = idx; + missing_default_indices[(*missing_default_count)++] = paired_idx; } else if (bin != maxb) { if ((MISS_IS_NA && MFB_IS_NA) || (MISS_IS_ZERO && MFB_IS_ZERO)) { - missing_default_indices[(*missing_default_count)++] = idx; + missing_default_indices[(*missing_default_count)++] = paired_idx; } else { - default_indices[(*default_count)++] = idx; + default_indices[(*default_count)++] = paired_idx; } } else { if (MISS_IS_NA && !MFB_IS_NA) { - missing_default_indices[(*missing_default_count)++] = idx; + missing_default_indices[(*missing_default_count)++] = paired_idx; } else { - max_bin_indices[(*max_bin_count)++] = idx; + max_bin_indices[(*max_bin_count)++] = paired_idx; } } } @@ -379,4 +1206,181 @@ data_size_t PairwiseRankingBin, ITERATOR_TYPE>::SplitIn return lte_count; } +template class ITERATOR_TYPE> +data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + MissingType missing_type, bool default_left, + uint32_t threshold, const data_size_t* data_indices, + data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const { + #define ARGUMENTS \ + min_bin, max_bin, default_bin, most_freq_bin, default_left, threshold, \ + data_indices, cnt, lte_indices, gt_indices + if (missing_type == MissingType::None) { + return SplitInner(ARGUMENTS); + } else if (missing_type == MissingType::Zero) { + if (default_bin == most_freq_bin) { + return SplitInner(ARGUMENTS); + } else { + return SplitInner(ARGUMENTS); + } + } else { + if (max_bin == most_freq_bin + min_bin && most_freq_bin > 0) { + return SplitInner(ARGUMENTS); + } else { + return SplitInner(ARGUMENTS); + } + } +#undef ARGUMENTS +} + +template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + MissingType missing_type, bool default_left, + uint32_t threshold, const data_size_t* data_indices, + data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + +template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + MissingType missing_type, bool default_left, + uint32_t threshold, const data_size_t* data_indices, + data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + +template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + MissingType missing_type, bool default_left, + uint32_t threshold, const data_size_t* data_indices, + data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + +template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + MissingType missing_type, bool default_left, + uint32_t threshold, const data_size_t* data_indices, + data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + +template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + MissingType missing_type, bool default_left, + uint32_t threshold, const data_size_t* data_indices, + data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + +template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + MissingType missing_type, bool default_left, + uint32_t threshold, const data_size_t* data_indices, + data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + +template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + MissingType missing_type, bool default_left, + uint32_t threshold, const data_size_t* data_indices, + data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + +template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + MissingType missing_type, bool default_left, + uint32_t threshold, const data_size_t* data_indices, + data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + +template class ITERATOR_TYPE> +data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, + uint32_t most_freq_bin, MissingType missing_type, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const { +#define ARGUMENTS \ + 1, max_bin, default_bin, most_freq_bin, default_left, threshold, \ + data_indices, cnt, lte_indices, gt_indices + if (missing_type == MissingType::None) { + return SplitInner(ARGUMENTS); + } else if (missing_type == MissingType::Zero) { + if (default_bin == most_freq_bin) { + return SplitInner(ARGUMENTS); + } else { + return SplitInner(ARGUMENTS); + } + } else { + if (max_bin == most_freq_bin + 1 && most_freq_bin > 0) { + return SplitInner(ARGUMENTS); + } else { + return SplitInner(ARGUMENTS); + } + } +#undef ARGUMENTS +} + +template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, + uint32_t most_freq_bin, MissingType missing_type, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + +template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, + uint32_t most_freq_bin, MissingType missing_type, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + +template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, + uint32_t most_freq_bin, MissingType missing_type, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + +template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, + uint32_t most_freq_bin, MissingType missing_type, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + +template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, + uint32_t most_freq_bin, MissingType missing_type, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + +template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, + uint32_t most_freq_bin, MissingType missing_type, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + +template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, + uint32_t most_freq_bin, MissingType missing_type, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + +template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, + uint32_t most_freq_bin, MissingType missing_type, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + } // namespace LightGBM diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp index 3fb69cb199b1..646369b8373e 100644 --- a/src/io/pairwise_lambdarank_bin.hpp +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -11,6 +11,9 @@ #include +#include "dense_bin.hpp" +#include "sparse_bin.hpp" + namespace LightGBM { template @@ -117,7 +120,7 @@ class PairwiseRankingSecondIterator: public BinIterator { template class ITERATOR_TYPE> class PairwiseRankingBin: public BIN_TYPE { public: - PairwiseRankingBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): BIN_TYPE(0), paired_ranking_item_index_map_(paired_ranking_item_index_map), unpaired_bin_(unpaired_bin) { + PairwiseRankingBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, BIN_TYPE* unpaired_bin): BIN_TYPE(0), paired_ranking_item_index_map_(paired_ranking_item_index_map), unpaired_bin_(unpaired_bin) { num_data_ = num_data; } @@ -142,10 +145,68 @@ class PairwiseRankingBin: public BIN_TYPE { data_size_t num_data() const override; - void* get_data() override; + void* get_data() override { + return unpaired_bin_->get_data(); + } void ReSize(data_size_t num_data) override; + data_size_t Split(uint32_t /*min_bin*/, uint32_t /*max_bin*/, + uint32_t /*default_bin*/, uint32_t /*most_freq_bin*/, + MissingType /*missing_type*/, bool /*default_left*/, + uint32_t /*threshold*/, const data_size_t* /*data_indices*/, + data_size_t /*cnt*/, + data_size_t* /*lte_indices*/, + data_size_t* /*gt_indices*/) const override { + Log::Fatal("Not implemented."); + } + + data_size_t SplitCategorical( + uint32_t /*min_bin*/, uint32_t /*max_bin*/, uint32_t /*most_freq_bin*/, + const uint32_t* /*threshold*/, int /*num_threshold*/, + const data_size_t* /*data_indices*/, data_size_t /*cnt*/, + data_size_t* /*lte_indices*/, data_size_t* /*gt_indices*/) const override { + Log::Fatal("Not implemented."); + } + + data_size_t Split(uint32_t /*max_bin*/, uint32_t /*default_bin*/, + uint32_t /*most_freq_bin*/, MissingType /*missing_type*/, + bool /*default_left*/, uint32_t /*threshold*/, + const data_size_t* /*data_indices*/, data_size_t /*cnt*/, + data_size_t* /*lte_indices*/, + data_size_t* /*gt_indices*/) const override { + Log::Fatal("Not implemented."); + } + + data_size_t SplitCategorical( + uint32_t /*max_bin*/, uint32_t /*most_freq_bin*/, const uint32_t* /*threshold*/, + int /*num_threshold*/, const data_size_t* /*data_indices*/, data_size_t /*cnt*/, + data_size_t* /*lte_indices*/, data_size_t* /*gt_indices*/) const override { + Log::Fatal("Not implemented."); + } + + const void* GetColWiseData(uint8_t* /*bit_type*/, bool* /*is_sparse*/, std::vector* /*bin_iterator*/, const int /*num_threads*/) const override { + Log::Fatal("Not implemented."); + } + + const void* GetColWiseData(uint8_t* /*bit_type*/, bool* /*is_sparse*/, BinIterator** /*bin_iterator*/) const override { + Log::Fatal("Not implemented."); + } + + protected: + + virtual data_size_t get_unpaired_index(const data_size_t paired_index) const = 0; + + const std::pair* paired_ranking_item_index_map_; + const std::unique_ptr unpaired_bin_; + data_size_t num_data_; +}; + +template class ITERATOR_TYPE> +class DensePairwiseRankingBin: public PairwiseRankingBin, ITERATOR_TYPE> { + public: + DensePairwiseRankingBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, DenseBin* unpaired_bin): PairwiseRankingBin, ITERATOR_TYPE>(num_data, paired_ranking_item_index_map, unpaired_bin) {} + void ConstructHistogram( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, @@ -214,29 +275,14 @@ class PairwiseRankingBin: public BIN_TYPE { data_size_t* lte_indices, data_size_t* gt_indices) const override; - data_size_t SplitCategorical( - uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin, - const uint32_t* threshold, int num_threshold, - const data_size_t* data_indices, data_size_t cnt, - data_size_t* lte_indices, data_size_t* gt_indices) const override; - - virtual data_size_t Split(uint32_t max_bin, uint32_t default_bin, + data_size_t Split(uint32_t max_bin, uint32_t default_bin, uint32_t most_freq_bin, MissingType missing_type, bool default_left, uint32_t threshold, const data_size_t* data_indices, data_size_t cnt, data_size_t* lte_indices, - data_size_t* gt_indices) const = 0; - - virtual data_size_t SplitCategorical( - uint32_t max_bin, uint32_t most_freq_bin, const uint32_t* threshold, - int num_threshold, const data_size_t* data_indices, data_size_t cnt, - data_size_t* lte_indices, data_size_t* gt_indices) const = 0; + data_size_t* gt_indices) const override; - const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, const int num_threads) const override; - - const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const override; - - protected: + private: template void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, @@ -258,32 +304,50 @@ class PairwiseRankingBin: public BIN_TYPE { const data_size_t* data_indices, data_size_t cnt, data_size_t* lte_indices, data_size_t* gt_indices) const; +}; - virtual inline data_size_t get_unpaired_index(const data_size_t paired_index) = 0; +template class ITERATOR_TYPE> +class SparsePairwiseRankingBin: public PairwiseRankingBin, ITERATOR_TYPE> { + public: + SparsePairwiseRankingBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, SparseBin* unpaired_bin): PairwiseRankingBin, ITERATOR_TYPE>(num_data, paired_ranking_item_index_map, unpaired_bin) {} +}; - const std::pair* paired_ranking_item_index_map_; - const std::unique_ptr unpaired_bin_; - data_size_t num_data_; +template +class DensePairwiseRankingFirstBin: public DensePairwiseRankingBin { + public: + DensePairwiseRankingFirstBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, DenseBin* unpaired_bin): DensePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} + private: + data_size_t get_unpaired_index(const data_size_t paired_index) const { + return this->paired_ranking_item_index_map_[paired_index].first; + } }; -template -class PairwiseRankingFirstBin: public PairwiseRankingBin { +template +class DensePairwiseRankingSecondBin: public DensePairwiseRankingBin { public: - PairwiseRankingFirstBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): PairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} + DensePairwiseRankingSecondBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, DenseBin* unpaired_bin): DensePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} + private: + data_size_t get_unpaired_index(const data_size_t paired_index) const { + return this->paired_ranking_item_index_map_[paired_index].second; + } +}; +template +class SparsePairwiseRankingFirstBin: public SparsePairwiseRankingBin { + public: + SparsePairwiseRankingFirstBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, SparseBin* unpaired_bin): SparsePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} private: - inline data_size_t get_unpaired_index(const data_size_t paired_index) { + data_size_t get_unpaired_index(const data_size_t paired_index) const { return this->paired_ranking_item_index_map_[paired_index].first; } }; -template -class PairwiseRankingSecondBin: public PairwiseRankingBin { +template +class SparsePairwiseRankingSecondBin: public SparsePairwiseRankingBin { public: - PairwiseRankingSecondBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): PairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} - + SparsePairwiseRankingSecondBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, SparseBin* unpaired_bin): SparsePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} private: - inline data_size_t get_unpaired_index(const data_size_t paired_index) { + data_size_t get_unpaired_index(const data_size_t paired_index) const { return this->paired_ranking_item_index_map_[paired_index].second; } }; diff --git a/src/io/pairwise_ranking_feature_group.cpp b/src/io/pairwise_ranking_feature_group.cpp index e0634fe2a9c5..074f6b8e6680 100644 --- a/src/io/pairwise_ranking_feature_group.cpp +++ b/src/io/pairwise_ranking_feature_group.cpp @@ -9,19 +9,28 @@ namespace LightGBM { -template class PAIRWISE_BIN_TYPE> -void PairwiseRankingFeatureGroup::CreateBinDataInner(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { +void PairwiseRankingFeatureGroup::CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { CHECK(!is_multi_val); // do not support multi-value bin for now if (is_multi_val) { multi_bin_data_.clear(); for (int i = 0; i < num_feature_; ++i) { int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1; if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) { - multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingBin( - num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); + if (is_first_or_second_in_pairing_ == 0) { + multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingFirstBin( + num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); + } else { + multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingSecondBin( + num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); + } } else { - multi_bin_data_.emplace_back( - Bin::CreateDensePairwiseRankingBin(num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); + if (is_first_or_second_in_pairing_ == 0) { + multi_bin_data_.emplace_back( + Bin::CreateDensePairwiseRankingFirstBin(num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); + } else { + multi_bin_data_.emplace_back( + Bin::CreateDensePairwiseRankingSecondBin(num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); + } } } is_multi_val_ = true; @@ -30,21 +39,21 @@ void PairwiseRankingFeatureGroup::CreateBinDataInner(int num_data, bool is_multi (!force_dense && num_feature_ == 1 && bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { is_sparse_ = true; - bin_data_.reset(Bin::CreateSparsePairwiseRankingBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); + if (is_first_or_second_in_pairing_) { + bin_data_.reset(Bin::CreateSparsePairwiseRankingFirstBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); + } else { + bin_data_.reset(Bin::CreateSparsePairwiseRankingSecondBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); + } } else { is_sparse_ = false; - bin_data_.reset(Bin::CreateDensePairwiseRankingBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); + if (is_first_or_second_in_pairing_) { + bin_data_.reset(Bin::CreateDensePairwiseRankingFirstBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); + } else { + bin_data_.reset(Bin::CreateDensePairwiseRankingSecondBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); + } } is_multi_val_ = false; } } -void PairwiseRankingFeatureGroup::CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { - if (is_first_or_second_in_pairing_ == 0) { - CreateBinDataInner(num_data, is_multi_val, force_dense, force_sparse); - } else { - CreateBinDataInner(num_data, is_multi_val, force_dense, force_sparse); - } -} - } // namespace LightGBM diff --git a/src/io/template b/src/io/template new file mode 100755 index 0000000000000000000000000000000000000000..4dbad49ebf15fb21ba001d0dde76a0d16aa57c4e GIT binary patch literal 9016 zcmeHNZ){uD6~B&~CeYMzLj&R8YFIn9pt^CJw&}V;oW$vKNkd3d=%C@@*iQ3?V@LKg zBxq%wQvP@)T{NMoABu!Drgi(&ZeQp&jX+p85Yoz~t!V3{ZlfqgLknVbVJr21=e~QK z=VyBZw2!;e^SS4D&pG$pbN{?|Px7@;cUP6mCAhf79fCMtt%bCyhJ_nd1*A>1imUK@ zv$#nt1;0dMR(qR8&}x+hWtyonE8J)%z1AvA&Pi{wVDw-W_K;|`!pf1obr#ALBar~< zk*{ORB@6146{gQCKjz3WS$y)BVA6Y9={>FVm)iuA^7E4xuM+H2{Z)#~wDO-R&%=g&IKS ztl!e&&!qet<+#Z{$?ojgBYUE-a;V7Xm3RoF_Pgdzy}#A_M0($IH#auyx{yAxZ09kO zrI7Qy3E`D>H4UQiXk7=IE0*n0ytPa#dDBn^_d~uMeyW1}@e26O73{oUL4Fl*A0B7f zRlyF$)#nrjzr6xJ4ZPgAu7dp)qDi!QeULzO!OMbq&6u(1?YGlhZV|e^e>9cUGiD@h z>blUw1H1KDJRRR}WXyPaV0T9%m5dKWh7xhBt$fS<{box%8B2tt{bnE#9f_oMGaWI^ zOgI_}^#R*76v-G-J(V%j@yKX+Kp$wh0?5Pxb2|0R3hn6aHlOqXV$MS?|3oTCRf*s{{BPO=V4ih(tLUh(q*YJv#ERJ{&O; zh)#4oqDRd5xM3=0*hoea#$yPFPKqjWqY)!1c6Nu`JNE9?H~Cwvy1vm?+vwkn38=!Z zpvR3}w76%8w~7n9fToJqfw}HmIC9)#pPD}xw68&(+NrEG3~ZStVnkut8;q-{m$Z`S z^N-IhA6lu5t8S+L=CHk|hXy<7N zpGR5qYvL)?7tcukb>b<+7iT2@3h^|Qizg)iW8!HD7oV5>55W)9u^~&~y>RZ$2esUv zwaK^Vdk4Cws&~PbHnsdW=t7&?@eZW&jo*gO-|JU(O6U}rJ=#?DNl1zLXq8|P3}0m6tz9%Pn~vn$JJ1uTaF>mY4|x^9fX{V+&M^&eJ~4? z7!QdLUL1x5)i5jGn09!{nzazoa-oI5j5ZbehnD-r+ysypNmDz#XCbC-S}A4B>bY-R z$>(32W$oO7%em~rng87x_w8>eQw(XDEcWLcDFc{ny91K1f)Muv`d;DM)o{~Me zy6KF-@*2Sl1|x9Y=-%E7O8b2!x;h4Gg1!oR2s8nD1W`E$`UdDth)5&i@g>mPKrfR# zD4mUOgT4St-yKw3kM#-HxYu=KT}|z@tHw(>eQSDQ_jVFcrM$cltIXj|*r!;Fc8~X) zRqdXpBkqvLH?^eQv-X*4&C~qkQq9vkS+m>ImiDv;JY;#D}5_5U3y+$S9-Ppg6dPSJN1y`<<;4unJR)3W@%wzH$-i@vpchO$XB>)Yzz z>~CJbDJ!|a229 zr;vXsmaiAQ--~gtnC1;>YjER+$9V)BjJP$3`LgoY6wZ%g`IUmtjbgkJc@du-l6AMZ zR`5QzH@az8t9wE*zIYEU#`#oRL}|M(5%1&sr?}AhScSOtI^yY3__PDRMd9Zi_*UR9 zoFcs|e5g^~DqXT(8i!Hf)GwdwN?g$Api7j-^C<9g{(o1>d&RSAUbx>Gi8qOR9RBcmw*q&C%~Ei7(1=&4qqfR9z?dJZ1b2yZ{?S>HhUVzFdChjKs07)cWLa(zO-j zM=RimE8q>%pVIT{dypspo$>q;;brnguLGy}JNI=M@$o`_uUfYphqF??sl@(YfR~Fy zt@P6)ylP*>m7yl!J~nEjElU2Nl80GYIwfusRMlt&JLwAeGZpNJXxhw}+2LV-R4nGl zbaPaXQid#pbWkj%?@y$LA_+ZarqUTbk{uV()aY0uZpLH&ty{LWmKV~IQZn>NIvttN z<4H3;A%@eD(YPMVj*d=1#m4E7G>b&dI66Rr${!sYlR2{Pz_u|X9*sY0Wa4@>l|;rZ z8%6R8>9g)uT{lu>0&Gh_r`(l^tQ;W*CAGjI7Z^##kOH*yh;KFwq3`UzyFJ(qmF$on z)@8a&*Sq?HyF+@Yr;{>=I#RuQNK>HJ*(dbQFZTp@hda<#={Bc}|f zQ=LL+CXy zXI$*4@Mk7Q&Bzd_nYQQ%*Nh}`+hf9?Okv{$+r#V4$bQA`PiFl?Sp$FfFk-?_+(;xd zBK)z5BB~0O$yUvfQe}4)rGRpXsQSql!a!vc= zBWkOU#9)LuOV!%*mIl|+LuAy5!YA4ieoUnBURRaggYJg&-V+atgo(r zg?l8M?Lmd^ysST?DolGRI4E>|pelc6^8NH)V0e`j^!dKRl<(CLkwuwfRy>Fr-C4PR zzSl72__O~k$Mj+7(;b+3zW*@oRXvgaB*XqISUiRb#gz36_at>+@+pV8f7a*mA5!}5 z${^pTn2sx)^smL^)IS1@x=`H}?p=KU;`{+bWFcSeB?rc?&-XK?57~uL_bQ(0v*7Le ze2-(w{<0b7nLg*x_o)V^LxsRv7Pw!w_Z^2m-wTz--g?Xlb0!C|(_4yvm|0lp3gV!hPv0ro#r1ewKulIq&{~3@|D7^kH zuC)Jm(O}mve1EGBxF4rJ^Yf^XPuzc-x<7f9ViOs+l{QGb{~HLL BolXD% literal 0 HcmV?d00001 diff --git a/src/io/template.cpp b/src/io/template.cpp new file mode 100644 index 000000000000..575008a2c3e0 --- /dev/null +++ b/src/io/template.cpp @@ -0,0 +1,26 @@ +#include + +//template +class BB { + +}; + +template +class C { + public: + template + B a(); +}; + +template +template +B C::a() {} + +int main() { + C c = C(); + + c.a(); + + return 0; + +} \ No newline at end of file From 3a646eb17e6ef307be7aea35ddc9baa640426d2a Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 6 Dec 2023 13:09:14 +0000 Subject: [PATCH 08/68] remove unrelated files --- src/io/template | Bin 9016 -> 0 bytes src/io/template.cpp | 26 -------------------------- 2 files changed, 26 deletions(-) delete mode 100755 src/io/template delete mode 100644 src/io/template.cpp diff --git a/src/io/template b/src/io/template deleted file mode 100755 index 4dbad49ebf15fb21ba001d0dde76a0d16aa57c4e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9016 zcmeHNZ){uD6~B&~CeYMzLj&R8YFIn9pt^CJw&}V;oW$vKNkd3d=%C@@*iQ3?V@LKg zBxq%wQvP@)T{NMoABu!Drgi(&ZeQp&jX+p85Yoz~t!V3{ZlfqgLknVbVJr21=e~QK z=VyBZw2!;e^SS4D&pG$pbN{?|Px7@;cUP6mCAhf79fCMtt%bCyhJ_nd1*A>1imUK@ zv$#nt1;0dMR(qR8&}x+hWtyonE8J)%z1AvA&Pi{wVDw-W_K;|`!pf1obr#ALBar~< zk*{ORB@6146{gQCKjz3WS$y)BVA6Y9={>FVm)iuA^7E4xuM+H2{Z)#~wDO-R&%=g&IKS ztl!e&&!qet<+#Z{$?ojgBYUE-a;V7Xm3RoF_Pgdzy}#A_M0($IH#auyx{yAxZ09kO zrI7Qy3E`D>H4UQiXk7=IE0*n0ytPa#dDBn^_d~uMeyW1}@e26O73{oUL4Fl*A0B7f zRlyF$)#nrjzr6xJ4ZPgAu7dp)qDi!QeULzO!OMbq&6u(1?YGlhZV|e^e>9cUGiD@h z>blUw1H1KDJRRR}WXyPaV0T9%m5dKWh7xhBt$fS<{box%8B2tt{bnE#9f_oMGaWI^ zOgI_}^#R*76v-G-J(V%j@yKX+Kp$wh0?5Pxb2|0R3hn6aHlOqXV$MS?|3oTCRf*s{{BPO=V4ih(tLUh(q*YJv#ERJ{&O; zh)#4oqDRd5xM3=0*hoea#$yPFPKqjWqY)!1c6Nu`JNE9?H~Cwvy1vm?+vwkn38=!Z zpvR3}w76%8w~7n9fToJqfw}HmIC9)#pPD}xw68&(+NrEG3~ZStVnkut8;q-{m$Z`S z^N-IhA6lu5t8S+L=CHk|hXy<7N zpGR5qYvL)?7tcukb>b<+7iT2@3h^|Qizg)iW8!HD7oV5>55W)9u^~&~y>RZ$2esUv zwaK^Vdk4Cws&~PbHnsdW=t7&?@eZW&jo*gO-|JU(O6U}rJ=#?DNl1zLXq8|P3}0m6tz9%Pn~vn$JJ1uTaF>mY4|x^9fX{V+&M^&eJ~4? z7!QdLUL1x5)i5jGn09!{nzazoa-oI5j5ZbehnD-r+ysypNmDz#XCbC-S}A4B>bY-R z$>(32W$oO7%em~rng87x_w8>eQw(XDEcWLcDFc{ny91K1f)Muv`d;DM)o{~Me zy6KF-@*2Sl1|x9Y=-%E7O8b2!x;h4Gg1!oR2s8nD1W`E$`UdDth)5&i@g>mPKrfR# zD4mUOgT4St-yKw3kM#-HxYu=KT}|z@tHw(>eQSDQ_jVFcrM$cltIXj|*r!;Fc8~X) zRqdXpBkqvLH?^eQv-X*4&C~qkQq9vkS+m>ImiDv;JY;#D}5_5U3y+$S9-Ppg6dPSJN1y`<<;4unJR)3W@%wzH$-i@vpchO$XB>)Yzz z>~CJbDJ!|a229 zr;vXsmaiAQ--~gtnC1;>YjER+$9V)BjJP$3`LgoY6wZ%g`IUmtjbgkJc@du-l6AMZ zR`5QzH@az8t9wE*zIYEU#`#oRL}|M(5%1&sr?}AhScSOtI^yY3__PDRMd9Zi_*UR9 zoFcs|e5g^~DqXT(8i!Hf)GwdwN?g$Api7j-^C<9g{(o1>d&RSAUbx>Gi8qOR9RBcmw*q&C%~Ei7(1=&4qqfR9z?dJZ1b2yZ{?S>HhUVzFdChjKs07)cWLa(zO-j zM=RimE8q>%pVIT{dypspo$>q;;brnguLGy}JNI=M@$o`_uUfYphqF??sl@(YfR~Fy zt@P6)ylP*>m7yl!J~nEjElU2Nl80GYIwfusRMlt&JLwAeGZpNJXxhw}+2LV-R4nGl zbaPaXQid#pbWkj%?@y$LA_+ZarqUTbk{uV()aY0uZpLH&ty{LWmKV~IQZn>NIvttN z<4H3;A%@eD(YPMVj*d=1#m4E7G>b&dI66Rr${!sYlR2{Pz_u|X9*sY0Wa4@>l|;rZ z8%6R8>9g)uT{lu>0&Gh_r`(l^tQ;W*CAGjI7Z^##kOH*yh;KFwq3`UzyFJ(qmF$on z)@8a&*Sq?HyF+@Yr;{>=I#RuQNK>HJ*(dbQFZTp@hda<#={Bc}|f zQ=LL+CXy zXI$*4@Mk7Q&Bzd_nYQQ%*Nh}`+hf9?Okv{$+r#V4$bQA`PiFl?Sp$FfFk-?_+(;xd zBK)z5BB~0O$yUvfQe}4)rGRpXsQSql!a!vc= zBWkOU#9)LuOV!%*mIl|+LuAy5!YA4ieoUnBURRaggYJg&-V+atgo(r zg?l8M?Lmd^ysST?DolGRI4E>|pelc6^8NH)V0e`j^!dKRl<(CLkwuwfRy>Fr-C4PR zzSl72__O~k$Mj+7(;b+3zW*@oRXvgaB*XqISUiRb#gz36_at>+@+pV8f7a*mA5!}5 z${^pTn2sx)^smL^)IS1@x=`H}?p=KU;`{+bWFcSeB?rc?&-XK?57~uL_bQ(0v*7Le ze2-(w{<0b7nLg*x_o)V^LxsRv7Pw!w_Z^2m-wTz--g?Xlb0!C|(_4yvm|0lp3gV!hPv0ro#r1ewKulIq&{~3@|D7^kH zuC)Jm(O}mve1EGBxF4rJ^Yf^XPuzc-x<7f9ViOs+l{QGb{~HLL BolXD% diff --git a/src/io/template.cpp b/src/io/template.cpp deleted file mode 100644 index 575008a2c3e0..000000000000 --- a/src/io/template.cpp +++ /dev/null @@ -1,26 +0,0 @@ -#include - -//template -class BB { - -}; - -template -class C { - public: - template - B a(); -}; - -template -template -B C::a() {} - -int main() { - C c = C(); - - c.a(); - - return 0; - -} \ No newline at end of file From 9e77ab9e88eed45a004afd6cd9548a0098c80390 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 7 Dec 2023 07:25:56 +0000 Subject: [PATCH 09/68] add return values for unimplemented methods --- src/io/pairwise_lambdarank_bin.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp index 646369b8373e..fed360b7f579 100644 --- a/src/io/pairwise_lambdarank_bin.hpp +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -159,6 +159,7 @@ class PairwiseRankingBin: public BIN_TYPE { data_size_t* /*lte_indices*/, data_size_t* /*gt_indices*/) const override { Log::Fatal("Not implemented."); + return 0; } data_size_t SplitCategorical( @@ -167,6 +168,7 @@ class PairwiseRankingBin: public BIN_TYPE { const data_size_t* /*data_indices*/, data_size_t /*cnt*/, data_size_t* /*lte_indices*/, data_size_t* /*gt_indices*/) const override { Log::Fatal("Not implemented."); + return 0; } data_size_t Split(uint32_t /*max_bin*/, uint32_t /*default_bin*/, @@ -176,6 +178,7 @@ class PairwiseRankingBin: public BIN_TYPE { data_size_t* /*lte_indices*/, data_size_t* /*gt_indices*/) const override { Log::Fatal("Not implemented."); + return 0; } data_size_t SplitCategorical( @@ -183,14 +186,17 @@ class PairwiseRankingBin: public BIN_TYPE { int /*num_threshold*/, const data_size_t* /*data_indices*/, data_size_t /*cnt*/, data_size_t* /*lte_indices*/, data_size_t* /*gt_indices*/) const override { Log::Fatal("Not implemented."); + return 0; } const void* GetColWiseData(uint8_t* /*bit_type*/, bool* /*is_sparse*/, std::vector* /*bin_iterator*/, const int /*num_threads*/) const override { Log::Fatal("Not implemented."); + return nullptr; } const void* GetColWiseData(uint8_t* /*bit_type*/, bool* /*is_sparse*/, BinIterator** /*bin_iterator*/) const override { Log::Fatal("Not implemented."); + return nullptr; } protected: From eba4560453924f82b76c82997db7af516df0d783 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 7 Dec 2023 07:43:26 +0000 Subject: [PATCH 10/68] add new files and windows/LightGBM.vcxproj and windows/LightGBM.vcxproj.filters --- windows/LightGBM.vcxproj | 4 ++++ windows/LightGBM.vcxproj.filters | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj index 96fe017e96b8..fe7800fc4123 100644 --- a/windows/LightGBM.vcxproj +++ b/windows/LightGBM.vcxproj @@ -248,6 +248,7 @@ + @@ -283,6 +284,7 @@ + @@ -328,6 +330,8 @@ + + diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters index 27b445893c0f..378d45efdf5a 100644 --- a/windows/LightGBM.vcxproj.filters +++ b/windows/LightGBM.vcxproj.filters @@ -69,6 +69,9 @@ src\io + + src\io + src\metric @@ -141,6 +144,9 @@ include\LightGBM + + include\LightGBM + include\LightGBM\utils @@ -335,6 +341,12 @@ src\io + + src\io + + + src\io + src\io From 873d7ad0abc8bad2ccd3bea3dfd7a89fc13dad64 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 7 Dec 2023 12:06:11 +0000 Subject: [PATCH 11/68] create pairwise dataset --- include/LightGBM/config.h | 1 + include/LightGBM/dataset.h | 2 +- include/LightGBM/feature_group.h | 2 +- .../LightGBM/pairwise_ranking_feature_group.h | 3 +- src/io/dataset.cpp | 20 +++++++++++-- src/io/dataset_loader.cpp | 13 +++++++++ src/io/pairwise_ranking_feature_group.cpp | 28 +++++++++++++++++++ src/objective/objective_function.cpp | 7 +++++ 8 files changed, 69 insertions(+), 7 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 2f65a0592ebb..d9ff994027c1 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -158,6 +158,7 @@ struct Config { // descl2 = ``lambdarank``, `lambdarank `__ objective. `label_gain <#label_gain>`__ can be used to set the gain (weight) of ``int`` label and all values in ``label`` must be smaller than number of elements in ``label_gain`` // descl2 = ``rank_xendcg``, `XE_NDCG_MART `__ ranking objective function, aliases: ``xendcg``, ``xe_ndcg``, ``xe_ndcg_mart``, ``xendcg_mart`` // descl2 = ``rank_xendcg`` is faster than and achieves the similar performance as ``lambdarank`` + // descl2 = ``pairwise_lambdarank``, pairwise lambdarank algorithm // descl2 = label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect) std::string objective = "regression"; diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 4b8c0f24a569..d58f55028fa9 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -727,7 +727,7 @@ class Dataset { LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset); - LIGHTGBM_EXPORT void CreatePairWiseRankingData(const Dataset* dataset, std::vector> pair_index_map); + LIGHTGBM_EXPORT void CreatePairWiseRankingData(const Dataset* dataset); void InitTrain(const std::vector& is_feature_used, TrainingShareStates* share_state) const; diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index ea01bd84c8da..38f5ab318daf 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -343,7 +343,7 @@ class FeatureGroup { num_feature_ += other->num_feature_; } - inline BinIterator* SubFeatureIterator(int sub_feature) { + inline BinIterator* SubFeatureIterator(int sub_feature) const { uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin(); if (!is_multi_val_) { uint32_t min_bin = bin_offsets_[sub_feature]; diff --git a/include/LightGBM/pairwise_ranking_feature_group.h b/include/LightGBM/pairwise_ranking_feature_group.h index f61e6c5dbc45..4b1f16a38350 100644 --- a/include/LightGBM/pairwise_ranking_feature_group.h +++ b/include/LightGBM/pairwise_ranking_feature_group.h @@ -28,8 +28,7 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { * \param is_first_or_second_in_pairing Mark whether features in this group belong to the first or second element in the pairing */ - PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map): - FeatureGroup(other, num_original_data), paired_ranking_item_index_map_(paired_ranking_item_index_map), num_data_(num_pairs), is_first_or_second_in_pairing_(is_first_or_second_in_pairing) {} + PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map); /*! * \brief Constructor from memory when data is present diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 3f052d0c8805..60c8d6a8fc56 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -820,7 +820,7 @@ void Dataset::CreateValid(const Dataset* dataset) { gpu_device_id_ = dataset->gpu_device_id_; } -void Dataset::CreatePairWiseRankingData(const Dataset* dataset, std::vector> /* TODO(shiyu1994) pair_index_map*/) { +void Dataset::CreatePairWiseRankingData(const Dataset* dataset) { metadata_.BuildPairwiseFeatureRanking(dataset->metadata()); feature_groups_.clear(); @@ -835,7 +835,14 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, std::vectorhas_raw(); numeric_feature_map_ = dataset->numeric_feature_map_; - num_numeric_features_ = dataset->num_numeric_features_; + for (const int feature_index : dataset->numeric_feature_map_) { + if (feature_index != -1) { + numeric_feature_map_.push_back(feature_index + dataset->num_features_); + } else { + numeric_feature_map_.push_back(-1); + } + } + num_numeric_features_ = dataset->num_numeric_features_ * 2; // copy feature bin mapper data feature_need_push_zeros_.clear(); group_bin_boundaries_.clear(); @@ -870,7 +877,14 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, std::vectorused_feature_map_.size()); used_feature_map_.insert(used_feature_map_.begin(), dataset->used_feature_map_.begin(), dataset->used_feature_map_.end()); - used_feature_map_.insert(used_feature_map_.begin() + dataset->used_feature_map_.size(), dataset->used_feature_map_.begin(), dataset->used_feature_map_.end()); + + for (int i = 0; i < dataset->num_total_features_; ++i) { + if (dataset->used_feature_map_[i] != -1) { + used_feature_map_.push_back(i + dataset->num_features_); + } else { + used_feature_map_.push_back(-1); + } + } feature_names_.clear(); for (const std::string& feature_name : dataset->feature_names_) { diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 84bf3907a43c..7b184226d232 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -293,6 +293,12 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac // need to check training data CheckDataset(dataset.get(), is_load_from_binary); + if (config_.objective == std::string("pairwise_lambdarank")) { + std::unique_ptr original_dataset(dataset.release()); + dataset.reset(new Dataset()); + dataset->CreatePairWiseRankingData(original_dataset.get()); + } + return dataset.release(); } @@ -350,6 +356,13 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, // not need to check validation data // check meta data dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices); + + if (config_.objective == std::string("pairwise_lambdarank")) { + std::unique_ptr original_dataset(dataset.release()); + dataset.reset(new Dataset()); + dataset->CreatePairWiseRankingData(original_dataset.get()); + } + return dataset.release(); } diff --git a/src/io/pairwise_ranking_feature_group.cpp b/src/io/pairwise_ranking_feature_group.cpp index 074f6b8e6680..096a01fabd6b 100644 --- a/src/io/pairwise_ranking_feature_group.cpp +++ b/src/io/pairwise_ranking_feature_group.cpp @@ -5,10 +5,38 @@ */ #include +#include + #include "pairwise_lambdarank_bin.hpp" namespace LightGBM { +PairwiseRankingFeatureGroup::PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map): + FeatureGroup(other, num_original_data), paired_ranking_item_index_map_(paired_ranking_item_index_map), num_data_(num_pairs), is_first_or_second_in_pairing_(is_first_or_second_in_pairing) { + + // copy from original bin data + const int num_threads = OMP_NUM_THREADS(); + std::vector>> bin_iterators(num_threads); + for (int i = 0; i < num_threads; ++i) { + for (int j = 0; j < num_feature_; ++j) { + bin_iterators[i].emplace_back(other.SubFeatureIterator(j)); + bin_iterators[i].back()->Reset(0); + } + } + + Threading::For(0, num_original_data, 512, [this, &other] (int block_index, data_size_t block_start, data_size_t block_end) { + for (int feature_index = 0; feature_index < num_feature_; ++feature_index) { + std::unique_ptr bin_iterator(other.SubFeatureIterator(feature_index)); + bin_iterator->Reset(block_start); + for (data_size_t index = block_start; index < block_end; ++index) { + PushData(block_index, feature_index, index, bin_iterator->RawGet(index)); + } + } + }); + + FinishLoad(); +} + void PairwiseRankingFeatureGroup::CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { CHECK(!is_multi_val); // do not support multi-value bin for now if (is_multi_val) { diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp index a203017cf36e..508a59fe0364 100644 --- a/src/objective/objective_function.cpp +++ b/src/objective/objective_function.cpp @@ -44,6 +44,9 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& return new CUDAMulticlassSoftmax(config); } else if (type == std::string("multiclassova")) { return new CUDAMulticlassOVA(config); + } else if (type == std::string("pairwise_lambdarank")) { + Log::Warning("Objective pairwise_lambdarank is not implemented in cuda version. Fall back to boosting on CPU."); + return new PairwiseLambdarankNDCG(config); } else if (type == std::string("cross_entropy")) { Log::Warning("Objective cross_entropy is not implemented in cuda version. Fall back to boosting on CPU."); return new CrossEntropy(config); @@ -81,6 +84,8 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& return new BinaryLogloss(config); } else if (type == std::string("lambdarank")) { return new LambdarankNDCG(config); + } else if (type == std::string("pairwise_lambdarank")) { + return new PairwiseLambdarankNDCG(config); } else if (type == std::string("rank_xendcg")) { return new RankXENDCG(config); } else if (type == std::string("multiclass")) { @@ -126,6 +131,8 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& return new BinaryLogloss(strs); } else if (type == std::string("lambdarank")) { return new LambdarankNDCG(strs); + } else if (type == std::string("pairwise_lambdarank")) { + return new PairwiseLambdarankNDCG(strs); } else if (type == std::string("rank_xendcg")) { return new RankXENDCG(strs); } else if (type == std::string("multiclass")) { From 986a9797ed61366d8618208954398c548ec4a75e Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 7 Dec 2023 13:15:27 +0000 Subject: [PATCH 12/68] set num_data_ of pairwise dataset --- include/LightGBM/dataset.h | 7 +++++-- src/io/dataset.cpp | 9 ++++++--- src/io/dataset_loader.cpp | 7 ++----- src/io/metadata.cpp | 9 ++++++++- src/io/pairwise_ranking_feature_group.cpp | 2 ++ 5 files changed, 23 insertions(+), 11 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index d58f55028fa9..bfb06719bac7 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -207,8 +207,9 @@ class Metadata { /*! * \brief Build metadata for ranking with pairwise features from metadata of an existing ranking dataset * \param metadata Reference to metadata of the existing ranking dataset + * \return The number of paired data */ - void BuildPairwiseFeatureRanking(const Metadata& metadata); + data_size_t BuildPairwiseFeatureRanking(const Metadata& metadata); /*! * \brief Perform any extra operations after all data has been loaded @@ -388,6 +389,8 @@ class Metadata { data_size_t num_positions_; /*! \brief Label data */ std::vector label_; + /*! \brief Paired label data for pairwise lambdarank */ + std::vector paired_label_; /*! \brief Weights data */ std::vector weights_; /*! \brief Positions data */ @@ -407,7 +410,7 @@ class Metadata { /*! \brief Queries data */ std::vector queries_; /*! \brief Mode for pairwise ranking */ - PairwiseRankingMode pairwise_ranking_mode_; + PairwiseRankingMode pairwise_ranking_mode_ = PairwiseRankingMode::kRelevance; /*! \brief Pairwise data index to original data indices for ranking with pairwise features */ std::vector> paired_ranking_item_index_map_; /*! \brief mutex for threading safe call */ diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 60c8d6a8fc56..4c3f7d393f52 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -353,10 +353,13 @@ void Dataset::Construct(std::vector>* bin_mappers, auto is_sparse = io_config.is_enable_sparse; if (io_config.device_type == std::string("cuda")) { LGBM_config_::current_device = lgbm_device_cuda; - if ((io_config.device_type == std::string("cuda")) && is_sparse) { + if (is_sparse) { Log::Warning("Using sparse features with CUDA is currently not supported."); is_sparse = false; } + } else if ((io_config.objective == std::string("pairwise_lambdarank")) && is_sparse) { + Log::Warning("Using sparse features with pairwise_lambdarank is currently not supported."); + is_sparse = false; } std::vector group_is_multi_val(used_features.size(), 0); @@ -821,7 +824,7 @@ void Dataset::CreateValid(const Dataset* dataset) { } void Dataset::CreatePairWiseRankingData(const Dataset* dataset) { - metadata_.BuildPairwiseFeatureRanking(dataset->metadata()); + num_data_ = metadata_.BuildPairwiseFeatureRanking(dataset->metadata()); feature_groups_.clear(); num_features_ = dataset->num_features_ * 2; @@ -880,7 +883,7 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset) { for (int i = 0; i < dataset->num_total_features_; ++i) { if (dataset->used_feature_map_[i] != -1) { - used_feature_map_.push_back(i + dataset->num_features_); + used_feature_map_.push_back(dataset->used_feature_map_[i] + dataset->num_features_); } else { used_feature_map_.push_back(-1); } diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 7b184226d232..0057c5ff1a4a 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -357,11 +357,8 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, // check meta data dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices); - if (config_.objective == std::string("pairwise_lambdarank")) { - std::unique_ptr original_dataset(dataset.release()); - dataset.reset(new Dataset()); - dataset->CreatePairWiseRankingData(original_dataset.get()); - } + // TODO(shiyu1994) + Log::Warning("Pairwise ranking with validation set is not supported yet."); return dataset.release(); } diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 034529d8f33a..1fdb7f706cea 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -853,9 +853,11 @@ size_t Metadata::SizesInByte() const { return size; } -void Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { +data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { num_data_ = 0; num_queries_ = metadata.num_queries(); + label_.clear(); + paired_label_.clear(); if (pairwise_ranking_mode_ == PairwiseRankingMode::kRelevance) { const label_t* labels = metadata.label(); paired_ranking_item_index_map_.clear(); @@ -873,6 +875,8 @@ void Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { continue; } const label_t label_j = labels[item_index_j]; + label_.push_back(label_i); + paired_label_.push_back(label_j); if (label_i != label_j) { paired_ranking_item_index_map_.push_back(std::pair{item_index_i, item_index_j}); ++num_pairs_in_query; @@ -884,7 +888,10 @@ void Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { } } else { // TODO(shiyu1994) + Log::Fatal("Not implemented."); } + + return num_data_; } } // namespace LightGBM diff --git a/src/io/pairwise_ranking_feature_group.cpp b/src/io/pairwise_ranking_feature_group.cpp index 096a01fabd6b..19fb1c98ca34 100644 --- a/src/io/pairwise_ranking_feature_group.cpp +++ b/src/io/pairwise_ranking_feature_group.cpp @@ -14,6 +14,8 @@ namespace LightGBM { PairwiseRankingFeatureGroup::PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map): FeatureGroup(other, num_original_data), paired_ranking_item_index_map_(paired_ranking_item_index_map), num_data_(num_pairs), is_first_or_second_in_pairing_(is_first_or_second_in_pairing) { + CreateBinData(num_original_data, is_multi_val_, !is_sparse_, is_sparse_); + // copy from original bin data const int num_threads = OMP_NUM_THREADS(); std::vector>> bin_iterators(num_threads); From c40965a12a842f543bc9066b854bcb165655f2f9 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 15 Dec 2023 08:24:42 +0000 Subject: [PATCH 13/68] skip query with no paired items --- src/io/metadata.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 1fdb7f706cea..173cf0ec7cae 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -865,6 +865,7 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { data_size_t num_pairs_in_query = 0; query_boundaries_.clear(); query_boundaries_.push_back(0); + num_queries_ = 0; for (data_size_t query_index = 0; query_index < num_queries_; ++query_index) { const data_size_t query_start = query_boundaries[query_index]; const data_size_t query_end = query_boundaries[query_index + 1]; @@ -884,7 +885,10 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { } } } - query_boundaries_.push_back(num_pairs_in_query); + if (num_pairs_in_query > 0) { + query_boundaries_.push_back(num_pairs_in_query); + ++num_queries_; + } } } else { // TODO(shiyu1994) From 97d34d7f0e693bd7d8ece1fc53a680f24fadb773 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 31 Jan 2024 06:15:37 +0000 Subject: [PATCH 14/68] store original query information store relative data indices for pairs store only original instead of paired labels --- include/LightGBM/dataset.h | 6 +++--- src/io/metadata.cpp | 31 +++++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index bfb06719bac7..d5a7138a8302 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -387,10 +387,8 @@ class Metadata { data_size_t num_weights_; /*! \brief Number of positions, used to check correct position file */ data_size_t num_positions_; - /*! \brief Label data */ + /*! \brief Label data. In pairwise ranking, the label_ refer to the labels of the original unpaired dataset. */ std::vector label_; - /*! \brief Paired label data for pairwise lambdarank */ - std::vector paired_label_; /*! \brief Weights data */ std::vector weights_; /*! \brief Positions data */ @@ -399,6 +397,8 @@ class Metadata { std::vector position_ids_; /*! \brief Query boundaries */ std::vector query_boundaries_; + /*! \brief Original query boundaries, used in pairwise ranking */ + std::vector original_query_boundaries_; /*! \brief Query weights */ std::vector query_weights_; /*! \brief Number of querys */ diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 173cf0ec7cae..3758e1e09302 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -857,11 +857,28 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { num_data_ = 0; num_queries_ = metadata.num_queries(); label_.clear(); - paired_label_.clear(); if (pairwise_ranking_mode_ == PairwiseRankingMode::kRelevance) { - const label_t* labels = metadata.label(); + const label_t* original_label = metadata.label(); paired_ranking_item_index_map_.clear(); const data_size_t* query_boundaries = metadata.query_boundaries(); + + // backup original query boundaries + original_query_boundaries_.clear(); + original_query_boundaries_.resize(num_queries_); + const int num_threads = OMP_NUM_THREADS(); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (num_queries_ >= 1024) + for (data_size_t i = 0; i < num_queries_; ++i) { + original_query_boundaries_[i] = query_boundaries[i]; + } + + // copy labels + const data_size_t original_num_data = query_boundaries[num_queries_]; + label_.resize(original_num_data); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (original_num_data >= 1024) + for (data_size_t i = 0; i < original_num_data; ++i) { + label_[i] = original_label[i]; + } + data_size_t num_pairs_in_query = 0; query_boundaries_.clear(); query_boundaries_.push_back(0); @@ -870,16 +887,14 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { const data_size_t query_start = query_boundaries[query_index]; const data_size_t query_end = query_boundaries[query_index + 1]; for (data_size_t item_index_i = query_start; item_index_i < query_end; ++item_index_i) { - const label_t label_i = labels[item_index_i]; + const label_t label_i = label_[item_index_i]; for (data_size_t item_index_j = query_start; item_index_j < query_end; ++item_index_j) { if (item_index_i == item_index_j) { continue; } - const label_t label_j = labels[item_index_j]; - label_.push_back(label_i); - paired_label_.push_back(label_j); + const label_t label_j = label_[item_index_j]; if (label_i != label_j) { - paired_ranking_item_index_map_.push_back(std::pair{item_index_i, item_index_j}); + paired_ranking_item_index_map_.push_back(std::pair{item_index_i - query_start, item_index_j - query_start}); ++num_pairs_in_query; ++num_data_; } @@ -894,7 +909,7 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { // TODO(shiyu1994) Log::Fatal("Not implemented."); } - + return num_data_; } From 1e57e271698cdf09c1626a2c6c991c5032b52d90 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 31 Jan 2024 13:05:25 +0000 Subject: [PATCH 15/68] copy position information for pairwise dataset --- src/io/metadata.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 3758e1e09302..85267f409bd0 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -857,6 +857,8 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { num_data_ = 0; num_queries_ = metadata.num_queries(); label_.clear(); + positions_.clear(); + position_ids_.clear(); if (pairwise_ranking_mode_ == PairwiseRankingMode::kRelevance) { const label_t* original_label = metadata.label(); paired_ranking_item_index_map_.clear(); @@ -879,6 +881,23 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { label_[i] = original_label[i]; } + if (metadata.num_position_ids() > 0) { + positions_.resize(original_num_data); + const data_size_t* original_positions = metadata.positions(); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (original_num_data >= 1024) + for (data_size_t i = 0; i < original_num_data; ++i) { + positions_[i] = original_positions[i]; + } + + const data_size_t num_position_ids = static_cast(metadata.num_position_ids()); + position_ids_.resize(num_position_ids); + const std::string* original_position_ids = metadata.position_ids(); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (num_position_ids >= 1024) + for (data_size_t i = 0; i < num_position_ids; ++i) { + position_ids_[i] = original_position_ids[i]; + } + } + data_size_t num_pairs_in_query = 0; query_boundaries_.clear(); query_boundaries_.push_back(0); From 1699c06ba759f4ea25e93c270679251b85c7bb8e Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 1 Feb 2024 03:11:33 +0000 Subject: [PATCH 16/68] rename to pointwise members provide interface for pointwise query boundaries copy pointwise weights --- include/LightGBM/dataset.h | 15 ++++++++++++- src/io/metadata.cpp | 45 +++++++++++++++++++++++--------------- 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index d5a7138a8302..4a179319e955 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -280,6 +280,7 @@ class Metadata { * we assume data will order by query, * the interval of [query_boundaris[i], query_boundaris[i+1]) * is the data indices for query i. + * When pairwise ranking, this points to the paired query boundaries. * \return Pointer of data boundaries on queries */ inline const data_size_t* query_boundaries() const { @@ -290,6 +291,18 @@ class Metadata { } } + /*! + * \brief Used in pairwise ranking. Pointwise query boundaries. + * \return Pointer of data boundaries on queries + */ + inline const data_size_t* pointwise_query_boundaries() const { + if (!pointwise_query_boundaries_.empty()) { + return pointwise_query_boundaries_.data(); + } else { + return nullptr; + } + } + /*! * \brief Get Number of queries * \return Number of queries @@ -398,7 +411,7 @@ class Metadata { /*! \brief Query boundaries */ std::vector query_boundaries_; /*! \brief Original query boundaries, used in pairwise ranking */ - std::vector original_query_boundaries_; + std::vector pointwise_query_boundaries_; /*! \brief Query weights */ std::vector query_weights_; /*! \brief Number of querys */ diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 85267f409bd0..58b4a88b0a63 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -860,41 +860,50 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { positions_.clear(); position_ids_.clear(); if (pairwise_ranking_mode_ == PairwiseRankingMode::kRelevance) { - const label_t* original_label = metadata.label(); + const label_t* pointwise_label = metadata.label(); + const label_t* pointwise_weights = metadata.weights(); paired_ranking_item_index_map_.clear(); const data_size_t* query_boundaries = metadata.query_boundaries(); - - // backup original query boundaries - original_query_boundaries_.clear(); - original_query_boundaries_.resize(num_queries_); + + // backup pointwise query boundaries + pointwise_query_boundaries_.clear(); + pointwise_query_boundaries_.resize(num_queries_); const int num_threads = OMP_NUM_THREADS(); #pragma omp parallel for schedule(static) num_threads(num_threads) if (num_queries_ >= 1024) for (data_size_t i = 0; i < num_queries_; ++i) { - original_query_boundaries_[i] = query_boundaries[i]; + pointwise_query_boundaries_[i] = query_boundaries[i]; } // copy labels - const data_size_t original_num_data = query_boundaries[num_queries_]; - label_.resize(original_num_data); - #pragma omp parallel for schedule(static) num_threads(num_threads) if (original_num_data >= 1024) - for (data_size_t i = 0; i < original_num_data; ++i) { - label_[i] = original_label[i]; + const data_size_t pointwise_num_data = query_boundaries[num_queries_]; + label_.resize(pointwise_num_data); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (pointwise_num_data >= 1024) + for (data_size_t i = 0; i < pointwise_num_data; ++i) { + label_[i] = pointwise_label[i]; + } + + // copy weights + weights_.resize(pointwise_num_data); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (pointwise_num_data >= 1024) + for (data_size_t i = 0; i < pointwise_num_data; ++i) { + weights_[i] = pointwise_weights[i]; } + // copy position information if (metadata.num_position_ids() > 0) { - positions_.resize(original_num_data); - const data_size_t* original_positions = metadata.positions(); - #pragma omp parallel for schedule(static) num_threads(num_threads) if (original_num_data >= 1024) - for (data_size_t i = 0; i < original_num_data; ++i) { - positions_[i] = original_positions[i]; + positions_.resize(pointwise_num_data); + const data_size_t* pointwise_positions = metadata.positions(); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (pointwise_num_data >= 1024) + for (data_size_t i = 0; i < pointwise_num_data; ++i) { + positions_[i] = pointwise_positions[i]; } const data_size_t num_position_ids = static_cast(metadata.num_position_ids()); position_ids_.resize(num_position_ids); - const std::string* original_position_ids = metadata.position_ids(); + const std::string* pointwise_position_ids = metadata.position_ids(); #pragma omp parallel for schedule(static) num_threads(num_threads) if (num_position_ids >= 1024) for (data_size_t i = 0; i < num_position_ids; ++i) { - position_ids_[i] = original_position_ids[i]; + position_ids_[i] = pointwise_position_ids[i]; } } From d5b6f0a69531c9fa06ebc8c468208fd9c3b1893e Mon Sep 17 00:00:00 2001 From: Pavel Metrikov Date: Thu, 8 Feb 2024 22:41:37 -0800 Subject: [PATCH 17/68] adding initial support for pairwise gradients and NDCG eval with pairwise scores --- include/LightGBM/objective_function.h | 3 + src/metric/rank_metric.hpp | 47 +++- src/objective/rank_objective.hpp | 356 +++++++++++++++++++++++++- 3 files changed, 401 insertions(+), 5 deletions(-) diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h index ad188dc39676..85b52c3ce35c 100644 --- a/include/LightGBM/objective_function.h +++ b/include/LightGBM/objective_function.h @@ -110,6 +110,9 @@ class ObjectiveFunction { #endif // USE_CUDA }; +void UpdatePointwiseScoresForOneQuery(data_size_t query_id, double* score_pointwise, const double* score, data_size_t cnt_pointwise, + data_size_t selected_pairs_cnt, const data_size_t* selected_pairs, const std::pair* paired_index_map, int truncation_level, double sigma); + } // namespace LightGBM #endif // LightGBM_OBJECTIVE_FUNCTION_H_ diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp index e2adb8c082d4..97f05ccc037b 100644 --- a/src/metric/rank_metric.hpp +++ b/src/metric/rank_metric.hpp @@ -9,10 +9,12 @@ #include #include #include +#include #include #include #include +#include namespace LightGBM { @@ -26,6 +28,9 @@ class NDCGMetric:public Metric { DCGCalculator::DefaultLabelGain(&label_gain); // initialize DCG calculator DCGCalculator::Init(label_gain); + pairwise_scores_ = config.objective == std::string("pairwise_lambdarank"); + sigmoid_ = config.sigmoid; + truncation_level_ = config.lambdarank_truncation_level; } ~NDCGMetric() { @@ -34,14 +39,14 @@ class NDCGMetric:public Metric { for (auto k : eval_at_) { name_.emplace_back(std::string("ndcg@") + std::to_string(k)); } - num_data_ = num_data; + num_data_ = pairwise_scores_? metadata.pointwise_query_boundaries()[metadata.num_queries()] : num_data; // get label label_ = metadata.label(); num_queries_ = metadata.num_queries(); DCGCalculator::CheckMetadata(metadata, num_queries_); DCGCalculator::CheckLabel(label_, num_data_); // get query boundaries - query_boundaries_ = metadata.query_boundaries(); + query_boundaries_ = pairwise_scores_? metadata.pointwise_query_boundaries() : metadata.query_boundaries(); if (query_boundaries_ == nullptr) { Log::Fatal("The NDCG metric requires query information"); } @@ -73,6 +78,12 @@ class NDCGMetric:public Metric { } } } + if (pairwise_scores_) { + paired_index_map_ = metadata.paired_ranking_item_index_map(); + scores_pointwise_.resize(num_data_, 0.0); + num_data_pairwise_ = num_data; + query_boundaries_pairwise_ = metadata.query_boundaries(); + } } const std::vector& GetName() const override { @@ -101,9 +112,19 @@ class NDCGMetric:public Metric { result_buffer_[tid][j] += 1.0f; } } else { + if (pairwise_scores_) { + const data_size_t start_pointwise = query_boundaries_[i]; + const data_size_t cnt_pointwise = query_boundaries_[i + 1] - query_boundaries_[i]; + const data_size_t start_pairwise = query_boundaries_pairwise_[i]; + const data_size_t cnt_pairwise = query_boundaries_[i + 1] - query_boundaries_[i]; + std::vector all_pairs(cnt_pairwise); + std::iota(all_pairs.begin(), all_pairs.end(), 0); + UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), paired_index_map_ + start_pairwise, truncation_level_, sigmoid_); + } + // calculate DCG DCGCalculator::CalDCG(eval_at_, label_ + query_boundaries_[i], - score + query_boundaries_[i], + pairwise_scores_? scores_pointwise_.data(): score + query_boundaries_[i], query_boundaries_[i + 1] - query_boundaries_[i], &tmp_dcg); // calculate NDCG for (size_t j = 0; j < eval_at_.size(); ++j) { @@ -121,9 +142,18 @@ class NDCGMetric:public Metric { result_buffer_[tid][j] += 1.0f; } } else { + if (pairwise_scores_) { + const data_size_t start_pointwise = query_boundaries_[i]; + const data_size_t cnt_pointwise = query_boundaries_[i + 1] - query_boundaries_[i]; + const data_size_t start_pairwise = query_boundaries_pairwise_[i]; + const data_size_t cnt_pairwise = query_boundaries_[i + 1] - query_boundaries_[i]; + std::vector all_pairs(cnt_pairwise); + std::iota(all_pairs.begin(), all_pairs.end(), 0); + UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), paired_index_map_ + start_pairwise, truncation_level_, sigmoid_); + } // calculate DCG DCGCalculator::CalDCG(eval_at_, label_ + query_boundaries_[i], - score + query_boundaries_[i], + pairwise_scores_ ? scores_pointwise_.data() : score + query_boundaries_[i], query_boundaries_[i + 1] - query_boundaries_[i], &tmp_dcg); // calculate NDCG for (size_t j = 0; j < eval_at_.size(); ++j) { @@ -162,6 +192,15 @@ class NDCGMetric:public Metric { std::vector eval_at_; /*! \brief Cache the inverse max dcg for all queries */ std::vector> inverse_max_dcgs_; + bool pairwise_scores_; + double sigmoid_; + /*! \brief Truncation position for max DCG */ + int truncation_level_; + mutable std::vector scores_pointwise_; + const std::pair* paired_index_map_; + /*! \brief Number of data */ + data_size_t num_data_pairwise_; + const data_size_t* query_boundaries_pairwise_; }; } // namespace LightGBM diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 5c5fb3c7183b..19e089c56fa8 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -6,6 +6,11 @@ #ifndef LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_ +#define model_indirect_comparisons_ true +#define model_conditional_rel_ true +#define indirect_comparisons_above_only true +#define logarithmic_discounts true + #include #include @@ -17,9 +22,104 @@ #include #include #include +#include namespace LightGBM { + void UpdatePointwiseScoresForOneQuery(data_size_t query_id, double* score_pointwise, const double* score, data_size_t cnt_pointwise, + data_size_t selected_pairs_cnt, const data_size_t* selected_pairs, const std::pair* paired_index_map, int truncation_level, double sigma) { + // get sorted indices for scores + std::vector sorted_idx(cnt_pointwise); + for (data_size_t i = 0; i < cnt_pointwise; ++i) { + sorted_idx[i] = i; + } + std::stable_sort( + sorted_idx.begin(), sorted_idx.end(), + [score_pointwise](data_size_t a, data_size_t b) { return score_pointwise[a] > score_pointwise[b]; }); + // get ranks when sorted by scores + std::vector ranks(cnt_pointwise); + for (int i = 0; i < cnt_pointwise; i++) { + ranks[sorted_idx.at(i)] = i; + } + + std::multimap mapRight2Left; + std::multimap mapLeft2Right; + std::map, data_size_t> mapLeftRight2Pair; + for (data_size_t i = 0; i < selected_pairs_cnt; ++i) { + data_size_t current_pair = selected_pairs[i]; + int indexLeft = paired_index_map[current_pair].first; + int indexRight = paired_index_map[current_pair].second; + mapRight2Left.insert(std::make_pair(indexRight, indexLeft)); + mapLeft2Right.insert(std::make_pair(indexLeft, indexRight)); + mapLeftRight2Pair.insert(std::make_pair(std::make_pair(indexLeft, indexRight), current_pair)); + } + + std::vector gradients(cnt_pointwise); + std::vector hessians(cnt_pointwise); + for (data_size_t i = 0; i < selected_pairs_cnt; i++) { + data_size_t current_pair = selected_pairs[i]; + int indexLeft = paired_index_map[current_pair].first; + int indexRight = paired_index_map[current_pair].second; + if (ranks[indexLeft] >= truncation_level && ranks[indexRight] >= truncation_level) { continue; } + + double delta_score = score[current_pair]; + int comparisons = 1; + data_size_t current_pair_inverse = -1; + if (mapLeftRight2Pair.count(std::make_pair(indexRight, indexLeft)) > 0) { + current_pair_inverse = mapLeftRight2Pair.at(std::make_pair(indexRight, indexLeft)); + delta_score -= score[current_pair_inverse]; + comparisons++; + } + if (model_indirect_comparisons_) { + auto indexHead_range = mapRight2Left.equal_range(indexLeft); + for (auto indexHead_it = indexHead_range.first; indexHead_it != indexHead_range.second; indexHead_it++) { + data_size_t indexHead = indexHead_it->second; + if (mapLeftRight2Pair.count(std::make_pair(indexHead, indexRight)) > 0 && + (!(indirect_comparisons_above_only || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { + data_size_t indexHeadLeft = mapLeftRight2Pair.at(std::make_pair(indexHead, indexLeft)); + data_size_t indexHeadRight = mapLeftRight2Pair.at(std::make_pair(indexHead, indexRight)); + delta_score += score[indexHeadRight] - score[indexHeadLeft]; + comparisons++; + } + } + auto indexTail_range = mapLeft2Right.equal_range(indexLeft); + for (auto indexTail_it = indexTail_range.first; indexTail_it != indexTail_range.second; indexTail_it++) { + data_size_t indexTail = indexTail_it->second; + if (mapLeftRight2Pair.count(std::make_pair(indexRight, indexTail)) > 0 && + (!indirect_comparisons_above_only || (ranks[indexTail] < ranks[indexLeft] && ranks[indexTail] < ranks[indexRight])) && + (!model_conditional_rel_ || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { + data_size_t indexLeftTail = mapLeftRight2Pair.at(std::make_pair(indexLeft, indexTail)); + data_size_t indexRightTail = mapLeftRight2Pair.at(std::make_pair(indexRight, indexTail)); + delta_score += score[indexLeftTail] - score[indexRightTail]; + comparisons++; + } + } + } + double delta_score_pointwise = score_pointwise[indexLeft] - score_pointwise[indexRight]; + if (delta_score_pointwise == kMinScore || -delta_score_pointwise == kMinScore || delta_score == kMinScore || -delta_score == kMinScore) { continue; } + delta_score /= comparisons; + // get discount of this pair + const double paired_discount = logarithmic_discounts ? fabs(DCGCalculator::GetDiscount(ranks[indexRight]) - DCGCalculator::GetDiscount(ranks[indexLeft])) : 1.0; + //double p_lr = GetSigmoid(delta_score); + double p_lr = 1.0f / (1.0f + std::exp(-delta_score * sigma)); + double p_rl = 1.0 - p_lr; + //double p_lr_pointwise = GetSigmoid(delta_score_pointwise); + double p_lr_pointwise = 1.0f / (1.0f + std::exp(-delta_score_pointwise * sigma)); + double p_rl_pointwise = 1.0 - p_lr_pointwise; + gradients[indexLeft] += paired_discount * (p_rl_pointwise - p_rl); + hessians[indexLeft] += paired_discount * p_rl_pointwise * p_lr_pointwise; + gradients[indexRight] -= paired_discount * (p_rl_pointwise - p_rl); + hessians[indexRight] += paired_discount * p_rl_pointwise * p_lr_pointwise; + } + + for (data_size_t i = 0; i < cnt_pointwise; i++) { + double delta = 0.3 * gradients[i] / (std::abs(hessians[i]) + 0.001); + delta = std::min(delta, 0.3); + delta = std::max(delta, -0.3); + score_pointwise[i] += delta; + } + } + /*! * \brief Objective function for Ranking */ @@ -461,10 +561,264 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { void Init(const Metadata& metadata, data_size_t num_data) override { LambdarankNDCG::Init(metadata, num_data); - + query_boundaries_pointwise_ = metadata.pointwise_query_boundaries(); + if (query_boundaries_pointwise_ == nullptr) { + Log::Fatal("Ranking tasks require query information"); + } + num_data_pointwise_ = query_boundaries_pointwise_[num_queries_]; paired_index_map_ = metadata.paired_ranking_item_index_map(); + scores_pointwise_.resize(num_data_pointwise_, 0.0); } + void GetGradients(const double* score, score_t* gradients, + score_t* hessians) const override { + #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) + for (data_size_t i = 0; i < num_queries_; ++i) { + const data_size_t start_pointwise = query_boundaries_pointwise_[i]; + const data_size_t cnt_pointwise = query_boundaries_pointwise_[i + 1] - query_boundaries_pointwise_[i]; + const data_size_t start = query_boundaries_[i]; + const data_size_t cnt = query_boundaries_[i + 1] - query_boundaries_[i]; + std::vector score_adjusted; + if (num_position_ids_ > 0) { + for (data_size_t j = 0; j < cnt; ++j) { + score_adjusted.push_back(score[start + j] + pos_biases_[positions_[start_pointwise + paired_index_map_[start + j].first]] - pos_biases_[positions_[start_pointwise + paired_index_map_[start + j].second]]); + } + } + GetGradientsForOneQuery(i, cnt_pointwise, cnt, label_ + start_pointwise, scores_pointwise_.data(), num_position_ids_ > 0 ? score_adjusted.data() : score + start, + gradients + start, hessians + start); + + std::vector all_pairs(cnt); + std::iota(all_pairs.begin(), all_pairs.end(), 0); + UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score + start, cnt_pointwise, cnt, all_pairs.data(), paired_index_map_ + start, truncation_level_, sigmoid_); + } + if (num_position_ids_ > 0) { + std::vector gradients_pointwise(num_data_pointwise_); + std::vector hessians_pointwise(num_data_pointwise_); + #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) + for (data_size_t i = 0; i < num_queries_; ++i) { + const data_size_t cnt_pointwise = query_boundaries_pointwise_[i + 1] - query_boundaries_pointwise_[i]; + const data_size_t cnt = query_boundaries_[i + 1] - query_boundaries_[i]; + TransformGradientsPairwiseIntoPointwiseForOneQuery(i, cnt_pointwise, cnt, gradients, hessians, gradients_pointwise.data(), hessians_pointwise.data()); + } + UpdatePositionBiasFactors(gradients_pointwise.data(), hessians_pointwise.data()); + } + } + + inline void TransformGradientsPairwiseIntoPointwiseForOneQuery(data_size_t query_id, data_size_t cnt_pointwise, data_size_t cnt, + const score_t* gradients, const score_t* hessians, score_t* gradients_pointwise, score_t* hessians_pointwise) const { + // initialize with zero + for (data_size_t i = 0; i < cnt_pointwise; ++i) { + gradients_pointwise[i] = 0.0f; + hessians_pointwise[i] = 0.0f; + } + const data_size_t start = query_boundaries_[query_id]; + for (data_size_t i = 0; i < cnt; i++) { + int indexLeft = paired_index_map_[i + start].first; + int indexRight = paired_index_map_[i + start].second; + gradients_pointwise[indexLeft] += gradients[i]; + gradients_pointwise[indexRight] -= gradients[i]; + hessians_pointwise[indexLeft] += hessians[i]; + hessians_pointwise[indexRight] += hessians[i]; + } + } + + + inline void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt_pointwise, data_size_t cnt, + const label_t* label, const double* score_pointwise, const double* score, + score_t* lambdas, + score_t* hessians) const { + + const data_size_t start_pointwise = query_boundaries_pointwise_[query_id]; + const data_size_t start = query_boundaries_[query_id]; + + // get max DCG on current query + const double inverse_max_dcg = inverse_max_dcgs_[query_id]; + // initialize with zero + for (data_size_t i = 0; i < cnt; ++i) { + lambdas[i] = 0.0f; + hessians[i] = 0.0f; + } + // get sorted indices for scores + std::vector sorted_idx(cnt_pointwise); + for (data_size_t i = 0; i < cnt_pointwise; ++i) { + sorted_idx[i] = i; + } + std::stable_sort( + sorted_idx.begin(), sorted_idx.end(), + [score_pointwise](data_size_t a, data_size_t b) { return score_pointwise[a] > score_pointwise[b]; }); + // get ranks when sorted by scores + std::vector ranks(cnt_pointwise); + for (int i = 0; i < cnt_pointwise; i++) { + ranks[sorted_idx.at(i)] = i; + } + // get best and worst score + const double best_score = score_pointwise[sorted_idx[0]]; + data_size_t worst_idx = cnt_pointwise - 1; + if (worst_idx > 0 && score_pointwise[sorted_idx[worst_idx]] == kMinScore) { + worst_idx -= 1; + } + const double worst_score = score_pointwise[sorted_idx[worst_idx]]; + + std::multimap mapRight2Left; + std::multimap mapLeft2Right; + std::map, data_size_t> mapLeftRight2Pair; + + for (data_size_t i = 0; i < cnt; ++i) { + int indexLeft = paired_index_map_[i + start].first; + int indexRight = paired_index_map_[i + start].second; + mapRight2Left.insert(std::make_pair(indexRight, indexLeft)); + mapLeft2Right.insert(std::make_pair(indexLeft, indexRight)); + mapLeftRight2Pair.insert(std::make_pair(std::make_pair(indexLeft, indexRight), i)); + } + + double sum_lambdas = 0.0; + // start accmulate lambdas by pairs + for (data_size_t i = 0; i < cnt; i++) { + int indexLeft = paired_index_map_[i + start].first; + int indexRight = paired_index_map_[i + start].second; + + if (label[indexLeft] <= label[indexRight] || (ranks[indexLeft] >= truncation_level_ && ranks[indexRight] >= truncation_level_)) { + continue; + } + + const data_size_t high = indexLeft; + const data_size_t low = indexRight; + const data_size_t high_rank = ranks[high]; + const data_size_t low_rank = ranks[low]; + const int high_label = static_cast(label[high]); + const double high_label_gain = label_gain_[high_label]; + const double high_discount = DCGCalculator::GetDiscount(high_rank); + const int low_label = static_cast(label[low]); + const double low_label_gain = label_gain_[low_label]; + const double low_discount = DCGCalculator::GetDiscount(low_rank); + double delta_score = score[i]; + int comparisons = 1; + + data_size_t i_inverse = -1; + if (mapLeftRight2Pair.count(std::make_pair(indexRight, indexLeft)) > 0) { + i_inverse = mapLeftRight2Pair.at(std::make_pair(indexRight, indexLeft)); + delta_score -= score[i_inverse]; + comparisons++; + } + if (model_indirect_comparisons_) { + auto indexHead_range = mapRight2Left.equal_range(indexLeft); + for (auto indexHead_it = indexHead_range.first; indexHead_it != indexHead_range.second; indexHead_it++) { + data_size_t indexHead = indexHead_it->second; + if (mapLeftRight2Pair.count(std::make_pair(indexHead, indexRight)) > 0 && + (!(indirect_comparisons_above_only || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { + data_size_t indexHeadLeft = mapLeftRight2Pair.at(std::make_pair(indexHead, indexLeft)); + data_size_t indexHeadRight = mapLeftRight2Pair.at(std::make_pair(indexHead, indexRight)); + delta_score += score[indexHeadRight] - score[indexHeadLeft]; + comparisons++; + } + } + auto indexTail_range = mapLeft2Right.equal_range(indexLeft); + for (auto indexTail_it = indexTail_range.first; indexTail_it != indexTail_range.second; indexTail_it++) { + data_size_t indexTail = indexTail_it->second; + if (mapLeftRight2Pair.count(std::make_pair(indexRight, indexTail)) > 0 && + (!indirect_comparisons_above_only || (ranks[indexTail] < ranks[indexLeft] && ranks[indexTail] < ranks[indexRight])) && + (!model_conditional_rel_ || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { + data_size_t indexLeftTail = mapLeftRight2Pair.at(std::make_pair(indexLeft, indexTail)); + data_size_t indexRightTail = mapLeftRight2Pair.at(std::make_pair(indexRight, indexTail)); + delta_score += score[indexLeftTail] - score[indexRightTail]; + comparisons++; + } + } + } + + if (delta_score == kMinScore || -delta_score == kMinScore) { continue; } + delta_score /= comparisons; + + // get dcg gap + const double dcg_gap = high_label_gain - low_label_gain; + // get discount of this pair + const double paired_discount = fabs(high_discount - low_discount); + // get delta NDCG + double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg; + // regularize the delta_pair_NDCG by score distance + if (norm_ && best_score != worst_score) { + delta_pair_NDCG /= (0.01f + fabs(delta_score)); + } + // calculate lambda for this pair + double p_lambda = GetSigmoid(delta_score); + double p_hessian = p_lambda * (1.0f - p_lambda); + // update + p_lambda *= -sigmoid_ * delta_pair_NDCG; + p_hessian *= sigmoid_ * sigmoid_ * delta_pair_NDCG; + if (weights_ != nullptr) { + p_lambda *= weights_[start_pointwise + high] * weights_[start_pointwise + low]; + p_hessian *= weights_[start_pointwise + high] * weights_[start_pointwise + low]; + } + lambdas[i] += static_cast(p_lambda / comparisons); + hessians[i] += static_cast(p_hessian / comparisons); + if (i_inverse >= 0) { + lambdas[i_inverse] -= static_cast(p_lambda / comparisons); + hessians[i_inverse] += static_cast(p_hessian / comparisons); + } + if (model_indirect_comparisons_) { + auto indexHead_range = mapRight2Left.equal_range(indexLeft); + for (auto indexHead_it = indexHead_range.first; indexHead_it != indexHead_range.second; indexHead_it++) { + data_size_t indexHead = indexHead_it->second; + if (mapLeftRight2Pair.count(std::make_pair(indexHead, indexRight)) > 0 && + (!(indirect_comparisons_above_only || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { + data_size_t indexHeadLeft = mapLeftRight2Pair.at(std::make_pair(indexHead, indexLeft)); + data_size_t indexHeadRight = mapLeftRight2Pair.at(std::make_pair(indexHead, indexRight)); + lambdas[indexHeadRight] += static_cast(p_lambda / comparisons); + hessians[indexHeadRight] += static_cast(p_hessian / comparisons); + lambdas[indexHeadLeft] -= static_cast(p_lambda / comparisons); + hessians[indexHeadLeft] += static_cast(p_hessian / comparisons); + } + } + auto indexTail_range = mapLeft2Right.equal_range(indexLeft); + for (auto indexTail_it = indexTail_range.first; indexTail_it != indexTail_range.second; indexTail_it++) { + data_size_t indexTail = indexTail_it->second; + if (mapLeftRight2Pair.count(std::make_pair(indexRight, indexTail)) > 0 && + (!indirect_comparisons_above_only || (ranks[indexTail] < ranks[indexLeft] && ranks[indexTail] < ranks[indexRight])) && + (!model_conditional_rel_ || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { + data_size_t indexLeftTail = mapLeftRight2Pair.at(std::make_pair(indexLeft, indexTail)); + data_size_t indexRightTail = mapLeftRight2Pair.at(std::make_pair(indexRight, indexTail)); + lambdas[indexLeftTail] += static_cast(p_lambda / comparisons); + hessians[indexLeftTail] += static_cast(p_hessian / comparisons); + lambdas[indexRightTail] -= static_cast(p_lambda / comparisons); + hessians[indexRightTail] += static_cast(p_hessian / comparisons); + } + } + } + // lambda is negative, so use minus to accumulate + sum_lambdas -= 2 * p_lambda; + } + + if (norm_ && sum_lambdas > 0) { + double norm_factor = std::log2(1 + sum_lambdas) / sum_lambdas; + for (data_size_t i = 0; i < cnt; ++i) { + lambdas[i] = static_cast(lambdas[i] * norm_factor); + hessians[i] = static_cast(hessians[i] * norm_factor); + } + } + } + + inline double GetSigmoid(double score) const { + if (score <= min_sigmoid_input_) { + // too small, use lower bound + return sigmoid_table_[0]; + } + else if (score >= max_sigmoid_input_) { + // too large, use upper bound + return sigmoid_table_[_sigmoid_bins - 1]; + } + else { + return sigmoid_table_[static_cast((score - min_sigmoid_input_) * + sigmoid_table_idx_factor_)]; + } + } + + protected: + /*! \brief Query boundaries for pointwise data instances */ + const data_size_t* query_boundaries_pointwise_; + /*! \brief Number of pointwise data */ + data_size_t num_data_pointwise_; + mutable std::vector scores_pointwise_; + private: const std::pair* paired_index_map_; }; From 2ee1199947e762af2777eb62ee8bb55502ecb719 Mon Sep 17 00:00:00 2001 From: Pavel Metrikov Date: Fri, 9 Feb 2024 02:47:18 -0800 Subject: [PATCH 18/68] fix score offsets --- src/metric/rank_metric.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp index 97f05ccc037b..1156f03f3a3e 100644 --- a/src/metric/rank_metric.hpp +++ b/src/metric/rank_metric.hpp @@ -124,7 +124,7 @@ class NDCGMetric:public Metric { // calculate DCG DCGCalculator::CalDCG(eval_at_, label_ + query_boundaries_[i], - pairwise_scores_? scores_pointwise_.data(): score + query_boundaries_[i], + (pairwise_scores_? scores_pointwise_.data(): score) + query_boundaries_[i], query_boundaries_[i + 1] - query_boundaries_[i], &tmp_dcg); // calculate NDCG for (size_t j = 0; j < eval_at_.size(); ++j) { @@ -153,7 +153,7 @@ class NDCGMetric:public Metric { } // calculate DCG DCGCalculator::CalDCG(eval_at_, label_ + query_boundaries_[i], - pairwise_scores_ ? scores_pointwise_.data() : score + query_boundaries_[i], + (pairwise_scores_ ? scores_pointwise_.data() : score) + query_boundaries_[i], query_boundaries_[i + 1] - query_boundaries_[i], &tmp_dcg); // calculate NDCG for (size_t j = 0; j < eval_at_.size(); ++j) { From 0aaf090df7da6a009ff28ed2af1690dece2fc18b Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 19 Feb 2024 06:46:01 +0000 Subject: [PATCH 19/68] skip copy for weights and label if none --- src/io/metadata.cpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 58b4a88b0a63..089741a3668e 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -876,17 +876,21 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { // copy labels const data_size_t pointwise_num_data = query_boundaries[num_queries_]; - label_.resize(pointwise_num_data); - #pragma omp parallel for schedule(static) num_threads(num_threads) if (pointwise_num_data >= 1024) - for (data_size_t i = 0; i < pointwise_num_data; ++i) { - label_[i] = pointwise_label[i]; + if (pointwise_label != nullptr) { + label_.resize(pointwise_num_data); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (pointwise_num_data >= 1024) + for (data_size_t i = 0; i < pointwise_num_data; ++i) { + label_[i] = pointwise_label[i]; + } } // copy weights - weights_.resize(pointwise_num_data); - #pragma omp parallel for schedule(static) num_threads(num_threads) if (pointwise_num_data >= 1024) - for (data_size_t i = 0; i < pointwise_num_data; ++i) { - weights_[i] = pointwise_weights[i]; + if (pointwise_weights != nullptr) { + weights_.resize(pointwise_num_data); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (pointwise_num_data >= 1024) + for (data_size_t i = 0; i < pointwise_num_data; ++i) { + weights_[i] = pointwise_weights[i]; + } } // copy position information From 8714bfb87e1c9f387630c2a6196cc213346be99f Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 29 Feb 2024 01:10:29 +0000 Subject: [PATCH 20/68] fix pairwise dataset bugs --- .../LightGBM/pairwise_ranking_feature_group.h | 25 ++++++++++++++++++- src/io/dataset.cpp | 4 +-- src/io/dataset_loader.cpp | 9 ------- src/io/metadata.cpp | 6 ++--- src/io/pairwise_lambdarank_bin.hpp | 12 ++++++--- src/io/pairwise_ranking_feature_group.cpp | 18 +++---------- src/io/sparse_bin.hpp | 3 +++ src/metric/dcg_calculator.cpp | 2 +- 8 files changed, 45 insertions(+), 34 deletions(-) diff --git a/include/LightGBM/pairwise_ranking_feature_group.h b/include/LightGBM/pairwise_ranking_feature_group.h index 4b1f16a38350..de9675e71ad0 100644 --- a/include/LightGBM/pairwise_ranking_feature_group.h +++ b/include/LightGBM/pairwise_ranking_feature_group.h @@ -72,7 +72,8 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { } inline void FinishLoad() { - // TODO(shiyu1994) + CHECK(!is_multi_val_); + bin_data_->FinishLoad(); } inline BinIterator* FeatureGroupIterator() { @@ -80,6 +81,28 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { return nullptr; } + /*! + * \brief Push one record, will auto convert to bin and push to bin data + * \param tid Thread id + * \param sub_feature_idx Index of the subfeature + * \param line_idx Index of record + * \param bin feature bin value of record + */ + inline void PushBinData(int tid, int sub_feature_idx, data_size_t line_idx, uint32_t bin) { + if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) { + return; + } + if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) { + bin -= 1; + } + if (is_multi_val_) { + multi_bin_data_[sub_feature_idx]->Push(tid, line_idx, bin + 1); + } else { + bin += bin_offsets_[sub_feature_idx]; + bin_data_->Push(tid, line_idx, bin); + } + } + private: void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) override; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 4c3f7d393f52..32a8d5acb478 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -858,7 +858,7 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset) { for (int i = 0; i < num_groups_; ++i) { int original_group_index = i % dataset->num_groups_; int original_group_feature_start = dataset->group_feature_start_[original_group_index]; - const int is_first_or_second_in_pairing = original_group_index / dataset->num_groups_; // 0 for first, 1 for second + const int is_first_or_second_in_pairing = i / dataset->num_groups_; // 0 for first, 1 for second group_feature_start_[i] = cur_feature_index; for (int feature_index_in_group = 0; feature_index_in_group < dataset->group_feature_cnt_[original_group_index]; ++feature_index_in_group) { const BinMapper* feature_bin_mapper = dataset->FeatureBinMapper(original_group_feature_start + feature_index_in_group); @@ -869,7 +869,7 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset) { feature2subfeature_.push_back(dataset->feature2subfeature_[original_group_feature_start + feature_index_in_group]); cur_feature_index += 1; } - feature_groups_.emplace_back(new PairwiseRankingFeatureGroup(*dataset->feature_groups_[original_group_index].get(), num_data_, is_first_or_second_in_pairing, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_index_map())); + feature_groups_.emplace_back(new PairwiseRankingFeatureGroup(*dataset->feature_groups_[original_group_index].get(), dataset->num_data(), is_first_or_second_in_pairing, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_index_map())); num_total_bin += dataset->FeatureGroupNumBin(original_group_index); group_bin_boundaries_.push_back(num_total_bin); group_feature_cnt_[i] = dataset->group_feature_cnt_[original_group_index]; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 0057c5ff1a4a..9ca807a4a269 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -293,12 +293,6 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac // need to check training data CheckDataset(dataset.get(), is_load_from_binary); - if (config_.objective == std::string("pairwise_lambdarank")) { - std::unique_ptr original_dataset(dataset.release()); - dataset.reset(new Dataset()); - dataset->CreatePairWiseRankingData(original_dataset.get()); - } - return dataset.release(); } @@ -357,9 +351,6 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, // check meta data dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices); - // TODO(shiyu1994) - Log::Warning("Pairwise ranking with validation set is not supported yet."); - return dataset.release(); } diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 089741a3668e..02dc3f10c1f7 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -911,11 +911,11 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { } } - data_size_t num_pairs_in_query = 0; query_boundaries_.clear(); query_boundaries_.push_back(0); num_queries_ = 0; - for (data_size_t query_index = 0; query_index < num_queries_; ++query_index) { + for (data_size_t query_index = 0; query_index < metadata.num_queries(); ++query_index) { + data_size_t num_pairs_in_query = 0; const data_size_t query_start = query_boundaries[query_index]; const data_size_t query_end = query_boundaries[query_index + 1]; for (data_size_t item_index_i = query_start; item_index_i < query_end; ++item_index_i) { @@ -926,7 +926,7 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { } const label_t label_j = label_[item_index_j]; if (label_i != label_j) { - paired_ranking_item_index_map_.push_back(std::pair{item_index_i - query_start, item_index_j - query_start}); + paired_ranking_item_index_map_.push_back(std::pair{item_index_i, item_index_j}); ++num_pairs_in_query; ++num_data_; } diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp index fed360b7f579..6fd839ed2d78 100644 --- a/src/io/pairwise_lambdarank_bin.hpp +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -32,7 +32,7 @@ class PairwiseRankingFirstIterator: public BinIterator { unpaired_bin_iterator_.reset(unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin)); unpaired_bin_iterator_->Reset(0); paired_ranking_item_index_map_ = paired_ranking_item_index_map; - prev_index_ = 0; + prev_index_ = -1; prev_val_ = 0; } @@ -41,7 +41,7 @@ class PairwiseRankingFirstIterator: public BinIterator { uint32_t Get(data_size_t idx) { const data_size_t data_index = paired_ranking_item_index_map_[idx].first; if (data_index != prev_index_) { - CHECK_GT(data_index, prev_index_); + CHECK_GE(data_index, prev_index_); prev_val_ = unpaired_bin_iterator_->Get(data_index); } prev_index_ = data_index; @@ -51,7 +51,7 @@ class PairwiseRankingFirstIterator: public BinIterator { uint32_t RawGet(data_size_t idx) { const data_size_t data_index = paired_ranking_item_index_map_[idx].first; if (data_index != prev_index_) { - CHECK_GT(data_index, prev_index_); + CHECK_GE(data_index, prev_index_); prev_val_ = unpaired_bin_iterator_->RawGet(data_index); } prev_index_ = data_index; @@ -60,7 +60,7 @@ class PairwiseRankingFirstIterator: public BinIterator { void Reset(data_size_t idx) { unpaired_bin_iterator_->Reset(idx); - prev_index_ = 0; + prev_index_ = -1; prev_val_ = 0; } @@ -134,6 +134,10 @@ class PairwiseRankingBin: public BIN_TYPE { void Push(int tid, data_size_t idx, uint32_t value) override; + void FinishLoad() override { + unpaired_bin_->FinishLoad(); + } + void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override; void SaveBinaryToFile(BinaryWriter* writer) const override; diff --git a/src/io/pairwise_ranking_feature_group.cpp b/src/io/pairwise_ranking_feature_group.cpp index 19fb1c98ca34..3f7404a7f78f 100644 --- a/src/io/pairwise_ranking_feature_group.cpp +++ b/src/io/pairwise_ranking_feature_group.cpp @@ -16,22 +16,12 @@ PairwiseRankingFeatureGroup::PairwiseRankingFeatureGroup(const FeatureGroup& oth CreateBinData(num_original_data, is_multi_val_, !is_sparse_, is_sparse_); - // copy from original bin data - const int num_threads = OMP_NUM_THREADS(); - std::vector>> bin_iterators(num_threads); - for (int i = 0; i < num_threads; ++i) { - for (int j = 0; j < num_feature_; ++j) { - bin_iterators[i].emplace_back(other.SubFeatureIterator(j)); - bin_iterators[i].back()->Reset(0); - } - } - Threading::For(0, num_original_data, 512, [this, &other] (int block_index, data_size_t block_start, data_size_t block_end) { for (int feature_index = 0; feature_index < num_feature_; ++feature_index) { std::unique_ptr bin_iterator(other.SubFeatureIterator(feature_index)); bin_iterator->Reset(block_start); for (data_size_t index = block_start; index < block_end; ++index) { - PushData(block_index, feature_index, index, bin_iterator->RawGet(index)); + PushBinData(block_index, feature_index, index, bin_iterator->Get(index)); } } }); @@ -50,7 +40,7 @@ void PairwiseRankingFeatureGroup::CreateBinData(int num_data, bool is_multi_val, multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingFirstBin( num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); } else { - multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingSecondBin( + multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingSecondBin( num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); } } else { @@ -69,14 +59,14 @@ void PairwiseRankingFeatureGroup::CreateBinData(int num_data, bool is_multi_val, (!force_dense && num_feature_ == 1 && bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { is_sparse_ = true; - if (is_first_or_second_in_pairing_) { + if (is_first_or_second_in_pairing_ == 0) { bin_data_.reset(Bin::CreateSparsePairwiseRankingFirstBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); } else { bin_data_.reset(Bin::CreateSparsePairwiseRankingSecondBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); } } else { is_sparse_ = false; - if (is_first_or_second_in_pairing_) { + if (is_first_or_second_in_pairing_ == 0) { bin_data_.reset(Bin::CreateDensePairwiseRankingFirstBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); } else { bin_data_.reset(Bin::CreateDensePairwiseRankingSecondBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index f7137d29ffd9..6c7b7c2f27b6 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -844,6 +844,9 @@ inline VAL_T SparseBinIterator::InnerRawGet(data_size_t idx) { template inline void SparseBinIterator::Reset(data_size_t start_idx) { bin_data_->InitIndex(start_idx, &i_delta_, &cur_pos_); + if (start_idx == 1920 || start_idx == 4320) { + Log::Warning("i_delta_ = %d, cur_pos_ = %d, start_idx = %d", i_delta_, cur_pos_, start_idx); + } } template diff --git a/src/metric/dcg_calculator.cpp b/src/metric/dcg_calculator.cpp index 316fdf0a6ddf..68a725f36db1 100644 --- a/src/metric/dcg_calculator.cpp +++ b/src/metric/dcg_calculator.cpp @@ -149,7 +149,7 @@ void DCGCalculator::CheckLabel(const label_t* label, data_size_t num_data) { label_t delta = std::fabs(label[i] - static_cast(label[i])); if (delta > kEpsilon) { Log::Fatal("label should be int type (met %f) for ranking task,\n" - "for the gain of label, please set the label_gain parameter", label[i]); + "for the gain of label, please set the label_gain parameter, i = %d", label[i], i); } if (label[i] < 0) { From 38b2f3e26e0cceebe8a57be9b55436c289566fa0 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 29 Feb 2024 01:17:17 +0000 Subject: [PATCH 21/68] fix validation set with pairwise lambda rank --- src/application/application.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/application/application.cpp b/src/application/application.cpp index 0bb9eca13bf2..b845049f4cc0 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -118,6 +118,14 @@ void Application::LoadData() { train_data_->SaveBinaryFile(nullptr); } // create training metric + const Dataset* ref_train_data = nullptr; + if (config_.objective == std::string("pairwise_lambdarank")) { + ref_train_data = train_data_.release(); + train_data_.reset(new Dataset()); + train_data_->CreatePairWiseRankingData(ref_train_data); + } else { + ref_train_data = train_data_.get(); + } if (config_.is_provide_training_metric) { for (auto metric_type : config_.metric) { auto metric = std::unique_ptr(Metric::CreateMetric(metric_type, config_)); @@ -138,7 +146,12 @@ void Application::LoadData() { auto new_dataset = std::unique_ptr( dataset_loader.LoadFromFileAlignWithOtherDataset( config_.valid[i].c_str(), - train_data_.get())); + ref_train_data)); + if (config_.objective == std::string("pairwise_lambdarank")) { + const Dataset* original_dataset = new_dataset.release(); + new_dataset.reset(new Dataset()); + new_dataset->CreatePairWiseRankingData(original_dataset); + } valid_datas_.push_back(std::move(new_dataset)); // need save binary file if (config_.save_binary) { From ba3c8151020a83d75781ad04cbf5b213e1e5bc9b Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 29 Feb 2024 02:06:46 +0000 Subject: [PATCH 22/68] fix pairwise ranking objective initialization --- include/LightGBM/dataset.h | 20 +++++++++++++++++--- src/io/dataset.cpp | 2 +- src/io/metadata.cpp | 4 +++- src/metric/dcg_calculator.cpp | 2 +- src/objective/rank_objective.hpp | 5 +++-- 5 files changed, 25 insertions(+), 8 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 4a179319e955..aab11c683dd9 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -260,8 +260,8 @@ class Metadata { } /*! - * \brief Get the pairwise item index map in ranking with pairwise features - * \return Pointer to the pairwise item index map + * \brief Get the pairwise item index map within query in ranking with pairwise features + * \return Pointer to the pairwise item index map within query */ inline const std::pair* paired_ranking_item_index_map() const { if (!paired_ranking_item_index_map_.empty()) { @@ -271,6 +271,18 @@ class Metadata { } } + /*! + * \brief Get the pairwise item global index map in ranking with pairwise features + * \return Pointer to the pairwise item global index map + */ + inline const std::pair* paired_ranking_item_global_index_map() const { + if (!paired_ranking_item_global_index_map_.empty()) { + return paired_ranking_item_global_index_map_.data(); + } else { + return nullptr; + } + } + inline data_size_t paired_ranking_item_index_map_size() const { return static_cast(paired_ranking_item_index_map_.size()); } @@ -424,8 +436,10 @@ class Metadata { std::vector queries_; /*! \brief Mode for pairwise ranking */ PairwiseRankingMode pairwise_ranking_mode_ = PairwiseRankingMode::kRelevance; - /*! \brief Pairwise data index to original data indices for ranking with pairwise features */ + /*! \brief Pairwise data index within query to original data indices for ranking with pairwise features */ std::vector> paired_ranking_item_index_map_; + /*! \brief Pairwise global data index to original data indices for ranking with pairwise features */ + std::vector> paired_ranking_item_global_index_map_; /*! \brief mutex for threading safe call */ std::mutex mutex_; bool weight_load_from_file_; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 1e363c9ba320..f93fe851f9a8 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -869,7 +869,7 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset) { feature2subfeature_.push_back(dataset->feature2subfeature_[original_group_feature_start + feature_index_in_group]); cur_feature_index += 1; } - feature_groups_.emplace_back(new PairwiseRankingFeatureGroup(*dataset->feature_groups_[original_group_index].get(), dataset->num_data(), is_first_or_second_in_pairing, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_index_map())); + feature_groups_.emplace_back(new PairwiseRankingFeatureGroup(*dataset->feature_groups_[original_group_index].get(), dataset->num_data(), is_first_or_second_in_pairing, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_global_index_map())); num_total_bin += dataset->FeatureGroupNumBin(original_group_index); group_bin_boundaries_.push_back(num_total_bin); group_feature_cnt_[i] = dataset->group_feature_cnt_[original_group_index]; diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 02dc3f10c1f7..95a80d988539 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -863,6 +863,7 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { const label_t* pointwise_label = metadata.label(); const label_t* pointwise_weights = metadata.weights(); paired_ranking_item_index_map_.clear(); + paired_ranking_item_global_index_map_.clear(); const data_size_t* query_boundaries = metadata.query_boundaries(); // backup pointwise query boundaries @@ -926,7 +927,8 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { } const label_t label_j = label_[item_index_j]; if (label_i != label_j) { - paired_ranking_item_index_map_.push_back(std::pair{item_index_i, item_index_j}); + paired_ranking_item_index_map_.push_back(std::pair{item_index_i - query_start, item_index_j - query_start}); + paired_ranking_item_global_index_map_.push_back(std::pair{item_index_i, item_index_j}); ++num_pairs_in_query; ++num_data_; } diff --git a/src/metric/dcg_calculator.cpp b/src/metric/dcg_calculator.cpp index 68a725f36db1..316fdf0a6ddf 100644 --- a/src/metric/dcg_calculator.cpp +++ b/src/metric/dcg_calculator.cpp @@ -149,7 +149,7 @@ void DCGCalculator::CheckLabel(const label_t* label, data_size_t num_data) { label_t delta = std::fabs(label[i] - static_cast(label[i])); if (delta > kEpsilon) { Log::Fatal("label should be int type (met %f) for ranking task,\n" - "for the gain of label, please set the label_gain parameter, i = %d", label[i], i); + "for the gain of label, please set the label_gain parameter", label[i]); } if (label[i] < 0) { diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 19e089c56fa8..bcaa0d98f73b 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -560,12 +560,13 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { ~PairwiseLambdarankNDCG() {} void Init(const Metadata& metadata, data_size_t num_data) override { - LambdarankNDCG::Init(metadata, num_data); query_boundaries_pointwise_ = metadata.pointwise_query_boundaries(); if (query_boundaries_pointwise_ == nullptr) { Log::Fatal("Ranking tasks require query information"); } - num_data_pointwise_ = query_boundaries_pointwise_[num_queries_]; + num_data_pointwise_ = query_boundaries_pointwise_[metadata.num_queries()]; + LambdarankNDCG::Init(metadata, num_data_pointwise_); + num_data_ = num_data; paired_index_map_ = metadata.paired_ranking_item_index_map(); scores_pointwise_.resize(num_data_pointwise_, 0.0); } From d9b537d81b04c7f2b20c7fcd3ff414a2cb5d2c68 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 29 Feb 2024 09:02:44 +0000 Subject: [PATCH 23/68] keep the original query boundaries and add pairwise query boundaries --- include/LightGBM/dataset.h | 8 ++++---- src/io/metadata.cpp | 12 ++++++------ src/metric/rank_metric.hpp | 6 +++--- src/objective/rank_objective.hpp | 3 ++- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index aab11c683dd9..3462e75d8bab 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -307,9 +307,9 @@ class Metadata { * \brief Used in pairwise ranking. Pointwise query boundaries. * \return Pointer of data boundaries on queries */ - inline const data_size_t* pointwise_query_boundaries() const { - if (!pointwise_query_boundaries_.empty()) { - return pointwise_query_boundaries_.data(); + inline const data_size_t* pairwise_query_boundaries() const { + if (!pairwise_query_boundaries_.empty()) { + return pairwise_query_boundaries_.data(); } else { return nullptr; } @@ -423,7 +423,7 @@ class Metadata { /*! \brief Query boundaries */ std::vector query_boundaries_; /*! \brief Original query boundaries, used in pairwise ranking */ - std::vector pointwise_query_boundaries_; + std::vector pairwise_query_boundaries_; /*! \brief Query weights */ std::vector query_weights_; /*! \brief Number of querys */ diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 95a80d988539..ae01b92fc47c 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -867,12 +867,12 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { const data_size_t* query_boundaries = metadata.query_boundaries(); // backup pointwise query boundaries - pointwise_query_boundaries_.clear(); - pointwise_query_boundaries_.resize(num_queries_); + query_boundaries_.clear(); + query_boundaries_.resize(num_queries_); const int num_threads = OMP_NUM_THREADS(); #pragma omp parallel for schedule(static) num_threads(num_threads) if (num_queries_ >= 1024) for (data_size_t i = 0; i < num_queries_; ++i) { - pointwise_query_boundaries_[i] = query_boundaries[i]; + query_boundaries_[i] = query_boundaries[i]; } // copy labels @@ -912,8 +912,8 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { } } - query_boundaries_.clear(); - query_boundaries_.push_back(0); + pairwise_query_boundaries_.clear(); + pairwise_query_boundaries_.push_back(0); num_queries_ = 0; for (data_size_t query_index = 0; query_index < metadata.num_queries(); ++query_index) { data_size_t num_pairs_in_query = 0; @@ -935,7 +935,7 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { } } if (num_pairs_in_query > 0) { - query_boundaries_.push_back(num_pairs_in_query); + pairwise_query_boundaries_.push_back(num_pairs_in_query); ++num_queries_; } } diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp index 1156f03f3a3e..8a1d05d4cf22 100644 --- a/src/metric/rank_metric.hpp +++ b/src/metric/rank_metric.hpp @@ -39,14 +39,14 @@ class NDCGMetric:public Metric { for (auto k : eval_at_) { name_.emplace_back(std::string("ndcg@") + std::to_string(k)); } - num_data_ = pairwise_scores_? metadata.pointwise_query_boundaries()[metadata.num_queries()] : num_data; + num_data_ = pairwise_scores_? metadata.query_boundaries()[metadata.num_queries()] : num_data; // get label label_ = metadata.label(); num_queries_ = metadata.num_queries(); DCGCalculator::CheckMetadata(metadata, num_queries_); DCGCalculator::CheckLabel(label_, num_data_); // get query boundaries - query_boundaries_ = pairwise_scores_? metadata.pointwise_query_boundaries() : metadata.query_boundaries(); + query_boundaries_ = metadata.query_boundaries(); if (query_boundaries_ == nullptr) { Log::Fatal("The NDCG metric requires query information"); } @@ -82,7 +82,7 @@ class NDCGMetric:public Metric { paired_index_map_ = metadata.paired_ranking_item_index_map(); scores_pointwise_.resize(num_data_, 0.0); num_data_pairwise_ = num_data; - query_boundaries_pairwise_ = metadata.query_boundaries(); + query_boundaries_pairwise_ = metadata.pairwise_query_boundaries(); } } diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index bcaa0d98f73b..5b0c92fb8f49 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -560,12 +560,13 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { ~PairwiseLambdarankNDCG() {} void Init(const Metadata& metadata, data_size_t num_data) override { - query_boundaries_pointwise_ = metadata.pointwise_query_boundaries(); + query_boundaries_pointwise_ = metadata.query_boundaries(); if (query_boundaries_pointwise_ == nullptr) { Log::Fatal("Ranking tasks require query information"); } num_data_pointwise_ = query_boundaries_pointwise_[metadata.num_queries()]; LambdarankNDCG::Init(metadata, num_data_pointwise_); + query_boundaries_ = metadata.pairwise_query_boundaries(); num_data_ = num_data; paired_index_map_ = metadata.paired_ranking_item_index_map(); scores_pointwise_.resize(num_data_pointwise_, 0.0); From 362baf89682e6410aa607cd40b94ae49499b7ff0 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 1 Mar 2024 07:42:39 +0000 Subject: [PATCH 24/68] allow empty queries in pairwise query boundaries --- src/io/metadata.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index ae01b92fc47c..ea795e892e1c 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -914,7 +914,7 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { pairwise_query_boundaries_.clear(); pairwise_query_boundaries_.push_back(0); - num_queries_ = 0; + num_queries_ = metadata.num_queries(); for (data_size_t query_index = 0; query_index < metadata.num_queries(); ++query_index) { data_size_t num_pairs_in_query = 0; const data_size_t query_start = query_boundaries[query_index]; @@ -934,10 +934,7 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { } } } - if (num_pairs_in_query > 0) { - pairwise_query_boundaries_.push_back(num_pairs_in_query); - ++num_queries_; - } + pairwise_query_boundaries_.push_back(num_pairs_in_query); } } else { // TODO(shiyu1994) From 06597ac02d6f0f2be8726799a549b942d157abf4 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 1 Mar 2024 08:48:31 +0000 Subject: [PATCH 25/68] fix query boundaries --- src/io/metadata.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index ea795e892e1c..8dd0932d2d2c 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -854,7 +854,6 @@ size_t Metadata::SizesInByte() const { } data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { - num_data_ = 0; num_queries_ = metadata.num_queries(); label_.clear(); positions_.clear(); @@ -868,10 +867,10 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { // backup pointwise query boundaries query_boundaries_.clear(); - query_boundaries_.resize(num_queries_); + query_boundaries_.resize(num_queries_ + 1); const int num_threads = OMP_NUM_THREADS(); #pragma omp parallel for schedule(static) num_threads(num_threads) if (num_queries_ >= 1024) - for (data_size_t i = 0; i < num_queries_; ++i) { + for (data_size_t i = 0; i < num_queries_ + 1; ++i) { query_boundaries_[i] = query_boundaries[i]; } @@ -914,9 +913,8 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { pairwise_query_boundaries_.clear(); pairwise_query_boundaries_.push_back(0); - num_queries_ = metadata.num_queries(); + num_data_ = 0; for (data_size_t query_index = 0; query_index < metadata.num_queries(); ++query_index) { - data_size_t num_pairs_in_query = 0; const data_size_t query_start = query_boundaries[query_index]; const data_size_t query_end = query_boundaries[query_index + 1]; for (data_size_t item_index_i = query_start; item_index_i < query_end; ++item_index_i) { @@ -929,12 +927,11 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { if (label_i != label_j) { paired_ranking_item_index_map_.push_back(std::pair{item_index_i - query_start, item_index_j - query_start}); paired_ranking_item_global_index_map_.push_back(std::pair{item_index_i, item_index_j}); - ++num_pairs_in_query; ++num_data_; } } } - pairwise_query_boundaries_.push_back(num_pairs_in_query); + pairwise_query_boundaries_.push_back(num_data_); } } else { // TODO(shiyu1994) From 18e3a1b77a791647036a8522493f93b5f42f6283 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 1 Mar 2024 09:09:53 +0000 Subject: [PATCH 26/68] clean up --- src/io/sparse_bin.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index 6c7b7c2f27b6..f7137d29ffd9 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -844,9 +844,6 @@ inline VAL_T SparseBinIterator::InnerRawGet(data_size_t idx) { template inline void SparseBinIterator::Reset(data_size_t start_idx) { bin_data_->InitIndex(start_idx, &i_delta_, &cur_pos_); - if (start_idx == 1920 || start_idx == 4320) { - Log::Warning("i_delta_ = %d, cur_pos_ = %d, start_idx = %d", i_delta_, cur_pos_, start_idx); - } } template From 43b85823dbaab2bd50c9b136118d3e619500f0b6 Mon Sep 17 00:00:00 2001 From: Pavel Metrikov Date: Fri, 1 Mar 2024 01:18:50 -0800 Subject: [PATCH 27/68] various fixes --- src/metric/rank_metric.hpp | 8 +- src/objective/rank_objective.hpp | 151 ++++++++++++++++--------------- 2 files changed, 81 insertions(+), 78 deletions(-) diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp index 8a1d05d4cf22..818e45e46c4b 100644 --- a/src/metric/rank_metric.hpp +++ b/src/metric/rank_metric.hpp @@ -39,7 +39,7 @@ class NDCGMetric:public Metric { for (auto k : eval_at_) { name_.emplace_back(std::string("ndcg@") + std::to_string(k)); } - num_data_ = pairwise_scores_? metadata.query_boundaries()[metadata.num_queries()] : num_data; + num_data_ = metadata.query_boundaries()[metadata.num_queries()]; // get label label_ = metadata.label(); num_queries_ = metadata.num_queries(); @@ -81,7 +81,7 @@ class NDCGMetric:public Metric { if (pairwise_scores_) { paired_index_map_ = metadata.paired_ranking_item_index_map(); scores_pointwise_.resize(num_data_, 0.0); - num_data_pairwise_ = num_data; + num_data_pairwise_ = metadata.pairwise_query_boundaries()[metadata.num_queries()]; query_boundaries_pairwise_ = metadata.pairwise_query_boundaries(); } } @@ -116,7 +116,7 @@ class NDCGMetric:public Metric { const data_size_t start_pointwise = query_boundaries_[i]; const data_size_t cnt_pointwise = query_boundaries_[i + 1] - query_boundaries_[i]; const data_size_t start_pairwise = query_boundaries_pairwise_[i]; - const data_size_t cnt_pairwise = query_boundaries_[i + 1] - query_boundaries_[i]; + const data_size_t cnt_pairwise = query_boundaries_pairwise_[i + 1] - query_boundaries_pairwise_[i]; std::vector all_pairs(cnt_pairwise); std::iota(all_pairs.begin(), all_pairs.end(), 0); UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), paired_index_map_ + start_pairwise, truncation_level_, sigmoid_); @@ -146,7 +146,7 @@ class NDCGMetric:public Metric { const data_size_t start_pointwise = query_boundaries_[i]; const data_size_t cnt_pointwise = query_boundaries_[i + 1] - query_boundaries_[i]; const data_size_t start_pairwise = query_boundaries_pairwise_[i]; - const data_size_t cnt_pairwise = query_boundaries_[i + 1] - query_boundaries_[i]; + const data_size_t cnt_pairwise = query_boundaries_pairwise_[i + 1] - query_boundaries_pairwise_[i]; std::vector all_pairs(cnt_pairwise); std::iota(all_pairs.begin(), all_pairs.end(), 0); UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), paired_index_map_ + start_pairwise, truncation_level_, sigmoid_); diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 5b0c92fb8f49..280ba9fee59a 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -26,7 +26,7 @@ namespace LightGBM { - void UpdatePointwiseScoresForOneQuery(data_size_t query_id, double* score_pointwise, const double* score, data_size_t cnt_pointwise, + void UpdatePointwiseScoresForOneQuery(data_size_t query_id, double* score_pointwise, const double* score_pairwise, data_size_t cnt_pointwise, data_size_t selected_pairs_cnt, const data_size_t* selected_pairs, const std::pair* paired_index_map, int truncation_level, double sigma) { // get sorted indices for scores std::vector sorted_idx(cnt_pointwise); @@ -62,12 +62,12 @@ namespace LightGBM { int indexRight = paired_index_map[current_pair].second; if (ranks[indexLeft] >= truncation_level && ranks[indexRight] >= truncation_level) { continue; } - double delta_score = score[current_pair]; + double delta_score = score_pairwise[current_pair]; int comparisons = 1; data_size_t current_pair_inverse = -1; if (mapLeftRight2Pair.count(std::make_pair(indexRight, indexLeft)) > 0) { current_pair_inverse = mapLeftRight2Pair.at(std::make_pair(indexRight, indexLeft)); - delta_score -= score[current_pair_inverse]; + delta_score -= score_pairwise[current_pair_inverse]; comparisons++; } if (model_indirect_comparisons_) { @@ -78,7 +78,7 @@ namespace LightGBM { (!(indirect_comparisons_above_only || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { data_size_t indexHeadLeft = mapLeftRight2Pair.at(std::make_pair(indexHead, indexLeft)); data_size_t indexHeadRight = mapLeftRight2Pair.at(std::make_pair(indexHead, indexRight)); - delta_score += score[indexHeadRight] - score[indexHeadLeft]; + delta_score += score_pairwise[indexHeadRight] - score_pairwise[indexHeadLeft]; comparisons++; } } @@ -90,7 +90,7 @@ namespace LightGBM { (!model_conditional_rel_ || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { data_size_t indexLeftTail = mapLeftRight2Pair.at(std::make_pair(indexLeft, indexTail)); data_size_t indexRightTail = mapLeftRight2Pair.at(std::make_pair(indexRight, indexTail)); - delta_score += score[indexLeftTail] - score[indexRightTail]; + delta_score += score_pairwise[indexLeftTail] - score_pairwise[indexRightTail]; comparisons++; } } @@ -106,10 +106,10 @@ namespace LightGBM { //double p_lr_pointwise = GetSigmoid(delta_score_pointwise); double p_lr_pointwise = 1.0f / (1.0f + std::exp(-delta_score_pointwise * sigma)); double p_rl_pointwise = 1.0 - p_lr_pointwise; - gradients[indexLeft] += paired_discount * (p_rl_pointwise - p_rl); - hessians[indexLeft] += paired_discount * p_rl_pointwise * p_lr_pointwise; - gradients[indexRight] -= paired_discount * (p_rl_pointwise - p_rl); - hessians[indexRight] += paired_discount * p_rl_pointwise * p_lr_pointwise; + gradients[indexLeft] += sigma * paired_discount * (p_rl_pointwise - p_rl); + hessians[indexLeft] += sigma * sigma * paired_discount * p_rl_pointwise * p_lr_pointwise; + gradients[indexRight] -= sigma * paired_discount * (p_rl_pointwise - p_rl); + hessians[indexRight] += sigma * sigma * paired_discount * p_rl_pointwise * p_lr_pointwise; } for (data_size_t i = 0; i < cnt_pointwise; i++) { @@ -559,48 +559,49 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { ~PairwiseLambdarankNDCG() {} - void Init(const Metadata& metadata, data_size_t num_data) override { - query_boundaries_pointwise_ = metadata.query_boundaries(); - if (query_boundaries_pointwise_ == nullptr) { + void Init(const Metadata& metadata, data_size_t num_data_pairwise) override { + data_size_t num_data_pointwise = metadata.query_boundaries()[metadata.num_queries()]; + Log::Info("!!! num_data_pointwise %d", num_data_pointwise); + LambdarankNDCG::Init(metadata, num_data_pointwise); + num_data_pairwise_ = num_data_pairwise; + query_boundaries_pairwise_ = metadata.pairwise_query_boundaries(); + if (query_boundaries_pairwise_ == nullptr) { Log::Fatal("Ranking tasks require query information"); } - num_data_pointwise_ = query_boundaries_pointwise_[metadata.num_queries()]; - LambdarankNDCG::Init(metadata, num_data_pointwise_); - query_boundaries_ = metadata.pairwise_query_boundaries(); - num_data_ = num_data; paired_index_map_ = metadata.paired_ranking_item_index_map(); - scores_pointwise_.resize(num_data_pointwise_, 0.0); + scores_pointwise_.resize(num_data_pointwise, 0.0); } - void GetGradients(const double* score, score_t* gradients, - score_t* hessians) const override { + void GetGradients(const double* score_pairwise, score_t* gradients_pairwise, + score_t* hessians_pairwise) const override { #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) for (data_size_t i = 0; i < num_queries_; ++i) { - const data_size_t start_pointwise = query_boundaries_pointwise_[i]; - const data_size_t cnt_pointwise = query_boundaries_pointwise_[i + 1] - query_boundaries_pointwise_[i]; - const data_size_t start = query_boundaries_[i]; - const data_size_t cnt = query_boundaries_[i + 1] - query_boundaries_[i]; - std::vector score_adjusted; + const data_size_t start_pointwise = query_boundaries_[i]; + const data_size_t cnt_pointwise = query_boundaries_[i + 1] - query_boundaries_[i]; + const data_size_t start_pairwise = query_boundaries_pairwise_[i]; + const data_size_t cnt_pairwise = query_boundaries_pairwise_[i + 1] - query_boundaries_pairwise_[i]; + std::vector score_adjusted_pairwise; if (num_position_ids_ > 0) { - for (data_size_t j = 0; j < cnt; ++j) { - score_adjusted.push_back(score[start + j] + pos_biases_[positions_[start_pointwise + paired_index_map_[start + j].first]] - pos_biases_[positions_[start_pointwise + paired_index_map_[start + j].second]]); + for (data_size_t j = 0; j < cnt_pairwise; ++j) { + score_adjusted_pairwise.push_back(score_pairwise[start_pairwise + j] + pos_biases_[positions_[start_pointwise + paired_index_map_[start_pairwise + j].first]] - + pos_biases_[positions_[start_pointwise + paired_index_map_[start_pairwise + j].second]]); } } - GetGradientsForOneQuery(i, cnt_pointwise, cnt, label_ + start_pointwise, scores_pointwise_.data(), num_position_ids_ > 0 ? score_adjusted.data() : score + start, - gradients + start, hessians + start); - - std::vector all_pairs(cnt); + GetGradientsForOneQuery(i, cnt_pointwise, cnt_pairwise, label_ + start_pointwise, scores_pointwise_.data(), num_position_ids_ > 0 ? score_adjusted_pairwise.data() : score_pairwise + start_pairwise, + gradients_pairwise + start_pairwise, hessians_pairwise + start_pairwise); + std::vector all_pairs(cnt_pairwise); std::iota(all_pairs.begin(), all_pairs.end(), 0); - UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score + start, cnt_pointwise, cnt, all_pairs.data(), paired_index_map_ + start, truncation_level_, sigmoid_); + UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score_pairwise + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), + paired_index_map_ + start_pairwise, truncation_level_, sigmoid_); } if (num_position_ids_ > 0) { - std::vector gradients_pointwise(num_data_pointwise_); - std::vector hessians_pointwise(num_data_pointwise_); + std::vector gradients_pointwise(num_data_); + std::vector hessians_pointwise(num_data_); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) for (data_size_t i = 0; i < num_queries_; ++i) { - const data_size_t cnt_pointwise = query_boundaries_pointwise_[i + 1] - query_boundaries_pointwise_[i]; - const data_size_t cnt = query_boundaries_[i + 1] - query_boundaries_[i]; - TransformGradientsPairwiseIntoPointwiseForOneQuery(i, cnt_pointwise, cnt, gradients, hessians, gradients_pointwise.data(), hessians_pointwise.data()); + const data_size_t cnt_pointwise = query_boundaries_[i + 1] - query_boundaries_[i]; + const data_size_t cnt_pairwise = query_boundaries_pairwise_[i + 1] - query_boundaries_pairwise_[i]; + TransformGradientsPairwiseIntoPointwiseForOneQuery(i, cnt_pointwise, cnt_pairwise, gradients_pairwise, hessians_pairwise, gradients_pointwise.data(), hessians_pointwise.data()); } UpdatePositionBiasFactors(gradients_pointwise.data(), hessians_pointwise.data()); } @@ -625,20 +626,20 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { } - inline void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt_pointwise, data_size_t cnt, - const label_t* label, const double* score_pointwise, const double* score, - score_t* lambdas, - score_t* hessians) const { + inline void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt_pointwise, data_size_t cnt_pairwise, + const label_t* label, const double* score_pointwise, const double* score_pairwise, + score_t* lambdas_pairwise, + score_t* hessians_pairwise) const { - const data_size_t start_pointwise = query_boundaries_pointwise_[query_id]; - const data_size_t start = query_boundaries_[query_id]; + const data_size_t start_pointwise = query_boundaries_[query_id]; + const data_size_t start_pairwise = query_boundaries_pairwise_[query_id]; // get max DCG on current query const double inverse_max_dcg = inverse_max_dcgs_[query_id]; // initialize with zero - for (data_size_t i = 0; i < cnt; ++i) { - lambdas[i] = 0.0f; - hessians[i] = 0.0f; + for (data_size_t i = 0; i < cnt_pairwise; ++i) { + lambdas_pairwise[i] = 0.0f; + hessians_pairwise[i] = 0.0f; } // get sorted indices for scores std::vector sorted_idx(cnt_pointwise); @@ -665,9 +666,9 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { std::multimap mapLeft2Right; std::map, data_size_t> mapLeftRight2Pair; - for (data_size_t i = 0; i < cnt; ++i) { - int indexLeft = paired_index_map_[i + start].first; - int indexRight = paired_index_map_[i + start].second; + for (data_size_t i = 0; i < cnt_pairwise; ++i) { + int indexLeft = paired_index_map_[i + start_pairwise].first; + int indexRight = paired_index_map_[i + start_pairwise].second; mapRight2Left.insert(std::make_pair(indexRight, indexLeft)); mapLeft2Right.insert(std::make_pair(indexLeft, indexRight)); mapLeftRight2Pair.insert(std::make_pair(std::make_pair(indexLeft, indexRight), i)); @@ -675,9 +676,9 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { double sum_lambdas = 0.0; // start accmulate lambdas by pairs - for (data_size_t i = 0; i < cnt; i++) { - int indexLeft = paired_index_map_[i + start].first; - int indexRight = paired_index_map_[i + start].second; + for (data_size_t i = 0; i < cnt_pairwise; i++) { + int indexLeft = paired_index_map_[i + start_pairwise].first; + int indexRight = paired_index_map_[i + start_pairwise].second; if (label[indexLeft] <= label[indexRight] || (ranks[indexLeft] >= truncation_level_ && ranks[indexRight] >= truncation_level_)) { continue; @@ -693,13 +694,13 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { const int low_label = static_cast(label[low]); const double low_label_gain = label_gain_[low_label]; const double low_discount = DCGCalculator::GetDiscount(low_rank); - double delta_score = score[i]; + double delta_score = score_pairwise[i]; int comparisons = 1; data_size_t i_inverse = -1; if (mapLeftRight2Pair.count(std::make_pair(indexRight, indexLeft)) > 0) { i_inverse = mapLeftRight2Pair.at(std::make_pair(indexRight, indexLeft)); - delta_score -= score[i_inverse]; + delta_score -= score_pairwise[i_inverse]; comparisons++; } if (model_indirect_comparisons_) { @@ -710,7 +711,7 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { (!(indirect_comparisons_above_only || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { data_size_t indexHeadLeft = mapLeftRight2Pair.at(std::make_pair(indexHead, indexLeft)); data_size_t indexHeadRight = mapLeftRight2Pair.at(std::make_pair(indexHead, indexRight)); - delta_score += score[indexHeadRight] - score[indexHeadLeft]; + delta_score += score_pairwise[indexHeadRight] - score_pairwise[indexHeadLeft]; comparisons++; } } @@ -722,7 +723,7 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { (!model_conditional_rel_ || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { data_size_t indexLeftTail = mapLeftRight2Pair.at(std::make_pair(indexLeft, indexTail)); data_size_t indexRightTail = mapLeftRight2Pair.at(std::make_pair(indexRight, indexTail)); - delta_score += score[indexLeftTail] - score[indexRightTail]; + delta_score += score_pairwise[indexLeftTail] - score_pairwise[indexRightTail]; comparisons++; } } @@ -751,11 +752,11 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { p_lambda *= weights_[start_pointwise + high] * weights_[start_pointwise + low]; p_hessian *= weights_[start_pointwise + high] * weights_[start_pointwise + low]; } - lambdas[i] += static_cast(p_lambda / comparisons); - hessians[i] += static_cast(p_hessian / comparisons); + lambdas_pairwise[i] += static_cast(p_lambda / comparisons); + hessians_pairwise[i] += static_cast(p_hessian / comparisons); if (i_inverse >= 0) { - lambdas[i_inverse] -= static_cast(p_lambda / comparisons); - hessians[i_inverse] += static_cast(p_hessian / comparisons); + lambdas_pairwise[i_inverse] -= static_cast(p_lambda / comparisons); + hessians_pairwise[i_inverse] += static_cast(p_hessian / comparisons); } if (model_indirect_comparisons_) { auto indexHead_range = mapRight2Left.equal_range(indexLeft); @@ -765,10 +766,10 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { (!(indirect_comparisons_above_only || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { data_size_t indexHeadLeft = mapLeftRight2Pair.at(std::make_pair(indexHead, indexLeft)); data_size_t indexHeadRight = mapLeftRight2Pair.at(std::make_pair(indexHead, indexRight)); - lambdas[indexHeadRight] += static_cast(p_lambda / comparisons); - hessians[indexHeadRight] += static_cast(p_hessian / comparisons); - lambdas[indexHeadLeft] -= static_cast(p_lambda / comparisons); - hessians[indexHeadLeft] += static_cast(p_hessian / comparisons); + lambdas_pairwise[indexHeadRight] += static_cast(p_lambda / comparisons); + hessians_pairwise[indexHeadRight] += static_cast(p_hessian / comparisons); + lambdas_pairwise[indexHeadLeft] -= static_cast(p_lambda / comparisons); + hessians_pairwise[indexHeadLeft] += static_cast(p_hessian / comparisons); } } auto indexTail_range = mapLeft2Right.equal_range(indexLeft); @@ -779,10 +780,10 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { (!model_conditional_rel_ || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { data_size_t indexLeftTail = mapLeftRight2Pair.at(std::make_pair(indexLeft, indexTail)); data_size_t indexRightTail = mapLeftRight2Pair.at(std::make_pair(indexRight, indexTail)); - lambdas[indexLeftTail] += static_cast(p_lambda / comparisons); - hessians[indexLeftTail] += static_cast(p_hessian / comparisons); - lambdas[indexRightTail] -= static_cast(p_lambda / comparisons); - hessians[indexRightTail] += static_cast(p_hessian / comparisons); + lambdas_pairwise[indexLeftTail] += static_cast(p_lambda / comparisons); + hessians_pairwise[indexLeftTail] += static_cast(p_hessian / comparisons); + lambdas_pairwise[indexRightTail] -= static_cast(p_lambda / comparisons); + hessians_pairwise[indexRightTail] += static_cast(p_hessian / comparisons); } } } @@ -792,9 +793,9 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { if (norm_ && sum_lambdas > 0) { double norm_factor = std::log2(1 + sum_lambdas) / sum_lambdas; - for (data_size_t i = 0; i < cnt; ++i) { - lambdas[i] = static_cast(lambdas[i] * norm_factor); - hessians[i] = static_cast(hessians[i] * norm_factor); + for (data_size_t i = 0; i < cnt_pairwise; ++i) { + lambdas_pairwise[i] = static_cast(lambdas_pairwise[i] * norm_factor); + hessians_pairwise[i] = static_cast(hessians_pairwise[i] * norm_factor); } } } @@ -814,11 +815,13 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { } } + const char* GetName() const override { return "pairwise_lambdarank"; } + protected: - /*! \brief Query boundaries for pointwise data instances */ - const data_size_t* query_boundaries_pointwise_; - /*! \brief Number of pointwise data */ - data_size_t num_data_pointwise_; + /*! \brief Query boundaries for pairwise data instances */ + const data_size_t* query_boundaries_pairwise_; + /*! \brief Number of pairwise data */ + data_size_t num_data_pairwise_; mutable std::vector scores_pointwise_; private: From ad4e89f6bb684837d4d0859156ed8dacd5459480 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 1 Mar 2024 09:19:25 +0000 Subject: [PATCH 28/68] construct all pairs for validation set --- include/LightGBM/dataset.h | 5 +++-- src/application/application.cpp | 4 ++-- src/io/dataset.cpp | 4 ++-- src/io/metadata.cpp | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 3462e75d8bab..f37c765815c9 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -207,9 +207,10 @@ class Metadata { /*! * \brief Build metadata for ranking with pairwise features from metadata of an existing ranking dataset * \param metadata Reference to metadata of the existing ranking dataset + * \param is_validation Whether the dataset is a validation set * \return The number of paired data */ - data_size_t BuildPairwiseFeatureRanking(const Metadata& metadata); + data_size_t BuildPairwiseFeatureRanking(const Metadata& metadata, const bool is_validation); /*! * \brief Perform any extra operations after all data has been loaded @@ -757,7 +758,7 @@ class Dataset { LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset); - LIGHTGBM_EXPORT void CreatePairWiseRankingData(const Dataset* dataset); + LIGHTGBM_EXPORT void CreatePairWiseRankingData(const Dataset* dataset, const bool is_validation); void InitTrain(const std::vector& is_feature_used, TrainingShareStates* share_state) const; diff --git a/src/application/application.cpp b/src/application/application.cpp index b845049f4cc0..a1c83df55177 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -122,7 +122,7 @@ void Application::LoadData() { if (config_.objective == std::string("pairwise_lambdarank")) { ref_train_data = train_data_.release(); train_data_.reset(new Dataset()); - train_data_->CreatePairWiseRankingData(ref_train_data); + train_data_->CreatePairWiseRankingData(ref_train_data, false); } else { ref_train_data = train_data_.get(); } @@ -150,7 +150,7 @@ void Application::LoadData() { if (config_.objective == std::string("pairwise_lambdarank")) { const Dataset* original_dataset = new_dataset.release(); new_dataset.reset(new Dataset()); - new_dataset->CreatePairWiseRankingData(original_dataset); + new_dataset->CreatePairWiseRankingData(original_dataset, true); } valid_datas_.push_back(std::move(new_dataset)); // need save binary file diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index f93fe851f9a8..03b482195b19 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -823,8 +823,8 @@ void Dataset::CreateValid(const Dataset* dataset) { gpu_device_id_ = dataset->gpu_device_id_; } -void Dataset::CreatePairWiseRankingData(const Dataset* dataset) { - num_data_ = metadata_.BuildPairwiseFeatureRanking(dataset->metadata()); +void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_validation) { + num_data_ = metadata_.BuildPairwiseFeatureRanking(dataset->metadata(), is_validation); feature_groups_.clear(); num_features_ = dataset->num_features_ * 2; diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 8dd0932d2d2c..95a9e6aeda8c 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -853,7 +853,7 @@ size_t Metadata::SizesInByte() const { return size; } -data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { +data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata, const bool is_validation) { num_queries_ = metadata.num_queries(); label_.clear(); positions_.clear(); @@ -920,7 +920,7 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata) { for (data_size_t item_index_i = query_start; item_index_i < query_end; ++item_index_i) { const label_t label_i = label_[item_index_i]; for (data_size_t item_index_j = query_start; item_index_j < query_end; ++item_index_j) { - if (item_index_i == item_index_j) { + if (item_index_i == item_index_j && !is_validation) { continue; } const label_t label_j = label_[item_index_j]; From 1ad78b2080f26f464c827c3502696550dce6fe65 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 1 Mar 2024 09:35:27 +0000 Subject: [PATCH 29/68] fix for validation set --- src/io/metadata.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 95a9e6aeda8c..3661e236cbd9 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -920,11 +920,11 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata, cons for (data_size_t item_index_i = query_start; item_index_i < query_end; ++item_index_i) { const label_t label_i = label_[item_index_i]; for (data_size_t item_index_j = query_start; item_index_j < query_end; ++item_index_j) { - if (item_index_i == item_index_j && !is_validation) { + if (item_index_i == item_index_j) { continue; } const label_t label_j = label_[item_index_j]; - if (label_i != label_j) { + if (label_i != label_j && !is_validation) { paired_ranking_item_index_map_.push_back(std::pair{item_index_i - query_start, item_index_j - query_start}); paired_ranking_item_global_index_map_.push_back(std::pair{item_index_i, item_index_j}); ++num_data_; From 9cd3b93a25da14bd43604d9b27ac578e31ab16e4 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 1 Mar 2024 10:43:29 +0000 Subject: [PATCH 30/68] fix validation pairs --- src/io/metadata.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 3661e236cbd9..0d546cc37106 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -924,7 +924,7 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata, cons continue; } const label_t label_j = label_[item_index_j]; - if (label_i != label_j && !is_validation) { + if (label_i != label_j || is_validation) { paired_ranking_item_index_map_.push_back(std::pair{item_index_i - query_start, item_index_j - query_start}); paired_ranking_item_global_index_map_.push_back(std::pair{item_index_i, item_index_j}); ++num_data_; From f9d9c075ab5ff82a4f6ceed409d5941d71afe2f4 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 1 Mar 2024 11:04:20 +0000 Subject: [PATCH 31/68] fatal error when no query boundary is provided --- src/io/metadata.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 0d546cc37106..0c0494af0ca9 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -865,6 +865,10 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata, cons paired_ranking_item_global_index_map_.clear(); const data_size_t* query_boundaries = metadata.query_boundaries(); + if (query_boundaries == nullptr) { + Log::Fatal("Query boundaries must be provided for ranking."); + } + // backup pointwise query boundaries query_boundaries_.clear(); query_boundaries_.resize(num_queries_ + 1); From 746bc827174f3c38a4f65b1cc2704578c63101a7 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 8 Mar 2024 08:09:44 +0000 Subject: [PATCH 32/68] add differential features --- include/LightGBM/dataset_loader.h | 16 ++++++++ src/io/dataset_loader.cpp | 66 +++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h index 73b8e7bfd071..68fba1307bf1 100644 --- a/include/LightGBM/dataset_loader.h +++ b/include/LightGBM/dataset_loader.h @@ -82,6 +82,22 @@ class DatasetLoader { */ void CheckCategoricalFeatureNumBin(const std::vector>& bin_mappers, const int max_bin, const std::vector& max_bin_by_feature) const; + /*! \brief Create differential features for pairwise lambdarank + * \param sample_values sampled values from the file + * \param sample_indices sampled data indices from the file + * \param bin_mappers bin mappers of the original features + * \param filter_cnt filter count for bin finding + * \param num_total_sample_data number of all sampled data + * \param differential_feature_bin_mappers output differential feature bin mapppers + */ + void CreatePairwiseRankingDifferentialFeatures( + const std::vector>& sample_values, + const std::vector>& sample_indices, + const std::vector>& bin_mappers, + const data_size_t filter_cnt, + const data_size_t num_total_sample_data, + std::vector>* differential_feature_bin_mappers) const; + const Config& config_; /*! \brief Random generator*/ Random random_; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 9ca807a4a269..ade885433c47 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -1163,6 +1163,9 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, OMP_LOOP_EX_END(); } OMP_THROW_EX(); + + if () {} + } else { // start and len will store the process feature indices for different machines // machine i will find bins for features in [ start[i], start[i] + len[i] ) @@ -1567,4 +1570,67 @@ void DatasetLoader::CheckCategoricalFeatureNumBin( } } +void DatasetLoader::CreatePairwiseRankingDifferentialFeatures( + const std::vector>& sample_values, + const std::vector>& sample_indices, + const std::vector>& bin_mappers, + const data_size_t filter_cnt, + const data_size_t num_total_sample_data, + std::vector>* differential_feature_bin_mappers) const { + const int num_original_features = static_cast(sample_values.size()); + std::vector numerical_feature_indices; + for (int i = 0; i < num_original_features; ++i) { + if (bin_mappers[i] != nullptr && bin_mappers[i]->bin_type() == BinType::NumericalBin) { + numerical_feature_indices.push_back(i); + } + } + const int num_numerical_features = static_cast(numerical_feature_indices.size()); + std::vector> sampled_differential_values(num_original_features); + differential_feature_bin_mappers->resize(num_numerical_features, nullptr); + const int num_threads = OMP_NUM_THREADS(); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (int i = 0; i < num_numerical_features; ++i) { + const int feature_index = numerical_feature_indices[i]; + const data_size_t num_samples_for_feature = static_cast(sample_values[feature_index].size()); + if (config_.zero_as_missing) { + for (int j = 0; j < num_samples_for_feature; ++j) { + const double value = sample_values[feature_index][j]; + for (int k = j + 1; k < num_samples_for_feature; ++k) { + const double diff_value = value - sample_values[feature_index][k]; + sampled_differential_values[i].push_back(diff_value); + } + } + } else { + CHECK_GT(sample_indices[feature_index].size(), 0); + int cur_pos_j = 0; + for (int j = 0; j < sample_indices[feature_index].back() + 1; ++j) { + double value_j = 0.0; + if (j == sample_indices[feature_index][cur_pos_j]) { + value_j = sample_values[feature_index][cur_pos_j]; + ++cur_pos_j; + } + int cur_pos_k = 0; + for (int k = 0; k < sample_indices[feature_index].back() + 1; ++k) { + double value_k = 0.0; + if (k == sample_indices[feature_index][cur_pos_k]) { + value_k = sample_values[feature_index][cur_pos_k]; + ++cur_pos_k; + } + const double diff_value = value_j - value_k; + sampled_differential_values.push_back(diff_value); + } + } + } + differential_feature_bin_mappers->operator[](i).reset(new BinMapper()); + std::vector forced_upper_bounds; + differential_feature_bin_mappers->operator[](i)->FindBin( + sampled_differential_values[i].data(), + static_cast(sampled_differential_values[i].size()), + static_cast(num_total_sample_data * (num_total_sample_data) / 2), + config_.max_bin, config_.min_data_in_bin, filter_cnt, config_.feature_pre_filter, + BinType::NumericalBin, config_.use_missing, config_.zero_as_missing, forced_upper_bounds + ); + } +} + } // namespace LightGBM From f9ab075d92c325c84ac50540fae38692b197b486 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 20 Mar 2024 05:06:17 +0000 Subject: [PATCH 33/68] add differential features --- include/LightGBM/dataset.h | 17 +++++- src/application/application.cpp | 4 +- src/io/dataset.cpp | 104 +++++++++++++++++++++++++++++++- src/io/dataset_loader.cpp | 14 ++--- 4 files changed, 128 insertions(+), 11 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index f37c765815c9..ff59d76c3254 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -758,7 +758,7 @@ class Dataset { LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset); - LIGHTGBM_EXPORT void CreatePairWiseRankingData(const Dataset* dataset, const bool is_validation); + LIGHTGBM_EXPORT void CreatePairWiseRankingData(const Dataset* dataset, const bool is_validation, const Config& config); void InitTrain(const std::vector& is_feature_used, TrainingShareStates* share_state) const; @@ -1062,6 +1062,14 @@ class Dataset { void CreateCUDAColumnData(); + void CreatePairwiseRankingDifferentialFeatures( + const std::vector>& sample_values, + const std::vector>& sample_indices, + const std::vector& bin_mappers, + const data_size_t num_total_sample_data, + std::vector>* differential_feature_bin_mappers, + const Config& config) const; + std::string data_filename_; /*! \brief Store used features */ std::vector> feature_groups_; @@ -1117,6 +1125,13 @@ class Dataset { #endif // USE_CUDA std::string parser_config_str_; + + /*! \brief stored sampled features, for creating differential features in pairwise lambdarank */ + std::vector> sampled_values_; + /*! \brief stored sampled data indices, for creating differential features in pairwise lambdarank */ + std::vector> sampled_indices_; + /*! \brief stored number of totally sampled data, for creating differential features in pairwise lambdarank */ + data_size_t num_total_sampled_data_; }; } // namespace LightGBM diff --git a/src/application/application.cpp b/src/application/application.cpp index a1c83df55177..52b731cee13b 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -122,7 +122,7 @@ void Application::LoadData() { if (config_.objective == std::string("pairwise_lambdarank")) { ref_train_data = train_data_.release(); train_data_.reset(new Dataset()); - train_data_->CreatePairWiseRankingData(ref_train_data, false); + train_data_->CreatePairWiseRankingData(ref_train_data, false, config_); } else { ref_train_data = train_data_.get(); } @@ -150,7 +150,7 @@ void Application::LoadData() { if (config_.objective == std::string("pairwise_lambdarank")) { const Dataset* original_dataset = new_dataset.release(); new_dataset.reset(new Dataset()); - new_dataset->CreatePairWiseRankingData(original_dataset, true); + new_dataset->CreatePairWiseRankingData(original_dataset, true, config_); } valid_datas_.push_back(std::move(new_dataset)); // need save binary file diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 03b482195b19..2d47622c5155 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -442,6 +442,24 @@ void Dataset::Construct(std::vector>* bin_mappers, } device_type_ = io_config.device_type; gpu_device_id_ = io_config.gpu_device_id; + + if (io_config.objective == std::string("pairwise_lambdarank")) { + // store sampled values for constructing differential features + const int num_threads = OMP_NUM_THREADS(); + sampled_values_.resize(static_cast(num_sample_col)); + sampled_indices_.resize(static_cast(num_sample_col)); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (int col_idx = 0; col_idx < num_sample_col; ++col_idx) { + const int num_samples_in_col = num_per_col[col_idx]; + sampled_values_[col_idx].reserve(static_cast(num_samples_in_col)); + sampled_indices_[col_idx].reserve(static_cast(num_samples_in_col)); + for (int i = 0; i < num_samples_in_col; ++i) { + sampled_values_[col_idx].push_back(sample_values[col_idx][i]); + sampled_indices_[col_idx].push_back(sample_non_zero_indices[col_idx][i]); + } + } + num_total_sampled_data_ = static_cast(total_sample_cnt); + } } void Dataset::FinishLoad() { @@ -823,7 +841,7 @@ void Dataset::CreateValid(const Dataset* dataset) { gpu_device_id_ = dataset->gpu_device_id_; } -void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_validation) { +void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_validation, const Config& config) { num_data_ = metadata_.BuildPairwiseFeatureRanking(dataset->metadata(), is_validation); feature_groups_.clear(); @@ -854,6 +872,23 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va group_feature_start_.resize(num_groups_); group_feature_cnt_.resize(num_groups_); + std::vector> diff_feature_bin_mappers; + if (config.objective == std::string("pairwise_lambdarank")) { + std::vector original_bin_mappers; + for (int i = 0; i < dataset->num_total_features_; ++i) { + const int inner_feature_index = dataset->InnerFeatureIndex(i); + if (inner_feature_index >= 0) { + original_bin_mappers.push_back(dataset->FeatureBinMapper(inner_feature_index)); + } else { + original_bin_mappers.push_back(nullptr); + } + } + + //CreatePairwiseRankingDifferentialFeatures(sampled_values_, sampled_indices_, original_bin_mappers, num_total_sampled_data_, &diff_feature_bin_mappers, config); + + num_features_ += dataset->num_features_; + } + int cur_feature_index = 0; for (int i = 0; i < num_groups_; ++i) { int original_group_index = i % dataset->num_groups_; @@ -1860,6 +1895,73 @@ const void* Dataset::GetColWiseData( return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator); } +void Dataset::CreatePairwiseRankingDifferentialFeatures( + const std::vector>& sample_values, + const std::vector>& sample_indices, + const std::vector& bin_mappers, + const data_size_t num_total_sample_data, + std::vector>* differential_feature_bin_mappers, + const Config& config) const { + const int num_original_features = static_cast(sample_values.size()); + const data_size_t filter_cnt = static_cast( + static_cast(config.min_data_in_leaf * num_total_sample_data) / num_data_); + std::vector numerical_feature_indices; + for (int i = 0; i < num_original_features; ++i) { + if (bin_mappers[i] != nullptr && bin_mappers[i]->bin_type() == BinType::NumericalBin) { + numerical_feature_indices.push_back(i); + } + } + const int num_numerical_features = static_cast(numerical_feature_indices.size()); + std::vector> sampled_differential_values(num_numerical_features); + for (int i = 0; i < num_numerical_features; ++i) { + differential_feature_bin_mappers->push_back(nullptr); + } + const int num_threads = OMP_NUM_THREADS(); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (int i = 0; i < num_numerical_features; ++i) { + const int feature_index = numerical_feature_indices[i]; + const data_size_t num_samples_for_feature = static_cast(sample_values[feature_index].size()); + if (config.zero_as_missing) { + for (int j = 0; j < num_samples_for_feature; ++j) { + const double value = sample_values[feature_index][j]; + for (int k = j + 1; k < num_samples_for_feature; ++k) { + const double diff_value = value - sample_values[feature_index][k]; + sampled_differential_values[i].push_back(diff_value); + } + } + } else { + CHECK_GT(sample_indices[feature_index].size(), 0); + int cur_pos_j = 0; + for (int j = 0; j < sample_indices[feature_index].back() + 1; ++j) { + double value_j = 0.0; + if (j == sample_indices[feature_index][cur_pos_j]) { + value_j = sample_values[feature_index][cur_pos_j]; + ++cur_pos_j; + } + int cur_pos_k = 0; + for (int k = 0; k < sample_indices[feature_index].back() + 1; ++k) { + double value_k = 0.0; + if (k == sample_indices[feature_index][cur_pos_k]) { + value_k = sample_values[feature_index][cur_pos_k]; + ++cur_pos_k; + } + const double diff_value = value_j - value_k; + sampled_differential_values[i].push_back(diff_value); + } + } + } + differential_feature_bin_mappers->operator[](i).reset(new BinMapper()); + std::vector forced_upper_bounds; + differential_feature_bin_mappers->operator[](i)->FindBin( + sampled_differential_values[i].data(), + static_cast(sampled_differential_values[i].size()), + static_cast(num_total_sample_data * (num_total_sample_data) / 2), + config.max_bin, config.min_data_in_bin, filter_cnt, config.feature_pre_filter, + BinType::NumericalBin, config.use_missing, config.zero_as_missing, forced_upper_bounds + ); + } +} + #ifdef USE_CUDA void Dataset::CreateCUDAColumnData() { cuda_column_data_.reset(new CUDAColumnData(num_data_, gpu_device_id_)); diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index ade885433c47..f7a9311dd476 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -1163,9 +1163,6 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, OMP_LOOP_EX_END(); } OMP_THROW_EX(); - - if () {} - } else { // start and len will store the process feature indices for different machines // machine i will find bins for features in [ start[i], start[i] + len[i] ) @@ -1253,7 +1250,8 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, CheckCategoricalFeatureNumBin(bin_mappers, config_.max_bin, config_.max_bin_by_feature); dataset->Construct(&bin_mappers, dataset->num_total_features_, forced_bin_bounds, Common::Vector2Ptr(&sample_indices).data(), Common::Vector2Ptr(&sample_values).data(), - Common::VectorSize(sample_indices).data(), static_cast(sample_indices.size()), sample_data.size(), config_); + Common::VectorSize(sample_indices).data(), static_cast(sample_indices.size()), + sample_data.size(), config_); if (dataset->has_raw()) { dataset->ResizeRaw(static_cast(sample_data.size())); } @@ -1585,8 +1583,10 @@ void DatasetLoader::CreatePairwiseRankingDifferentialFeatures( } } const int num_numerical_features = static_cast(numerical_feature_indices.size()); - std::vector> sampled_differential_values(num_original_features); - differential_feature_bin_mappers->resize(num_numerical_features, nullptr); + std::vector> sampled_differential_values(num_numerical_features); + for (int i = 0; i < num_numerical_features; ++i) { + differential_feature_bin_mappers->push_back(nullptr); + } const int num_threads = OMP_NUM_THREADS(); #pragma omp parallel for schedule(static) num_threads(num_threads) for (int i = 0; i < num_numerical_features; ++i) { @@ -1617,7 +1617,7 @@ void DatasetLoader::CreatePairwiseRankingDifferentialFeatures( ++cur_pos_k; } const double diff_value = value_j - value_k; - sampled_differential_values.push_back(diff_value); + sampled_differential_values[i].push_back(diff_value); } } } From 7aa170b2fe5a2fcd6c9478f06ca35f727b064482 Mon Sep 17 00:00:00 2001 From: Pavel Metrikov Date: Mon, 25 Mar 2024 16:19:05 -0700 Subject: [PATCH 34/68] bug fixing and efficiency improvement --- include/LightGBM/objective_function.h | 7 +- include/LightGBM/utils/common.h | 48 +++++++++ src/metric/rank_metric.hpp | 38 ++++++- src/objective/rank_objective.hpp | 147 ++++++++++++++------------ 4 files changed, 170 insertions(+), 70 deletions(-) diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h index 85b52c3ce35c..203b3e158a36 100644 --- a/include/LightGBM/objective_function.h +++ b/include/LightGBM/objective_function.h @@ -110,8 +110,11 @@ class ObjectiveFunction { #endif // USE_CUDA }; -void UpdatePointwiseScoresForOneQuery(data_size_t query_id, double* score_pointwise, const double* score, data_size_t cnt_pointwise, - data_size_t selected_pairs_cnt, const data_size_t* selected_pairs, const std::pair* paired_index_map, int truncation_level, double sigma); +void UpdatePointwiseScoresForOneQuery(data_size_t query_id, double* score_pointwise, const double* score_pairwise, data_size_t cnt_pointwise, + data_size_t selected_pairs_cnt, const data_size_t* selected_pairs, const std::pair* paired_index_map, + const std::multimap& right2left_map, const std::multimap < data_size_t, data_size_t>& left2right_map, + const std::map, data_size_t>& left_right2pair_map, + int truncation_level, double sigma, CommonC::SigmoidCache sigmoid_cache); } // namespace LightGBM diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index f1b5a10b5a69..eb2f41b9ef4c 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -1255,6 +1255,54 @@ inline static std::string ArrayToString(const std::vector& arr, size_t n) { return str_buf.str(); } +class SigmoidCache { +public: + SigmoidCache(){} + + void Init(double sigmoid) { + sigmoid_ = sigmoid; + // get boundary + min_sigmoid_input_ = min_sigmoid_input_ / sigmoid_ / 2; + max_sigmoid_input_ = -min_sigmoid_input_; + sigmoid_table_.resize(_sigmoid_bins); + // get score to bin factor + sigmoid_table_idx_factor_ = + _sigmoid_bins / (max_sigmoid_input_ - min_sigmoid_input_); + // cache + for (size_t i = 0; i < _sigmoid_bins; ++i) { + const double score = i / sigmoid_table_idx_factor_ + min_sigmoid_input_; + sigmoid_table_[i] = 1.0f / (1.0f + std::exp(score * sigmoid_)); + } + } + + double compute(double score) { + if (score <= min_sigmoid_input_) { + // too small, use lower bound + return sigmoid_table_[0]; + } + else if (score >= max_sigmoid_input_) { + // too large, use upper bound + return sigmoid_table_[_sigmoid_bins - 1]; + } + else { + return sigmoid_table_[static_cast((score - min_sigmoid_input_) * + sigmoid_table_idx_factor_)]; + } + } +private: + /*! \brief Sigmoid param */ + double sigmoid_; + /*! \brief Cache result for sigmoid transform to speed up */ + std::vector sigmoid_table_; + /*! \brief Number of bins in simoid table */ + size_t _sigmoid_bins = 1024 * 1024; + /*! \brief Minimal input of sigmoid table */ + double min_sigmoid_input_ = -50; + /*! \brief Maximal input of Sigmoid table */ + double max_sigmoid_input_ = 50; + /*! \brief Factor that covert score to bin in Sigmoid table */ + double sigmoid_table_idx_factor_; +}; } // namespace CommonC diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp index 818e45e46c4b..ca2c75058278 100644 --- a/src/metric/rank_metric.hpp +++ b/src/metric/rank_metric.hpp @@ -30,7 +30,7 @@ class NDCGMetric:public Metric { DCGCalculator::Init(label_gain); pairwise_scores_ = config.objective == std::string("pairwise_lambdarank"); sigmoid_ = config.sigmoid; - truncation_level_ = config.lambdarank_truncation_level; + truncation_level_ = config.lambdarank_truncation_level; } ~NDCGMetric() { @@ -83,7 +83,31 @@ class NDCGMetric:public Metric { scores_pointwise_.resize(num_data_, 0.0); num_data_pairwise_ = metadata.pairwise_query_boundaries()[metadata.num_queries()]; query_boundaries_pairwise_ = metadata.pairwise_query_boundaries(); + + right2left_map_byquery_.resize(num_queries_); + left2right_map_byquery_.resize(num_queries_); + left_right2pair_map_byquery_.resize(num_queries_); + #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) + for (data_size_t q = 0; q < num_queries_; ++q) { + const data_size_t start_pairwise = query_boundaries_pairwise_[q]; + const data_size_t cnt_pairwise = query_boundaries_pairwise_[q + 1] - query_boundaries_pairwise_[q]; + std::multimap right2left_map_; + std::multimap < data_size_t, data_size_t> left2right_map_; + std::map, data_size_t> left_right2pair_map_; + for (data_size_t i = 0; i < cnt_pairwise; ++i) { + //data_size_t current_pair = selected_pairs[i]; + int index_left = paired_index_map_[i + start_pairwise].first; + int index_right = paired_index_map_[i + start_pairwise].second; + right2left_map_.insert(std::make_pair(index_right, index_left)); + left2right_map_.insert(std::make_pair(index_left, index_right)); + left_right2pair_map_.insert(std::make_pair(std::make_pair(index_left, index_right), i)); + } + right2left_map_byquery_[q] = right2left_map_; + left2right_map_byquery_[q] = left2right_map_; + left_right2pair_map_byquery_[q] = left_right2pair_map_; + } } + sigmoid_cache_.Init(sigmoid_); } const std::vector& GetName() const override { @@ -119,7 +143,9 @@ class NDCGMetric:public Metric { const data_size_t cnt_pairwise = query_boundaries_pairwise_[i + 1] - query_boundaries_pairwise_[i]; std::vector all_pairs(cnt_pairwise); std::iota(all_pairs.begin(), all_pairs.end(), 0); - UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), paired_index_map_ + start_pairwise, truncation_level_, sigmoid_); + UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), + paired_index_map_ + start_pairwise, right2left_map_byquery_[i], left2right_map_byquery_[i], left_right2pair_map_byquery_[i], truncation_level_, + sigmoid_, sigmoid_cache_); } // calculate DCG @@ -149,7 +175,9 @@ class NDCGMetric:public Metric { const data_size_t cnt_pairwise = query_boundaries_pairwise_[i + 1] - query_boundaries_pairwise_[i]; std::vector all_pairs(cnt_pairwise); std::iota(all_pairs.begin(), all_pairs.end(), 0); - UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), paired_index_map_ + start_pairwise, truncation_level_, sigmoid_); + UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), + paired_index_map_ + start_pairwise, right2left_map_byquery_[i], left2right_map_byquery_[i], left_right2pair_map_byquery_[i], truncation_level_, + sigmoid_, sigmoid_cache_); } // calculate DCG DCGCalculator::CalDCG(eval_at_, label_ + query_boundaries_[i], @@ -194,10 +222,14 @@ class NDCGMetric:public Metric { std::vector> inverse_max_dcgs_; bool pairwise_scores_; double sigmoid_; + CommonC::SigmoidCache sigmoid_cache_; /*! \brief Truncation position for max DCG */ int truncation_level_; mutable std::vector scores_pointwise_; const std::pair* paired_index_map_; + std::vector> right2left_map_byquery_; + std::vector> left2right_map_byquery_; + std::vector, data_size_t>> left_right2pair_map_byquery_; /*! \brief Number of data */ data_size_t num_data_pairwise_; const data_size_t* query_boundaries_pairwise_; diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 280ba9fee59a..51dcf36e1ac6 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -6,10 +6,11 @@ #ifndef LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_ -#define model_indirect_comparisons_ true +#define model_indirect_comparisons_ false #define model_conditional_rel_ true #define indirect_comparisons_above_only true #define logarithmic_discounts true +#define hard_pairwise_preference false #include #include @@ -27,7 +28,10 @@ namespace LightGBM { void UpdatePointwiseScoresForOneQuery(data_size_t query_id, double* score_pointwise, const double* score_pairwise, data_size_t cnt_pointwise, - data_size_t selected_pairs_cnt, const data_size_t* selected_pairs, const std::pair* paired_index_map, int truncation_level, double sigma) { + data_size_t selected_pairs_cnt, const data_size_t* selected_pairs, const std::pair* paired_index_map, + const std::multimap& right2left_map, const std::multimap < data_size_t, data_size_t>& left2right_map, + const std::map, data_size_t>& left_right2pair_map, + int truncation_level, double sigma, CommonC::SigmoidCache sigmoid_cache) { // get sorted indices for scores std::vector sorted_idx(cnt_pointwise); for (data_size_t i = 0; i < cnt_pointwise; ++i) { @@ -42,18 +46,6 @@ namespace LightGBM { ranks[sorted_idx.at(i)] = i; } - std::multimap mapRight2Left; - std::multimap mapLeft2Right; - std::map, data_size_t> mapLeftRight2Pair; - for (data_size_t i = 0; i < selected_pairs_cnt; ++i) { - data_size_t current_pair = selected_pairs[i]; - int indexLeft = paired_index_map[current_pair].first; - int indexRight = paired_index_map[current_pair].second; - mapRight2Left.insert(std::make_pair(indexRight, indexLeft)); - mapLeft2Right.insert(std::make_pair(indexLeft, indexRight)); - mapLeftRight2Pair.insert(std::make_pair(std::make_pair(indexLeft, indexRight), current_pair)); - } - std::vector gradients(cnt_pointwise); std::vector hessians(cnt_pointwise); for (data_size_t i = 0; i < selected_pairs_cnt; i++) { @@ -65,31 +57,31 @@ namespace LightGBM { double delta_score = score_pairwise[current_pair]; int comparisons = 1; data_size_t current_pair_inverse = -1; - if (mapLeftRight2Pair.count(std::make_pair(indexRight, indexLeft)) > 0) { - current_pair_inverse = mapLeftRight2Pair.at(std::make_pair(indexRight, indexLeft)); + if (left_right2pair_map.count(std::make_pair(indexRight, indexLeft)) > 0) { + current_pair_inverse = left_right2pair_map.at(std::make_pair(indexRight, indexLeft)); delta_score -= score_pairwise[current_pair_inverse]; comparisons++; } if (model_indirect_comparisons_) { - auto indexHead_range = mapRight2Left.equal_range(indexLeft); + auto indexHead_range = right2left_map.equal_range(indexLeft); for (auto indexHead_it = indexHead_range.first; indexHead_it != indexHead_range.second; indexHead_it++) { data_size_t indexHead = indexHead_it->second; - if (mapLeftRight2Pair.count(std::make_pair(indexHead, indexRight)) > 0 && + if (left_right2pair_map.count(std::make_pair(indexHead, indexRight)) > 0 && (!(indirect_comparisons_above_only || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { - data_size_t indexHeadLeft = mapLeftRight2Pair.at(std::make_pair(indexHead, indexLeft)); - data_size_t indexHeadRight = mapLeftRight2Pair.at(std::make_pair(indexHead, indexRight)); + data_size_t indexHeadLeft = left_right2pair_map.at(std::make_pair(indexHead, indexLeft)); + data_size_t indexHeadRight = left_right2pair_map.at(std::make_pair(indexHead, indexRight)); delta_score += score_pairwise[indexHeadRight] - score_pairwise[indexHeadLeft]; comparisons++; } } - auto indexTail_range = mapLeft2Right.equal_range(indexLeft); + auto indexTail_range = left2right_map.equal_range(indexLeft); for (auto indexTail_it = indexTail_range.first; indexTail_it != indexTail_range.second; indexTail_it++) { data_size_t indexTail = indexTail_it->second; - if (mapLeftRight2Pair.count(std::make_pair(indexRight, indexTail)) > 0 && + if (left_right2pair_map.count(std::make_pair(indexRight, indexTail)) > 0 && (!indirect_comparisons_above_only || (ranks[indexTail] < ranks[indexLeft] && ranks[indexTail] < ranks[indexRight])) && (!model_conditional_rel_ || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { - data_size_t indexLeftTail = mapLeftRight2Pair.at(std::make_pair(indexLeft, indexTail)); - data_size_t indexRightTail = mapLeftRight2Pair.at(std::make_pair(indexRight, indexTail)); + data_size_t indexLeftTail = left_right2pair_map.at(std::make_pair(indexLeft, indexTail)); + data_size_t indexRightTail = left_right2pair_map.at(std::make_pair(indexRight, indexTail)); delta_score += score_pairwise[indexLeftTail] - score_pairwise[indexRightTail]; comparisons++; } @@ -99,16 +91,23 @@ namespace LightGBM { if (delta_score_pointwise == kMinScore || -delta_score_pointwise == kMinScore || delta_score == kMinScore || -delta_score == kMinScore) { continue; } delta_score /= comparisons; // get discount of this pair - const double paired_discount = logarithmic_discounts ? fabs(DCGCalculator::GetDiscount(ranks[indexRight]) - DCGCalculator::GetDiscount(ranks[indexLeft])) : 1.0; - //double p_lr = GetSigmoid(delta_score); - double p_lr = 1.0f / (1.0f + std::exp(-delta_score * sigma)); - double p_rl = 1.0 - p_lr; - //double p_lr_pointwise = GetSigmoid(delta_score_pointwise); - double p_lr_pointwise = 1.0f / (1.0f + std::exp(-delta_score_pointwise * sigma)); + double paired_discount = logarithmic_discounts ? fabs(DCGCalculator::GetDiscount(ranks[indexRight]) - DCGCalculator::GetDiscount(ranks[indexLeft])) : 1.0; + //double p_lr_pairwise = 1.0f / (1.0f + std::exp(-delta_score * sigma)); + double p_lr_pairwise = sigmoid_cache.compute(-delta_score); + double p_rl_pairwise = 1.0 - p_lr_pairwise; + //double p_lr_pointwise = 1.0f / (1.0f + std::exp(-delta_score_pointwise * sigma)); + double p_lr_pointwise = sigmoid_cache.compute(-delta_score_pointwise); double p_rl_pointwise = 1.0 - p_lr_pointwise; - gradients[indexLeft] += sigma * paired_discount * (p_rl_pointwise - p_rl); + + if (hard_pairwise_preference) { + paired_discount *= std::abs(0.5 - p_lr_pairwise); + p_lr_pairwise = p_lr_pairwise >= 0.5 ? 1.0 : 0.0; + p_rl_pairwise = 1.0 - p_lr_pairwise; + } + + gradients[indexLeft] += sigma * paired_discount * (p_rl_pointwise - p_rl_pairwise); hessians[indexLeft] += sigma * sigma * paired_discount * p_rl_pointwise * p_lr_pointwise; - gradients[indexRight] -= sigma * paired_discount * (p_rl_pointwise - p_rl); + gradients[indexRight] -= sigma * paired_discount * (p_rl_pointwise - p_rl_pairwise); hessians[indexRight] += sigma * sigma * paired_discount * p_rl_pointwise * p_lr_pointwise; } @@ -270,6 +269,7 @@ class LambdarankNDCG : public RankingObjective { } // construct Sigmoid table to speed up Sigmoid transform ConstructSigmoidTable(); + sigmoid_cache_.Init(sigmoid_); } inline void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt, @@ -464,6 +464,7 @@ class LambdarankNDCG : public RankingObjective { double max_sigmoid_input_ = 50; /*! \brief Factor that covert score to bin in Sigmoid table */ double sigmoid_table_idx_factor_; + CommonC::SigmoidCache sigmoid_cache_; }; /*! @@ -570,6 +571,29 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { } paired_index_map_ = metadata.paired_ranking_item_index_map(); scores_pointwise_.resize(num_data_pointwise, 0.0); + + right2left_map_byquery_.resize(num_queries_); + left2right_map_byquery_.resize(num_queries_); + left_right2pair_map_byquery_.resize(num_queries_); + #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) + for (data_size_t q = 0; q < num_queries_; ++q) { + const data_size_t start_pairwise = query_boundaries_pairwise_[q]; + const data_size_t cnt_pairwise = query_boundaries_pairwise_[q + 1] - query_boundaries_pairwise_[q]; + std::multimap right2left_map_; + std::multimap < data_size_t, data_size_t> left2right_map_; + std::map, data_size_t> left_right2pair_map_; + for (data_size_t i = 0; i < cnt_pairwise; ++i) { + //data_size_t current_pair = selected_pairs[i]; + int index_left = paired_index_map_[i + start_pairwise].first; + int index_right = paired_index_map_[i + start_pairwise].second; + right2left_map_.insert(std::make_pair(index_right, index_left)); + left2right_map_.insert(std::make_pair(index_left, index_right)); + left_right2pair_map_.insert(std::make_pair(std::make_pair(index_left, index_right), i)); + } + right2left_map_byquery_[q] = right2left_map_; + left2right_map_byquery_[q] = left2right_map_; + left_right2pair_map_byquery_[q] = left_right2pair_map_; + } } void GetGradients(const double* score_pairwise, score_t* gradients_pairwise, @@ -587,12 +611,13 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { pos_biases_[positions_[start_pointwise + paired_index_map_[start_pairwise + j].second]]); } } - GetGradientsForOneQuery(i, cnt_pointwise, cnt_pairwise, label_ + start_pointwise, scores_pointwise_.data(), num_position_ids_ > 0 ? score_adjusted_pairwise.data() : score_pairwise + start_pairwise, + GetGradientsForOneQuery(i, cnt_pointwise, cnt_pairwise, label_ + start_pointwise, scores_pointwise_.data() + start_pointwise, num_position_ids_ > 0 ? score_adjusted_pairwise.data() : score_pairwise + start_pairwise, + right2left_map_byquery_[i], left2right_map_byquery_[i], left_right2pair_map_byquery_[i], gradients_pairwise + start_pairwise, hessians_pairwise + start_pairwise); std::vector all_pairs(cnt_pairwise); std::iota(all_pairs.begin(), all_pairs.end(), 0); UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score_pairwise + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), - paired_index_map_ + start_pairwise, truncation_level_, sigmoid_); + paired_index_map_ + start_pairwise, right2left_map_byquery_[i], left2right_map_byquery_[i], left_right2pair_map_byquery_[i], truncation_level_, sigmoid_, sigmoid_cache_); } if (num_position_ids_ > 0) { std::vector gradients_pointwise(num_data_); @@ -628,6 +653,8 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { inline void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt_pointwise, data_size_t cnt_pairwise, const label_t* label, const double* score_pointwise, const double* score_pairwise, + const std::multimap& right2left_map, const std::multimap < data_size_t, data_size_t>& left2right_map, + const std::map, data_size_t>& left_right2pair_map, score_t* lambdas_pairwise, score_t* hessians_pairwise) const { @@ -661,19 +688,6 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { worst_idx -= 1; } const double worst_score = score_pointwise[sorted_idx[worst_idx]]; - - std::multimap mapRight2Left; - std::multimap mapLeft2Right; - std::map, data_size_t> mapLeftRight2Pair; - - for (data_size_t i = 0; i < cnt_pairwise; ++i) { - int indexLeft = paired_index_map_[i + start_pairwise].first; - int indexRight = paired_index_map_[i + start_pairwise].second; - mapRight2Left.insert(std::make_pair(indexRight, indexLeft)); - mapLeft2Right.insert(std::make_pair(indexLeft, indexRight)); - mapLeftRight2Pair.insert(std::make_pair(std::make_pair(indexLeft, indexRight), i)); - } - double sum_lambdas = 0.0; // start accmulate lambdas by pairs for (data_size_t i = 0; i < cnt_pairwise; i++) { @@ -698,31 +712,31 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { int comparisons = 1; data_size_t i_inverse = -1; - if (mapLeftRight2Pair.count(std::make_pair(indexRight, indexLeft)) > 0) { - i_inverse = mapLeftRight2Pair.at(std::make_pair(indexRight, indexLeft)); + if (left_right2pair_map.count(std::make_pair(indexRight, indexLeft)) > 0) { + i_inverse = left_right2pair_map.at(std::make_pair(indexRight, indexLeft)); delta_score -= score_pairwise[i_inverse]; comparisons++; } if (model_indirect_comparisons_) { - auto indexHead_range = mapRight2Left.equal_range(indexLeft); + auto indexHead_range = right2left_map.equal_range(indexLeft); for (auto indexHead_it = indexHead_range.first; indexHead_it != indexHead_range.second; indexHead_it++) { data_size_t indexHead = indexHead_it->second; - if (mapLeftRight2Pair.count(std::make_pair(indexHead, indexRight)) > 0 && + if (left_right2pair_map.count(std::make_pair(indexHead, indexRight)) > 0 && (!(indirect_comparisons_above_only || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { - data_size_t indexHeadLeft = mapLeftRight2Pair.at(std::make_pair(indexHead, indexLeft)); - data_size_t indexHeadRight = mapLeftRight2Pair.at(std::make_pair(indexHead, indexRight)); + data_size_t indexHeadLeft = left_right2pair_map.at(std::make_pair(indexHead, indexLeft)); + data_size_t indexHeadRight = left_right2pair_map.at(std::make_pair(indexHead, indexRight)); delta_score += score_pairwise[indexHeadRight] - score_pairwise[indexHeadLeft]; comparisons++; } } - auto indexTail_range = mapLeft2Right.equal_range(indexLeft); + auto indexTail_range = left2right_map.equal_range(indexLeft); for (auto indexTail_it = indexTail_range.first; indexTail_it != indexTail_range.second; indexTail_it++) { data_size_t indexTail = indexTail_it->second; - if (mapLeftRight2Pair.count(std::make_pair(indexRight, indexTail)) > 0 && + if (left_right2pair_map.count(std::make_pair(indexRight, indexTail)) > 0 && (!indirect_comparisons_above_only || (ranks[indexTail] < ranks[indexLeft] && ranks[indexTail] < ranks[indexRight])) && (!model_conditional_rel_ || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { - data_size_t indexLeftTail = mapLeftRight2Pair.at(std::make_pair(indexLeft, indexTail)); - data_size_t indexRightTail = mapLeftRight2Pair.at(std::make_pair(indexRight, indexTail)); + data_size_t indexLeftTail = left_right2pair_map.at(std::make_pair(indexLeft, indexTail)); + data_size_t indexRightTail = left_right2pair_map.at(std::make_pair(indexRight, indexTail)); delta_score += score_pairwise[indexLeftTail] - score_pairwise[indexRightTail]; comparisons++; } @@ -759,27 +773,27 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { hessians_pairwise[i_inverse] += static_cast(p_hessian / comparisons); } if (model_indirect_comparisons_) { - auto indexHead_range = mapRight2Left.equal_range(indexLeft); + auto indexHead_range = right2left_map.equal_range(indexLeft); for (auto indexHead_it = indexHead_range.first; indexHead_it != indexHead_range.second; indexHead_it++) { data_size_t indexHead = indexHead_it->second; - if (mapLeftRight2Pair.count(std::make_pair(indexHead, indexRight)) > 0 && + if (left_right2pair_map.count(std::make_pair(indexHead, indexRight)) > 0 && (!(indirect_comparisons_above_only || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { - data_size_t indexHeadLeft = mapLeftRight2Pair.at(std::make_pair(indexHead, indexLeft)); - data_size_t indexHeadRight = mapLeftRight2Pair.at(std::make_pair(indexHead, indexRight)); + data_size_t indexHeadLeft = left_right2pair_map.at(std::make_pair(indexHead, indexLeft)); + data_size_t indexHeadRight = left_right2pair_map.at(std::make_pair(indexHead, indexRight)); lambdas_pairwise[indexHeadRight] += static_cast(p_lambda / comparisons); hessians_pairwise[indexHeadRight] += static_cast(p_hessian / comparisons); lambdas_pairwise[indexHeadLeft] -= static_cast(p_lambda / comparisons); hessians_pairwise[indexHeadLeft] += static_cast(p_hessian / comparisons); } } - auto indexTail_range = mapLeft2Right.equal_range(indexLeft); + auto indexTail_range = left2right_map.equal_range(indexLeft); for (auto indexTail_it = indexTail_range.first; indexTail_it != indexTail_range.second; indexTail_it++) { data_size_t indexTail = indexTail_it->second; - if (mapLeftRight2Pair.count(std::make_pair(indexRight, indexTail)) > 0 && + if (left_right2pair_map.count(std::make_pair(indexRight, indexTail)) > 0 && (!indirect_comparisons_above_only || (ranks[indexTail] < ranks[indexLeft] && ranks[indexTail] < ranks[indexRight])) && (!model_conditional_rel_ || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { - data_size_t indexLeftTail = mapLeftRight2Pair.at(std::make_pair(indexLeft, indexTail)); - data_size_t indexRightTail = mapLeftRight2Pair.at(std::make_pair(indexRight, indexTail)); + data_size_t indexLeftTail = left_right2pair_map.at(std::make_pair(indexLeft, indexTail)); + data_size_t indexRightTail = left_right2pair_map.at(std::make_pair(indexRight, indexTail)); lambdas_pairwise[indexLeftTail] += static_cast(p_lambda / comparisons); hessians_pairwise[indexLeftTail] += static_cast(p_hessian / comparisons); lambdas_pairwise[indexRightTail] -= static_cast(p_lambda / comparisons); @@ -826,6 +840,9 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { private: const std::pair* paired_index_map_; + std::vector> right2left_map_byquery_; + std::vector> left2right_map_byquery_; + std::vector, data_size_t>> left_right2pair_map_byquery_; }; From abdb716c757227d15ce12568b059e95e5ff8b1b7 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 27 Mar 2024 05:26:51 +0000 Subject: [PATCH 35/68] add feature group for differential features --- include/LightGBM/bin.h | 28 +++++++-- .../LightGBM/pairwise_ranking_feature_group.h | 30 ++++++++- src/io/bin.cpp | 10 +++ src/io/dataset.cpp | 61 +++++++++++++++---- src/io/pairwise_ranking_feature_group.cpp | 20 ++++++ 5 files changed, 131 insertions(+), 18 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 59ca758b5e2f..bec1f5f6bd32 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -468,7 +468,7 @@ class Bin { static Bin* CreateSparseBin(data_size_t num_data, int num_bin); /*! - * \brief Create object for bin data of one feature, used for pairwise ranking, for an original dense bin + * \brief Create object for bin data of the first feature in pair, used for pairwise ranking, for an original dense bin * \param num_data Size of the pairwise dataset * \param num_bin Number of bin * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair @@ -477,7 +477,7 @@ class Bin { static Bin* CreateDensePairwiseRankingFirstBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); /*! - * \brief Create object for bin data of one feature, used for pairwise ranking, for an original sparse bin + * \brief Create object for bin data of the first feature in pair, used for pairwise ranking, for an original sparse bin * \param num_data Size of the pairwise dataset * \param num_bin Number of bin * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair @@ -486,7 +486,7 @@ class Bin { static Bin* CreateSparsePairwiseRankingFirstBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); /*! - * \brief Create object for bin data of one feature, used for pairwise ranking, for an original dense bin + * \brief Create object for bin data of the second feature in pair, used for pairwise ranking, for an original dense bin * \param num_data Size of the pairwise dataset * \param num_bin Number of bin * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair @@ -495,7 +495,7 @@ class Bin { static Bin* CreateDensePairwiseRankingSecondBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); /*! - * \brief Create object for bin data of one feature, used for pairwise ranking, for an original sparse bin + * \brief Create object for bin data of the second feature in pair, used for pairwise ranking, for an original sparse bin * \param num_data Size of the pairwise dataset * \param num_bin Number of bin * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair @@ -503,6 +503,26 @@ class Bin { */ static Bin* CreateSparsePairwiseRankingSecondBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); + /*! + * \brief Create object for bin data of the differential feature in pair, used for pairwise ranking, for an original dense bin + * \param num_data Size of the pairwise dataset + * \param num_bin Number of bin + * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair + * \param diff_bin_mappers Bin mappers for differential features in this group + * \return The bin data object + */ + static Bin* CreateDensePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers); + + /*! + * \brief Create object for bin data of the differential feature in pair, used for pairwise ranking, for an original sparse bin + * \param num_data Size of the pairwise dataset + * \param num_bin Number of bin + * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair + * \param diff_bin_mappers Bin mappers for differential features in this group + * \return The bin data object + */ + static Bin* CreateSparsePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers); + /*! * \brief Deep copy the bin */ diff --git a/include/LightGBM/pairwise_ranking_feature_group.h b/include/LightGBM/pairwise_ranking_feature_group.h index de9675e71ad0..d6ba3f123c90 100644 --- a/include/LightGBM/pairwise_ranking_feature_group.h +++ b/include/LightGBM/pairwise_ranking_feature_group.h @@ -15,8 +15,7 @@ namespace LightGBM { -/*! \brief Using to store data and providing some operations on one feature -group*/ +/*! \brief Using to store data and providing some operations on one pairwise feature group for pairwise ranking */ class PairwiseRankingFeatureGroup: public FeatureGroup { public: /*! @@ -103,7 +102,7 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { } } - private: + protected: void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) override; /*! \brief Pairwise data index to original data indices for ranking with pairwise features */ @@ -114,6 +113,31 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { const int is_first_or_second_in_pairing_; }; + +/*! \brief One differential feature group in pairwise ranking */ +class PairwiseRankingDifferentialFeatureGroup: public PairwiseRankingFeatureGroup { + public: + /*! + * \brief Constructor + * \param num_feature number of features of this group + * \param bin_mappers Bin mapper for features + * \param num_data Total number of data + * \param is_enable_sparse True if enable sparse feature + * \param is_first_or_second_in_pairing Mark whether features in this group belong to the first or second element in the pairing + */ + + PairwiseRankingDifferentialFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map, std::vector>& diff_feature_bin_mappers); + + /*! \brief Destructor */ + ~PairwiseRankingDifferentialFeatureGroup() {} + + private: + void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) override; + + std::vector> diff_feature_bin_mappers_; +}; + + } // namespace LightGBM #endif // LIGHTGBM_PAIRWISE_RANKING_FEATURE_GROUP_H_ diff --git a/src/io/bin.cpp b/src/io/bin.cpp index e2cfb50acfb7..75142fab03bd 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -677,6 +677,16 @@ namespace LightGBM { } } + Bin* Bin::CreateDensePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers) { + // TODO(shiyu1994) + return nullptr; + } + + Bin* Bin::CreateSparsePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers) { + // TODO(shiyu1994) + return nullptr; + } + MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate, const std::vector& offsets) { if (sparse_rate >= multi_val_bin_sparse_threshold) { diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 2d47622c5155..21a4ba79c0b7 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -872,23 +872,35 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va group_feature_start_.resize(num_groups_); group_feature_cnt_.resize(num_groups_); + + // create differential features std::vector> diff_feature_bin_mappers; - if (config.objective == std::string("pairwise_lambdarank")) { - std::vector original_bin_mappers; - for (int i = 0; i < dataset->num_total_features_; ++i) { - const int inner_feature_index = dataset->InnerFeatureIndex(i); - if (inner_feature_index >= 0) { - original_bin_mappers.push_back(dataset->FeatureBinMapper(inner_feature_index)); - } else { - original_bin_mappers.push_back(nullptr); - } + std::vector original_bin_mappers; + for (int i = 0; i < dataset->num_total_features_; ++i) { + const int inner_feature_index = dataset->InnerFeatureIndex(i); + if (inner_feature_index >= 0) { + original_bin_mappers.push_back(dataset->FeatureBinMapper(inner_feature_index)); + } else { + original_bin_mappers.push_back(nullptr); } + } - //CreatePairwiseRankingDifferentialFeatures(sampled_values_, sampled_indices_, original_bin_mappers, num_total_sampled_data_, &diff_feature_bin_mappers, config); + CreatePairwiseRankingDifferentialFeatures(sampled_values_, sampled_indices_, original_bin_mappers, num_total_sampled_data_, &diff_feature_bin_mappers, config); - num_features_ += dataset->num_features_; + std::vector used_diff_features; + for (int diff_feature_index = 0; diff_feature_index < static_cast(diff_feature_bin_mappers.size()); ++diff_feature_index) { + if (!diff_feature_bin_mappers[diff_feature_index]->is_trivial()) { + num_numeric_features_ += 1; + num_features_ += 1; + used_diff_features.push_back(diff_feature_index); + } } + const bool is_use_gpu = config.device_type == std::string("cuda") || config.device_type == std::string("gpu"); + std::vector group_is_multi_val; + std::vector> diff_feature_groups = FindGroups(diff_feature_bin_mappers, used_diff_features, Common::Vector2Ptr(&sampled_indices_).data(), Common::VectorSize(sampled_indices_).data(), static_cast(sampled_indices_.size()), num_total_sampled_data_, num_data_, is_use_gpu, false, &group_is_multi_val); + + int cur_feature_index = 0; for (int i = 0; i < num_groups_; ++i) { int original_group_index = i % dataset->num_groups_; @@ -910,6 +922,27 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va group_feature_cnt_[i] = dataset->group_feature_cnt_[original_group_index]; } + for (size_t i = 0; i < diff_feature_groups.size(); ++i) { + const std::vector& features_in_group = diff_feature_groups[i]; + group_feature_start_.push_back(cur_feature_index); + int sub_feature_index = 0; + for (size_t j = 0; j < features_in_group.size(); ++j) { + const int diff_feature_index = features_in_group[j]; + if (!diff_feature_bin_mappers[diff_feature_index]->is_trivial()) { + if (diff_feature_bin_mappers[diff_feature_index]->GetDefaultBin() != diff_feature_bin_mappers[diff_feature_index]->GetMostFreqBin()) { + feature_need_push_zeros_.push_back(cur_feature_index); + } + feature2group_.push_back(i + num_groups_); + feature2subfeature_.push_back(sub_feature_index); + ++cur_feature_index; + ++sub_feature_index; + } + } + + group_feature_cnt_.push_back(cur_feature_index - group_feature_start_.back()); + } + + feature_groups_.shrink_to_fit(); used_feature_map_.clear(); @@ -946,6 +979,12 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va forced_bin_bounds_.insert(forced_bin_bounds_.begin() + dataset->forced_bin_bounds_.size(), dataset->forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.end()); num_total_features_ = dataset->num_total_features_ * 2; + for (const auto& bin_mapper_ref : diff_feature_bin_mappers) { + if (!bin_mapper_ref->is_trivial()) { + num_total_features_ += 1; + } + } + label_idx_ = dataset->label_idx_; device_type_ = dataset->device_type_; gpu_device_id_ = dataset->gpu_device_id_; diff --git a/src/io/pairwise_ranking_feature_group.cpp b/src/io/pairwise_ranking_feature_group.cpp index 3f7404a7f78f..e24234434d24 100644 --- a/src/io/pairwise_ranking_feature_group.cpp +++ b/src/io/pairwise_ranking_feature_group.cpp @@ -76,4 +76,24 @@ void PairwiseRankingFeatureGroup::CreateBinData(int num_data, bool is_multi_val, } } +PairwiseRankingDifferentialFeatureGroup::PairwiseRankingDifferentialFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map, std::vector>& diff_feature_bin_mappers): PairwiseRankingFeatureGroup(other, num_original_data, is_first_or_second_in_pairing, num_pairs, paired_ranking_item_index_map) { + for (auto& bin_mapper_ref : diff_feature_bin_mappers) { + diff_feature_bin_mappers_.emplace_back(bin_mapper_ref.release()); + } +} + +void PairwiseRankingDifferentialFeatureGroup::CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { + CHECK(!is_multi_val); // do not support multi-value bin for now + if (force_sparse || + (!force_dense && num_feature_ == 1 && + bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { + is_sparse_ = true; + bin_data_.reset(Bin::CreateDensePairwiseRankingDiffBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_, &diff_feature_bin_mappers_)); + } else { + is_sparse_ = false; + bin_data_.reset(Bin::CreateDensePairwiseRankingDiffBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_, &diff_feature_bin_mappers_)); + } + is_multi_val_ = false; +} + } // namespace LightGBM From 3cdfd83e6c2c5c45bf2e8bde5ed4e8c8e88d6420 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 28 Mar 2024 08:59:59 +0000 Subject: [PATCH 36/68] refactor template initializations with macro --- include/LightGBM/bin.h | 6 +- include/LightGBM/dataset.h | 3 +- .../LightGBM/pairwise_ranking_feature_group.h | 3 +- src/io/bin.cpp | 24 +- src/io/dataset.cpp | 23 +- src/io/pairwise_lambdarank_bin.cpp | 1146 +++-------------- src/io/pairwise_lambdarank_bin.hpp | 257 +++- src/io/pairwise_ranking_feature_group.cpp | 9 +- 8 files changed, 447 insertions(+), 1024 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index bec1f5f6bd32..2d77e74eec12 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -509,9 +509,10 @@ class Bin { * \param num_bin Number of bin * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair * \param diff_bin_mappers Bin mappers for differential features in this group + * \param bin_offsets Bin offsets in feature group * \return The bin data object */ - static Bin* CreateDensePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers); + static Bin* CreateDensePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets); /*! * \brief Create object for bin data of the differential feature in pair, used for pairwise ranking, for an original sparse bin @@ -519,9 +520,10 @@ class Bin { * \param num_bin Number of bin * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair * \param diff_bin_mappers Bin mappers for differential features in this group + * \param bin_offsets Bin offsets in feature group * \return The bin data object */ - static Bin* CreateSparsePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers); + static Bin* CreateSparsePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets); /*! * \brief Deep copy the bin diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index ff59d76c3254..8d4d745af63a 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -1065,9 +1065,10 @@ class Dataset { void CreatePairwiseRankingDifferentialFeatures( const std::vector>& sample_values, const std::vector>& sample_indices, - const std::vector& bin_mappers, + const std::vector>& bin_mappers, const data_size_t num_total_sample_data, std::vector>* differential_feature_bin_mappers, + std::vector* diff_original_feature_index, const Config& config) const; std::string data_filename_; diff --git a/include/LightGBM/pairwise_ranking_feature_group.h b/include/LightGBM/pairwise_ranking_feature_group.h index d6ba3f123c90..1cca9f1fc3c8 100644 --- a/include/LightGBM/pairwise_ranking_feature_group.h +++ b/include/LightGBM/pairwise_ranking_feature_group.h @@ -126,7 +126,7 @@ class PairwiseRankingDifferentialFeatureGroup: public PairwiseRankingFeatureGrou * \param is_first_or_second_in_pairing Mark whether features in this group belong to the first or second element in the pairing */ - PairwiseRankingDifferentialFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map, std::vector>& diff_feature_bin_mappers); + PairwiseRankingDifferentialFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map, std::vector>& diff_feature_bin_mappers, std::vector>& ori_feature_bin_mappers); /*! \brief Destructor */ ~PairwiseRankingDifferentialFeatureGroup() {} @@ -135,6 +135,7 @@ class PairwiseRankingDifferentialFeatureGroup: public PairwiseRankingFeatureGrou void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) override; std::vector> diff_feature_bin_mappers_; + std::vector> ori_feature_bin_mappers_; }; diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 75142fab03bd..0abeba22eef1 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -677,14 +677,26 @@ namespace LightGBM { } } - Bin* Bin::CreateDensePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers) { - // TODO(shiyu1994) - return nullptr; + Bin* Bin::CreateDensePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets) { + if (num_bin <= 16) { + return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets); + } else if (num_bin <= 256) { + return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets); + } else if (num_bin <= 65536) { + return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets); + } else { + return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets); + } } - Bin* Bin::CreateSparsePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers) { - // TODO(shiyu1994) - return nullptr; + Bin* Bin::CreateSparsePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets) { + if (num_bin <= 256) { + return new SparsePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets); + } else if (num_bin <= 65536) { + return new SparsePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets); + } else { + return new SparsePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets); + } } MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 21a4ba79c0b7..c99eadb82a3f 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -875,17 +875,18 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va // create differential features std::vector> diff_feature_bin_mappers; - std::vector original_bin_mappers; + std::vector> original_bin_mappers; + std::vector diff_original_feature_index; for (int i = 0; i < dataset->num_total_features_; ++i) { const int inner_feature_index = dataset->InnerFeatureIndex(i); if (inner_feature_index >= 0) { - original_bin_mappers.push_back(dataset->FeatureBinMapper(inner_feature_index)); + original_bin_mappers.emplace_back(dataset->FeatureBinMapper(inner_feature_index)); } else { - original_bin_mappers.push_back(nullptr); + original_bin_mappers.emplace_back(nullptr); } } - CreatePairwiseRankingDifferentialFeatures(sampled_values_, sampled_indices_, original_bin_mappers, num_total_sampled_data_, &diff_feature_bin_mappers, config); + CreatePairwiseRankingDifferentialFeatures(sampled_values_, sampled_indices_, original_bin_mappers, num_total_sampled_data_, &diff_feature_bin_mappers, &diff_original_feature_index, config); std::vector used_diff_features; for (int diff_feature_index = 0; diff_feature_index < static_cast(diff_feature_bin_mappers.size()); ++diff_feature_index) { @@ -926,6 +927,8 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va const std::vector& features_in_group = diff_feature_groups[i]; group_feature_start_.push_back(cur_feature_index); int sub_feature_index = 0; + std::vector> ori_bin_mappers; + std::vector> diff_bin_mappers; for (size_t j = 0; j < features_in_group.size(); ++j) { const int diff_feature_index = features_in_group[j]; if (!diff_feature_bin_mappers[diff_feature_index]->is_trivial()) { @@ -936,9 +939,15 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va feature2subfeature_.push_back(sub_feature_index); ++cur_feature_index; ++sub_feature_index; + const int ori_feature_index = dataset->InnerFeatureIndex(diff_original_feature_index[diff_feature_index]); + ori_bin_mappers.emplace_back(new BinMapper(*dataset->FeatureBinMapper(ori_feature_index))); + diff_bin_mappers.emplace_back(new BinMapper(*diff_feature_bin_mappers[diff_feature_index])); } } + FeatureGroup feature_group(sub_feature_index, 0, &ori_bin_mappers, dataset->num_data(), i + num_groups_); + feature_groups_.emplace_back(new PairwiseRankingDifferentialFeatureGroup(feature_group, dataset->num_data(), 2, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_global_index_map(), diff_bin_mappers, ori_bin_mappers)); + group_feature_cnt_.push_back(cur_feature_index - group_feature_start_.back()); } @@ -1937,16 +1946,17 @@ const void* Dataset::GetColWiseData( void Dataset::CreatePairwiseRankingDifferentialFeatures( const std::vector>& sample_values, const std::vector>& sample_indices, - const std::vector& bin_mappers, + const std::vector>& bin_mappers, const data_size_t num_total_sample_data, std::vector>* differential_feature_bin_mappers, + std::vector* diff_original_feature_index, const Config& config) const { const int num_original_features = static_cast(sample_values.size()); const data_size_t filter_cnt = static_cast( static_cast(config.min_data_in_leaf * num_total_sample_data) / num_data_); std::vector numerical_feature_indices; for (int i = 0; i < num_original_features; ++i) { - if (bin_mappers[i] != nullptr && bin_mappers[i]->bin_type() == BinType::NumericalBin) { + if (bin_mappers[i] != nullptr && !bin_mappers[i]->is_trivial() && bin_mappers[i]->bin_type() == BinType::NumericalBin) { numerical_feature_indices.push_back(i); } } @@ -1959,6 +1969,7 @@ void Dataset::CreatePairwiseRankingDifferentialFeatures( #pragma omp parallel for schedule(static) num_threads(num_threads) for (int i = 0; i < num_numerical_features; ++i) { const int feature_index = numerical_feature_indices[i]; + diff_original_feature_index->push_back(feature_index); const data_size_t num_samples_for_feature = static_cast(sample_values[feature_index].size()); if (config.zero_as_missing) { for (int j = 0; j < num_samples_for_feature; ++j) { diff --git a/src/io/pairwise_lambdarank_bin.cpp b/src/io/pairwise_lambdarank_bin.cpp index 6d85179099c2..bade03724443 100644 --- a/src/io/pairwise_lambdarank_bin.cpp +++ b/src/io/pairwise_lambdarank_bin.cpp @@ -13,161 +13,41 @@ void PairwiseRankingBin::InitStreaming(uint32_t num_thr unpaired_bin_->InitStreaming(num_thread, omp_max_threads); } -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); - template class ITERATOR_TYPE> void PairwiseRankingBin::Push(int tid, data_size_t idx, uint32_t value) { unpaired_bin_->Push(tid, idx, value); } -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::Push(int tid, data_size_t idx, uint32_t value); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::Push(int tid, data_size_t idx, uint32_t value); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::Push(int tid, data_size_t idx, uint32_t value); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::Push(int tid, data_size_t idx, uint32_t value); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::Push(int tid, data_size_t idx, uint32_t value); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::Push(int tid, data_size_t idx, uint32_t value); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::Push(int tid, data_size_t idx, uint32_t value); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::Push(int tid, data_size_t idx, uint32_t value); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::Push(int tid, data_size_t idx, uint32_t value); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::Push(int tid, data_size_t idx, uint32_t value); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::Push(int tid, data_size_t idx, uint32_t value); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::Push(int tid, data_size_t idx, uint32_t value); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::Push(int tid, data_size_t idx, uint32_t value); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::Push(int tid, data_size_t idx, uint32_t value); - template class ITERATOR_TYPE> void PairwiseRankingBin::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) { unpaired_bin_->CopySubrow(full_bin, used_indices, num_used_indices); } -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); - template class ITERATOR_TYPE> void PairwiseRankingBin::SaveBinaryToFile(BinaryWriter* writer) const { unpaired_bin_->SaveBinaryToFile(writer); } -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::SaveBinaryToFile(BinaryWriter* writer) const; -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::SaveBinaryToFile(BinaryWriter* writer) const; -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::SaveBinaryToFile(BinaryWriter* writer) const; -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::SaveBinaryToFile(BinaryWriter* writer) const; -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::SaveBinaryToFile(BinaryWriter* writer) const; -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::SaveBinaryToFile(BinaryWriter* writer) const; -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::SaveBinaryToFile(BinaryWriter* writer) const; -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::SaveBinaryToFile(BinaryWriter* writer) const; -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::SaveBinaryToFile(BinaryWriter* writer) const; -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::SaveBinaryToFile(BinaryWriter* writer) const; -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::SaveBinaryToFile(BinaryWriter* writer) const; -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::SaveBinaryToFile(BinaryWriter* writer) const; -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::SaveBinaryToFile(BinaryWriter* writer) const; -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::SaveBinaryToFile(BinaryWriter* writer) const; - template class ITERATOR_TYPE> void PairwiseRankingBin::LoadFromMemory(const void* memory, const std::vector& local_used_indices) { unpaired_bin_->LoadFromMemory(memory, local_used_indices); } -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::LoadFromMemory(const void* memory, const std::vector& local_used_indices); - template class ITERATOR_TYPE> size_t PairwiseRankingBin::SizesInByte() const { return unpaired_bin_->SizesInByte(); } -template size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::SizesInByte() const; -template size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::SizesInByte() const; -template size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::SizesInByte() const; -template size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::SizesInByte() const; -template size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::SizesInByte() const; -template size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::SizesInByte() const; -template size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::SizesInByte() const; -template size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::SizesInByte() const; -template size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::SizesInByte() const; -template size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::SizesInByte() const; -template size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::SizesInByte() const; -template size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::SizesInByte() const; -template size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::SizesInByte() const; -template size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::SizesInByte() const; - template class ITERATOR_TYPE> data_size_t PairwiseRankingBin::num_data() const { return unpaired_bin_->num_data(); } -template data_size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::num_data() const; -template data_size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::num_data() const; -template data_size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::num_data() const; -template data_size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::num_data() const; -template data_size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::num_data() const; -template data_size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::num_data() const; -template data_size_t PairwiseRankingBin, PairwiseRankingFirstIterator>::num_data() const; -template data_size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::num_data() const; -template data_size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::num_data() const; -template data_size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::num_data() const; -template data_size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::num_data() const; -template data_size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::num_data() const; -template data_size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::num_data() const; -template data_size_t PairwiseRankingBin, PairwiseRankingSecondIterator>::num_data() const; - template class ITERATOR_TYPE> void PairwiseRankingBin::ReSize(data_size_t num_data) { return unpaired_bin_->ReSize(num_data); } -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::ReSize(data_size_t num_data); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::ReSize(data_size_t num_data); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::ReSize(data_size_t num_data); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::ReSize(data_size_t num_data); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::ReSize(data_size_t num_data); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::ReSize(data_size_t num_data); -template void PairwiseRankingBin, PairwiseRankingFirstIterator>::ReSize(data_size_t num_data); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::ReSize(data_size_t num_data); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::ReSize(data_size_t num_data); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::ReSize(data_size_t num_data); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::ReSize(data_size_t num_data); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::ReSize(data_size_t num_data); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::ReSize(data_size_t num_data); -template void PairwiseRankingBin, PairwiseRankingSecondIterator>::ReSize(data_size_t num_data); - template class ITERATOR_TYPE> template void DensePairwiseRankingBin::ConstructHistogramInner( @@ -283,46 +163,6 @@ void DensePairwiseRankingBin::ConstructHistogram( data_indices, start, end, ordered_gradients, ordered_hessians, out); } -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, data_size_t end, - const score_t* ordered_gradients, const score_t* ordered_hessians, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, data_size_t end, - const score_t* ordered_gradients, const score_t* ordered_hessians, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, data_size_t end, - const score_t* ordered_gradients, const score_t* ordered_hessians, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, data_size_t end, - const score_t* ordered_gradients, const score_t* ordered_hessians, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, data_size_t end, - const score_t* ordered_gradients, const score_t* ordered_hessians, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, data_size_t end, - const score_t* ordered_gradients, const score_t* ordered_hessians, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, data_size_t end, - const score_t* ordered_gradients, const score_t* ordered_hessians, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, data_size_t end, - const score_t* ordered_gradients, const score_t* ordered_hessians, - hist_t* out) const; - template class ITERATOR_TYPE> void DensePairwiseRankingBin::ConstructHistogram( data_size_t start, data_size_t end, @@ -333,54 +173,6 @@ void DensePairwiseRankingBin::ConstructHistogram( nullptr, start, end, ordered_gradients, ordered_hessians, out); } -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* ordered_hessians, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* ordered_hessians, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* ordered_hessians, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* ordered_hessians, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* ordered_hessians, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* ordered_hessians, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* ordered_hessians, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* ordered_hessians, - hist_t* out) const; - template class ITERATOR_TYPE> void DensePairwiseRankingBin::ConstructHistogram( const data_size_t* data_indices, data_size_t start, @@ -390,46 +182,6 @@ void DensePairwiseRankingBin::ConstructHistogram( ordered_gradients, nullptr, out); } -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - template class ITERATOR_TYPE> void DensePairwiseRankingBin::ConstructHistogram( data_size_t start, data_size_t end, @@ -439,46 +191,6 @@ void DensePairwiseRankingBin::ConstructHistogram( nullptr, start, end, ordered_gradients, nullptr, out); } -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogram( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - template class ITERATOR_TYPE> void DensePairwiseRankingBin::ConstructHistogramInt8( const data_size_t* data_indices, data_size_t start, @@ -489,637 +201,109 @@ void DensePairwiseRankingBin::ConstructHistogramI data_indices, start, end, ordered_gradients, out); } -template void DensePairwiseRankingBin::ConstructHistogramInt8( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* /*ordered_hessians*/, - hist_t* out) const; + hist_t* out) const { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} -template void DensePairwiseRankingBin::ConstructHistogramInt8( +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt8( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; + hist_t* out) const { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} -template void DensePairwiseRankingBin::ConstructHistogramInt8( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} -template void DensePairwiseRankingBin::ConstructHistogramInt8( +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt16( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* /*ordered_hessians*/, - hist_t* out) const; + hist_t* out) const { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} -template void DensePairwiseRankingBin::ConstructHistogramInt8( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* /*ordered_hessians*/, - hist_t* out) const; + hist_t* out) const { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} -template void DensePairwiseRankingBin::ConstructHistogramInt8( +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt16( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; + hist_t* out) const { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} template class ITERATOR_TYPE> -void DensePairwiseRankingBin::ConstructHistogramInt8( +void DensePairwiseRankingBin::ConstructHistogramInt16( data_size_t start, data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, hist_t* out) const { - ConstructHistogramIntInner( + ConstructHistogramIntInner( nullptr, start, end, ordered_gradients, out); } -template void DensePairwiseRankingBin::ConstructHistogramInt8( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, const score_t* /*ordered_hessians*/, - hist_t* out) const; + hist_t* out) const { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} -template void DensePairwiseRankingBin::ConstructHistogramInt8( +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt32( data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* /*ordered_hessians*/, - hist_t* out) const; + hist_t* out) const { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} -template void DensePairwiseRankingBin::ConstructHistogramInt8( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} -template void DensePairwiseRankingBin::ConstructHistogramInt8( +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt32( data_size_t start, data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template class ITERATOR_TYPE> -void DensePairwiseRankingBin::ConstructHistogramInt8( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const { - ConstructHistogramIntInner( - data_indices, start, end, ordered_gradients, out); -} - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template class ITERATOR_TYPE> -void DensePairwiseRankingBin::ConstructHistogramInt8( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const { - ConstructHistogramIntInner( - nullptr, start, end, ordered_gradients, out); -} - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt8( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template class ITERATOR_TYPE> -void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const { - ConstructHistogramIntInner( - data_indices, start, end, ordered_gradients, out); -} - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template class ITERATOR_TYPE> -void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const { - ConstructHistogramIntInner( - nullptr, start, end, ordered_gradients, out); -} - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template class ITERATOR_TYPE> -void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const { - ConstructHistogramIntInner( - data_indices, start, end, ordered_gradients, out); -} - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template class ITERATOR_TYPE> -void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const { - ConstructHistogramIntInner( - nullptr, start, end, ordered_gradients, out); -} - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt16( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template class ITERATOR_TYPE> -void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const { - ConstructHistogramIntInner( - data_indices, start, end, ordered_gradients, out); -} - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template class ITERATOR_TYPE> -void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const { - ConstructHistogramIntInner( - nullptr, start, end, ordered_gradients, out); -} - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - const score_t* /*ordered_hessians*/, - hist_t* out) const; - -template class ITERATOR_TYPE> -void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const { - ConstructHistogramIntInner( - data_indices, start, end, ordered_gradients, out); -} - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* ordered_gradients, - hist_t* out) const; - -template class ITERATOR_TYPE> -void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const { - ConstructHistogramIntInner( - nullptr, start, end, ordered_gradients, out); -} - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; - -template void DensePairwiseRankingBin::ConstructHistogramInt32( - data_size_t start, data_size_t end, - const score_t* ordered_gradients, - hist_t* out) const; + hist_t* out) const { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} template class ITERATOR_TYPE> template ::Split(uint32 #undef ARGUMENTS } -template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, - uint32_t default_bin, uint32_t most_freq_bin, - MissingType missing_type, bool default_left, - uint32_t threshold, const data_size_t* data_indices, - data_size_t cnt, - data_size_t* lte_indices, - data_size_t* gt_indices) const; - -template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, - uint32_t default_bin, uint32_t most_freq_bin, - MissingType missing_type, bool default_left, - uint32_t threshold, const data_size_t* data_indices, - data_size_t cnt, - data_size_t* lte_indices, - data_size_t* gt_indices) const; - -template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, - uint32_t default_bin, uint32_t most_freq_bin, - MissingType missing_type, bool default_left, - uint32_t threshold, const data_size_t* data_indices, - data_size_t cnt, - data_size_t* lte_indices, - data_size_t* gt_indices) const; - -template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, - uint32_t default_bin, uint32_t most_freq_bin, - MissingType missing_type, bool default_left, - uint32_t threshold, const data_size_t* data_indices, - data_size_t cnt, - data_size_t* lte_indices, - data_size_t* gt_indices) const; - -template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, - uint32_t default_bin, uint32_t most_freq_bin, - MissingType missing_type, bool default_left, - uint32_t threshold, const data_size_t* data_indices, - data_size_t cnt, - data_size_t* lte_indices, - data_size_t* gt_indices) const; - -template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, - uint32_t default_bin, uint32_t most_freq_bin, - MissingType missing_type, bool default_left, - uint32_t threshold, const data_size_t* data_indices, - data_size_t cnt, - data_size_t* lte_indices, - data_size_t* gt_indices) const; - -template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, - uint32_t default_bin, uint32_t most_freq_bin, - MissingType missing_type, bool default_left, - uint32_t threshold, const data_size_t* data_indices, - data_size_t cnt, - data_size_t* lte_indices, - data_size_t* gt_indices) const; - -template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, - uint32_t default_bin, uint32_t most_freq_bin, - MissingType missing_type, bool default_left, - uint32_t threshold, const data_size_t* data_indices, - data_size_t cnt, - data_size_t* lte_indices, - data_size_t* gt_indices) const; - template class ITERATOR_TYPE> data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, uint32_t most_freq_bin, MissingType missing_type, @@ -1327,60 +447,88 @@ data_size_t DensePairwiseRankingBin::Split(uint32 #undef ARGUMENTS } -template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, - uint32_t most_freq_bin, MissingType missing_type, - bool default_left, uint32_t threshold, - const data_size_t* data_indices, data_size_t cnt, - data_size_t* lte_indices, - data_size_t* gt_indices) const; -template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, - uint32_t most_freq_bin, MissingType missing_type, - bool default_left, uint32_t threshold, - const data_size_t* data_indices, data_size_t cnt, - data_size_t* lte_indices, +#define REGISTER_BIN_TYPE(BIN_TYPE, ITERATOR_TYPE) \ + template void PairwiseRankingBin::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); \ + template void PairwiseRankingBin::Push(int tid, data_size_t idx, uint32_t value); \ + template void PairwiseRankingBin::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); \ + template void PairwiseRankingBin::SaveBinaryToFile(BinaryWriter* writer) const; \ + template void PairwiseRankingBin::LoadFromMemory(const void* memory, const std::vector& local_used_indices); \ + template size_t PairwiseRankingBin::SizesInByte() const; \ + template data_size_t PairwiseRankingBin::num_data() const; \ + template void PairwiseRankingBin::ReSize(data_size_t num_data); + +#define COMMA , + +#define REGISTER_DENSE_HISTOGRAM_CONSTRUCTION_WITH_ITERATOR_TYPE_AND_FUNC_NAME(DATA_TYPE, USE_4BIT, ITERATOR_TYPE, FUNC_NAME) \ + template void DensePairwiseRankingBin::FUNC_NAME( \ + const data_size_t* data_indices, data_size_t start, data_size_t end, \ + const score_t* ordered_gradients, const score_t* ordered_hessians, \ + hist_t* out) const; \ + template void DensePairwiseRankingBin::FUNC_NAME( \ + data_size_t start, data_size_t end, \ + const score_t* ordered_gradients, \ + const score_t* ordered_hessians, \ + hist_t* out) const; \ + template void DensePairwiseRankingBin::FUNC_NAME( \ + const data_size_t* data_indices, data_size_t start, \ + data_size_t end, const score_t* ordered_gradients, \ + hist_t* out) const; \ + template void DensePairwiseRankingBin::FUNC_NAME( \ + data_size_t start, data_size_t end, \ + const score_t* ordered_gradients, \ + hist_t* out) const; + + +#define REGISTER_DENSE_TREE_LEARNING_FUNC_WITH_ITERATOR_TYPE(DATA_TYPE, USE_4BIT, ITERATOR_TYPE) \ + REGISTER_DENSE_HISTOGRAM_CONSTRUCTION_WITH_ITERATOR_TYPE_AND_FUNC_NAME(DATA_TYPE, USE_4BIT, ITERATOR_TYPE, ConstructHistogram) \ + REGISTER_DENSE_HISTOGRAM_CONSTRUCTION_WITH_ITERATOR_TYPE_AND_FUNC_NAME(DATA_TYPE, USE_4BIT, ITERATOR_TYPE, ConstructHistogramInt8) \ + REGISTER_DENSE_HISTOGRAM_CONSTRUCTION_WITH_ITERATOR_TYPE_AND_FUNC_NAME(DATA_TYPE, USE_4BIT, ITERATOR_TYPE, ConstructHistogramInt16) \ + REGISTER_DENSE_HISTOGRAM_CONSTRUCTION_WITH_ITERATOR_TYPE_AND_FUNC_NAME(DATA_TYPE, USE_4BIT, ITERATOR_TYPE, ConstructHistogramInt32) \ + template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, \ + uint32_t default_bin, uint32_t most_freq_bin, \ + MissingType missing_type, bool default_left, \ + uint32_t threshold, const data_size_t* data_indices, \ + data_size_t cnt, \ + data_size_t* lte_indices, \ + data_size_t* gt_indices) const; \ + template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, \ + uint32_t most_freq_bin, MissingType missing_type, \ + bool default_left, uint32_t threshold, \ + const data_size_t* data_indices, data_size_t cnt, \ + data_size_t* lte_indices, \ data_size_t* gt_indices) const; -template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, - uint32_t most_freq_bin, MissingType missing_type, - bool default_left, uint32_t threshold, - const data_size_t* data_indices, data_size_t cnt, - data_size_t* lte_indices, - data_size_t* gt_indices) const; -template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, - uint32_t most_freq_bin, MissingType missing_type, - bool default_left, uint32_t threshold, - const data_size_t* data_indices, data_size_t cnt, - data_size_t* lte_indices, - data_size_t* gt_indices) const; +#define REGISTER_DENSE_TREE_LEARNING_FUNC_FOR_ITERATOR_TYPE(ITERATOR_TYPE) \ + REGISTER_DENSE_TREE_LEARNING_FUNC_WITH_ITERATOR_TYPE(uint8_t, true, ITERATOR_TYPE) \ + REGISTER_DENSE_TREE_LEARNING_FUNC_WITH_ITERATOR_TYPE(uint8_t, false, ITERATOR_TYPE) \ + REGISTER_DENSE_TREE_LEARNING_FUNC_WITH_ITERATOR_TYPE(uint16_t, false, ITERATOR_TYPE) \ + REGISTER_DENSE_TREE_LEARNING_FUNC_WITH_ITERATOR_TYPE(uint32_t, false, ITERATOR_TYPE) -template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, - uint32_t most_freq_bin, MissingType missing_type, - bool default_left, uint32_t threshold, - const data_size_t* data_indices, data_size_t cnt, - data_size_t* lte_indices, - data_size_t* gt_indices) const; -template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, - uint32_t most_freq_bin, MissingType missing_type, - bool default_left, uint32_t threshold, - const data_size_t* data_indices, data_size_t cnt, - data_size_t* lte_indices, - data_size_t* gt_indices) const; +#define REGISTER_ITERATOR(ITERATOR_TYPE) \ + REGISTER_BIN_TYPE(DenseBin, ITERATOR_TYPE) \ + REGISTER_BIN_TYPE(DenseBin, ITERATOR_TYPE) \ + REGISTER_BIN_TYPE(DenseBin, ITERATOR_TYPE) \ + REGISTER_BIN_TYPE(DenseBin, ITERATOR_TYPE) \ + REGISTER_BIN_TYPE(SparseBin, ITERATOR_TYPE) \ + REGISTER_BIN_TYPE(SparseBin, ITERATOR_TYPE) \ + REGISTER_BIN_TYPE(SparseBin, ITERATOR_TYPE) \ + REGISTER_DENSE_TREE_LEARNING_FUNC_FOR_ITERATOR_TYPE(ITERATOR_TYPE) -template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, - uint32_t most_freq_bin, MissingType missing_type, - bool default_left, uint32_t threshold, - const data_size_t* data_indices, data_size_t cnt, - data_size_t* lte_indices, - data_size_t* gt_indices) const; -template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, - uint32_t most_freq_bin, MissingType missing_type, - bool default_left, uint32_t threshold, - const data_size_t* data_indices, data_size_t cnt, - data_size_t* lte_indices, - data_size_t* gt_indices) const; +REGISTER_ITERATOR(PairwiseRankingFirstIterator) +REGISTER_ITERATOR(PairwiseRankingSecondIterator) +REGISTER_ITERATOR(PairwiseRankingDiffIterator) + + +#undef COMMA +#undef REGISTER_TYPE +#undef REGISTER_BIN_TYPE +#undef REGISTER_DENSE_TREE_LEARNING_FUNC_FOR_ITERATOR_TYPE +#undef REGISTER_DENSE_TREE_LEARNING_FUNC_WITH_ITERATOR_TYPE +#undef REGISTER_DENSE_HISTOGRAM_CONSTRUCTION_WITH_ITERATOR_TYPE_AND_FUNC_NAME + } // namespace LightGBM diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp index 6fd839ed2d78..311a25b588e2 100644 --- a/src/io/pairwise_lambdarank_bin.hpp +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -22,6 +22,9 @@ class PairwiseRankingFirstBin; template class PairwiseRankingSecondBin; +template +class PairwiseRankingDiffBin; + template class PairwiseRankingFirstIterator: public BinIterator { public: @@ -59,7 +62,8 @@ class PairwiseRankingFirstIterator: public BinIterator { } void Reset(data_size_t idx) { - unpaired_bin_iterator_->Reset(idx); + const data_size_t first_idx = paired_ranking_item_index_map_[idx].first; + unpaired_bin_iterator_->Reset(first_idx); prev_index_ = -1; prev_val_ = 0; } @@ -106,7 +110,8 @@ class PairwiseRankingSecondIterator: public BinIterator { } void Reset(data_size_t idx) { - unpaired_bin_iterator_->Reset(idx); + const data_size_t second_idx = paired_ranking_item_index_map_[idx].second; + unpaired_bin_iterator_->Reset(second_idx); prev_index_ = 0; } @@ -117,6 +122,74 @@ class PairwiseRankingSecondIterator: public BinIterator { data_size_t prev_index_; }; + +template +class PairwiseRankingDiffIterator: public BinIterator { + public: + friend PairwiseRankingDiffBin; + + PairwiseRankingDiffIterator(const BIN_TYPE* unpaired_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin, const BinMapper* original_feature_bin_mapper, const BinMapper* diff_feature_bin_mapper): min_bin_(min_bin), max_bin_(max_bin), offset_(diff_feature_bin_mapper->GetMostFreqBin() == 0) { + unpaired_bin_ = unpaired_bin; + first_unpaired_bin_iterator_.reset(unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin)); + first_unpaired_bin_iterator_->Reset(0); + second_unpaired_bin_iterator_.reset(unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin)); + second_unpaired_bin_iterator_->Reset(0); + paired_ranking_item_index_map_ = paired_ranking_item_index_map; + first_prev_index_ = 0; + second_prev_index_ = 0; + original_feature_bin_mapper_ = original_feature_bin_mapper; + diff_feature_bin_mapper_ = diff_feature_bin_mapper; + } + + ~PairwiseRankingDiffIterator() {} + + uint32_t Get(data_size_t idx) { + const data_size_t first_data_index = paired_ranking_item_index_map_[idx].first; + const data_size_t second_data_index = paired_ranking_item_index_map_[idx].second; + if (second_data_index < second_prev_index_) { + second_unpaired_bin_iterator_->Reset(0); + } + first_prev_index_ = first_data_index; + second_prev_index_ = second_data_index; + const uint32_t first_bin = first_unpaired_bin_iterator_->Get(first_data_index); + const uint32_t second_bin = second_unpaired_bin_iterator_->Get(second_data_index); + // TODO(shiyu1994): better original value + const double first_value = original_feature_bin_mapper_->BinToValue(first_bin); + const double second_value = original_feature_bin_mapper_->BinToValue(second_bin); + const double diff_value = first_value - second_value; + const uint32_t diff_bin = diff_feature_bin_mapper_->ValueToBin(diff_value); + return diff_bin; + } + + uint32_t RawGet(data_size_t idx) { + const uint32_t bin = Get(idx); + return bin + min_bin_ - offset_; + } + + void Reset(data_size_t idx) { + const data_size_t first_idx = paired_ranking_item_index_map_[idx].first; + const data_size_t second_idx = paired_ranking_item_index_map_[idx].second; + first_unpaired_bin_iterator_->Reset(first_idx); + second_unpaired_bin_iterator_->Reset(second_idx); + first_prev_index_ = -1; + second_prev_index_ = 0; + } + + private: + const BIN_TYPE* unpaired_bin_; + std::unique_ptr first_unpaired_bin_iterator_; + std::unique_ptr second_unpaired_bin_iterator_; + const std::pair* paired_ranking_item_index_map_; + const BinMapper* original_feature_bin_mapper_; + const BinMapper* diff_feature_bin_mapper_; + data_size_t first_prev_index_; + data_size_t second_prev_index_; + const uint32_t min_bin_; + const uint32_t max_bin_; + const uint32_t offset_; +}; + + template class ITERATOR_TYPE> class PairwiseRankingBin: public BIN_TYPE { public: @@ -126,10 +199,6 @@ class PairwiseRankingBin: public BIN_TYPE { virtual ~PairwiseRankingBin() {} - BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { - return new ITERATOR_TYPE(unpaired_bin_.get(), paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin); - } - void InitStreaming(uint32_t num_thread, int32_t omp_max_threads) override; void Push(int tid, data_size_t idx, uint32_t value) override; @@ -193,6 +262,98 @@ class PairwiseRankingBin: public BIN_TYPE { return 0; } + void ConstructHistogram( + const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogram(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt8( + const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt8(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt16( + const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt16(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt32( + const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt32(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogram(const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogram(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt8(const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt8(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt16(const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt16(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt32(const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt32(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + const void* GetColWiseData(uint8_t* /*bit_type*/, bool* /*is_sparse*/, std::vector* /*bin_iterator*/, const int /*num_threads*/) const override { Log::Fatal("Not implemented."); return nullptr; @@ -326,6 +487,11 @@ template class DensePairwiseRankingFirstBin: public DensePairwiseRankingBin { public: DensePairwiseRankingFirstBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, DenseBin* unpaired_bin): DensePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} + + BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + return new PairwiseRankingFirstIterator>(this->unpaired_bin_.get(), this->paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin); + } + private: data_size_t get_unpaired_index(const data_size_t paired_index) const { return this->paired_ranking_item_index_map_[paired_index].first; @@ -336,16 +502,58 @@ template class DensePairwiseRankingSecondBin: public DensePairwiseRankingBin { public: DensePairwiseRankingSecondBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, DenseBin* unpaired_bin): DensePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} + + BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + return new PairwiseRankingSecondIterator>(this->unpaired_bin_.get(), this->paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin); + } + private: data_size_t get_unpaired_index(const data_size_t paired_index) const { return this->paired_ranking_item_index_map_[paired_index].second; } }; +template +class DensePairwiseRankingDiffBin: public DensePairwiseRankingBin { + public: + DensePairwiseRankingDiffBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, DenseBin* unpaired_bin, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets): DensePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) { + diff_bin_mappers_ = diff_bin_mappers; + ori_bin_mappers_ = ori_bin_mappers; + bin_offsets_ = bin_offsets; + } + + BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + int sub_feature_index = -1; + for (int i = 0; i < static_cast(bin_offsets_->size()); ++i) { + if (bin_offsets_->at(i) == min_bin) { + CHECK_GT(i, 0); + sub_feature_index = i - 1; + break; + } + } + CHECK_GE(sub_feature_index, 0); + return new PairwiseRankingDiffIterator>(this->unpaired_bin_.get(), this->paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin, ori_bin_mappers_->at(sub_feature_index).get(), diff_bin_mappers_->at(sub_feature_index).get()); + } + + private: + data_size_t get_unpaired_index(const data_size_t /*paired_index*/) const { + Log::Fatal("get_unpaired_index of DensePairwiseRankingDiffBin should not be called."); + } + + const std::vector* bin_offsets_; + const std::vector>* diff_bin_mappers_; + const std::vector>* ori_bin_mappers_; +}; + template class SparsePairwiseRankingFirstBin: public SparsePairwiseRankingBin { public: SparsePairwiseRankingFirstBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, SparseBin* unpaired_bin): SparsePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} + + BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + return new PairwiseRankingFirstIterator>(this->unpaired_bin_.get(), this->paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin); + } + private: data_size_t get_unpaired_index(const data_size_t paired_index) const { return this->paired_ranking_item_index_map_[paired_index].first; @@ -356,12 +564,49 @@ template class SparsePairwiseRankingSecondBin: public SparsePairwiseRankingBin { public: SparsePairwiseRankingSecondBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, SparseBin* unpaired_bin): SparsePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} + + BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + return new PairwiseRankingSecondIterator>(this->unpaired_bin_.get(), this->paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin); + } + private: data_size_t get_unpaired_index(const data_size_t paired_index) const { return this->paired_ranking_item_index_map_[paired_index].second; } }; +template +class SparsePairwiseRankingDiffBin: public SparsePairwiseRankingBin { + public: + SparsePairwiseRankingDiffBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, SparseBin* unpaired_bin, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets): SparsePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) { + bin_offsets_ = bin_offsets; + diff_bin_mappers_ = diff_bin_mappers; + ori_bin_mappers_ = ori_bin_mappers; + } + + BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + int sub_feature_index = -1; + for (int i = 0; i < static_cast(bin_offsets_->size()); ++i) { + if (bin_offsets_->at(i) == min_bin) { + CHECK_GT(i, 0); + sub_feature_index = i - 1; + break; + } + } + CHECK_GE(sub_feature_index, 0); + return new PairwiseRankingDiffIterator>(this->unpaired_bin_.get(), this->paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin, ori_bin_mappers_->at(sub_feature_index).get(), diff_bin_mappers_->at(sub_feature_index).get()); + } + + private: + data_size_t get_unpaired_index(const data_size_t /*paired_index*/) const { + Log::Fatal("get_unpaired_index of SparsePairwiseRankingDiffBin should not be called."); + } + + const std::vector* bin_offsets_; + const std::vector>* diff_bin_mappers_; + const std::vector>* ori_bin_mappers_; +}; + } // namespace LightGBM diff --git a/src/io/pairwise_ranking_feature_group.cpp b/src/io/pairwise_ranking_feature_group.cpp index e24234434d24..d795db1e257d 100644 --- a/src/io/pairwise_ranking_feature_group.cpp +++ b/src/io/pairwise_ranking_feature_group.cpp @@ -76,10 +76,13 @@ void PairwiseRankingFeatureGroup::CreateBinData(int num_data, bool is_multi_val, } } -PairwiseRankingDifferentialFeatureGroup::PairwiseRankingDifferentialFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map, std::vector>& diff_feature_bin_mappers): PairwiseRankingFeatureGroup(other, num_original_data, is_first_or_second_in_pairing, num_pairs, paired_ranking_item_index_map) { +PairwiseRankingDifferentialFeatureGroup::PairwiseRankingDifferentialFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map, std::vector>& diff_feature_bin_mappers, std::vector>& ori_feature_bin_mappers): PairwiseRankingFeatureGroup(other, num_original_data, is_first_or_second_in_pairing, num_pairs, paired_ranking_item_index_map) { for (auto& bin_mapper_ref : diff_feature_bin_mappers) { diff_feature_bin_mappers_.emplace_back(bin_mapper_ref.release()); } + for (auto& bin_mapper_ref : ori_feature_bin_mappers) { + ori_feature_bin_mappers_.emplace_back(bin_mapper_ref.release()); + } } void PairwiseRankingDifferentialFeatureGroup::CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { @@ -88,10 +91,10 @@ void PairwiseRankingDifferentialFeatureGroup::CreateBinData(int num_data, bool i (!force_dense && num_feature_ == 1 && bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { is_sparse_ = true; - bin_data_.reset(Bin::CreateDensePairwiseRankingDiffBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_, &diff_feature_bin_mappers_)); + bin_data_.reset(Bin::CreateDensePairwiseRankingDiffBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_, &diff_feature_bin_mappers_, &ori_feature_bin_mappers_, &bin_offsets_)); } else { is_sparse_ = false; - bin_data_.reset(Bin::CreateDensePairwiseRankingDiffBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_, &diff_feature_bin_mappers_)); + bin_data_.reset(Bin::CreateDensePairwiseRankingDiffBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_, &diff_feature_bin_mappers_, &ori_feature_bin_mappers_, &bin_offsets_)); } is_multi_val_ = false; } From 3703495f574af95645c52fa870c90096a8adf50d Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 28 Mar 2024 12:18:02 +0000 Subject: [PATCH 37/68] tree learning with differential features --- include/LightGBM/dataset.h | 8 ++++ include/LightGBM/dataset_loader.h | 16 -------- src/io/dataset.cpp | 51 ++++++++++++++--------- src/io/dataset_loader.cpp | 65 ------------------------------ src/io/pairwise_lambdarank_bin.cpp | 59 +++++++++++++-------------- src/io/pairwise_lambdarank_bin.hpp | 13 ++++-- 6 files changed, 78 insertions(+), 134 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 8d4d745af63a..c295cb8d3d1b 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -1062,6 +1062,14 @@ class Dataset { void CreateCUDAColumnData(); + /*! \brief Create differential features for pairwise lambdarank + * \param sample_values sampled values from the file + * \param sample_indices sampled data indices from the file + * \param bin_mappers bin mappers of the original features + * \param filter_cnt filter count for bin finding + * \param num_total_sample_data number of all sampled data + * \param differential_feature_bin_mappers output differential feature bin mapppers + */ void CreatePairwiseRankingDifferentialFeatures( const std::vector>& sample_values, const std::vector>& sample_indices, diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h index 68fba1307bf1..73b8e7bfd071 100644 --- a/include/LightGBM/dataset_loader.h +++ b/include/LightGBM/dataset_loader.h @@ -82,22 +82,6 @@ class DatasetLoader { */ void CheckCategoricalFeatureNumBin(const std::vector>& bin_mappers, const int max_bin, const std::vector& max_bin_by_feature) const; - /*! \brief Create differential features for pairwise lambdarank - * \param sample_values sampled values from the file - * \param sample_indices sampled data indices from the file - * \param bin_mappers bin mappers of the original features - * \param filter_cnt filter count for bin finding - * \param num_total_sample_data number of all sampled data - * \param differential_feature_bin_mappers output differential feature bin mapppers - */ - void CreatePairwiseRankingDifferentialFeatures( - const std::vector>& sample_values, - const std::vector>& sample_indices, - const std::vector>& bin_mappers, - const data_size_t filter_cnt, - const data_size_t num_total_sample_data, - std::vector>* differential_feature_bin_mappers) const; - const Config& config_; /*! \brief Random generator*/ Random random_; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index c99eadb82a3f..13ba478949a0 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -888,12 +888,28 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va CreatePairwiseRankingDifferentialFeatures(sampled_values_, sampled_indices_, original_bin_mappers, num_total_sampled_data_, &diff_feature_bin_mappers, &diff_original_feature_index, config); + used_feature_map_.clear(); + used_feature_map_.reserve(2 * dataset->used_feature_map_.size()); + used_feature_map_.insert(used_feature_map_.begin(), dataset->used_feature_map_.begin(), dataset->used_feature_map_.end()); + + for (int i = 0; i < dataset->num_total_features_; ++i) { + if (dataset->used_feature_map_[i] != -1) { + used_feature_map_.push_back(dataset->used_feature_map_[i] + dataset->num_features_); + } else { + used_feature_map_.push_back(-1); + } + } + std::vector used_diff_features; for (int diff_feature_index = 0; diff_feature_index < static_cast(diff_feature_bin_mappers.size()); ++diff_feature_index) { if (!diff_feature_bin_mappers[diff_feature_index]->is_trivial()) { + used_feature_map_.push_back(num_features_); + numeric_feature_map_.push_back(num_features_); num_numeric_features_ += 1; num_features_ += 1; used_diff_features.push_back(diff_feature_index); + } else { + used_feature_map_.push_back(-1); } } @@ -949,23 +965,14 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va feature_groups_.emplace_back(new PairwiseRankingDifferentialFeatureGroup(feature_group, dataset->num_data(), 2, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_global_index_map(), diff_bin_mappers, ori_bin_mappers)); group_feature_cnt_.push_back(cur_feature_index - group_feature_start_.back()); + num_total_bin += feature_groups_.back()->num_total_bin_; + group_bin_boundaries_.push_back(num_total_bin); } + num_groups_ += static_cast(diff_feature_groups.size()); feature_groups_.shrink_to_fit(); - used_feature_map_.clear(); - used_feature_map_.reserve(2 * dataset->used_feature_map_.size()); - used_feature_map_.insert(used_feature_map_.begin(), dataset->used_feature_map_.begin(), dataset->used_feature_map_.end()); - - for (int i = 0; i < dataset->num_total_features_; ++i) { - if (dataset->used_feature_map_[i] != -1) { - used_feature_map_.push_back(dataset->used_feature_map_[i] + dataset->num_features_); - } else { - used_feature_map_.push_back(-1); - } - } - feature_names_.clear(); for (const std::string& feature_name : dataset->feature_names_) { feature_names_.push_back(feature_name + std::string("_i")); @@ -973,6 +980,9 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va for (const std::string& feature_name : dataset->feature_names_) { feature_names_.push_back(feature_name + std::string("_j")); } + for (const int real_feature_index : diff_original_feature_index) { + feature_names_.push_back(dataset->feature_names_[real_feature_index] + std::string("_k")); + } real_feature_idx_.clear(); for (const int idx : dataset->real_feature_idx_) { @@ -981,18 +991,19 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va for (const int idx : dataset->real_feature_idx_) { real_feature_idx_.push_back(idx + dataset->num_total_features_); } + for (const auto& features_in_diff_group : diff_feature_groups) { + for (const int idx : features_in_diff_group) { + real_feature_idx_.push_back(idx + 2 * dataset->num_total_features_); + } + } + + num_total_features_ = dataset->num_total_features_ * 2 + static_cast(diff_feature_bin_mappers.size()); forced_bin_bounds_.clear(); - forced_bin_bounds_.reserve(dataset->forced_bin_bounds_.size() * 2); + forced_bin_bounds_.reserve(2 * dataset->num_total_features_); forced_bin_bounds_.insert(forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.end()); forced_bin_bounds_.insert(forced_bin_bounds_.begin() + dataset->forced_bin_bounds_.size(), dataset->forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.end()); - - num_total_features_ = dataset->num_total_features_ * 2; - for (const auto& bin_mapper_ref : diff_feature_bin_mappers) { - if (!bin_mapper_ref->is_trivial()) { - num_total_features_ += 1; - } - } + forced_bin_bounds_.resize(num_total_features_); label_idx_ = dataset->label_idx_; device_type_ = dataset->device_type_; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index f7a9311dd476..65ecf38685a1 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -1568,69 +1568,4 @@ void DatasetLoader::CheckCategoricalFeatureNumBin( } } -void DatasetLoader::CreatePairwiseRankingDifferentialFeatures( - const std::vector>& sample_values, - const std::vector>& sample_indices, - const std::vector>& bin_mappers, - const data_size_t filter_cnt, - const data_size_t num_total_sample_data, - std::vector>* differential_feature_bin_mappers) const { - const int num_original_features = static_cast(sample_values.size()); - std::vector numerical_feature_indices; - for (int i = 0; i < num_original_features; ++i) { - if (bin_mappers[i] != nullptr && bin_mappers[i]->bin_type() == BinType::NumericalBin) { - numerical_feature_indices.push_back(i); - } - } - const int num_numerical_features = static_cast(numerical_feature_indices.size()); - std::vector> sampled_differential_values(num_numerical_features); - for (int i = 0; i < num_numerical_features; ++i) { - differential_feature_bin_mappers->push_back(nullptr); - } - const int num_threads = OMP_NUM_THREADS(); - #pragma omp parallel for schedule(static) num_threads(num_threads) - for (int i = 0; i < num_numerical_features; ++i) { - const int feature_index = numerical_feature_indices[i]; - const data_size_t num_samples_for_feature = static_cast(sample_values[feature_index].size()); - if (config_.zero_as_missing) { - for (int j = 0; j < num_samples_for_feature; ++j) { - const double value = sample_values[feature_index][j]; - for (int k = j + 1; k < num_samples_for_feature; ++k) { - const double diff_value = value - sample_values[feature_index][k]; - sampled_differential_values[i].push_back(diff_value); - } - } - } else { - CHECK_GT(sample_indices[feature_index].size(), 0); - int cur_pos_j = 0; - for (int j = 0; j < sample_indices[feature_index].back() + 1; ++j) { - double value_j = 0.0; - if (j == sample_indices[feature_index][cur_pos_j]) { - value_j = sample_values[feature_index][cur_pos_j]; - ++cur_pos_j; - } - int cur_pos_k = 0; - for (int k = 0; k < sample_indices[feature_index].back() + 1; ++k) { - double value_k = 0.0; - if (k == sample_indices[feature_index][cur_pos_k]) { - value_k = sample_values[feature_index][cur_pos_k]; - ++cur_pos_k; - } - const double diff_value = value_j - value_k; - sampled_differential_values[i].push_back(diff_value); - } - } - } - differential_feature_bin_mappers->operator[](i).reset(new BinMapper()); - std::vector forced_upper_bounds; - differential_feature_bin_mappers->operator[](i)->FindBin( - sampled_differential_values[i].data(), - static_cast(sampled_differential_values[i].size()), - static_cast(num_total_sample_data * (num_total_sample_data) / 2), - config_.max_bin, config_.min_data_in_bin, filter_cnt, config_.feature_pre_filter, - BinType::NumericalBin, config_.use_missing, config_.zero_as_missing, forced_upper_bounds - ); - } -} - } // namespace LightGBM diff --git a/src/io/pairwise_lambdarank_bin.cpp b/src/io/pairwise_lambdarank_bin.cpp index bade03724443..b025d7f6341d 100644 --- a/src/io/pairwise_lambdarank_bin.cpp +++ b/src/io/pairwise_lambdarank_bin.cpp @@ -6,8 +6,31 @@ #include "pairwise_lambdarank_bin.hpp" +#include + namespace LightGBM { +template +uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const { + const data_size_t first_data_index = this->paired_ranking_item_index_map_[paired_data_index].first; + const data_size_t second_data_index = this->paired_ranking_item_index_map_[paired_data_index].second; + const uint32_t first_bin = static_cast(this->unpaired_bin_->data(first_data_index)); + const uint32_t second_bin = static_cast(this->unpaired_bin_->data(second_data_index)); + int first_feature_index = static_cast(std::upper_bound(bin_offsets_->begin(), bin_offsets_->end(), first_bin) - bin_offsets_->begin()) - 1; + int second_feature_index = static_cast(std::upper_bound(bin_offsets_->begin(), bin_offsets_->end(), second_bin) - bin_offsets_->begin()) - 1; + // TODO(shiyu1994): better original value, handle nan as missing + const double first_value = first_feature_index >= 0 ? ori_bin_mappers_->at(first_feature_index)->BinToValue(first_bin) : 0.0; + const double second_value = second_feature_index >= 0 ? ori_bin_mappers_->at(second_feature_index)->BinToValue(second_bin) : 0.0; + const double diff_value = first_value - second_value; + const uint32_t diff_bin = diff_bin_mappers_->at(first_feature_index)->ValueToBin(diff_value); + return diff_bin; +} + +template uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const; +template uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const; +template uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const; +template uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const; + template class ITERATOR_TYPE> void PairwiseRankingBin::InitStreaming(uint32_t num_thread, int32_t omp_max_threads) { unpaired_bin_->InitStreaming(num_thread, omp_max_threads); @@ -60,22 +83,12 @@ void DensePairwiseRankingBin::ConstructHistogramI hist_t* grad = out; hist_t* hess = out + 1; hist_cnt_t* cnt = reinterpret_cast(hess); - const VAL_T* base_data_ptr = reinterpret_cast(this->unpaired_bin_->get_data()); if (USE_PREFETCH) { const data_size_t pf_offset = 64 / sizeof(VAL_T); const data_size_t pf_end = end - pf_offset; for (; i < pf_end; ++i) { const auto paired_idx = USE_INDICES ? data_indices[i] : i; - const auto idx = this->get_unpaired_index(paired_idx); - const auto paired_pf_idx = - USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; - const auto pf_idx = this->get_unpaired_index(paired_pf_idx); - if (IS_4BIT) { - PREFETCH_T0(base_data_ptr + (pf_idx >> 1)); - } else { - PREFETCH_T0(base_data_ptr + pf_idx); - } - const auto ti = static_cast(this->unpaired_bin_->data(idx)) << 1; + const auto ti = GetBinAt(paired_idx) << 1; if (USE_HESSIAN) { grad[ti] += ordered_gradients[i]; hess[ti] += ordered_hessians[i]; @@ -87,8 +100,7 @@ void DensePairwiseRankingBin::ConstructHistogramI } for (; i < end; ++i) { const auto paired_idx = USE_INDICES ? data_indices[i] : i; - const auto idx = this->get_unpaired_index(paired_idx); - const auto ti = static_cast(this->unpaired_bin_->data(idx)) << 1; + const auto ti = GetBinAt(paired_idx) << 1; if (USE_HESSIAN) { grad[ti] += ordered_gradients[i]; hess[ti] += ordered_hessians[i]; @@ -109,22 +121,12 @@ void DensePairwiseRankingBin::ConstructHistogramI data_size_t i = start; PACKED_HIST_T* out_ptr = reinterpret_cast(out); const int16_t* gradients_ptr = reinterpret_cast(ordered_gradients); - const VAL_T* data_ptr_base = reinterpret_cast(this->unpaired_bin_->get_data()); if (USE_PREFETCH) { const data_size_t pf_offset = 64 / sizeof(VAL_T); const data_size_t pf_end = end - pf_offset; for (; i < pf_end; ++i) { const auto paired_idx = USE_INDICES ? data_indices[i] : i; - const auto paired_pf_idx = - USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; - const auto idx = this->get_unpaired_index(paired_idx); - const auto pf_idx = this->get_unpaired_index(paired_pf_idx); - if (IS_4BIT) { - PREFETCH_T0(data_ptr_base + (pf_idx >> 1)); - } else { - PREFETCH_T0(data_ptr_base + pf_idx); - } - const auto ti = static_cast(this->unpaired_bin_->data(idx)); + const auto ti = GetBinAt(paired_idx) << 1; const int16_t gradient_16 = gradients_ptr[i]; if (USE_HESSIAN) { const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : @@ -139,8 +141,7 @@ void DensePairwiseRankingBin::ConstructHistogramI } for (; i < end; ++i) { const auto paired_idx = USE_INDICES ? data_indices[i] : i; - const auto idx = this->get_unpaired_index(paired_idx); - const auto ti = static_cast(this->unpaired_bin_->data(idx)); + const auto ti = GetBinAt(paired_idx) << 1; const int16_t gradient_16 = gradients_ptr[i]; if (USE_HESSIAN) { const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : @@ -341,8 +342,7 @@ data_size_t DensePairwiseRankingBin::SplitInner(u if (min_bin < max_bin) { for (data_size_t i = 0; i < cnt; ++i) { const data_size_t paired_idx = data_indices[i]; - const data_size_t idx = this->get_unpaired_index(paired_idx); - const auto bin = this->unpaired_bin_->data(idx); + const auto bin = GetBinAt(paired_idx); if ((MISS_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || (MISS_IS_NA && !MFB_IS_NA && bin == maxb)) { missing_default_indices[(*missing_default_count)++] = paired_idx; @@ -368,8 +368,7 @@ data_size_t DensePairwiseRankingBin::SplitInner(u } for (data_size_t i = 0; i < cnt; ++i) { const data_size_t paired_idx = data_indices[i]; - const data_size_t idx = this->get_unpaired_index(paired_idx); - const auto bin = this->unpaired_bin_->data(idx); + const auto bin = GetBinAt(paired_idx); if (MISS_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { missing_default_indices[(*missing_default_count)++] = paired_idx; } else if (bin != maxb) { diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp index 311a25b588e2..db9a8f44f484 100644 --- a/src/io/pairwise_lambdarank_bin.hpp +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -453,7 +453,7 @@ class DensePairwiseRankingBin: public PairwiseRankingBin void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, @@ -475,6 +475,11 @@ class DensePairwiseRankingBin: public PairwiseRankingBinget_unpaired_index(paired_data_index); + return this->unpaired_bin_->data(idx); + } }; template class ITERATOR_TYPE> @@ -527,7 +532,7 @@ class DensePairwiseRankingDiffBin: public DensePairwiseRankingBin(bin_offsets_->size()); ++i) { if (bin_offsets_->at(i) == min_bin) { CHECK_GT(i, 0); - sub_feature_index = i - 1; + sub_feature_index = i; break; } } @@ -540,6 +545,8 @@ class DensePairwiseRankingDiffBin: public DensePairwiseRankingBin* bin_offsets_; const std::vector>* diff_bin_mappers_; const std::vector>* ori_bin_mappers_; @@ -589,7 +596,7 @@ class SparsePairwiseRankingDiffBin: public SparsePairwiseRankingBin(bin_offsets_->size()); ++i) { if (bin_offsets_->at(i) == min_bin) { CHECK_GT(i, 0); - sub_feature_index = i - 1; + sub_feature_index = i; break; } } From 8f55a938c8853f07b972c87b39aa9e2403ac2194 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 28 Mar 2024 12:48:21 +0000 Subject: [PATCH 38/68] avoid copy sampled values --- include/LightGBM/dataset.h | 4 ++-- src/io/dataset.cpp | 31 +++++++++++++++++-------------- src/io/dataset_loader.cpp | 4 ++++ 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index c295cb8d3d1b..8a135c7219d2 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -1136,9 +1136,9 @@ class Dataset { std::string parser_config_str_; /*! \brief stored sampled features, for creating differential features in pairwise lambdarank */ - std::vector> sampled_values_; + std::shared_ptr>> sampled_values_; /*! \brief stored sampled data indices, for creating differential features in pairwise lambdarank */ - std::vector> sampled_indices_; + std::shared_ptr>> sampled_indices_; /*! \brief stored number of totally sampled data, for creating differential features in pairwise lambdarank */ data_size_t num_total_sampled_data_; }; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 13ba478949a0..1822dda30452 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -446,16 +446,18 @@ void Dataset::Construct(std::vector>* bin_mappers, if (io_config.objective == std::string("pairwise_lambdarank")) { // store sampled values for constructing differential features const int num_threads = OMP_NUM_THREADS(); - sampled_values_.resize(static_cast(num_sample_col)); - sampled_indices_.resize(static_cast(num_sample_col)); + sampled_values_.reset(new std::vector>()); + sampled_indices_.reset(new std::vector>()); + sampled_values_->resize(static_cast(num_sample_col)); + sampled_indices_->resize(static_cast(num_sample_col)); #pragma omp parallel for schedule(static) num_threads(num_threads) for (int col_idx = 0; col_idx < num_sample_col; ++col_idx) { const int num_samples_in_col = num_per_col[col_idx]; - sampled_values_[col_idx].reserve(static_cast(num_samples_in_col)); - sampled_indices_[col_idx].reserve(static_cast(num_samples_in_col)); + sampled_values_->at(col_idx).reserve(static_cast(num_samples_in_col)); + sampled_indices_->at(col_idx).reserve(static_cast(num_samples_in_col)); for (int i = 0; i < num_samples_in_col; ++i) { - sampled_values_[col_idx].push_back(sample_values[col_idx][i]); - sampled_indices_[col_idx].push_back(sample_non_zero_indices[col_idx][i]); + sampled_values_->at(col_idx).push_back(sample_values[col_idx][i]); + sampled_indices_->at(col_idx).push_back(sample_non_zero_indices[col_idx][i]); } } num_total_sampled_data_ = static_cast(total_sample_cnt); @@ -872,6 +874,9 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va group_feature_start_.resize(num_groups_); group_feature_cnt_.resize(num_groups_); + sampled_values_ = dataset->sampled_values_; + sampled_indices_ = dataset->sampled_indices_; + num_total_sampled_data_ = dataset->num_total_sampled_data_; // create differential features std::vector> diff_feature_bin_mappers; @@ -880,13 +885,13 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va for (int i = 0; i < dataset->num_total_features_; ++i) { const int inner_feature_index = dataset->InnerFeatureIndex(i); if (inner_feature_index >= 0) { - original_bin_mappers.emplace_back(dataset->FeatureBinMapper(inner_feature_index)); + original_bin_mappers.emplace_back(new BinMapper(*dataset->FeatureBinMapper(inner_feature_index))); } else { original_bin_mappers.emplace_back(nullptr); } } - CreatePairwiseRankingDifferentialFeatures(sampled_values_, sampled_indices_, original_bin_mappers, num_total_sampled_data_, &diff_feature_bin_mappers, &diff_original_feature_index, config); + CreatePairwiseRankingDifferentialFeatures(*sampled_values_, *sampled_indices_, original_bin_mappers, num_total_sampled_data_, &diff_feature_bin_mappers, &diff_original_feature_index, config); used_feature_map_.clear(); used_feature_map_.reserve(2 * dataset->used_feature_map_.size()); @@ -915,7 +920,7 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va const bool is_use_gpu = config.device_type == std::string("cuda") || config.device_type == std::string("gpu"); std::vector group_is_multi_val; - std::vector> diff_feature_groups = FindGroups(diff_feature_bin_mappers, used_diff_features, Common::Vector2Ptr(&sampled_indices_).data(), Common::VectorSize(sampled_indices_).data(), static_cast(sampled_indices_.size()), num_total_sampled_data_, num_data_, is_use_gpu, false, &group_is_multi_val); + std::vector> diff_feature_groups = FindGroups(diff_feature_bin_mappers, used_diff_features, Common::Vector2Ptr(sampled_indices_.get()).data(), Common::VectorSize(*sampled_indices_).data(), static_cast(sampled_indices_->size()), num_total_sampled_data_, num_data_, is_use_gpu, false, &group_is_multi_val); int cur_feature_index = 0; @@ -1965,13 +1970,12 @@ void Dataset::CreatePairwiseRankingDifferentialFeatures( const int num_original_features = static_cast(sample_values.size()); const data_size_t filter_cnt = static_cast( static_cast(config.min_data_in_leaf * num_total_sample_data) / num_data_); - std::vector numerical_feature_indices; for (int i = 0; i < num_original_features; ++i) { if (bin_mappers[i] != nullptr && !bin_mappers[i]->is_trivial() && bin_mappers[i]->bin_type() == BinType::NumericalBin) { - numerical_feature_indices.push_back(i); + diff_original_feature_index->push_back(i); } } - const int num_numerical_features = static_cast(numerical_feature_indices.size()); + const int num_numerical_features = static_cast(diff_original_feature_index->size()); std::vector> sampled_differential_values(num_numerical_features); for (int i = 0; i < num_numerical_features; ++i) { differential_feature_bin_mappers->push_back(nullptr); @@ -1979,8 +1983,7 @@ void Dataset::CreatePairwiseRankingDifferentialFeatures( const int num_threads = OMP_NUM_THREADS(); #pragma omp parallel for schedule(static) num_threads(num_threads) for (int i = 0; i < num_numerical_features; ++i) { - const int feature_index = numerical_feature_indices[i]; - diff_original_feature_index->push_back(feature_index); + const int feature_index = diff_original_feature_index->at(i); const data_size_t num_samples_for_feature = static_cast(sample_values[feature_index].size()); if (config.zero_as_missing) { for (int j = 0; j < num_samples_for_feature; ++j) { diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 65ecf38685a1..14d509090f64 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -351,6 +351,10 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, // check meta data dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices); + dataset->sampled_values_ = train_data->sampled_values_; + dataset->sampled_indices_ = train_data->sampled_indices_; + dataset->num_total_sampled_data_ = train_data->num_total_sampled_data_; + return dataset.release(); } From 8c3e7be3c6cb0d5b7277c133e3ae81366a6362c1 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 2 Apr 2024 07:55:52 +0000 Subject: [PATCH 39/68] fix sampled indices accelerate differential value sampling --- include/LightGBM/dataset.h | 2 ++ src/io/dataset.cpp | 35 ++++++++++++++++++++++++----------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 8a135c7219d2..439e96876657 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -1075,6 +1075,8 @@ class Dataset { const std::vector>& sample_indices, const std::vector>& bin_mappers, const data_size_t num_total_sample_data, + const data_size_t* query_boundaries, + const data_size_t num_queries, std::vector>* differential_feature_bin_mappers, std::vector* diff_original_feature_index, const Config& config) const; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 1822dda30452..dd10cf57aea1 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -280,6 +280,7 @@ std::vector> FastFeatureBundling( } std::vector> tmp_indices; + std::vector tmp_indices_ptr(num_sample_col, nullptr); std::vector tmp_num_per_col(num_sample_col, 0); for (auto fidx : used_features) { if (fidx >= num_sample_col) { @@ -291,18 +292,19 @@ std::vector> FastFeatureBundling( if (!ret.empty()) { tmp_indices.push_back(ret); tmp_num_per_col[fidx] = static_cast(ret.size()); - sample_indices[fidx] = tmp_indices.back().data(); + tmp_indices_ptr[fidx] = tmp_indices.back().data(); } else { tmp_num_per_col[fidx] = num_per_col[fidx]; + tmp_indices_ptr[fidx] = sample_indices[fidx]; } } std::vector group_is_multi_val, group_is_multi_val2; auto features_in_group = - FindGroups(bin_mappers, used_features, sample_indices, + FindGroups(bin_mappers, used_features, tmp_indices_ptr.data(), tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, is_sparse, &group_is_multi_val); auto group2 = - FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, + FindGroups(bin_mappers, feature_order_by_cnt, tmp_indices_ptr.data(), tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, is_sparse, &group_is_multi_val2); @@ -453,11 +455,11 @@ void Dataset::Construct(std::vector>* bin_mappers, #pragma omp parallel for schedule(static) num_threads(num_threads) for (int col_idx = 0; col_idx < num_sample_col; ++col_idx) { const int num_samples_in_col = num_per_col[col_idx]; - sampled_values_->at(col_idx).reserve(static_cast(num_samples_in_col)); - sampled_indices_->at(col_idx).reserve(static_cast(num_samples_in_col)); + sampled_values_->at(col_idx).resize(num_samples_in_col); + sampled_indices_->at(col_idx).resize(num_samples_in_col); for (int i = 0; i < num_samples_in_col; ++i) { - sampled_values_->at(col_idx).push_back(sample_values[col_idx][i]); - sampled_indices_->at(col_idx).push_back(sample_non_zero_indices[col_idx][i]); + sampled_values_->at(col_idx)[i] = sample_values[col_idx][i]; + sampled_indices_->at(col_idx)[i] = sample_non_zero_indices[col_idx][i]; } } num_total_sampled_data_ = static_cast(total_sample_cnt); @@ -891,7 +893,7 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va } } - CreatePairwiseRankingDifferentialFeatures(*sampled_values_, *sampled_indices_, original_bin_mappers, num_total_sampled_data_, &diff_feature_bin_mappers, &diff_original_feature_index, config); + CreatePairwiseRankingDifferentialFeatures(*sampled_values_, *sampled_indices_, original_bin_mappers, num_total_sampled_data_, metadata_.query_boundaries(), metadata_.num_queries(), &diff_feature_bin_mappers, &diff_original_feature_index, config); used_feature_map_.clear(); used_feature_map_.reserve(2 * dataset->used_feature_map_.size()); @@ -1964,6 +1966,8 @@ void Dataset::CreatePairwiseRankingDifferentialFeatures( const std::vector>& sample_indices, const std::vector>& bin_mappers, const data_size_t num_total_sample_data, + const data_size_t* query_boundaries, + const data_size_t num_queries, std::vector>* differential_feature_bin_mappers, std::vector* diff_original_feature_index, const Config& config) const { @@ -1988,7 +1992,12 @@ void Dataset::CreatePairwiseRankingDifferentialFeatures( if (config.zero_as_missing) { for (int j = 0; j < num_samples_for_feature; ++j) { const double value = sample_values[feature_index][j]; - for (int k = j + 1; k < num_samples_for_feature; ++k) { + int cur_query = 0; + data_size_t cur_data_index = sample_indices[feature_index][j]; + while (query_boundaries[cur_query + 1] <= cur_data_index) { + ++cur_query; + } + for (int k = j + 1; sample_indices[feature_index][k] < query_boundaries[cur_query + 1]; ++k) { const double diff_value = value - sample_values[feature_index][k]; sampled_differential_values[i].push_back(diff_value); } @@ -1997,13 +2006,17 @@ void Dataset::CreatePairwiseRankingDifferentialFeatures( CHECK_GT(sample_indices[feature_index].size(), 0); int cur_pos_j = 0; for (int j = 0; j < sample_indices[feature_index].back() + 1; ++j) { + int cur_query = 0; + while (query_boundaries[cur_query + 1] <= j) { + ++cur_query; + } double value_j = 0.0; if (j == sample_indices[feature_index][cur_pos_j]) { value_j = sample_values[feature_index][cur_pos_j]; ++cur_pos_j; } - int cur_pos_k = 0; - for (int k = 0; k < sample_indices[feature_index].back() + 1; ++k) { + int cur_pos_k = cur_pos_j; + for (int k = j + 1; k < query_boundaries[cur_query + 1] && k < sample_indices[feature_index].back() + 1; ++k) { double value_k = 0.0; if (k == sample_indices[feature_index][cur_pos_k]) { value_k = sample_values[feature_index][cur_pos_k]; From 5aa2d175c075a6eb91a6c447105e2fb6d476a02c Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 11 Apr 2024 06:07:46 +0000 Subject: [PATCH 40/68] push data into differential features --- docs/Parameters.rst | 8 ++ include/LightGBM/config.h | 4 + src/io/config_auto.cpp | 6 ++ src/io/dataset.cpp | 125 +++++++++++++--------- src/io/pairwise_lambdarank_bin.cpp | 18 +++- src/io/pairwise_lambdarank_bin.hpp | 1 - src/io/pairwise_ranking_feature_group.cpp | 14 +++ src/objective/rank_objective.hpp | 16 ++- 8 files changed, 138 insertions(+), 54 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 341cdd487c71..51bf9b0a09b4 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -121,6 +121,8 @@ Core Parameters - ``rank_xendcg`` is faster than and achieves the similar performance as ``lambdarank`` + - ``pairwise_lambdarank``, pairwise lambdarank algorithm + - label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect) - ``boosting`` :raw-html:`🔗︎`, default = ``gbdt``, type = enum, options: ``gbdt``, ``rf``, ``dart``, aliases: ``boosting_type``, ``boost`` @@ -1139,6 +1141,12 @@ Objective Parameters - *New in version 4.1.0* +- ``use_differential_feature_in_pairwise_ranking`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - whether to use differential features in pairwise ranking + + - used only in ``pairwise_lambdarank`` application + Metric Parameters ----------------- diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index d9ff994027c1..6b00013f117d 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -981,6 +981,10 @@ struct Config { // desc = *New in version 4.1.0* double lambdarank_position_bias_regularization = 0.0; + // desc = whether to use differential features in pairwise ranking + // desc = used only in ``pairwise_lambdarank`` application + bool use_differential_feature_in_pairwise_ranking = false; + #ifndef __NVCC__ #pragma endregion diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 394614af3f33..eb5d07c42476 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -305,6 +305,7 @@ const std::unordered_set& Config::parameter_set() { "lambdarank_norm", "label_gain", "lambdarank_position_bias_regularization", + "use_differential_feature_in_pairwise_ranking", "metric", "metric_freq", "is_provide_training_metric", @@ -623,6 +624,8 @@ void Config::GetMembersFromString(const std::unordered_map>& Config::paramet {"lambdarank_norm", {}}, {"label_gain", {}}, {"lambdarank_position_bias_regularization", {}}, + {"use_differential_feature_in_pairwise_ranking", {}}, {"metric", {"metrics", "metric_types"}}, {"metric_freq", {"output_freq"}}, {"is_provide_training_metric", {"training_metric", "is_training_metric", "train_metric"}}, @@ -1048,6 +1053,7 @@ const std::unordered_map& Config::ParameterTypes() { {"lambdarank_norm", "bool"}, {"label_gain", "vector"}, {"lambdarank_position_bias_regularization", "double"}, + {"use_differential_feature_in_pairwise_ranking", "bool"}, {"metric", "vector"}, {"metric_freq", "int"}, {"is_provide_training_metric", "bool"}, diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index dd10cf57aea1..07f532ecd85f 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -884,16 +884,18 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va std::vector> diff_feature_bin_mappers; std::vector> original_bin_mappers; std::vector diff_original_feature_index; - for (int i = 0; i < dataset->num_total_features_; ++i) { - const int inner_feature_index = dataset->InnerFeatureIndex(i); - if (inner_feature_index >= 0) { - original_bin_mappers.emplace_back(new BinMapper(*dataset->FeatureBinMapper(inner_feature_index))); - } else { - original_bin_mappers.emplace_back(nullptr); + if (config.use_differential_feature_in_pairwise_ranking) { + for (int i = 0; i < dataset->num_total_features_; ++i) { + const int inner_feature_index = dataset->InnerFeatureIndex(i); + if (inner_feature_index >= 0) { + original_bin_mappers.emplace_back(new BinMapper(*dataset->FeatureBinMapper(inner_feature_index))); + } else { + original_bin_mappers.emplace_back(nullptr); + } } - } - CreatePairwiseRankingDifferentialFeatures(*sampled_values_, *sampled_indices_, original_bin_mappers, num_total_sampled_data_, metadata_.query_boundaries(), metadata_.num_queries(), &diff_feature_bin_mappers, &diff_original_feature_index, config); + CreatePairwiseRankingDifferentialFeatures(*sampled_values_, *sampled_indices_, original_bin_mappers, num_total_sampled_data_, metadata_.query_boundaries(), metadata_.num_queries(), &diff_feature_bin_mappers, &diff_original_feature_index, config); + } used_feature_map_.clear(); used_feature_map_.reserve(2 * dataset->used_feature_map_.size()); @@ -908,15 +910,17 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va } std::vector used_diff_features; - for (int diff_feature_index = 0; diff_feature_index < static_cast(diff_feature_bin_mappers.size()); ++diff_feature_index) { - if (!diff_feature_bin_mappers[diff_feature_index]->is_trivial()) { - used_feature_map_.push_back(num_features_); - numeric_feature_map_.push_back(num_features_); - num_numeric_features_ += 1; - num_features_ += 1; - used_diff_features.push_back(diff_feature_index); - } else { - used_feature_map_.push_back(-1); + if (config.use_differential_feature_in_pairwise_ranking) { + for (int diff_feature_index = 0; diff_feature_index < static_cast(diff_feature_bin_mappers.size()); ++diff_feature_index) { + if (!diff_feature_bin_mappers[diff_feature_index]->is_trivial()) { + used_feature_map_.push_back(num_features_); + numeric_feature_map_.push_back(num_features_); + num_numeric_features_ += 1; + num_features_ += 1; + used_diff_features.push_back(diff_feature_index); + } else { + used_feature_map_.push_back(-1); + } } } @@ -946,37 +950,58 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va group_feature_cnt_[i] = dataset->group_feature_cnt_[original_group_index]; } - for (size_t i = 0; i < diff_feature_groups.size(); ++i) { - const std::vector& features_in_group = diff_feature_groups[i]; - group_feature_start_.push_back(cur_feature_index); - int sub_feature_index = 0; - std::vector> ori_bin_mappers; - std::vector> diff_bin_mappers; - for (size_t j = 0; j < features_in_group.size(); ++j) { - const int diff_feature_index = features_in_group[j]; - if (!diff_feature_bin_mappers[diff_feature_index]->is_trivial()) { - if (diff_feature_bin_mappers[diff_feature_index]->GetDefaultBin() != diff_feature_bin_mappers[diff_feature_index]->GetMostFreqBin()) { - feature_need_push_zeros_.push_back(cur_feature_index); + if (config.use_differential_feature_in_pairwise_ranking) { + for (size_t i = 0; i < diff_feature_groups.size(); ++i) { + const std::vector& features_in_group = diff_feature_groups[i]; + group_feature_start_.push_back(cur_feature_index); + int num_features_in_group = 0; + std::vector> ori_bin_mappers; + std::vector> ori_bin_mappers_for_diff; + std::vector> diff_bin_mappers; + for (size_t j = 0; j < features_in_group.size(); ++j) { + const int diff_feature_index = features_in_group[j]; + if (!diff_feature_bin_mappers[diff_feature_index]->is_trivial()) { + if (diff_feature_bin_mappers[diff_feature_index]->GetDefaultBin() != diff_feature_bin_mappers[diff_feature_index]->GetMostFreqBin()) { + feature_need_push_zeros_.push_back(cur_feature_index); + } + feature2group_.push_back(i + num_groups_); + feature2subfeature_.push_back(num_features_in_group); + ++cur_feature_index; + ++num_features_in_group; + const int ori_feature_index = dataset->InnerFeatureIndex(diff_original_feature_index[diff_feature_index]); + ori_bin_mappers.emplace_back(new BinMapper(*dataset->FeatureBinMapper(ori_feature_index))); + ori_bin_mappers_for_diff.emplace_back(new BinMapper(*dataset->FeatureBinMapper(ori_feature_index))); + diff_bin_mappers.emplace_back(new BinMapper(*diff_feature_bin_mappers[diff_feature_index])); + } + } + + FeatureGroup feature_group(num_features_in_group, 0, &ori_bin_mappers, dataset->num_data(), i + num_groups_); + + const int num_threads = OMP_NUM_THREADS(); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (int j = 0; j < num_features_in_group; ++j) { + const int tid = omp_get_thread_num(); + const int diff_feature_index = features_in_group[j]; + const int original_feature_index = dataset->InnerFeatureIndex(diff_original_feature_index[diff_feature_index]); + const BinMapper* original_feature_bin_mapper = dataset->FeatureBinMapper(original_feature_index); + BinIterator* original_feature_iterator = dataset->FeatureIterator(original_feature_index); + original_feature_iterator->Reset(0); + for (int k = 0; k < dataset->num_data(); ++k) { + feature_group.PushData(tid, j, k, original_feature_bin_mapper->BinToValue(original_feature_iterator->Get(k))); } - feature2group_.push_back(i + num_groups_); - feature2subfeature_.push_back(sub_feature_index); - ++cur_feature_index; - ++sub_feature_index; - const int ori_feature_index = dataset->InnerFeatureIndex(diff_original_feature_index[diff_feature_index]); - ori_bin_mappers.emplace_back(new BinMapper(*dataset->FeatureBinMapper(ori_feature_index))); - diff_bin_mappers.emplace_back(new BinMapper(*diff_feature_bin_mappers[diff_feature_index])); } - } - FeatureGroup feature_group(sub_feature_index, 0, &ori_bin_mappers, dataset->num_data(), i + num_groups_); - feature_groups_.emplace_back(new PairwiseRankingDifferentialFeatureGroup(feature_group, dataset->num_data(), 2, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_global_index_map(), diff_bin_mappers, ori_bin_mappers)); + feature_group.FinishLoad(); - group_feature_cnt_.push_back(cur_feature_index - group_feature_start_.back()); - num_total_bin += feature_groups_.back()->num_total_bin_; - group_bin_boundaries_.push_back(num_total_bin); - } + feature_groups_.emplace_back(new PairwiseRankingDifferentialFeatureGroup(feature_group, dataset->num_data(), 2, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_global_index_map(), diff_bin_mappers, ori_bin_mappers_for_diff)); - num_groups_ += static_cast(diff_feature_groups.size()); + group_feature_cnt_.push_back(cur_feature_index - group_feature_start_.back()); + num_total_bin += feature_groups_.back()->num_total_bin_; + group_bin_boundaries_.push_back(num_total_bin); + } + + num_groups_ += static_cast(diff_feature_groups.size()); + } feature_groups_.shrink_to_fit(); @@ -987,8 +1012,10 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va for (const std::string& feature_name : dataset->feature_names_) { feature_names_.push_back(feature_name + std::string("_j")); } - for (const int real_feature_index : diff_original_feature_index) { - feature_names_.push_back(dataset->feature_names_[real_feature_index] + std::string("_k")); + if (config.use_differential_feature_in_pairwise_ranking) { + for (const int real_feature_index : diff_original_feature_index) { + feature_names_.push_back(dataset->feature_names_[real_feature_index] + std::string("_k")); + } } real_feature_idx_.clear(); @@ -998,9 +1025,11 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va for (const int idx : dataset->real_feature_idx_) { real_feature_idx_.push_back(idx + dataset->num_total_features_); } - for (const auto& features_in_diff_group : diff_feature_groups) { - for (const int idx : features_in_diff_group) { - real_feature_idx_.push_back(idx + 2 * dataset->num_total_features_); + if (config.use_differential_feature_in_pairwise_ranking) { + for (const auto& features_in_diff_group : diff_feature_groups) { + for (const int idx : features_in_diff_group) { + real_feature_idx_.push_back(idx + 2 * dataset->num_total_features_); + } } } diff --git a/src/io/pairwise_lambdarank_bin.cpp b/src/io/pairwise_lambdarank_bin.cpp index b025d7f6341d..aea196984a35 100644 --- a/src/io/pairwise_lambdarank_bin.cpp +++ b/src/io/pairwise_lambdarank_bin.cpp @@ -18,12 +18,26 @@ uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t const uint32_t second_bin = static_cast(this->unpaired_bin_->data(second_data_index)); int first_feature_index = static_cast(std::upper_bound(bin_offsets_->begin(), bin_offsets_->end(), first_bin) - bin_offsets_->begin()) - 1; int second_feature_index = static_cast(std::upper_bound(bin_offsets_->begin(), bin_offsets_->end(), second_bin) - bin_offsets_->begin()) - 1; + // CHECK_GE(first_feature_index, 0); + // CHECK_GE(second_feature_index, 0); + // CHECK_LT(first_feature_index, diff_bin_mappers_->size()); // TODO(shiyu1994): better original value, handle nan as missing const double first_value = first_feature_index >= 0 ? ori_bin_mappers_->at(first_feature_index)->BinToValue(first_bin) : 0.0; const double second_value = second_feature_index >= 0 ? ori_bin_mappers_->at(second_feature_index)->BinToValue(second_bin) : 0.0; const double diff_value = first_value - second_value; - const uint32_t diff_bin = diff_bin_mappers_->at(first_feature_index)->ValueToBin(diff_value); - return diff_bin; + if (first_feature_index >= 0) { + const uint32_t min_bin = bin_offsets_->at(first_feature_index); + const uint32_t max_bin = bin_offsets_->at(first_feature_index + 1) - 1; + const uint32_t most_freq_bin = diff_bin_mappers_->at(first_feature_index)->GetMostFreqBin(); + const uint32_t diff_bin = diff_bin_mappers_->at(first_feature_index)->ValueToBin(diff_value) + bin_offsets_->at(first_feature_index); + if (diff_bin < min_bin || diff_bin > max_bin) { + return 0; + } else { + return diff_bin + min_bin - static_cast(most_freq_bin == 0); + } + } else { + return 0; + } } template uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const; diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp index db9a8f44f484..49e58e1c1653 100644 --- a/src/io/pairwise_lambdarank_bin.hpp +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -531,7 +531,6 @@ class DensePairwiseRankingDiffBin: public DensePairwiseRankingBin(bin_offsets_->size()); ++i) { if (bin_offsets_->at(i) == min_bin) { - CHECK_GT(i, 0); sub_feature_index = i; break; } diff --git a/src/io/pairwise_ranking_feature_group.cpp b/src/io/pairwise_ranking_feature_group.cpp index d795db1e257d..81c5ffebfe14 100644 --- a/src/io/pairwise_ranking_feature_group.cpp +++ b/src/io/pairwise_ranking_feature_group.cpp @@ -83,6 +83,20 @@ PairwiseRankingDifferentialFeatureGroup::PairwiseRankingDifferentialFeatureGroup for (auto& bin_mapper_ref : ori_feature_bin_mappers) { ori_feature_bin_mappers_.emplace_back(bin_mapper_ref.release()); } + + CreateBinData(num_original_data, is_multi_val_, !is_sparse_, is_sparse_); + + Threading::For(0, num_original_data, 512, [this, &other] (int block_index, data_size_t block_start, data_size_t block_end) { + for (int feature_index = 0; feature_index < num_feature_; ++feature_index) { + std::unique_ptr bin_iterator(other.SubFeatureIterator(feature_index)); + bin_iterator->Reset(block_start); + for (data_size_t index = block_start; index < block_end; ++index) { + PushBinData(block_index, feature_index, index, bin_iterator->Get(index)); + } + } + }); + + FinishLoad(); } void PairwiseRankingDifferentialFeatureGroup::CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 51dcf36e1ac6..a2c4b8cfee0b 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -33,6 +33,7 @@ namespace LightGBM { const std::map, data_size_t>& left_right2pair_map, int truncation_level, double sigma, CommonC::SigmoidCache sigmoid_cache) { // get sorted indices for scores + global_timer.Start("pairwise_lambdarank::UpdatePointwiseScoresForOneQuery part 0"); std::vector sorted_idx(cnt_pointwise); for (data_size_t i = 0; i < cnt_pointwise; ++i) { sorted_idx[i] = i; @@ -45,7 +46,8 @@ namespace LightGBM { for (int i = 0; i < cnt_pointwise; i++) { ranks[sorted_idx.at(i)] = i; } - + global_timer.Stop("pairwise_lambdarank::UpdatePointwiseScoresForOneQuery part 0"); + global_timer.Start("pairwise_lambdarank::UpdatePointwiseScoresForOneQuery part 1"); std::vector gradients(cnt_pointwise); std::vector hessians(cnt_pointwise); for (data_size_t i = 0; i < selected_pairs_cnt; i++) { @@ -110,13 +112,15 @@ namespace LightGBM { gradients[indexRight] -= sigma * paired_discount * (p_rl_pointwise - p_rl_pairwise); hessians[indexRight] += sigma * sigma * paired_discount * p_rl_pointwise * p_lr_pointwise; } - + global_timer.Stop("pairwise_lambdarank::UpdatePointwiseScoresForOneQuery part 1"); + global_timer.Start("pairwise_lambdarank::UpdatePointwiseScoresForOneQuery part 2"); for (data_size_t i = 0; i < cnt_pointwise; i++) { double delta = 0.3 * gradients[i] / (std::abs(hessians[i]) + 0.001); delta = std::min(delta, 0.3); delta = std::max(delta, -0.3); score_pointwise[i] += delta; } + global_timer.Stop("pairwise_lambdarank::UpdatePointwiseScoresForOneQuery part 2"); } /*! @@ -600,6 +604,7 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { score_t* hessians_pairwise) const override { #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) for (data_size_t i = 0; i < num_queries_; ++i) { + global_timer.Start("pairwise_lambdarank::GetGradients part 0"); const data_size_t start_pointwise = query_boundaries_[i]; const data_size_t cnt_pointwise = query_boundaries_[i + 1] - query_boundaries_[i]; const data_size_t start_pairwise = query_boundaries_pairwise_[i]; @@ -611,14 +616,19 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { pos_biases_[positions_[start_pointwise + paired_index_map_[start_pairwise + j].second]]); } } + global_timer.Stop("pairwise_lambdarank::GetGradients part 0"); + global_timer.Start("pairwise_lambdarank::GetGradients part 1"); GetGradientsForOneQuery(i, cnt_pointwise, cnt_pairwise, label_ + start_pointwise, scores_pointwise_.data() + start_pointwise, num_position_ids_ > 0 ? score_adjusted_pairwise.data() : score_pairwise + start_pairwise, right2left_map_byquery_[i], left2right_map_byquery_[i], left_right2pair_map_byquery_[i], gradients_pairwise + start_pairwise, hessians_pairwise + start_pairwise); std::vector all_pairs(cnt_pairwise); std::iota(all_pairs.begin(), all_pairs.end(), 0); + global_timer.Stop("pairwise_lambdarank::GetGradients part 1"); + global_timer.Start("pairwise_lambdarank::GetGradients part 2"); UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score_pairwise + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), paired_index_map_ + start_pairwise, right2left_map_byquery_[i], left2right_map_byquery_[i], left_right2pair_map_byquery_[i], truncation_level_, sigmoid_, sigmoid_cache_); - } + global_timer.Stop("pairwise_lambdarank::GetGradients part 2"); + } if (num_position_ids_ > 0) { std::vector gradients_pointwise(num_data_); std::vector hessians_pointwise(num_data_); From 1c319b865a70d87ae0adb273775705ea2ef43a1d Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 17 Apr 2024 15:54:10 +0000 Subject: [PATCH 41/68] fix differential feature bugs --- include/LightGBM/bin.h | 4 +- include/LightGBM/dataset.h | 7 +++ include/LightGBM/feature_group.h | 4 +- .../LightGBM/pairwise_ranking_feature_group.h | 5 ++ src/io/bin.cpp | 18 +++---- src/io/dataset.cpp | 8 +-- src/io/pairwise_lambdarank_bin.cpp | 21 +++----- src/io/pairwise_lambdarank_bin.hpp | 8 ++- src/io/pairwise_ranking_feature_group.cpp | 49 ++++++++++++++++++- src/treelearner/serial_tree_learner.cpp | 35 +++++++++++++ 10 files changed, 125 insertions(+), 34 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 2d77e74eec12..b265180ef9b0 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -512,7 +512,7 @@ class Bin { * \param bin_offsets Bin offsets in feature group * \return The bin data object */ - static Bin* CreateDensePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets); + static Bin* CreateDensePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets); /*! * \brief Create object for bin data of the differential feature in pair, used for pairwise ranking, for an original sparse bin @@ -523,7 +523,7 @@ class Bin { * \param bin_offsets Bin offsets in feature group * \return The bin data object */ - static Bin* CreateSparsePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets); + static Bin* CreateSparsePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets); /*! * \brief Deep copy the bin diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 439e96876657..e1bd18fdfe02 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -841,6 +841,13 @@ class Dataset { } } + void PrintGroupFeatureInfo(int group_index) const { + for (int sub_feature = 0; sub_feature < group_feature_cnt_[group_index]; ++sub_feature) { + const BinMapper* bin_mapper = feature_groups_[group_index]->bin_mappers_[sub_feature].get(); + Log::Warning("sub_feature = %d, missing_type = %d, most_freq_bin = %d", sub_feature, bin_mapper->missing_type(), bin_mapper->GetMostFreqBin()); + } + } + inline int FeatureNumBin(int i) const { const int group = feature2group_[i]; const int sub_feature = feature2subfeature_[i]; diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index 38f5ab318daf..b492a3031fff 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -343,7 +343,7 @@ class FeatureGroup { num_feature_ += other->num_feature_; } - inline BinIterator* SubFeatureIterator(int sub_feature) const { + virtual inline BinIterator* SubFeatureIterator(int sub_feature) const { uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin(); if (!is_multi_val_) { uint32_t min_bin = bin_offsets_[sub_feature]; @@ -373,7 +373,7 @@ class FeatureGroup { } } - inline BinIterator* FeatureGroupIterator() { + virtual inline BinIterator* FeatureGroupIterator() { if (is_multi_val_) { return nullptr; } diff --git a/include/LightGBM/pairwise_ranking_feature_group.h b/include/LightGBM/pairwise_ranking_feature_group.h index 1cca9f1fc3c8..8879bf1f8c66 100644 --- a/include/LightGBM/pairwise_ranking_feature_group.h +++ b/include/LightGBM/pairwise_ranking_feature_group.h @@ -128,6 +128,10 @@ class PairwiseRankingDifferentialFeatureGroup: public PairwiseRankingFeatureGrou PairwiseRankingDifferentialFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map, std::vector>& diff_feature_bin_mappers, std::vector>& ori_feature_bin_mappers); + virtual inline BinIterator* SubFeatureIterator(int sub_feature) const override; + + virtual inline BinIterator* FeatureGroupIterator() override; + /*! \brief Destructor */ ~PairwiseRankingDifferentialFeatureGroup() {} @@ -136,6 +140,7 @@ class PairwiseRankingDifferentialFeatureGroup: public PairwiseRankingFeatureGrou std::vector> diff_feature_bin_mappers_; std::vector> ori_feature_bin_mappers_; + std::vector original_bin_offsets_; }; diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 0abeba22eef1..d5062995ed2e 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -677,25 +677,25 @@ namespace LightGBM { } } - Bin* Bin::CreateDensePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets) { + Bin* Bin::CreateDensePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets) { if (num_bin <= 16) { - return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets); + return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets); } else if (num_bin <= 256) { - return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets); + return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets); } else if (num_bin <= 65536) { - return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets); + return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets); } else { - return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets); + return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets); } } - Bin* Bin::CreateSparsePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets) { + Bin* Bin::CreateSparsePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets) { if (num_bin <= 256) { - return new SparsePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets); + return new SparsePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets); } else if (num_bin <= 65536) { - return new SparsePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets); + return new SparsePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets); } else { - return new SparsePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets); + return new SparsePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets); } } diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 07f532ecd85f..bb86de1df0b1 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -913,15 +913,13 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va if (config.use_differential_feature_in_pairwise_ranking) { for (int diff_feature_index = 0; diff_feature_index < static_cast(diff_feature_bin_mappers.size()); ++diff_feature_index) { if (!diff_feature_bin_mappers[diff_feature_index]->is_trivial()) { - used_feature_map_.push_back(num_features_); - numeric_feature_map_.push_back(num_features_); num_numeric_features_ += 1; num_features_ += 1; used_diff_features.push_back(diff_feature_index); - } else { - used_feature_map_.push_back(-1); } } + numeric_feature_map_.resize(num_features_, -1); + used_feature_map_.resize(2 * dataset->num_total_features_ + static_cast(diff_feature_bin_mappers.size()), -1); } const bool is_use_gpu = config.device_type == std::string("cuda") || config.device_type == std::string("gpu"); @@ -966,6 +964,8 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va } feature2group_.push_back(i + num_groups_); feature2subfeature_.push_back(num_features_in_group); + numeric_feature_map_[cur_feature_index] = cur_feature_index; + used_feature_map_[diff_feature_index + dataset->num_total_features_ * 2] = cur_feature_index; ++cur_feature_index; ++num_features_in_group; const int ori_feature_index = dataset->InnerFeatureIndex(diff_original_feature_index[diff_feature_index]); diff --git a/src/io/pairwise_lambdarank_bin.cpp b/src/io/pairwise_lambdarank_bin.cpp index aea196984a35..1447db2c6b8f 100644 --- a/src/io/pairwise_lambdarank_bin.cpp +++ b/src/io/pairwise_lambdarank_bin.cpp @@ -1,5 +1,5 @@ /*! - * Copyright (c) 2016 Microsoft Corporation. All rights reserved. + * Copyright (c) 2024 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for * license information. */ @@ -18,23 +18,18 @@ uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t const uint32_t second_bin = static_cast(this->unpaired_bin_->data(second_data_index)); int first_feature_index = static_cast(std::upper_bound(bin_offsets_->begin(), bin_offsets_->end(), first_bin) - bin_offsets_->begin()) - 1; int second_feature_index = static_cast(std::upper_bound(bin_offsets_->begin(), bin_offsets_->end(), second_bin) - bin_offsets_->begin()) - 1; - // CHECK_GE(first_feature_index, 0); - // CHECK_GE(second_feature_index, 0); - // CHECK_LT(first_feature_index, diff_bin_mappers_->size()); + // TODO(shiyu1994): better original value, handle nan as missing const double first_value = first_feature_index >= 0 ? ori_bin_mappers_->at(first_feature_index)->BinToValue(first_bin) : 0.0; const double second_value = second_feature_index >= 0 ? ori_bin_mappers_->at(second_feature_index)->BinToValue(second_bin) : 0.0; const double diff_value = first_value - second_value; - if (first_feature_index >= 0) { - const uint32_t min_bin = bin_offsets_->at(first_feature_index); - const uint32_t max_bin = bin_offsets_->at(first_feature_index + 1) - 1; + CHECK(first_feature_index >= 0 || first_bin == 0); + if (first_feature_index >= 0 && first_feature_index == second_feature_index) { + const uint32_t min_bin = diff_bin_offsets_->at(first_feature_index); + const uint32_t max_bin = diff_bin_offsets_->at(first_feature_index + 1) - 1; const uint32_t most_freq_bin = diff_bin_mappers_->at(first_feature_index)->GetMostFreqBin(); - const uint32_t diff_bin = diff_bin_mappers_->at(first_feature_index)->ValueToBin(diff_value) + bin_offsets_->at(first_feature_index); - if (diff_bin < min_bin || diff_bin > max_bin) { - return 0; - } else { - return diff_bin + min_bin - static_cast(most_freq_bin == 0); - } + const uint32_t diff_bin = diff_bin_mappers_->at(first_feature_index)->ValueToBin(diff_value); + return diff_bin + min_bin - static_cast(most_freq_bin == 0); } else { return 0; } diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp index 49e58e1c1653..81373fc1f799 100644 --- a/src/io/pairwise_lambdarank_bin.hpp +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -521,10 +521,11 @@ class DensePairwiseRankingSecondBin: public DensePairwiseRankingBin class DensePairwiseRankingDiffBin: public DensePairwiseRankingBin { public: - DensePairwiseRankingDiffBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, DenseBin* unpaired_bin, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets): DensePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) { + DensePairwiseRankingDiffBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, DenseBin* unpaired_bin, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets): DensePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) { diff_bin_mappers_ = diff_bin_mappers; ori_bin_mappers_ = ori_bin_mappers; bin_offsets_ = bin_offsets; + diff_bin_offsets_ = diff_bin_offsets; } BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { @@ -547,6 +548,7 @@ class DensePairwiseRankingDiffBin: public DensePairwiseRankingBin* bin_offsets_; + const std::vector* diff_bin_offsets_; const std::vector>* diff_bin_mappers_; const std::vector>* ori_bin_mappers_; }; @@ -584,8 +586,9 @@ class SparsePairwiseRankingSecondBin: public SparsePairwiseRankingBin class SparsePairwiseRankingDiffBin: public SparsePairwiseRankingBin { public: - SparsePairwiseRankingDiffBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, SparseBin* unpaired_bin, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets): SparsePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) { + SparsePairwiseRankingDiffBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, SparseBin* unpaired_bin, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets): SparsePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) { bin_offsets_ = bin_offsets; + diff_bin_offsets_ = diff_bin_offsets; diff_bin_mappers_ = diff_bin_mappers; ori_bin_mappers_ = ori_bin_mappers; } @@ -609,6 +612,7 @@ class SparsePairwiseRankingDiffBin: public SparsePairwiseRankingBin* bin_offsets_; + const std::vector* diff_bin_offsets_; const std::vector>* diff_bin_mappers_; const std::vector>* ori_bin_mappers_; }; diff --git a/src/io/pairwise_ranking_feature_group.cpp b/src/io/pairwise_ranking_feature_group.cpp index 81c5ffebfe14..7a28223ca71c 100644 --- a/src/io/pairwise_ranking_feature_group.cpp +++ b/src/io/pairwise_ranking_feature_group.cpp @@ -97,6 +97,26 @@ PairwiseRankingDifferentialFeatureGroup::PairwiseRankingDifferentialFeatureGroup }); FinishLoad(); + + // calculate diff bin offsets + const int offset = 1; + original_bin_offsets_ = bin_offsets_; + bin_offsets_.clear(); + num_total_bin_ = offset; + bin_offsets_.emplace_back(num_total_bin_); + for (int i = 0; i < num_feature_; ++i) { + auto num_bin = diff_feature_bin_mappers_[i]->num_bin(); + if (diff_feature_bin_mappers_[i]->GetMostFreqBin() == 0) { + num_bin -= offset; + } + num_total_bin_ += num_bin; + bin_offsets_.emplace_back(num_total_bin_); + } + + bin_mappers_.clear(); + for (const auto& bin_mapper : diff_feature_bin_mappers_) { + bin_mappers_.emplace_back(new BinMapper(*bin_mapper.get())); + } } void PairwiseRankingDifferentialFeatureGroup::CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { @@ -105,12 +125,37 @@ void PairwiseRankingDifferentialFeatureGroup::CreateBinData(int num_data, bool i (!force_dense && num_feature_ == 1 && bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { is_sparse_ = true; - bin_data_.reset(Bin::CreateDensePairwiseRankingDiffBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_, &diff_feature_bin_mappers_, &ori_feature_bin_mappers_, &bin_offsets_)); + bin_data_.reset(Bin::CreateDensePairwiseRankingDiffBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_, &diff_feature_bin_mappers_, &ori_feature_bin_mappers_, &original_bin_offsets_, &bin_offsets_)); } else { is_sparse_ = false; - bin_data_.reset(Bin::CreateDensePairwiseRankingDiffBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_, &diff_feature_bin_mappers_, &ori_feature_bin_mappers_, &bin_offsets_)); + bin_data_.reset(Bin::CreateDensePairwiseRankingDiffBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_, &diff_feature_bin_mappers_, &ori_feature_bin_mappers_, &original_bin_offsets_, &bin_offsets_)); } is_multi_val_ = false; } +inline BinIterator* PairwiseRankingDifferentialFeatureGroup::SubFeatureIterator(int sub_feature) const { + uint32_t most_freq_bin = ori_feature_bin_mappers_[sub_feature]->GetMostFreqBin(); + if (!is_multi_val_) { + uint32_t min_bin = original_bin_offsets_[sub_feature]; + uint32_t max_bin = original_bin_offsets_[sub_feature + 1] - 1; + return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin); + } else { + int addi = most_freq_bin == 0 ? 0 : 1; + uint32_t min_bin = 1; + uint32_t max_bin = ori_feature_bin_mappers_[sub_feature]->num_bin() - 1 + addi; + return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin, + most_freq_bin); + } +} + +inline BinIterator* PairwiseRankingDifferentialFeatureGroup::FeatureGroupIterator() { + if (is_multi_val_) { + return nullptr; + } + uint32_t min_bin = original_bin_offsets_[0]; + uint32_t max_bin = original_bin_offsets_.back() - 1; + uint32_t most_freq_bin = 0; + return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin); +} + } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index f3a88bd18679..4eb15148c6ea 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -1097,6 +1097,41 @@ void SerialTreeLearner::CheckSplit(const SplitInfo& best_split_info, const int l CHECK_EQ(sum_right_gradient, static_cast(best_split_info.right_sum_gradient_and_hessian >> 32)); CHECK_EQ(sum_right_hessian, static_cast(best_split_info.right_sum_gradient_and_hessian & 0x00000000ffffffff)); Log::Warning("============================ end leaf split info ============================"); + Log::Warning("============================ pass split check ============================"); + } else { + double sum_left_gradient = 0; + double sum_left_hessian = 0; + double sum_right_gradient = 0; + double sum_right_hessian = 0; + + for (data_size_t i = 0; i < num_data_in_left; ++i) { + const data_size_t index = data_indices_in_left[i]; + sum_left_gradient += gradients_[index]; + sum_left_hessian += hessians_[index]; + } + for (data_size_t i = 0; i < num_data_in_right; ++i) { + const data_size_t index = data_indices_in_right[i]; + sum_right_gradient += gradients_[index]; + sum_right_hessian += hessians_[index]; + } + // Log::Warning("============================ start leaf split info ============================"); + // Log::Warning("left_leaf_index = %d, right_leaf_index = %d", left_leaf_index, right_leaf_index); + // Log::Warning("num_data_in_left = %d, num_data_in_right = %d", num_data_in_left, num_data_in_right); + // Log::Warning("sum_left_gradient = %d, best_split_info->left_sum_gradient_and_hessian.gradient = %d", sum_left_gradient, + // static_cast(best_split_info.left_sum_gradient_and_hessian >> 32)); + // Log::Warning("sum_left_hessian = %d, best_split_info->left_sum_gradient_and_hessian.hessian = %d", sum_left_hessian, + // static_cast(best_split_info.left_sum_gradient_and_hessian & 0x00000000ffffffff)); + // Log::Warning("sum_right_gradient = %d, best_split_info->right_sum_gradient_and_hessian.gradient = %d", sum_right_gradient, + // static_cast(best_split_info.right_sum_gradient_and_hessian >> 32)); + // Log::Warning("sum_right_hessian = %d, best_split_info->right_sum_gradient_and_hessian.hessian = %d", sum_right_hessian, + // static_cast(best_split_info.right_sum_gradient_and_hessian & 0x00000000ffffffff)); + CHECK_EQ(num_data_in_left, best_split_info.left_count); + CHECK_EQ(num_data_in_right, best_split_info.right_count); + CHECK_LE(std::fabs(sum_left_gradient - best_split_info.left_sum_gradient), 1e-3); + CHECK_LE(std::fabs(sum_left_hessian - best_split_info.left_sum_hessian), 1e-3); + CHECK_LE(std::fabs(sum_right_gradient - best_split_info.right_sum_gradient), 1e-3); + CHECK_LE(std::fabs(sum_right_hessian - best_split_info.right_sum_hessian), 1e-3); + Log::Warning("============================ pass split check ============================"); } } #endif From d8eb68b380061d5192c403138c7a7bc9bd5e8a01 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 17 Apr 2024 15:56:34 +0000 Subject: [PATCH 42/68] clean up debug code --- src/treelearner/serial_tree_learner.cpp | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 4eb15148c6ea..6bf420a6aec3 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -1114,17 +1114,6 @@ void SerialTreeLearner::CheckSplit(const SplitInfo& best_split_info, const int l sum_right_gradient += gradients_[index]; sum_right_hessian += hessians_[index]; } - // Log::Warning("============================ start leaf split info ============================"); - // Log::Warning("left_leaf_index = %d, right_leaf_index = %d", left_leaf_index, right_leaf_index); - // Log::Warning("num_data_in_left = %d, num_data_in_right = %d", num_data_in_left, num_data_in_right); - // Log::Warning("sum_left_gradient = %d, best_split_info->left_sum_gradient_and_hessian.gradient = %d", sum_left_gradient, - // static_cast(best_split_info.left_sum_gradient_and_hessian >> 32)); - // Log::Warning("sum_left_hessian = %d, best_split_info->left_sum_gradient_and_hessian.hessian = %d", sum_left_hessian, - // static_cast(best_split_info.left_sum_gradient_and_hessian & 0x00000000ffffffff)); - // Log::Warning("sum_right_gradient = %d, best_split_info->right_sum_gradient_and_hessian.gradient = %d", sum_right_gradient, - // static_cast(best_split_info.right_sum_gradient_and_hessian >> 32)); - // Log::Warning("sum_right_hessian = %d, best_split_info->right_sum_gradient_and_hessian.hessian = %d", sum_right_hessian, - // static_cast(best_split_info.right_sum_gradient_and_hessian & 0x00000000ffffffff)); CHECK_EQ(num_data_in_left, best_split_info.left_count); CHECK_EQ(num_data_in_right, best_split_info.right_count); CHECK_LE(std::fabs(sum_left_gradient - best_split_info.left_sum_gradient), 1e-3); From b088236908e0209203e673619e20315ed8b8a1c2 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 18 Apr 2024 05:15:18 +0000 Subject: [PATCH 43/68] fix validation set with differential features --- include/LightGBM/dataset.h | 4 ++++ src/io/dataset.cpp | 24 +++++++++++++++++++++--- src/io/dataset_loader.cpp | 2 ++ 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index e1bd18fdfe02..e4a7c235c8e6 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -1150,6 +1150,10 @@ class Dataset { std::shared_ptr>> sampled_indices_; /*! \brief stored number of totally sampled data, for creating differential features in pairwise lambdarank */ data_size_t num_total_sampled_data_; + /*! \brief stored query boundaries from training dataset, for creating differential features in pairwise lambdarank */ + const data_size_t* train_query_boundaries_; + /*! \brief stored number of queries from training dataset, for creating differential features in pairwise lambdarank */ + data_size_t train_num_queries_; }; } // namespace LightGBM diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index bb86de1df0b1..c63feeacf4cb 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -894,7 +894,15 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va } } - CreatePairwiseRankingDifferentialFeatures(*sampled_values_, *sampled_indices_, original_bin_mappers, num_total_sampled_data_, metadata_.query_boundaries(), metadata_.num_queries(), &diff_feature_bin_mappers, &diff_original_feature_index, config); + if (!is_validation) { + train_query_boundaries_ = metadata_.query_boundaries(); + train_num_queries_ = metadata_.num_queries(); + } else { + train_query_boundaries_ = dataset->train_query_boundaries_; + train_num_queries_ = dataset->train_num_queries_; + } + // TODO(shiyu1994): verify the difference in training and validation results even when they share the same dataset + CreatePairwiseRankingDifferentialFeatures(*sampled_values_, *sampled_indices_, original_bin_mappers, num_total_sampled_data_, train_query_boundaries_, train_num_queries_, &diff_feature_bin_mappers, &diff_original_feature_index, config); } used_feature_map_.clear(); @@ -924,8 +932,18 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va const bool is_use_gpu = config.device_type == std::string("cuda") || config.device_type == std::string("gpu"); std::vector group_is_multi_val; - std::vector> diff_feature_groups = FindGroups(diff_feature_bin_mappers, used_diff_features, Common::Vector2Ptr(sampled_indices_.get()).data(), Common::VectorSize(*sampled_indices_).data(), static_cast(sampled_indices_->size()), num_total_sampled_data_, num_data_, is_use_gpu, false, &group_is_multi_val); - + std::vector> diff_feature_groups = + FindGroups(diff_feature_bin_mappers, used_diff_features, Common::Vector2Ptr(sampled_indices_.get()).data(), Common::VectorSize(*sampled_indices_).data(), static_cast(sampled_indices_->size()), num_total_sampled_data_, num_data_, is_use_gpu, false, &group_is_multi_val); + + if (is_validation) { + std::vector> flatten_feature_groups; + for (const auto& features_in_group : diff_feature_groups) { + for (const int feature_index : features_in_group) { + flatten_feature_groups.push_back(std::vector{feature_index}); + } + } + diff_feature_groups = flatten_feature_groups; + } int cur_feature_index = 0; for (int i = 0; i < num_groups_; ++i) { diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 14d509090f64..81ed028444e0 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -354,6 +354,8 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, dataset->sampled_values_ = train_data->sampled_values_; dataset->sampled_indices_ = train_data->sampled_indices_; dataset->num_total_sampled_data_ = train_data->num_total_sampled_data_; + dataset->train_query_boundaries_ = train_data->metadata().query_boundaries(); + dataset->train_num_queries_ = train_data->metadata().num_queries(); return dataset.release(); } From 2d0989745b5df8fffeebcb0b4739619be69da3bc Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sat, 15 Jun 2024 05:24:14 +0000 Subject: [PATCH 44/68] support row-wise histogram construction with pairwise ranking --- include/LightGBM/bin.h | 6 +- include/LightGBM/dataset.h | 3 +- src/io/bin.cpp | 36 +++++++++--- src/io/dataset.cpp | 28 +++++---- src/io/multi_val_pairwise_lambdarank_bin.hpp | 62 ++++++++++++++++++++ src/treelearner/serial_tree_learner.cpp | 8 +-- 6 files changed, 115 insertions(+), 28 deletions(-) create mode 100644 src/io/multi_val_pairwise_lambdarank_bin.hpp diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index b265180ef9b0..98cd33db6965 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -647,12 +647,12 @@ class MultiValBin { virtual bool IsSparse() = 0; static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin, - int num_feature, double sparse_rate, const std::vector& offsets); + int num_feature, double sparse_rate, const std::vector& offsets, const bool use_pairwise_ranking); static MultiValBin* CreateMultiValDenseBin(data_size_t num_data, int num_bin, - int num_feature, const std::vector& offsets); + int num_feature, const std::vector& offsets, const bool use_pairwise_ranking); - static MultiValBin* CreateMultiValSparseBin(data_size_t num_data, int num_bin, double estimate_element_per_row); + static MultiValBin* CreateMultiValSparseBin(data_size_t num_data, int num_bin, double estimate_element_per_row, const bool use_pairwise_ranking); static constexpr double multi_val_bin_sparse_threshold = 0.25f; diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index e4a7c235c8e6..fa2facfcabf3 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -724,7 +724,8 @@ class Dataset { TrainingShareStates* GetShareStates( score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, - bool force_col_wise, bool force_row_wise, const int num_grad_quant_bins) const; + bool force_col_wise, bool force_row_wise, const int num_grad_quant_bins, + const bool use_pairwise_ranking) const; LIGHTGBM_EXPORT void FinishLoad(); diff --git a/src/io/bin.cpp b/src/io/bin.cpp index d5062995ed2e..04ef33614f4b 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -15,6 +15,7 @@ #include "dense_bin.hpp" #include "multi_val_dense_bin.hpp" +#include "multi_val_pairwise_lambdarank_bin.hpp" #include "multi_val_sparse_bin.hpp" #include "sparse_bin.hpp" #include "pairwise_lambdarank_bin.hpp" @@ -700,20 +701,25 @@ namespace LightGBM { } MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, - double sparse_rate, const std::vector& offsets) { + double sparse_rate, const std::vector& offsets, const bool use_pairwise_ranking) { if (sparse_rate >= multi_val_bin_sparse_threshold) { const double average_element_per_row = (1.0 - sparse_rate) * num_feature; + if (use_pairwise_ranking) { + Log::Fatal("Pairwise ranking with sparse row-wse bins is not supported yet.")'' + } return CreateMultiValSparseBin(num_data, num_bin, - average_element_per_row); + average_element_per_row, use_pairwise_ranking); } else { - return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets); + return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets, use_pairwise_ranking); } } MultiValBin* MultiValBin::CreateMultiValDenseBin(data_size_t num_data, int num_bin, int num_feature, - const std::vector& offsets) { + const std::vector& offsets, + const bool use_pairwise_ranking, + const data_size_t* paired_ranking_item_global_index_map) { // calculate max bin of all features to select the int type in MultiValDenseBin int max_bin = 0; for (int i = 0; i < static_cast(offsets.size()) - 1; ++i) { @@ -723,17 +729,31 @@ namespace LightGBM { } } if (max_bin <= 256) { - return new MultiValDenseBin(num_data, num_bin, num_feature, offsets); + if (use_pairwise_ranking) { + return new MultiValDensePairwiseLambdarankBin(num_data, num_bin, num_feature, offsets, paired_ranking_item_global_index_map); + } else { + return new MultiValDenseBin(num_data, num_bin, num_feature, offsets); + } } else if (max_bin <= 65536) { - return new MultiValDenseBin(num_data, num_bin, num_feature, offsets); + if (use_pairwise_ranking) { + return new MultiValDensePairwiseLambdarankBin(num_data, num_bin, num_feature, offsets, paired_ranking_item_global_index_map); + } else { + return new MultiValDenseBin(num_data, num_bin, num_feature, offsets); + } } else { - return new MultiValDenseBin(num_data, num_bin, num_feature, offsets); + if (use_pairwise_ranking) { + return new MultiValDensePairwiseLambdarankBin(num_data, num_bin, num_feature, offsets, paired_ranking_item_global_index_map); + } else { + return new MultiValDenseBin(num_data, num_bin, num_feature, offsets); + } } } MultiValBin* MultiValBin::CreateMultiValSparseBin(data_size_t num_data, int num_bin, - double estimate_element_per_row) { + double estimate_element_per_row, + const bool /*use_pairwise_ranking*/, + const data_size_t* /*paired_ranking_item_global_index_map*/) { size_t estimate_total_entries = static_cast(estimate_element_per_row * 1.1 * num_data); if (estimate_total_entries <= std::numeric_limits::max()) { diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index c63feeacf4cb..94d4f67c0241 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -539,7 +539,7 @@ void PushDataToMultiValBin( } } -MultiValBin* Dataset::GetMultiBinFromSparseFeatures(const std::vector& offsets) const { +MultiValBin* Dataset::GetMultiBinFromSparseFeatures(const std::vector& offsets, const bool use_pairwise_ranking) const { Common::FunctionTimer fun_time("Dataset::GetMultiBinFromSparseFeatures", global_timer); int multi_group_id = -1; @@ -577,13 +577,13 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures(const std::vector& sum_sparse_rate); std::unique_ptr ret; ret.reset(MultiValBin::CreateMultiValBin(num_data_, offsets.back(), - num_feature, sum_sparse_rate, offsets)); + num_feature, sum_sparse_rate, offsets, use_pairwise_ranking, metadata_.paired_ranking_item_global_index_map())); PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get()); ret->FinishLoad(); return ret.release(); } -MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& offsets) const { +MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& offsets, const bool use_pairwise_ranking) const { Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures", global_timer); int num_threads = OMP_NUM_THREADS(); @@ -628,7 +628,7 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& of 1.0 - sum_dense_ratio); ret.reset(MultiValBin::CreateMultiValBin( num_data_, offsets.back(), static_cast(most_freq_bins.size()), - 1.0 - sum_dense_ratio, offsets)); + 1.0 - sum_dense_ratio, offsets, use_pairwise_ranking, metadata_.paired_ranking_item_global_index_map())); PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get()); ret->FinishLoad(); return ret.release(); @@ -639,7 +639,8 @@ TrainingShareStates* Dataset::GetShareStates( score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, bool force_col_wise, bool force_row_wise, - const int num_grad_quant_bins) const { + const int num_grad_quant_bins, + const bool use_pairwise_ranking) const { Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod", global_timer); if (force_col_wise && force_row_wise) { @@ -658,7 +659,7 @@ TrainingShareStates* Dataset::GetShareStates( std::vector offsets; share_state->CalcBinOffsets( feature_groups_, &offsets, true); - share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets), + share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets, use_pairwise_ranking), num_data_, feature_groups_, false, true, num_grad_quant_bins); share_state->is_col_wise = true; share_state->is_constant_hessian = is_constant_hessian; @@ -668,7 +669,7 @@ TrainingShareStates* Dataset::GetShareStates( std::vector offsets; share_state->CalcBinOffsets( feature_groups_, &offsets, false); - share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets), num_data_, + share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets, use_pairwise_ranking), num_data_, feature_groups_, false, false, num_grad_quant_bins); share_state->is_col_wise = false; share_state->is_constant_hessian = is_constant_hessian; @@ -685,14 +686,14 @@ TrainingShareStates* Dataset::GetShareStates( auto start_time = std::chrono::steady_clock::now(); std::vector col_wise_offsets; col_wise_state->CalcBinOffsets(feature_groups_, &col_wise_offsets, true); - col_wise_state->SetMultiValBin(GetMultiBinFromSparseFeatures(col_wise_offsets), num_data_, + col_wise_state->SetMultiValBin(GetMultiBinFromSparseFeatures(col_wise_offsets, use_pairwise_ranking), num_data_, feature_groups_, false, true, num_grad_quant_bins); col_wise_init_time = std::chrono::steady_clock::now() - start_time; start_time = std::chrono::steady_clock::now(); std::vector row_wise_offsets; row_wise_state->CalcBinOffsets(feature_groups_, &row_wise_offsets, false); - row_wise_state->SetMultiValBin(GetMultiBinFromAllFeatures(row_wise_offsets), num_data_, + row_wise_state->SetMultiValBin(GetMultiBinFromAllFeatures(row_wise_offsets, use_pairwise_ranking), num_data_, feature_groups_, false, false, num_grad_quant_bins); row_wise_init_time = std::chrono::steady_clock::now() - start_time; @@ -753,19 +754,22 @@ template TrainingShareStates* Dataset::GetShareStates( score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, bool force_col_wise, bool force_row_wise, - const int num_grad_quant_bins) const; + const int num_grad_quant_bins, + const bool use_pairwise_ranking) const; template TrainingShareStates* Dataset::GetShareStates( score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, bool force_col_wise, bool force_row_wise, - const int num_grad_quant_bins) const; + const int num_grad_quant_bins, + const bool use_pairwise_ranking) const; template TrainingShareStates* Dataset::GetShareStates( score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, bool force_col_wise, bool force_row_wise, - const int num_grad_quant_bins) const; + const int num_grad_quant_bins, + const bool use_pairwise_ranking) const; void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) { feature_groups_.clear(); diff --git a/src/io/multi_val_pairwise_lambdarank_bin.hpp b/src/io/multi_val_pairwise_lambdarank_bin.hpp new file mode 100644 index 000000000000..3a3625e70379 --- /dev/null +++ b/src/io/multi_val_pairwise_lambdarank_bin.hpp @@ -0,0 +1,62 @@ +/*! + * Copyright (c) 2024 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_IO_MULTI_VAL_PAIRWISE_LAMBDARANK_BIN_HPP_ +#define LIGHTGBM_IO_MULTI_VAL_PAIRWISE_LAMBDARANK_BIN_HPP_ + +#include "multi_val_dense_bin.hpp" + +template +class MultiValPairwiseLambdarankBin : public MULTIVAL_BIN_TYPE { + protected: + const std::pair* paired_ranking_item_index_map_; +}; + + +template +class MultiValDensePairwiseLambdarankBin: public MultiValPairwiseLambdarankBin> { + public: + MultiValDensePairwiseLambdarankBin(data_size_t num_data, int num_bin, int num_feature, + const std::vector& offsets, const data_size_t* paired_ranking_item_global_index_map): MultiValDenseBin(num_data, num_bin, num_feature, offsets) { + paired_ranking_item_global_index_map_ = paired_ranking_item_global_index_map; + } + + template + void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* gradients, const score_t* hessians, hist_t* out) const { + data_size_t i = start; + hist_t* grad = out; + hist_t* hess = out + 1; + + for (; i < end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const data_size_t first_idx = paired_ranking_item_index_map_[idx].first; + const data_size_t second_idx = paired_ranking_item_index_map_[idx].second; + const auto first_j_start = RowPtr(first_idx); + const VAL_T* first_data_ptr = data_.data() + first_j_start; + const score_t gradient = ORDERED ? gradients[i] : gradients[idx]; + const score_t hessian = ORDERED ? hessians[i] : hessians[idx]; + for (int j = 0; j < num_feature_; ++j) { + const uint32_t bin = static_cast(first_data_ptr[j]); + const auto ti = (bin + offsets_[j]) << 1; + grad[ti] += gradient; + hess[ti] += hessian; + } + + const auto second_j_start = RowPtr(second_idx); + const VAL_T* second_data_ptr = data_.data() + second_j_start; + const auto base_offset = offsets_.back(); + for (int j = 0; j < num_feature_; ++j) { + const uint32_t bin = static_cast(second_data_ptr[j]); + const auto ti = (bin + offsets_[j] + base_offset) << 1; + grad[ti] += gradient; + hess[ti] += hessian; + } + } + } +}; + + +#endif // LIGHTGBM_IO_MULTI_VAL_PAIRWISE_LAMBDARANK_BIN_HPP_ diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 6bf420a6aec3..4471534db7e1 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -86,12 +86,12 @@ void SerialTreeLearner::GetShareStates(const Dataset* dataset, share_state_.reset(dataset->GetShareStates( reinterpret_cast(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr, col_sampler_.is_feature_used_bytree(), is_constant_hessian, - config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins)); + config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_ranking"))); } else { share_state_.reset(dataset->GetShareStates( ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(), is_constant_hessian, - config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins)); + config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_ranking"))); } } else { CHECK_NOTNULL(share_state_); @@ -100,12 +100,12 @@ void SerialTreeLearner::GetShareStates(const Dataset* dataset, share_state_.reset(dataset->GetShareStates( reinterpret_cast(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr, col_sampler_.is_feature_used_bytree(), is_constant_hessian, - share_state_->is_col_wise, !share_state_->is_col_wise, config_->num_grad_quant_bins)); + share_state_->is_col_wise, !share_state_->is_col_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_ranking"))); } else { share_state_.reset(dataset->GetShareStates( ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(), is_constant_hessian, share_state_->is_col_wise, - !share_state_->is_col_wise, config_->num_grad_quant_bins)); + !share_state_->is_col_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_ranking"))); } } CHECK_NOTNULL(share_state_); From 406d0c1b70de0244b591f5e542314a9cef8edc97 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 20 Jun 2024 02:52:19 +0000 Subject: [PATCH 45/68] fix row wise in pairwise ranking --- include/LightGBM/bin.h | 8 ++-- include/LightGBM/dataset.h | 4 +- src/io/bin.cpp | 12 +++--- src/io/multi_val_pairwise_lambdarank_bin.hpp | 41 +++++++++++--------- 4 files changed, 36 insertions(+), 29 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 98cd33db6965..88244a2d4829 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -647,12 +647,14 @@ class MultiValBin { virtual bool IsSparse() = 0; static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin, - int num_feature, double sparse_rate, const std::vector& offsets, const bool use_pairwise_ranking); + int num_feature, double sparse_rate, const std::vector& offsets, const bool use_pairwise_ranking, + const std::pair* paired_ranking_item_global_index_map); static MultiValBin* CreateMultiValDenseBin(data_size_t num_data, int num_bin, - int num_feature, const std::vector& offsets, const bool use_pairwise_ranking); + int num_feature, const std::vector& offsets, const bool use_pairwise_ranking, + const std::pair* paired_ranking_item_global_index_map); - static MultiValBin* CreateMultiValSparseBin(data_size_t num_data, int num_bin, double estimate_element_per_row, const bool use_pairwise_ranking); + static MultiValBin* CreateMultiValSparseBin(data_size_t num_data, int num_bin, double estimate_element_per_row, const bool use_pairwise_ranking, const std::pair* paired_ranking_item_global_index_map); static constexpr double multi_val_bin_sparse_threshold = 0.25f; diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index fa2facfcabf3..a2bf2ccf9208 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -716,9 +716,9 @@ class Dataset { void CopySubrow(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data); - MultiValBin* GetMultiBinFromSparseFeatures(const std::vector& offsets) const; + MultiValBin* GetMultiBinFromSparseFeatures(const std::vector& offsets, const bool use_pairwise_ranking) const; - MultiValBin* GetMultiBinFromAllFeatures(const std::vector& offsets) const; + MultiValBin* GetMultiBinFromAllFeatures(const std::vector& offsets, const bool use_pairwise_ranking) const; template TrainingShareStates* GetShareStates( diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 04ef33614f4b..10c63a898d84 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -701,16 +701,16 @@ namespace LightGBM { } MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, - double sparse_rate, const std::vector& offsets, const bool use_pairwise_ranking) { + double sparse_rate, const std::vector& offsets, const bool use_pairwise_ranking, const std::pair* paired_ranking_item_global_index_map) { if (sparse_rate >= multi_val_bin_sparse_threshold) { const double average_element_per_row = (1.0 - sparse_rate) * num_feature; if (use_pairwise_ranking) { - Log::Fatal("Pairwise ranking with sparse row-wse bins is not supported yet.")'' + Log::Fatal("Pairwise ranking with sparse row-wse bins is not supported yet."); } return CreateMultiValSparseBin(num_data, num_bin, - average_element_per_row, use_pairwise_ranking); + average_element_per_row, use_pairwise_ranking, paired_ranking_item_global_index_map); } else { - return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets, use_pairwise_ranking); + return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets, use_pairwise_ranking, paired_ranking_item_global_index_map); } } @@ -719,7 +719,7 @@ namespace LightGBM { int num_feature, const std::vector& offsets, const bool use_pairwise_ranking, - const data_size_t* paired_ranking_item_global_index_map) { + const std::pair* paired_ranking_item_global_index_map) { // calculate max bin of all features to select the int type in MultiValDenseBin int max_bin = 0; for (int i = 0; i < static_cast(offsets.size()) - 1; ++i) { @@ -753,7 +753,7 @@ namespace LightGBM { int num_bin, double estimate_element_per_row, const bool /*use_pairwise_ranking*/, - const data_size_t* /*paired_ranking_item_global_index_map*/) { + const std::pair* /*paired_ranking_item_global_index_map*/) { size_t estimate_total_entries = static_cast(estimate_element_per_row * 1.1 * num_data); if (estimate_total_entries <= std::numeric_limits::max()) { diff --git a/src/io/multi_val_pairwise_lambdarank_bin.hpp b/src/io/multi_val_pairwise_lambdarank_bin.hpp index 3a3625e70379..e09742bc40a6 100644 --- a/src/io/multi_val_pairwise_lambdarank_bin.hpp +++ b/src/io/multi_val_pairwise_lambdarank_bin.hpp @@ -8,19 +8,23 @@ #include "multi_val_dense_bin.hpp" -template -class MultiValPairwiseLambdarankBin : public MULTIVAL_BIN_TYPE { +namespace LightGBM { + +template class MULTI_VAL_BIN_TYPE> +class MultiValPairwiseLambdarankBin : public MULTI_VAL_BIN_TYPE { + public: + MultiValPairwiseLambdarankBin(data_size_t num_data, int num_bin, int num_feature, const std::vector& offsets): MULTI_VAL_BIN_TYPE(num_data, num_bin, num_feature, offsets) {} protected: - const std::pair* paired_ranking_item_index_map_; + const std::pair* paired_ranking_item_global_index_map_; }; -template -class MultiValDensePairwiseLambdarankBin: public MultiValPairwiseLambdarankBin> { +template +class MultiValDensePairwiseLambdarankBin: public MultiValPairwiseLambdarankBin { public: MultiValDensePairwiseLambdarankBin(data_size_t num_data, int num_bin, int num_feature, - const std::vector& offsets, const data_size_t* paired_ranking_item_global_index_map): MultiValDenseBin(num_data, num_bin, num_feature, offsets) { - paired_ranking_item_global_index_map_ = paired_ranking_item_global_index_map; + const std::vector& offsets, const std::pair* paired_ranking_item_global_index_map): MultiValPairwiseLambdarankBin(num_data, num_bin, num_feature, offsets) { + this->paired_ranking_item_global_index_map_ = paired_ranking_item_global_index_map; } template @@ -32,25 +36,25 @@ class MultiValDensePairwiseLambdarankBin: public MultiValPairwiseLambdarankBinpaired_ranking_item_global_index_map_[idx].first; + const data_size_t second_idx = this->paired_ranking_item_global_index_map_[idx].second; + const auto first_j_start = this->RowPtr(first_idx); + const BIN_TYPE* first_data_ptr = this->data_.data() + first_j_start; const score_t gradient = ORDERED ? gradients[i] : gradients[idx]; const score_t hessian = ORDERED ? hessians[i] : hessians[idx]; - for (int j = 0; j < num_feature_; ++j) { + for (int j = 0; j < this->num_feature_; ++j) { const uint32_t bin = static_cast(first_data_ptr[j]); - const auto ti = (bin + offsets_[j]) << 1; + const auto ti = (bin + this->offsets_[j]) << 1; grad[ti] += gradient; hess[ti] += hessian; } - const auto second_j_start = RowPtr(second_idx); - const VAL_T* second_data_ptr = data_.data() + second_j_start; - const auto base_offset = offsets_.back(); - for (int j = 0; j < num_feature_; ++j) { + const auto second_j_start = this->RowPtr(second_idx); + const BIN_TYPE* second_data_ptr = this->data_.data() + second_j_start; + const auto base_offset = this->offsets_.back(); + for (int j = 0; j < this->num_feature_; ++j) { const uint32_t bin = static_cast(second_data_ptr[j]); - const auto ti = (bin + offsets_[j] + base_offset) << 1; + const auto ti = (bin + this->offsets_[j] + base_offset) << 1; grad[ti] += gradient; hess[ti] += hessian; } @@ -58,5 +62,6 @@ class MultiValDensePairwiseLambdarankBin: public MultiValPairwiseLambdarankBin Date: Thu, 20 Jun 2024 05:21:57 +0000 Subject: [PATCH 46/68] save for debug --- src/io/bin.cpp | 3 +- src/io/dataset.cpp | 45 +++++++++++++++----- src/io/multi_val_dense_bin.hpp | 2 +- src/io/multi_val_pairwise_lambdarank_bin.hpp | 24 ++++++++++- src/main.cpp | 12 +++++- src/treelearner/serial_tree_learner.cpp | 8 ++-- 6 files changed, 75 insertions(+), 19 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 10c63a898d84..a6140053c709 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -705,7 +705,8 @@ namespace LightGBM { if (sparse_rate >= multi_val_bin_sparse_threshold) { const double average_element_per_row = (1.0 - sparse_rate) * num_feature; if (use_pairwise_ranking) { - Log::Fatal("Pairwise ranking with sparse row-wse bins is not supported yet."); + Log::Warning("Pairwise ranking with sparse row-wse bins is not supported yet."); + return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets, use_pairwise_ranking, paired_ranking_item_global_index_map); } return CreateMultiValSparseBin(num_data, num_bin, average_element_per_row, use_pairwise_ranking, paired_ranking_item_global_index_map); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 94d4f67c0241..6cc6b24fa109 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -496,6 +496,7 @@ void PushDataToMultiValBin( Common::FunctionTimer fun_time("Dataset::PushDataToMultiValBin", global_timer); if (ret->IsSparse()) { + Log::Fatal("pairwise ranking with sparse multi val bin is not supported."); Threading::For( 0, num_data, 1024, [&](int tid, data_size_t start, data_size_t end) { std::vector cur_data; @@ -524,16 +525,17 @@ void PushDataToMultiValBin( Threading::For( 0, num_data, 1024, [&](int tid, data_size_t start, data_size_t end) { std::vector cur_data(most_freq_bins.size(), 0); - for (size_t j = 0; j < most_freq_bins.size(); ++j) { - (*iters)[tid][j]->Reset(start); - } + // for (size_t j = 0; j < most_freq_bins.size(); ++j) { + // Log::Warning("(*iters)[%d].size() = %d, j = %d, start = %d", tid, (*iters)[tid].size(), j, start); + // (*iters)[tid][j]->Reset(start); + // } for (data_size_t i = start; i < end; ++i) { for (size_t j = 0; j < most_freq_bins.size(); ++j) { // for dense multi value bin, the feature bin values without offsets are used - auto cur_bin = (*iters)[tid][j]->Get(i); - cur_data[j] = cur_bin; + // auto cur_bin = (*iters)[tid][j]->Get(i); + // cur_data[j] = cur_bin; } - ret->PushOneRow(tid, i, cur_data); + // ret->PushOneRow(tid, i, cur_data); } }); } @@ -626,10 +628,33 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& of CHECK(static_cast(most_freq_bins.size()) == ncol); Log::Debug("Dataset::GetMultiBinFromAllFeatures: sparse rate %f", 1.0 - sum_dense_ratio); - ret.reset(MultiValBin::CreateMultiValBin( - num_data_, offsets.back(), static_cast(most_freq_bins.size()), - 1.0 - sum_dense_ratio, offsets, use_pairwise_ranking, metadata_.paired_ranking_item_global_index_map())); - PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get()); + if (use_pairwise_ranking) { + + for (size_t i = 0; i < iters.size(); ++i) { + for (size_t j = 0; j < iters[i].size(); ++j) { + Log::Warning("i = %d, j = %d, iters[i][j] = %d", static_cast(iters[i][j] == nullptr)); + } + } + + const int num_original_features = static_cast(most_freq_bins.size()) / 2; + std::vector original_most_freq_bins; + std::vector original_offsets; + for (int i = 0; i < num_original_features; ++i) { + original_most_freq_bins.push_back(most_freq_bins[i]); + original_offsets.push_back(offsets[i]); + } + original_offsets.push_back(offsets[num_original_features]); + const data_size_t num_original_data = metadata_.query_boundaries()[metadata_.num_queries()]; + ret.reset(MultiValBin::CreateMultiValBin( + num_original_data, original_offsets.back(), num_original_features, + 1.0 - sum_dense_ratio, original_offsets, use_pairwise_ranking, metadata_.paired_ranking_item_global_index_map())); + PushDataToMultiValBin(num_original_features, original_most_freq_bins, original_offsets, &iters, ret.get()); + } else { + ret.reset(MultiValBin::CreateMultiValBin( + num_data_, offsets.back(), static_cast(most_freq_bins.size()), + 1.0 - sum_dense_ratio, offsets, use_pairwise_ranking, metadata_.paired_ranking_item_global_index_map())); + PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get()); + } ret->FinishLoad(); return ret.release(); } diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index 2bab45f044cd..60df38acfb0f 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -336,7 +336,7 @@ class MultiValDenseBin : public MultiValBin { uint8_t* data_ptr_bit_type) const override; #endif // USE_CUDA - private: + protected: data_size_t num_data_; int num_bin_; int num_feature_; diff --git a/src/io/multi_val_pairwise_lambdarank_bin.hpp b/src/io/multi_val_pairwise_lambdarank_bin.hpp index e09742bc40a6..9b9a6333f54e 100644 --- a/src/io/multi_val_pairwise_lambdarank_bin.hpp +++ b/src/io/multi_val_pairwise_lambdarank_bin.hpp @@ -27,13 +27,35 @@ class MultiValDensePairwiseLambdarankBin: public MultiValPairwiseLambdarankBinpaired_ranking_item_global_index_map_ = paired_ranking_item_global_index_map; } + void ConstructHistogram(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* gradients, + const score_t* hessians, hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, + gradients, hessians, out); + } + + void ConstructHistogram(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* hessians, + hist_t* out) const override { + ConstructHistogramInner( + nullptr, start, end, gradients, hessians, out); + } + + void ConstructHistogramOrdered(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, + gradients, hessians, out); + } + template void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* gradients, const score_t* hessians, hist_t* out) const { data_size_t i = start; hist_t* grad = out; hist_t* hess = out + 1; - for (; i < end; ++i) { const auto idx = USE_INDICES ? data_indices[i] : i; const data_size_t first_idx = this->paired_ranking_item_global_index_map_[idx].first; diff --git a/src/main.cpp b/src/main.cpp index ecd8dd77ed02..ea05febd156b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -10,10 +10,18 @@ #include "network/linkers.h" #endif -int main(int argc, char** argv) { +int main(int /*argc*/, char** /*argv*/) { bool success = false; try { - LightGBM::Application app(argc, argv); + + std::string config_str = std::string("config=train.conf"); + char* argv = new char[config_str.size() + 1]; + for (size_t i = 0; i < config_str.size(); ++i) { + argv[i] = config_str[i]; + } + argv[config_str.size()] = '\0'; + + LightGBM::Application app(2, &argv - 1); app.Run(); #ifdef USE_MPI diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 4471534db7e1..b6760b46bca7 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -86,12 +86,12 @@ void SerialTreeLearner::GetShareStates(const Dataset* dataset, share_state_.reset(dataset->GetShareStates( reinterpret_cast(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr, col_sampler_.is_feature_used_bytree(), is_constant_hessian, - config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_ranking"))); + config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_lambdarank"))); } else { share_state_.reset(dataset->GetShareStates( ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(), is_constant_hessian, - config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_ranking"))); + config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_lambdarank"))); } } else { CHECK_NOTNULL(share_state_); @@ -100,12 +100,12 @@ void SerialTreeLearner::GetShareStates(const Dataset* dataset, share_state_.reset(dataset->GetShareStates( reinterpret_cast(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr, col_sampler_.is_feature_used_bytree(), is_constant_hessian, - share_state_->is_col_wise, !share_state_->is_col_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_ranking"))); + share_state_->is_col_wise, !share_state_->is_col_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_lambdarank"))); } else { share_state_.reset(dataset->GetShareStates( ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(), is_constant_hessian, share_state_->is_col_wise, - !share_state_->is_col_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_ranking"))); + !share_state_->is_col_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_lambdarank"))); } } CHECK_NOTNULL(share_state_); From 773891556531252122b2e536c83838ea459c3def Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 28 Jun 2024 03:46:17 +0000 Subject: [PATCH 47/68] update code for debug --- include/LightGBM/bin.h | 6 +++ .../LightGBM/pairwise_ranking_feature_group.h | 9 +++- src/io/bin.cpp | 4 +- src/io/dataset.cpp | 41 +++++++++++++++---- src/io/multi_val_dense_bin.hpp | 12 ++++++ src/io/pairwise_lambdarank_bin.hpp | 4 ++ 6 files changed, 63 insertions(+), 13 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 88244a2d4829..2adf3cf0ff81 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -306,6 +306,10 @@ class Bin { */ virtual BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const = 0; + virtual BinIterator* GetUnpairedIterator(uint32_t /* min_bin */, uint32_t /* max_bin */, uint32_t /* most_freq_bin */) const { + return nullptr; + } + /*! * \brief Save binary data to file * \param file File want to write @@ -554,6 +558,8 @@ class MultiValBin { const data_size_t* used_indices, data_size_t num_used_indices) = 0; + virtual void DumpContent() const {} + virtual MultiValBin* CreateLike(data_size_t num_data, int num_bin, int num_feature, double estimate_element_per_row, diff --git a/include/LightGBM/pairwise_ranking_feature_group.h b/include/LightGBM/pairwise_ranking_feature_group.h index 8879bf1f8c66..b08b3c8bbf7d 100644 --- a/include/LightGBM/pairwise_ranking_feature_group.h +++ b/include/LightGBM/pairwise_ranking_feature_group.h @@ -76,8 +76,13 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { } inline BinIterator* FeatureGroupIterator() { - // TODO(shiyu1994) - return nullptr; + if (is_multi_val_) { + return nullptr; + } + uint32_t min_bin = bin_offsets_[0]; + uint32_t max_bin = bin_offsets_.back() - 1; + uint32_t most_freq_bin = 0; + return bin_data_->GetUnpairedIterator(min_bin, max_bin, most_freq_bin); } /*! diff --git a/src/io/bin.cpp b/src/io/bin.cpp index a6140053c709..a1eafad72dbf 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -704,10 +704,10 @@ namespace LightGBM { double sparse_rate, const std::vector& offsets, const bool use_pairwise_ranking, const std::pair* paired_ranking_item_global_index_map) { if (sparse_rate >= multi_val_bin_sparse_threshold) { const double average_element_per_row = (1.0 - sparse_rate) * num_feature; - if (use_pairwise_ranking) { + // if (use_pairwise_ranking) { Log::Warning("Pairwise ranking with sparse row-wse bins is not supported yet."); return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets, use_pairwise_ranking, paired_ranking_item_global_index_map); - } + // } return CreateMultiValSparseBin(num_data, num_bin, average_element_per_row, use_pairwise_ranking, paired_ranking_item_global_index_map); } else { diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 6cc6b24fa109..36be4446f717 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -18,6 +18,8 @@ #include #include +#include + namespace LightGBM { const int Dataset::kSerializedReferenceVersionLength = 2; @@ -496,7 +498,7 @@ void PushDataToMultiValBin( Common::FunctionTimer fun_time("Dataset::PushDataToMultiValBin", global_timer); if (ret->IsSparse()) { - Log::Fatal("pairwise ranking with sparse multi val bin is not supported."); + // Log::Fatal("pairwise ranking with sparse multi val bin is not supported."); Threading::For( 0, num_data, 1024, [&](int tid, data_size_t start, data_size_t end) { std::vector cur_data; @@ -525,17 +527,17 @@ void PushDataToMultiValBin( Threading::For( 0, num_data, 1024, [&](int tid, data_size_t start, data_size_t end) { std::vector cur_data(most_freq_bins.size(), 0); - // for (size_t j = 0; j < most_freq_bins.size(); ++j) { - // Log::Warning("(*iters)[%d].size() = %d, j = %d, start = %d", tid, (*iters)[tid].size(), j, start); - // (*iters)[tid][j]->Reset(start); - // } + for (size_t j = 0; j < most_freq_bins.size(); ++j) { + //Log::Warning("(*iters)[%d].size() = %d, j = %d, start = %d", tid, (*iters)[tid].size(), j, start); + (*iters)[tid][j]->Reset(start); + } for (data_size_t i = start; i < end; ++i) { for (size_t j = 0; j < most_freq_bins.size(); ++j) { // for dense multi value bin, the feature bin values without offsets are used - // auto cur_bin = (*iters)[tid][j]->Get(i); - // cur_data[j] = cur_bin; + auto cur_bin = (*iters)[tid][j]->Get(i); + cur_data[j] = cur_bin; } - // ret->PushOneRow(tid, i, cur_data); + ret->PushOneRow(tid, i, cur_data); } }); } @@ -632,7 +634,7 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& of for (size_t i = 0; i < iters.size(); ++i) { for (size_t j = 0; j < iters[i].size(); ++j) { - Log::Warning("i = %d, j = %d, iters[i][j] = %d", static_cast(iters[i][j] == nullptr)); + Log::Warning("i = %ld, j = %ld, iters[i][j] = %d", i, j, static_cast(iters[i][j] == nullptr)); } } @@ -644,6 +646,16 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& of original_offsets.push_back(offsets[i]); } original_offsets.push_back(offsets[num_original_features]); + std::ofstream fout("mutli_val_bin_meta_info_pairwise.txt"); + fout << "original_most_freq_bins" << std::endl; + for (size_t i = 0; i < original_most_freq_bins.size(); ++i) { + fout << original_most_freq_bins[i] << std::endl; + } + fout << "original_offsets" << std::endl; + for (size_t i = 0; i < original_offsets.size(); ++i) { + fout << original_offsets[i] << std::endl; + } + fout.close(); const data_size_t num_original_data = metadata_.query_boundaries()[metadata_.num_queries()]; ret.reset(MultiValBin::CreateMultiValBin( num_original_data, original_offsets.back(), num_original_features, @@ -654,8 +666,19 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& of num_data_, offsets.back(), static_cast(most_freq_bins.size()), 1.0 - sum_dense_ratio, offsets, use_pairwise_ranking, metadata_.paired_ranking_item_global_index_map())); PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get()); + std::ofstream fout("mutli_val_bin_meta_info_no_pairwise.txt"); + fout << "original_most_freq_bins" << std::endl; + for (size_t i = 0; i < most_freq_bins.size(); ++i) { + fout << most_freq_bins[i] << std::endl; + } + fout << "original_offsets" << std::endl; + for (size_t i = 0; i < offsets.size(); ++i) { + fout << offsets[i] << std::endl; + } + fout.close(); } ret->FinishLoad(); + ret->DumpContent(); return ret.release(); } diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index 60df38acfb0f..899de8322d66 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -13,6 +13,7 @@ #include #include #include +#include namespace LightGBM { @@ -26,6 +27,17 @@ class MultiValDenseBin : public MultiValBin { data_.resize(static_cast(num_data_) * num_feature_, static_cast(0)); } + void DumpContent() const override { + std::ofstream fout("multi_val_bin.txt"); + for (data_size_t i = 0; i < num_data_; ++i) { + for (data_size_t j = 0; j < num_feature_; ++j) { + fout << static_cast(data_[i * num_feature_ + j]) << " "; + } + fout << std::endl; + } + fout.close(); + } + ~MultiValDenseBin() { } diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp index 81373fc1f799..6695694d9703 100644 --- a/src/io/pairwise_lambdarank_bin.hpp +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -222,6 +222,10 @@ class PairwiseRankingBin: public BIN_TYPE { return unpaired_bin_->get_data(); } + BinIterator* GetUnpairedIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + return unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin); + } + void ReSize(data_size_t num_data) override; data_size_t Split(uint32_t /*min_bin*/, uint32_t /*max_bin*/, From d6c16df856109b64f00c0f743004fe0d24b2a588 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 4 Jul 2024 02:35:18 +0000 Subject: [PATCH 48/68] save changes --- src/io/dataset.cpp | 4 ++++ src/io/pairwise_lambdarank_bin.hpp | 6 ++++++ src/main.cpp | 2 +- src/treelearner/serial_tree_learner.cpp | 2 +- 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 36be4446f717..4ebf114fafd4 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -366,6 +366,8 @@ void Dataset::Construct(std::vector>* bin_mappers, is_sparse = false; } + is_sparse = false; + std::vector group_is_multi_val(used_features.size(), 0); if (io_config.enable_bundle && !used_features.empty()) { bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda"); @@ -707,6 +709,7 @@ TrainingShareStates* Dataset::GetShareStates( std::vector offsets; share_state->CalcBinOffsets( feature_groups_, &offsets, true); + Log::Warning("feature_groups_.size() = %ld, offsets.size() = %ld", feature_groups_.size(), offsets.size()); share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets, use_pairwise_ranking), num_data_, feature_groups_, false, true, num_grad_quant_bins); share_state->is_col_wise = true; @@ -717,6 +720,7 @@ TrainingShareStates* Dataset::GetShareStates( std::vector offsets; share_state->CalcBinOffsets( feature_groups_, &offsets, false); + Log::Warning("feature_groups_.size() = %ld, offsets.size() = %ld", feature_groups_.size(), offsets.size()); share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets, use_pairwise_ranking), num_data_, feature_groups_, false, false, num_grad_quant_bins); share_state->is_col_wise = false; diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp index 6695694d9703..0e481c78f37f 100644 --- a/src/io/pairwise_lambdarank_bin.hpp +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -621,6 +621,12 @@ class SparsePairwiseRankingDiffBin: public SparsePairwiseRankingBin>* ori_bin_mappers_; }; +template +class MultiValPairwiseBin : public MULTI_VAL_BIN_TYPE { + public: + +}; + } // namespace LightGBM diff --git a/src/main.cpp b/src/main.cpp index ea05febd156b..10ccb3d82b6a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,7 @@ int main(int /*argc*/, char** /*argv*/) { bool success = false; try { - std::string config_str = std::string("config=train.conf"); + std::string config_str = std::string("config=train_pairwise_lambdarank.conf"); char* argv = new char[config_str.size() + 1]; for (size_t i = 0; i < config_str.size(); ++i) { argv[i] = config_str[i]; diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index b6760b46bca7..35dcc6bc4156 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -126,7 +126,7 @@ void SerialTreeLearner::ResetTrainingDataInner(const Dataset* train_data, data_partition_->ResetNumData(num_data_); if (reset_multi_val_bin) { col_sampler_.SetTrainingData(train_data_); - GetShareStates(train_data_, is_constant_hessian, false); + GetShareStates(train_data_, is_constant_hessian, config_->objective == std::string("pairwise_lambdarank")); } // initialize ordered gradients and hessians From 0d572d7a550a0f113e8d392726a405530dc92cc0 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 8 Jul 2024 16:45:37 +0000 Subject: [PATCH 49/68] save changes for debug --- include/LightGBM/bin.h | 2 + include/LightGBM/feature_group.h | 26 ++++----- src/io/bin.cpp | 5 +- src/io/dataset.cpp | 14 +++-- src/io/multi_val_pairwise_lambdarank_bin.hpp | 14 ++++- src/io/pairwise_lambdarank_bin.cpp | 6 ++ src/io/pairwise_ranking_feature_group.cpp | 60 ++++++++++---------- src/io/sparse_bin.hpp | 1 + src/treelearner/feature_histogram.hpp | 16 ++++++ src/treelearner/serial_tree_learner.cpp | 25 +++++--- src/treelearner/serial_tree_learner.h | 4 +- 11 files changed, 111 insertions(+), 62 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 2adf3cf0ff81..8374e1e57b3a 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -537,6 +537,8 @@ class Bin { virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, const int num_threads) const = 0; virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const = 0; + + int group_index_ = -1; }; diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index b492a3031fff..0cbe901e35dc 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -587,26 +587,26 @@ class FeatureGroup { multi_bin_data_.clear(); for (int i = 0; i < num_feature_; ++i) { int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1; - if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) { - multi_bin_data_.emplace_back(Bin::CreateSparseBin( - num_data, bin_mappers_[i]->num_bin() + addi)); - } else { + // if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) { + // multi_bin_data_.emplace_back(Bin::CreateSparseBin( + // num_data, bin_mappers_[i]->num_bin() + addi)); + // } else { multi_bin_data_.emplace_back( Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi)); - } + // } } is_multi_val_ = true; } else { - if (force_sparse || - (!force_dense && num_feature_ == 1 && - bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { - is_sparse_ = true; - bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_)); - } else { + // if (force_sparse || + // (!force_dense && num_feature_ == 1 && + // bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { + // is_sparse_ = true; + // bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_)); + // } else { is_sparse_ = false; bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_)); - } - is_multi_val_ = false; + // } + // is_multi_val_ = false; } } diff --git a/src/io/bin.cpp b/src/io/bin.cpp index a1eafad72dbf..326df0ec5f79 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -707,9 +707,10 @@ namespace LightGBM { // if (use_pairwise_ranking) { Log::Warning("Pairwise ranking with sparse row-wse bins is not supported yet."); return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets, use_pairwise_ranking, paired_ranking_item_global_index_map); + // } else { + // return CreateMultiValSparseBin(num_data, num_bin, + // average_element_per_row, use_pairwise_ranking, paired_ranking_item_global_index_map); // } - return CreateMultiValSparseBin(num_data, num_bin, - average_element_per_row, use_pairwise_ranking, paired_ranking_item_global_index_map); } else { return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets, use_pairwise_ranking, paired_ranking_item_global_index_map); } diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 4ebf114fafd4..0fbbea24b407 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -499,6 +499,7 @@ void PushDataToMultiValBin( MultiValBin* ret) { Common::FunctionTimer fun_time("Dataset::PushDataToMultiValBin", global_timer); + Log::Warning("num_data = %d", num_data); if (ret->IsSparse()) { // Log::Fatal("pairwise ranking with sparse multi val bin is not supported."); Threading::For( @@ -634,11 +635,11 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& of 1.0 - sum_dense_ratio); if (use_pairwise_ranking) { - for (size_t i = 0; i < iters.size(); ++i) { - for (size_t j = 0; j < iters[i].size(); ++j) { - Log::Warning("i = %ld, j = %ld, iters[i][j] = %d", i, j, static_cast(iters[i][j] == nullptr)); - } - } + // for (size_t i = 0; i < iters.size(); ++i) { + // for (size_t j = 0; j < iters[i].size(); ++j) { + // Log::Warning("i = %ld, j = %ld, iters[i][j] = %d", i, j, static_cast(iters[i][j] == nullptr)); + // } + // } const int num_original_features = static_cast(most_freq_bins.size()) / 2; std::vector original_most_freq_bins; @@ -662,7 +663,7 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& of ret.reset(MultiValBin::CreateMultiValBin( num_original_data, original_offsets.back(), num_original_features, 1.0 - sum_dense_ratio, original_offsets, use_pairwise_ranking, metadata_.paired_ranking_item_global_index_map())); - PushDataToMultiValBin(num_original_features, original_most_freq_bins, original_offsets, &iters, ret.get()); + PushDataToMultiValBin(num_original_data, original_most_freq_bins, original_offsets, &iters, ret.get()); } else { ret.reset(MultiValBin::CreateMultiValBin( num_data_, offsets.back(), static_cast(most_freq_bins.size()), @@ -1632,6 +1633,7 @@ void Dataset::ConstructHistogramsInner( OMP_LOOP_EX_BEGIN(); int group = used_dense_group[gi]; const int num_bin = feature_groups_[group]->num_total_bin_; + feature_groups_[group]->bin_data_->group_index_ = gi; if (USE_QUANT_GRAD) { if (HIST_BITS == 16) { auto data_ptr = reinterpret_cast(reinterpret_cast(hist_data) + group_bin_boundaries_[group]); diff --git a/src/io/multi_val_pairwise_lambdarank_bin.hpp b/src/io/multi_val_pairwise_lambdarank_bin.hpp index 9b9a6333f54e..599593b56dbb 100644 --- a/src/io/multi_val_pairwise_lambdarank_bin.hpp +++ b/src/io/multi_val_pairwise_lambdarank_bin.hpp @@ -13,7 +13,9 @@ namespace LightGBM { template class MULTI_VAL_BIN_TYPE> class MultiValPairwiseLambdarankBin : public MULTI_VAL_BIN_TYPE { public: - MultiValPairwiseLambdarankBin(data_size_t num_data, int num_bin, int num_feature, const std::vector& offsets): MULTI_VAL_BIN_TYPE(num_data, num_bin, num_feature, offsets) {} + MultiValPairwiseLambdarankBin(data_size_t num_data, int num_bin, int num_feature, const std::vector& offsets): MULTI_VAL_BIN_TYPE(num_data, num_bin, num_feature, offsets) { + this->num_bin_ = num_bin * 2; + } protected: const std::pair* paired_ranking_item_global_index_map_; }; @@ -66,6 +68,13 @@ class MultiValDensePairwiseLambdarankBin: public MultiValPairwiseLambdarankBinnum_feature_; ++j) { const uint32_t bin = static_cast(first_data_ptr[j]); + // if (bin != 0) { + // Log::Warning("first bin = %d, num_feature_ = %d", bin, this->num_feature_); + // } + if (j == 0) { + Log::Warning("group index = %d bin = %d gradient = %f hessian = %f", j, bin, gradient, hessian); + } + const auto ti = (bin + this->offsets_[j]) << 1; grad[ti] += gradient; hess[ti] += hessian; @@ -76,6 +85,9 @@ class MultiValDensePairwiseLambdarankBin: public MultiValPairwiseLambdarankBinoffsets_.back(); for (int j = 0; j < this->num_feature_; ++j) { const uint32_t bin = static_cast(second_data_ptr[j]); + // if (bin != 0) { + // Log::Warning("second bin = %d, num_feature_ = %d", bin, this->num_feature_); + // } const auto ti = (bin + this->offsets_[j] + base_offset) << 1; grad[ti] += gradient; hess[ti] += hessian; diff --git a/src/io/pairwise_lambdarank_bin.cpp b/src/io/pairwise_lambdarank_bin.cpp index 1447db2c6b8f..87d61b8e7e4c 100644 --- a/src/io/pairwise_lambdarank_bin.cpp +++ b/src/io/pairwise_lambdarank_bin.cpp @@ -98,6 +98,9 @@ void DensePairwiseRankingBin::ConstructHistogramI for (; i < pf_end; ++i) { const auto paired_idx = USE_INDICES ? data_indices[i] : i; const auto ti = GetBinAt(paired_idx) << 1; + if (this->group_index_ == 0) { + Log::Warning("group index = %d bin = %d gradient = %f hessian = %f", this->group_index_, ti / 2, ordered_gradients[i], ordered_hessians[i]); + } if (USE_HESSIAN) { grad[ti] += ordered_gradients[i]; hess[ti] += ordered_hessians[i]; @@ -110,6 +113,9 @@ void DensePairwiseRankingBin::ConstructHistogramI for (; i < end; ++i) { const auto paired_idx = USE_INDICES ? data_indices[i] : i; const auto ti = GetBinAt(paired_idx) << 1; + if (this->group_index_ == 0) { + Log::Warning("group index = %d bin = %d gradient = %f hessian = %f", this->group_index_, ti / 2, ordered_gradients[i], ordered_hessians[i]); + } if (USE_HESSIAN) { grad[ti] += ordered_gradients[i]; hess[ti] += ordered_hessians[i]; diff --git a/src/io/pairwise_ranking_feature_group.cpp b/src/io/pairwise_ranking_feature_group.cpp index 7a28223ca71c..8a02e31a8d56 100644 --- a/src/io/pairwise_ranking_feature_group.cpp +++ b/src/io/pairwise_ranking_feature_group.cpp @@ -35,43 +35,43 @@ void PairwiseRankingFeatureGroup::CreateBinData(int num_data, bool is_multi_val, multi_bin_data_.clear(); for (int i = 0; i < num_feature_; ++i) { int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1; - if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) { - if (is_first_or_second_in_pairing_ == 0) { - multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingFirstBin( - num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); - } else { - multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingSecondBin( - num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); - } + // if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) { + // if (is_first_or_second_in_pairing_ == 0) { + // multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingFirstBin( + // num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); + // } else { + // multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingSecondBin( + // num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); + // } + // } else { + if (is_first_or_second_in_pairing_ == 0) { + multi_bin_data_.emplace_back( + Bin::CreateDensePairwiseRankingFirstBin(num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); } else { - if (is_first_or_second_in_pairing_ == 0) { - multi_bin_data_.emplace_back( - Bin::CreateDensePairwiseRankingFirstBin(num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); - } else { - multi_bin_data_.emplace_back( - Bin::CreateDensePairwiseRankingSecondBin(num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); - } + multi_bin_data_.emplace_back( + Bin::CreateDensePairwiseRankingSecondBin(num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); } + // } } is_multi_val_ = true; } else { - if (force_sparse || - (!force_dense && num_feature_ == 1 && - bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { - is_sparse_ = true; - if (is_first_or_second_in_pairing_ == 0) { - bin_data_.reset(Bin::CreateSparsePairwiseRankingFirstBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); - } else { - bin_data_.reset(Bin::CreateSparsePairwiseRankingSecondBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); - } + // if (force_sparse || + // (!force_dense && num_feature_ == 1 && + // bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { + // is_sparse_ = true; + // if (is_first_or_second_in_pairing_ == 0) { + // bin_data_.reset(Bin::CreateSparsePairwiseRankingFirstBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); + // } else { + // bin_data_.reset(Bin::CreateSparsePairwiseRankingSecondBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); + // } + // } else { + is_sparse_ = false; + if (is_first_or_second_in_pairing_ == 0) { + bin_data_.reset(Bin::CreateDensePairwiseRankingFirstBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); } else { - is_sparse_ = false; - if (is_first_or_second_in_pairing_ == 0) { - bin_data_.reset(Bin::CreateDensePairwiseRankingFirstBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); - } else { - bin_data_.reset(Bin::CreateDensePairwiseRankingSecondBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); - } + bin_data_.reset(Bin::CreateDensePairwiseRankingSecondBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); } + // } is_multi_val_ = false; } } diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index f7137d29ffd9..da89fd013e10 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -77,6 +77,7 @@ class SparseBin : public Bin { explicit SparseBin(data_size_t num_data) : num_data_(num_data) { int num_threads = OMP_NUM_THREADS(); push_buffers_.resize(num_threads); + Log::Warning("sparse bin is created !!!"); } ~SparseBin() {} diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 70dd0fb5436f..8eb458c83680 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -20,6 +20,8 @@ #include "monotone_constraints.hpp" #include "split_info.hpp" +#include + namespace LightGBM { class FeatureMetainfo { @@ -1501,6 +1503,7 @@ class HistogramPool { } OMP_THROW_EX(); } + offsets_ = offsets; } void ResetConfig(const Dataset* train_data, const Config* config) { @@ -1522,6 +1525,18 @@ class HistogramPool { } } + void DumpContent() const { + std::ofstream fout("historam_wise.txt"); + int cur_offsets_ptr = 0; + for (int i = 0; i < data_[0].size() / 2; ++i) { + if (i == offsets_[cur_offsets_ptr]) { + fout << "offset " << cur_offsets_ptr << " " << offsets_[cur_offsets_ptr] << " " << feature_metas_[cur_offsets_ptr].num_bin << " " << static_cast(feature_metas_[cur_offsets_ptr].offset) << std::endl; + ++cur_offsets_ptr; + } + fout << i << " " << data_[0][2 * i] << " " << data_[0][2 * i + 1] << std::endl; + } + } + /*! * \brief Get data for the specific index * \param idx which index want to get @@ -1591,6 +1606,7 @@ class HistogramPool { std::vector inverse_mapper_; std::vector last_used_time_; int cur_time_ = 0; + std::vector offsets_; }; } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 35dcc6bc4156..8a3e799cf3cb 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -756,6 +756,9 @@ std::set SerialTreeLearner::FindAllForceFeatures(Json force_split_leaf_sett void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf, bool update_cnt) { Common::FunctionTimer fun_timer("SerialTreeLearner::SplitInner", global_timer); + + histogram_pool_.DumpContent(); + SplitInfo& best_split_info = best_split_per_leaf_[best_leaf]; const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature); @@ -843,7 +846,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, // init the leaves that used on next iteration if (!config_->use_quantized_grad) { if (best_split_info.left_count < best_split_info.right_count) { - CHECK_GT(best_split_info.left_count, 0); + // CHECK_GT(best_split_info.left_count, 0); smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian, @@ -853,7 +856,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, best_split_info.right_sum_hessian, best_split_info.right_output); } else { - CHECK_GT(best_split_info.right_count, 0); + // CHECK_GT(best_split_info.right_count, 0); smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian, @@ -865,7 +868,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, } } else { if (best_split_info.left_count < best_split_info.right_count) { - CHECK_GT(best_split_info.left_count, 0); + // CHECK_GT(best_split_info.left_count, 0); smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian, @@ -877,7 +880,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, best_split_info.right_sum_gradient_and_hessian, best_split_info.right_output); } else { - CHECK_GT(best_split_info.right_count, 0); + // CHECK_GT(best_split_info.right_count, 0); smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian, @@ -896,9 +899,9 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, data_partition_->leaf_count(*right_leaf)); } - #ifdef DEBUG + // #ifdef DEBUG CheckSplit(best_split_info, *left_leaf, *right_leaf); - #endif + // #endif auto leaves_need_update = constraints_->Update( is_numerical_split, *left_leaf, *right_leaf, @@ -1057,7 +1060,7 @@ std::vector node_used_features = col_sampler_.GetByNode(tree, leaf); *split = bests[best_idx]; } -#ifdef DEBUG +// #ifdef DEBUG void SerialTreeLearner::CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index) { data_size_t num_data_in_left = 0; data_size_t num_data_in_right = 0; @@ -1114,6 +1117,12 @@ void SerialTreeLearner::CheckSplit(const SplitInfo& best_split_info, const int l sum_right_gradient += gradients_[index]; sum_right_hessian += hessians_[index]; } + Log::Warning("num_data_in_left = %d, best_split_info.left_count = %d", num_data_in_left, best_split_info.left_count); + Log::Warning("num_data_in_right = %d, best_split_info.right_count = %d", num_data_in_right, best_split_info.right_count); + Log::Warning("sum_left_gradient = %f, best_split_info.left_sum_gradient = %f", sum_left_gradient, best_split_info.left_sum_gradient); + Log::Warning("sum_left_hessian = %f, best_split_info.sum_left_hessian = %f", sum_left_hessian, best_split_info.left_sum_hessian); + Log::Warning("sum_right_gradient = %f, best_split_info.sum_right_gradient = %f", sum_right_gradient, best_split_info.right_sum_gradient); + Log::Warning("sum_right_hessian = %f, best_split_info.sum_right_hessian = %f", sum_right_hessian, best_split_info.right_sum_hessian); CHECK_EQ(num_data_in_left, best_split_info.left_count); CHECK_EQ(num_data_in_right, best_split_info.right_count); CHECK_LE(std::fabs(sum_left_gradient - best_split_info.left_sum_gradient), 1e-3); @@ -1123,6 +1132,6 @@ void SerialTreeLearner::CheckSplit(const SplitInfo& best_split_info, const int l Log::Warning("============================ pass split check ============================"); } } -#endif +// #endif } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 43ff6a4b1e13..e1ec2100ddea 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -171,9 +171,9 @@ class SerialTreeLearner: public TreeLearner { std::set FindAllForceFeatures(Json force_split_leaf_setting); - #ifdef DEBUG + // #ifdef DEBUG void CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index); - #endif + // #endif /*! * \brief Get the number of data in a leaf From 1f59f8525baaf5bb8626490b407c96d72ebcc2f6 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 21 Aug 2024 12:51:09 +0000 Subject: [PATCH 50/68] save changes --- include/LightGBM/feature_group.h | 3 +++ include/LightGBM/objective_function.h | 5 +++++ src/boosting/bagging.hpp | 5 ++++- src/boosting/gbdt.cpp | 3 ++- src/io/dataset.cpp | 19 +++++++++++++------ src/io/dense_bin.hpp | 4 ++++ src/io/multi_val_pairwise_lambdarank_bin.hpp | 6 +++--- src/io/pairwise_lambdarank_bin.cpp | 13 +++++++------ src/io/sparse_bin.hpp | 2 ++ src/io/train_share_states.cpp | 7 +++++-- src/treelearner/serial_tree_learner.cpp | 4 ++-- 11 files changed, 50 insertions(+), 21 deletions(-) diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index 0cbe901e35dc..53a501cd149b 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -286,7 +286,10 @@ class FeatureGroup { } inline void CopySubrowByCol(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices, int fidx) { + Log::Warning("in CopySubrowByCol"); if (!is_multi_val_) { + Log::Warning("is not multi val"); + Log::Warning("full_feature->bin_data_.get() = %ld", full_feature->bin_data_.get()); bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices); } else { multi_bin_data_[fidx]->CopySubrow(full_feature->multi_bin_data_[fidx].get(), used_indices, num_used_indices); diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h index 203b3e158a36..8a6de72a276b 100644 --- a/include/LightGBM/objective_function.h +++ b/include/LightGBM/objective_function.h @@ -108,6 +108,11 @@ class ObjectiveFunction { virtual bool NeedConvertOutputCUDA () const { return false; } #endif // USE_CUDA + + virtual void SetDataIndices(const data_size_t* used_data_indices) const { used_data_indices_ = used_data_indices; } + + private: + mutable const data_size_t* used_data_indices_ = nullptr; }; void UpdatePointwiseScoresForOneQuery(data_size_t query_id, double* score_pointwise, const double* score_pairwise, data_size_t cnt_pointwise, diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index 4c2c81553e7c..f61a14b82ab8 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -60,6 +60,7 @@ class BaggingSampleStrategy : public SampleStrategy { } else { // get subset tmp_subset_->ReSize(bag_data_cnt_); + Log::Warning("bag_data_indices_.size() = %ld, bag_data_cnt_ = %d", bag_data_indices_.size(), bag_data_cnt_); tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); #ifdef USE_CUDA @@ -119,8 +120,10 @@ class BaggingSampleStrategy : public SampleStrategy { (static_cast(bag_data_cnt_) / num_data_) / config_->bagging_freq; is_use_subset_ = false; if (config_->device_type != std::string("cuda")) { - const int group_threshold_usesubset = 100; + const int group_threshold_usesubset = 200; const double average_bag_rate_threshold = 0.5; + Log::Warning("train_data_->num_feature_groups() = %d", train_data_->num_feature_groups()); + Log::Warning("average_bag_rate = %f", average_bag_rate); if (average_bag_rate <= average_bag_rate_threshold && (train_data_->num_feature_groups() < group_threshold_usesubset)) { if (tmp_subset_ == nullptr || is_change_dataset) { diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index b75adab6d971..9005615932d2 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -339,6 +339,8 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true); } + data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data()); + objective_function_->SetDataIndices(data_sample_strategy_->bag_data_indices().data()); Boosting(); gradients = gradients_pointer_; hessians = hessians_pointer_; @@ -361,7 +363,6 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } // bagging logic - data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data()); const bool is_use_subset = data_sample_strategy_->is_use_subset(); const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt(); const std::vector>& bag_data_indices = data_sample_strategy_->bag_data_indices(); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 0fbbea24b407..a3621bdc1615 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -1140,6 +1140,7 @@ void Dataset::CopySubrow(const Dataset* fullset, data_size_t num_used_indices, bool need_meta_data) { CHECK_EQ(num_used_indices, num_data_); + Log::Warning("copy subrow here !!!!"); std::vector group_ids, subfeature_ids; group_ids.reserve(num_features_); subfeature_ids.reserve(num_features_); @@ -1155,20 +1156,24 @@ void Dataset::CopySubrow(const Dataset* fullset, subfeature_ids.emplace_back(-1); } } + Log::Warning("copy subrow step 0 !!!!"); int num_copy_tasks = static_cast(group_ids.size()); - - OMP_INIT_EX(); - #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(dynamic) + // OMP_INIT_EX(); + // #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(dynamic) for (int task_id = 0; task_id < num_copy_tasks; ++task_id) { - OMP_LOOP_EX_BEGIN(); + // OMP_LOOP_EX_BEGIN(); + Log::Warning("before copy sub row by col 0"); int group = group_ids[task_id]; int subfeature = subfeature_ids[task_id]; + Log::Warning("before copy sub row by col 1"); feature_groups_[group]->CopySubrowByCol(fullset->feature_groups_[group].get(), used_indices, num_used_indices, subfeature); - OMP_LOOP_EX_END(); + Log::Warning("after copy sub row by col"); + // OMP_LOOP_EX_END(); } - OMP_THROW_EX(); + // OMP_THROW_EX(); + Log::Warning("copy subrow step 1 !!!!"); if (need_meta_data) { metadata_.Init(fullset->metadata_, used_indices, num_used_indices); } @@ -1188,6 +1193,8 @@ void Dataset::CopySubrow(const Dataset* fullset, device_type_ = fullset->device_type_; gpu_device_id_ = fullset->gpu_device_id_; + Log::Warning("copy subrow step 2 !!!!"); + #ifdef USE_CUDA if (device_type_ == std::string("cuda")) { if (cuda_column_data_ == nullptr) { diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index c84d618ab0f1..dafdd92a419a 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -566,8 +566,11 @@ class DenseBin : public Bin { void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { + Log::Warning("is dense"); auto other_bin = dynamic_cast*>(full_bin); + Log::Warning("other bin created"); if (IS_4BIT) { + Log::Warning("is 4 bit"); const data_size_t rest = num_used_indices & 1; for (int i = 0; i < num_used_indices - rest; i += 2) { data_size_t idx = used_indices[i]; @@ -586,6 +589,7 @@ class DenseBin : public Bin { } } else { for (int i = 0; i < num_used_indices; ++i) { + CHECK_LT(used_indices[i], data_.size()); data_[i] = other_bin->data_[used_indices[i]]; } } diff --git a/src/io/multi_val_pairwise_lambdarank_bin.hpp b/src/io/multi_val_pairwise_lambdarank_bin.hpp index 599593b56dbb..4102bd5ad6da 100644 --- a/src/io/multi_val_pairwise_lambdarank_bin.hpp +++ b/src/io/multi_val_pairwise_lambdarank_bin.hpp @@ -71,9 +71,9 @@ class MultiValDensePairwiseLambdarankBin: public MultiValPairwiseLambdarankBinnum_feature_); // } - if (j == 0) { - Log::Warning("group index = %d bin = %d gradient = %f hessian = %f", j, bin, gradient, hessian); - } + // if (j == 0) { + // Log::Warning("group index = %d bin = %d gradient = %f hessian = %f", j, bin, gradient, hessian); + // } const auto ti = (bin + this->offsets_[j]) << 1; grad[ti] += gradient; diff --git a/src/io/pairwise_lambdarank_bin.cpp b/src/io/pairwise_lambdarank_bin.cpp index 87d61b8e7e4c..6f5cfd8cbad9 100644 --- a/src/io/pairwise_lambdarank_bin.cpp +++ b/src/io/pairwise_lambdarank_bin.cpp @@ -52,6 +52,7 @@ void PairwiseRankingBin::Push(int tid, data_size_t idx, template class ITERATOR_TYPE> void PairwiseRankingBin::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) { + Log::Warning("copy subrow in pairwie ranking bin"); unpaired_bin_->CopySubrow(full_bin, used_indices, num_used_indices); } @@ -98,9 +99,9 @@ void DensePairwiseRankingBin::ConstructHistogramI for (; i < pf_end; ++i) { const auto paired_idx = USE_INDICES ? data_indices[i] : i; const auto ti = GetBinAt(paired_idx) << 1; - if (this->group_index_ == 0) { - Log::Warning("group index = %d bin = %d gradient = %f hessian = %f", this->group_index_, ti / 2, ordered_gradients[i], ordered_hessians[i]); - } + // if (this->group_index_ == 0) { + // Log::Warning("group index = %d bin = %d gradient = %f hessian = %f", this->group_index_, ti / 2, ordered_gradients[i], ordered_hessians[i]); + // } if (USE_HESSIAN) { grad[ti] += ordered_gradients[i]; hess[ti] += ordered_hessians[i]; @@ -113,9 +114,9 @@ void DensePairwiseRankingBin::ConstructHistogramI for (; i < end; ++i) { const auto paired_idx = USE_INDICES ? data_indices[i] : i; const auto ti = GetBinAt(paired_idx) << 1; - if (this->group_index_ == 0) { - Log::Warning("group index = %d bin = %d gradient = %f hessian = %f", this->group_index_, ti / 2, ordered_gradients[i], ordered_hessians[i]); - } + // if (this->group_index_ == 0) { + // Log::Warning("group index = %d bin = %d gradient = %f hessian = %f", this->group_index_, ti / 2, ordered_gradients[i], ordered_hessians[i]); + // } if (USE_HESSIAN) { grad[ti] += ordered_gradients[i]; hess[ti] += ordered_hessians[i]; diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index da89fd013e10..842994f2e8b3 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -745,9 +745,11 @@ class SparseBin : public Bin { void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { + Log::Warning("is sparse"); auto other_bin = dynamic_cast*>(full_bin); deltas_.clear(); vals_.clear(); + Log::Warning("is sparse"); data_size_t start = 0; if (num_used_indices > 0) { start = used_indices[0]; diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp index ec7581e504c4..26deaaa70981 100644 --- a/src/io/train_share_states.cpp +++ b/src/io/train_share_states.cpp @@ -374,6 +374,7 @@ void TrainingShareStates::CalcBinOffsets(const std::vectorclear(); feature_hist_offsets_.clear(); if (in_is_col_wise) { + // Log::Fatal("not supported 0"); uint32_t cur_num_bin = 0; uint32_t hist_cur_num_bin = 0; for (int group = 0; group < static_cast(feature_groups.size()); ++group) { @@ -438,9 +439,10 @@ void TrainingShareStates::CalcBinOffsets(const std::vector= - MultiValBin::multi_val_bin_sparse_threshold ? 1 : 0; + const bool is_sparse_row_wise = false; //(1.0f - sum_dense_ratio) >= + // MultiValBin::multi_val_bin_sparse_threshold ? 1 : 0; if (is_sparse_row_wise) { + // Log::Fatal("not supported 1"); int cur_num_bin = 1; uint32_t hist_cur_num_bin = 1; for (int group = 0; group < static_cast(feature_groups.size()); ++group) { @@ -474,6 +476,7 @@ void TrainingShareStates::CalcBinOffsets(const std::vector(feature_groups.size()); ++group) { const std::unique_ptr& feature_group = feature_groups[group]; if (feature_group->is_multi_val_) { + Log::Fatal("not supported 2"); for (int i = 0; i < feature_group->num_feature_; ++i) { const std::unique_ptr& bin_mapper = feature_group->bin_mappers_[i]; if (group == 0 && i == 0 && bin_mapper->GetMostFreqBin() > 0) { diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 8a3e799cf3cb..1f19429ddddb 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -757,7 +757,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf, bool update_cnt) { Common::FunctionTimer fun_timer("SerialTreeLearner::SplitInner", global_timer); - histogram_pool_.DumpContent(); + // histogram_pool_.DumpContent(); SplitInfo& best_split_info = best_split_per_leaf_[best_leaf]; const int inner_feature_index = @@ -900,7 +900,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, } // #ifdef DEBUG - CheckSplit(best_split_info, *left_leaf, *right_leaf); + // CheckSplit(best_split_info, *left_leaf, *right_leaf); // #endif auto leaves_need_update = constraints_->Update( From 0618bb2595e82dd6176fb4f6d6ed063e271f3701 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 27 Aug 2024 03:10:54 +0000 Subject: [PATCH 51/68] add bagging by query for lambdarank --- docs/Parameters.rst | 4 + include/LightGBM/config.h | 3 + include/LightGBM/objective_function.h | 11 +++ include/LightGBM/sample_strategy.h | 4 + src/boosting/bagging.hpp | 94 +++++++++++++++++++++++- src/boosting/gbdt.cpp | 14 +++- src/io/config.cpp | 5 ++ src/io/config_auto.cpp | 6 ++ src/objective/rank_objective.hpp | 18 +++-- tests/python_package_test/test_engine.py | 24 ++++++ 10 files changed, 170 insertions(+), 13 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index e7b6fb1cded3..b7edae0110ab 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -366,6 +366,10 @@ Learning Control Parameters - random seed for bagging +- ``bagging_by_query`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - whether to do bagging sample by query + - ``feature_fraction`` :raw-html:`🔗︎`, default = ``1.0``, type = double, aliases: ``sub_feature``, ``colsample_bytree``, constraints: ``0.0 < feature_fraction <= 1.0`` - LightGBM will randomly select a subset of features on each iteration (tree) if ``feature_fraction`` is smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features before training each tree diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index a682c47d30e8..dc62f2f90cba 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -358,6 +358,9 @@ struct Config { // desc = random seed for bagging int bagging_seed = 3; + // desc = whether to do bagging sample by query + bool bagging_by_query = false; + // alias = sub_feature, colsample_bytree // check = >0.0 // check = <=1.0 diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h index ad188dc39676..57fab68befab 100644 --- a/include/LightGBM/objective_function.h +++ b/include/LightGBM/objective_function.h @@ -37,6 +37,17 @@ class ObjectiveFunction { virtual void GetGradients(const double* score, score_t* gradients, score_t* hessians) const = 0; + /*! + * \brief calculating first order derivative of loss function, used only for baggin by query in lambdarank + * \param score prediction score in this round + * \param num_sampled_queries number of in-bag queries + * \param sampled_query_indices indices of in-bag queries + * \gradients Output gradients + * \hessians Output hessians + */ + virtual void GetGradients(const double* score, const data_size_t /*num_sampled_queries*/, const data_size_t* /*sampled_query_indices*/, + score_t* gradients, score_t* hessians) const { GetGradients(score, gradients, hessians); } + virtual const char* GetName() const = 0; virtual bool IsConstantHessian() const { return false; } diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h index 4ea5cfc5f436..d2c26877c8ee 100644 --- a/include/LightGBM/sample_strategy.h +++ b/include/LightGBM/sample_strategy.h @@ -55,6 +55,10 @@ class SampleStrategy { bool NeedResizeGradients() const { return need_resize_gradients_; } + virtual data_size_t num_sampled_queries() const { return 0; } + + virtual const data_size_t* sampled_query_indices() const { return nullptr; } + protected: const Config* config_; const Dataset* train_data_; diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index 4c2c81553e7c..e3a8cc5bbcf9 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -17,8 +17,11 @@ class BaggingSampleStrategy : public SampleStrategy { config_ = config; train_data_ = train_data; num_data_ = train_data->num_data(); + num_queries_ = train_data->metadata().num_queries(); + query_boundaries_ = train_data->metadata().query_boundaries(); objective_function_ = objective_function; num_tree_per_iteration_ = num_tree_per_iteration; + num_threads_ = OMP_NUM_THREADS(); } ~BaggingSampleStrategy() {} @@ -27,9 +30,10 @@ class BaggingSampleStrategy : public SampleStrategy { Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer); // if need bagging if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) || - need_re_bagging_) { + need_re_bagging_) { need_re_bagging_ = false; - auto left_cnt = bagging_runner_.Run( + if (!config_->bagging_by_query) { + auto left_cnt = bagging_runner_.Run( num_data_, [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left, data_size_t*) { @@ -43,7 +47,60 @@ class BaggingSampleStrategy : public SampleStrategy { return cur_left_count; }, bag_data_indices_.data()); - bag_data_cnt_ = left_cnt; + bag_data_cnt_ = left_cnt; + } else { + num_sampled_queries_ = bagging_runner_.Run( + num_queries_, + [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left, + data_size_t*) { + data_size_t cur_left_count = 0; + cur_left_count = BaggingHelper(cur_start, cur_cnt, left); + return cur_left_count; + }, bag_query_indices_.data()); + + sampled_query_boundaries_[0] = 0; + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (data_size_t i = 0; i < num_sampled_queries_; ++i) { + OMP_LOOP_EX_BEGIN(); + sampled_query_boundaries_[i + 1] = query_boundaries_[bag_query_indices_[i] + 1] - query_boundaries_[bag_query_indices_[i]]; + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + + const int num_blocks = Threading::For(0, num_sampled_queries_ + 1, 128, [this](int thread_index, data_size_t start_index, data_size_t end_index) { + for (data_size_t i = start_index + 1; i < end_index; ++i) { + sampled_query_boundaries_[i] += sampled_query_boundaries_[i - 1]; + } + sampled_query_boundaires_thread_buffer_[thread_index] = sampled_query_boundaries_[end_index - 1]; + }); + + for (int thread_index = 1; thread_index < num_blocks; ++thread_index) { + sampled_query_boundaires_thread_buffer_[thread_index] += sampled_query_boundaires_thread_buffer_[thread_index - 1]; + } + + Threading::For(0, num_sampled_queries_ + 1, 128, [this](int thread_index, data_size_t start_index, data_size_t end_index) { + if (thread_index > 0) { + for (data_size_t i = start_index; i < end_index; ++i) { + sampled_query_boundaries_[i] += sampled_query_boundaires_thread_buffer_[thread_index - 1]; + } + } + }); + + bag_data_cnt_ = sampled_query_boundaries_[num_sampled_queries_]; + + Threading::For(0, num_sampled_queries_, 1, [this](int /*thread_index*/, data_size_t start_index, data_size_t end_index) { + for (data_size_t sampled_query_id = start_index; sampled_query_id < end_index; ++sampled_query_id) { + const data_size_t query_index = bag_query_indices_[sampled_query_id]; + const data_size_t data_index_start = query_boundaries_[query_index]; + const data_size_t data_index_end = query_boundaries_[query_index + 1]; + const data_size_t sampled_query_start = sampled_query_boundaries_[sampled_query_id]; + for (data_size_t i = data_index_start; i < data_index_end; ++i) { + bag_data_indices_[sampled_query_start + i - data_index_start] = i; + } + } + }); + } Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_); // set bagging data to tree learner if (!is_use_subset_) { @@ -108,7 +165,14 @@ class BaggingSampleStrategy : public SampleStrategy { cuda_bag_data_indices_.Resize(num_data_); } #endif // USE_CUDA - bagging_runner_.ReSize(num_data_); + if (!config_->bagging_by_query) { + bagging_runner_.ReSize(num_data_); + } else { + bagging_runner_.ReSize(num_queries_); + sampled_query_boundaries_.resize(num_queries_ + 1, 0); + sampled_query_boundaires_thread_buffer_.resize(num_threads_, 0); + bag_query_indices_.resize(num_data_); + } bagging_rands_.clear(); for (int i = 0; i < (num_data_ + bagging_rand_block_ - 1) / bagging_rand_block_; ++i) { @@ -153,6 +217,14 @@ class BaggingSampleStrategy : public SampleStrategy { return false; } + data_size_t num_sampled_queries() const override { + return num_sampled_queries_; + } + + const data_size_t* sampled_query_indices() const override { + return bag_query_indices_.data(); + } + private: data_size_t BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) { if (cnt <= 0) { @@ -202,6 +274,20 @@ class BaggingSampleStrategy : public SampleStrategy { /*! \brief whether need restart bagging in continued training */ bool need_re_bagging_; + /*! \brief number of threads */ + int num_threads_; + /*! \brief query boundaries of the in-bag queries */ + std::vector sampled_query_boundaries_; + /*! \brief buffer for calculating sampled_query_boundaries_ */ + std::vector sampled_query_boundaires_thread_buffer_; + /*! \brief in-bag query indices */ + std::vector> bag_query_indices_; + /*! \brief number of queries in the training dataset */ + data_size_t num_queries_; + /*! \brief number of in-bag queries */ + data_size_t num_sampled_queries_; + /*! \brief query boundaries of the whole training dataset */ + const data_size_t* query_boundaries_; }; } // namespace LightGBM diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 937b44fcc8aa..a58f34e98c5c 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -224,8 +224,14 @@ void GBDT::Boosting() { } // objective function will calculate gradients and hessians int64_t num_score = 0; - objective_function_-> - GetGradients(GetTrainingScore(&num_score), gradients_pointer_, hessians_pointer_); + if (config_->bagging_by_query) { + data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data()); + objective_function_-> + GetGradients(GetTrainingScore(&num_score), data_sample_strategy_->num_sampled_queries(), data_sample_strategy_->sampled_query_indices(), gradients_pointer_, hessians_pointer_); + } else { + objective_function_-> + GetGradients(GetTrainingScore(&num_score), gradients_pointer_, hessians_pointer_); + } } void GBDT::Train(int snapshot_freq, const std::string& model_output_path) { @@ -366,7 +372,9 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } // bagging logic - data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data()); + if (!config_->bagging_by_query) { + data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data()); + } const bool is_use_subset = data_sample_strategy_->is_use_subset(); const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt(); const std::vector>& bag_data_indices = data_sample_strategy_->bag_data_indices(); diff --git a/src/io/config.cpp b/src/io/config.cpp index c63de70fc16b..20d327ca2edb 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -466,6 +466,11 @@ void Config::CheckParamConflict(const std::unordered_map& Config::parameter_set() { "neg_bagging_fraction", "bagging_freq", "bagging_seed", + "bagging_by_query", "feature_fraction", "feature_fraction_bynode", "feature_fraction_seed", @@ -377,6 +378,8 @@ void Config::GetMembersFromString(const std::unordered_map>& Config::paramet {"neg_bagging_fraction", {"neg_sub_row", "neg_subsample", "neg_bagging"}}, {"bagging_freq", {"subsample_freq"}}, {"bagging_seed", {"bagging_fraction_seed"}}, + {"bagging_by_query", {}}, {"feature_fraction", {"sub_feature", "colsample_bytree"}}, {"feature_fraction_bynode", {"sub_feature_bynode", "colsample_bynode"}}, {"feature_fraction_seed", {}}, @@ -957,6 +962,7 @@ const std::unordered_map& Config::ParameterTypes() { {"neg_bagging_fraction", "double"}, {"bagging_freq", "int"}, {"bagging_seed", "int"}, + {"bagging_by_query", "bool"}, {"feature_fraction", "double"}, {"feature_fraction_bynode", "double"}, {"feature_fraction_seed", "int"}, diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index ae3b74651759..a0710804baae 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -56,19 +56,21 @@ class RankingObjective : public ObjectiveFunction { pos_biases_.resize(num_position_ids_, 0.0); } - void GetGradients(const double* score, score_t* gradients, - score_t* hessians) const override { + void GetGradients(const double* score, const data_size_t num_sampled_queries, const data_size_t* sampled_query_indices, + score_t* gradients, score_t* hessians) const override { + const data_size_t num_queries = (sampled_query_indices == nullptr ? num_queries_ : num_sampled_queries); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) - for (data_size_t i = 0; i < num_queries_; ++i) { - const data_size_t start = query_boundaries_[i]; - const data_size_t cnt = query_boundaries_[i + 1] - query_boundaries_[i]; + for (data_size_t i = 0; i < num_queries; ++i) { + const data_size_t query_index = (sampled_query_indices == nullptr ? i : sampled_query_indices[i]); + const data_size_t start = query_boundaries_[query_index]; + const data_size_t cnt = query_boundaries_[query_index + 1] - query_boundaries_[query_index]; std::vector score_adjusted; if (num_position_ids_ > 0) { for (data_size_t j = 0; j < cnt; ++j) { score_adjusted.push_back(score[start + j] + pos_biases_[positions_[start + j]]); } } - GetGradientsForOneQuery(i, cnt, label_ + start, num_position_ids_ > 0 ? score_adjusted.data() : score + start, + GetGradientsForOneQuery(query_index, cnt, label_ + start, num_position_ids_ > 0 ? score_adjusted.data() : score + start, gradients + start, hessians + start); if (weights_ != nullptr) { for (data_size_t j = 0; j < cnt; ++j) { @@ -84,6 +86,10 @@ class RankingObjective : public ObjectiveFunction { } } + void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override { + GetGradients(score, num_queries_, nullptr, gradients, hessians); + } + virtual void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt, const label_t* label, const double* score, score_t* lambdas, diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 8e5fddd01d48..30846c133b96 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -4399,3 +4399,27 @@ def test_quantized_training(): quant_bst = lgb.train(bst_params, ds, num_boost_round=10) quant_rmse = np.sqrt(np.mean((quant_bst.predict(X) - y) ** 2)) assert quant_rmse < rmse + 6.0 + + +def test_bagging_by_query_in_lambdarank(): + rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank" + X_train, y_train = load_svmlight_file(str(rank_example_dir / "rank.train")) + q_train = np.loadtxt(str(rank_example_dir / "rank.train.query")) + X_test, y_test = load_svmlight_file(str(rank_example_dir / "rank.test")) + q_test = np.loadtxt(str(rank_example_dir / "rank.test.query")) + params = {"objective": "lambdarank", "verbose": -1, "metric": "ndcg", "ndcg_eval_at": "5"} + lgb_train = lgb.Dataset(X_train, y_train, group=q_train, params=params) + lgb_test = lgb.Dataset(X_test, y_test, group=q_test, params=params) + gbm = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=[lgb_test]) + ndcg_score = gbm.best_score['valid_0']['ndcg@5'] + + params.update({"bagging_by_query": True, "bagging_fraction": 0.1, "bagging_freq": 1}) + gbm_bagging_by_query = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=[lgb_test]) + ndcg_score_bagging_by_query = gbm_bagging_by_query.best_score['valid_0']['ndcg@5'] + + params.update({"bagging_by_query": False, "bagging_fraction": 0.1, "bagging_freq": 1}) + gbm_no_bagging_by_query = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=[lgb_test]) + ndcg_score_no_bagging_by_query = gbm_no_bagging_by_query.best_score['valid_0']['ndcg@5'] + + assert ndcg_score_bagging_by_query >= ndcg_score + assert ndcg_score_no_bagging_by_query <= ndcg_score From 38fa4c2fb5d73ff1f0fe4136b1a3b27e424c51a1 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 27 Aug 2024 03:30:07 +0000 Subject: [PATCH 52/68] fix pre-commit --- tests/python_package_test/test_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 30846c133b96..5d12114391ea 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -4411,15 +4411,15 @@ def test_bagging_by_query_in_lambdarank(): lgb_train = lgb.Dataset(X_train, y_train, group=q_train, params=params) lgb_test = lgb.Dataset(X_test, y_test, group=q_test, params=params) gbm = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=[lgb_test]) - ndcg_score = gbm.best_score['valid_0']['ndcg@5'] + ndcg_score = gbm.best_score["valid_0"]["ndcg@5"] params.update({"bagging_by_query": True, "bagging_fraction": 0.1, "bagging_freq": 1}) gbm_bagging_by_query = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=[lgb_test]) - ndcg_score_bagging_by_query = gbm_bagging_by_query.best_score['valid_0']['ndcg@5'] + ndcg_score_bagging_by_query = gbm_bagging_by_query.best_score["valid_0"]["ndcg@5"] params.update({"bagging_by_query": False, "bagging_fraction": 0.1, "bagging_freq": 1}) gbm_no_bagging_by_query = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=[lgb_test]) - ndcg_score_no_bagging_by_query = gbm_no_bagging_by_query.best_score['valid_0']['ndcg@5'] + ndcg_score_no_bagging_by_query = gbm_no_bagging_by_query.best_score["valid_0"]["ndcg@5"] assert ndcg_score_bagging_by_query >= ndcg_score assert ndcg_score_no_bagging_by_query <= ndcg_score From 9e2a322e4ad69d22d2b879aa1dfafb79f9c50b28 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 29 Aug 2024 11:53:17 +0000 Subject: [PATCH 53/68] fix bagging by query with cuda --- include/LightGBM/cuda/cuda_objective_function.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/LightGBM/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp index 465ed334156c..a7877361cd09 100644 --- a/include/LightGBM/cuda/cuda_objective_function.hpp +++ b/include/LightGBM/cuda/cuda_objective_function.hpp @@ -49,6 +49,11 @@ class CUDAObjectiveInterface: public HOST_OBJECTIVE { SynchronizeCUDADevice(__FILE__, __LINE__); } + void GetGradients(const double* scores, const data_size_t /*num_sampled_queries*/, const data_size_t* /*sampled_query_indices*/, score_t* gradients, score_t* hessians) const override { + LaunchGetGradientsKernel(scores, gradients, hessians); + SynchronizeCUDADevice(__FILE__, __LINE__); + } + void RenewTreeOutputCUDA(const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf, const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const override { global_timer.Start("CUDAObjectiveInterface::LaunchRenewTreeOutputCUDAKernel"); From 666c51ef65f84e9d7387a7a604ac280ad7e49bb9 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 30 Aug 2024 05:32:33 +0000 Subject: [PATCH 54/68] fix bagging by query test case --- tests/python_package_test/test_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 5d12114391ea..956cb77520be 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -4420,6 +4420,6 @@ def test_bagging_by_query_in_lambdarank(): params.update({"bagging_by_query": False, "bagging_fraction": 0.1, "bagging_freq": 1}) gbm_no_bagging_by_query = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=[lgb_test]) ndcg_score_no_bagging_by_query = gbm_no_bagging_by_query.best_score["valid_0"]["ndcg@5"] - - assert ndcg_score_bagging_by_query >= ndcg_score - assert ndcg_score_no_bagging_by_query <= ndcg_score + print(ndcg_score_bagging_by_query, ndcg_score, ndcg_score_no_bagging_by_query) + assert ndcg_score_bagging_by_query >= ndcg_score - 0.03 + assert ndcg_score_no_bagging_by_query <= ndcg_score_bagging_by_query From 9e2c33882462ec8ad1089a5df5422eaf71a22e00 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 30 Aug 2024 05:52:59 +0000 Subject: [PATCH 55/68] fix bagging by query test case --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 956cb77520be..4f216fa970c5 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -4422,4 +4422,4 @@ def test_bagging_by_query_in_lambdarank(): ndcg_score_no_bagging_by_query = gbm_no_bagging_by_query.best_score["valid_0"]["ndcg@5"] print(ndcg_score_bagging_by_query, ndcg_score, ndcg_score_no_bagging_by_query) assert ndcg_score_bagging_by_query >= ndcg_score - 0.03 - assert ndcg_score_no_bagging_by_query <= ndcg_score_bagging_by_query + assert ndcg_score_no_bagging_by_query >= ndcg_score - 0.1 From 3abbc11b842806c56cb4ac37c804a6ca10b96fcd Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 30 Aug 2024 06:08:56 +0000 Subject: [PATCH 56/68] fix bagging by query test case --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 4f216fa970c5..308d3fc7b69e 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -4421,5 +4421,5 @@ def test_bagging_by_query_in_lambdarank(): gbm_no_bagging_by_query = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=[lgb_test]) ndcg_score_no_bagging_by_query = gbm_no_bagging_by_query.best_score["valid_0"]["ndcg@5"] print(ndcg_score_bagging_by_query, ndcg_score, ndcg_score_no_bagging_by_query) - assert ndcg_score_bagging_by_query >= ndcg_score - 0.03 + assert ndcg_score_bagging_by_query >= ndcg_score - 0.1 assert ndcg_score_no_bagging_by_query >= ndcg_score - 0.1 From 13fa0a3e173b1fd320e998818ab9be4493701cff Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 30 Aug 2024 06:24:01 +0000 Subject: [PATCH 57/68] add #include --- src/boosting/bagging.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index e3a8cc5bbcf9..7a66b5696425 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -7,6 +7,7 @@ #define LIGHTGBM_BOOSTING_BAGGING_HPP_ #include +#include namespace LightGBM { From 0258f07044c764c7673db79e123aedb7c3b1105f Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 4 Sep 2024 02:50:38 +0000 Subject: [PATCH 58/68] update CMakeLists.txt --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e1ae37eb8f0..78d9bc17fb55 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -438,6 +438,8 @@ set( src/io/parser.cpp src/io/train_share_states.cpp src/io/tree.cpp + src/io/pairwise_lambdarank_bin.cpp + src/io/pairwise_ranking_feature_group.cpp src/metric/dcg_calculator.cpp src/metric/metric.cpp src/network/linker_topo.cpp From 90a95fa57a6472ab4bee29d014b924e0cfb4c543 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 20 Sep 2024 05:03:08 +0000 Subject: [PATCH 59/68] fix bagging by query with pairwise lambdarank --- src/boosting/bagging.hpp | 14 +++++++++----- src/io/dataset.cpp | 15 ++++++++------- src/objective/rank_objective.hpp | 29 ++++++++++++++++------------- 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index 89544a4a1c5b..1a9f72ab7370 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -19,7 +19,11 @@ class BaggingSampleStrategy : public SampleStrategy { train_data_ = train_data; num_data_ = train_data->num_data(); num_queries_ = train_data->metadata().num_queries(); - query_boundaries_ = train_data->metadata().query_boundaries(); + if (config->objective == std::string("pairwise_lambdarank")) { + query_boundaries_ = train_data->metadata().pairwise_query_boundaries(); + } else { + query_boundaries_ = train_data->metadata().query_boundaries(); + } objective_function_ = objective_function; num_tree_per_iteration_ = num_tree_per_iteration; num_threads_ = OMP_NUM_THREADS(); @@ -62,14 +66,14 @@ class BaggingSampleStrategy : public SampleStrategy { sampled_query_boundaries_[0] = 0; OMP_INIT_EX(); #pragma omp parallel for schedule(static) num_threads(num_threads_) - for (data_size_t i = 0; i < num_sampled_queries_; ++i) { + for (data_size_t i = 0; i < num_queries_; ++i) { OMP_LOOP_EX_BEGIN(); sampled_query_boundaries_[i + 1] = query_boundaries_[bag_query_indices_[i] + 1] - query_boundaries_[bag_query_indices_[i]]; OMP_LOOP_EX_END(); } OMP_THROW_EX(); - const int num_blocks = Threading::For(0, num_sampled_queries_ + 1, 128, [this](int thread_index, data_size_t start_index, data_size_t end_index) { + const int num_blocks = Threading::For(0, num_queries_ + 1, 128, [this](int thread_index, data_size_t start_index, data_size_t end_index) { for (data_size_t i = start_index + 1; i < end_index; ++i) { sampled_query_boundaries_[i] += sampled_query_boundaries_[i - 1]; } @@ -80,7 +84,7 @@ class BaggingSampleStrategy : public SampleStrategy { sampled_query_boundaires_thread_buffer_[thread_index] += sampled_query_boundaires_thread_buffer_[thread_index - 1]; } - Threading::For(0, num_sampled_queries_ + 1, 128, [this](int thread_index, data_size_t start_index, data_size_t end_index) { + Threading::For(0, num_queries_ + 1, 128, [this](int thread_index, data_size_t start_index, data_size_t end_index) { if (thread_index > 0) { for (data_size_t i = start_index; i < end_index; ++i) { sampled_query_boundaries_[i] += sampled_query_boundaires_thread_buffer_[thread_index - 1]; @@ -90,7 +94,7 @@ class BaggingSampleStrategy : public SampleStrategy { bag_data_cnt_ = sampled_query_boundaries_[num_sampled_queries_]; - Threading::For(0, num_sampled_queries_, 1, [this](int /*thread_index*/, data_size_t start_index, data_size_t end_index) { + Threading::For(0, num_queries_, 1, [this](int /*thread_index*/, data_size_t start_index, data_size_t end_index) { for (data_size_t sampled_query_id = start_index; sampled_query_id < end_index; ++sampled_query_id) { const data_size_t query_index = bag_query_indices_[sampled_query_id]; const data_size_t data_index_start = query_boundaries_[query_index]; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index a3621bdc1615..37c0a213bd6f 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -917,14 +917,15 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va feature2subfeature_.clear(); has_raw_ = dataset->has_raw(); numeric_feature_map_ = dataset->numeric_feature_map_; - for (const int feature_index : dataset->numeric_feature_map_) { - if (feature_index != -1) { - numeric_feature_map_.push_back(feature_index + dataset->num_features_); + num_numeric_features_ = dataset->num_numeric_features_; + for (const int nuermic_feature_index : dataset->numeric_feature_map_) { + if (nuermic_feature_index != -1) { + numeric_feature_map_.push_back(num_numeric_features_); + ++num_numeric_features_; } else { numeric_feature_map_.push_back(-1); } } - num_numeric_features_ = dataset->num_numeric_features_ * 2; // copy feature bin mapper data feature_need_push_zeros_.clear(); group_bin_boundaries_.clear(); @@ -2102,9 +2103,9 @@ void Dataset::CreatePairwiseRankingDifferentialFeatures( const int feature_index = diff_original_feature_index->at(i); const data_size_t num_samples_for_feature = static_cast(sample_values[feature_index].size()); if (config.zero_as_missing) { + int cur_query = 0; for (int j = 0; j < num_samples_for_feature; ++j) { const double value = sample_values[feature_index][j]; - int cur_query = 0; data_size_t cur_data_index = sample_indices[feature_index][j]; while (query_boundaries[cur_query + 1] <= cur_data_index) { ++cur_query; @@ -2117,8 +2118,8 @@ void Dataset::CreatePairwiseRankingDifferentialFeatures( } else { CHECK_GT(sample_indices[feature_index].size(), 0); int cur_pos_j = 0; + int cur_query = 0; for (int j = 0; j < sample_indices[feature_index].back() + 1; ++j) { - int cur_query = 0; while (query_boundaries[cur_query + 1] <= j) { ++cur_query; } @@ -2144,7 +2145,7 @@ void Dataset::CreatePairwiseRankingDifferentialFeatures( differential_feature_bin_mappers->operator[](i)->FindBin( sampled_differential_values[i].data(), static_cast(sampled_differential_values[i].size()), - static_cast(num_total_sample_data * (num_total_sample_data) / 2), + static_cast(num_total_sample_data * (num_total_sample_data + 1) / 2), config.max_bin, config.min_data_in_bin, filter_cnt, config.feature_pre_filter, BinType::NumericalBin, config.use_missing, config.zero_as_missing, forced_upper_bounds ); diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index f37fc7db0f90..86df38566f5c 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -606,15 +606,17 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { } } - void GetGradients(const double* score_pairwise, score_t* gradients_pairwise, - score_t* hessians_pairwise) const override { + void GetGradients(const double* score_pairwise, const data_size_t num_sampled_queries, const data_size_t* sampled_query_indices, + score_t* gradients_pairwise, score_t* hessians_pairwise) const override { + const data_size_t num_queries = (sampled_query_indices == nullptr ? num_queries_ : num_sampled_queries); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) - for (data_size_t i = 0; i < num_queries_; ++i) { + for (data_size_t i = 0; i < num_queries; ++i) { global_timer.Start("pairwise_lambdarank::GetGradients part 0"); - const data_size_t start_pointwise = query_boundaries_[i]; - const data_size_t cnt_pointwise = query_boundaries_[i + 1] - query_boundaries_[i]; - const data_size_t start_pairwise = query_boundaries_pairwise_[i]; - const data_size_t cnt_pairwise = query_boundaries_pairwise_[i + 1] - query_boundaries_pairwise_[i]; + const data_size_t query_index = (sampled_query_indices == nullptr ? i : sampled_query_indices[i]); + const data_size_t start_pointwise = query_boundaries_[query_index]; + const data_size_t cnt_pointwise = query_boundaries_[query_index + 1] - query_boundaries_[query_index]; + const data_size_t start_pairwise = query_boundaries_pairwise_[query_index]; + const data_size_t cnt_pairwise = query_boundaries_pairwise_[query_index + 1] - query_boundaries_pairwise_[query_index]; std::vector score_adjusted_pairwise; if (num_position_ids_ > 0) { for (data_size_t j = 0; j < cnt_pairwise; ++j) { @@ -624,15 +626,15 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { } global_timer.Stop("pairwise_lambdarank::GetGradients part 0"); global_timer.Start("pairwise_lambdarank::GetGradients part 1"); - GetGradientsForOneQuery(i, cnt_pointwise, cnt_pairwise, label_ + start_pointwise, scores_pointwise_.data() + start_pointwise, num_position_ids_ > 0 ? score_adjusted_pairwise.data() : score_pairwise + start_pairwise, - right2left_map_byquery_[i], left2right_map_byquery_[i], left_right2pair_map_byquery_[i], + GetGradientsForOneQuery(query_index, cnt_pointwise, cnt_pairwise, label_ + start_pointwise, scores_pointwise_.data() + start_pointwise, num_position_ids_ > 0 ? score_adjusted_pairwise.data() : score_pairwise + start_pairwise, + right2left_map_byquery_[query_index], left2right_map_byquery_[query_index], left_right2pair_map_byquery_[query_index], gradients_pairwise + start_pairwise, hessians_pairwise + start_pairwise); std::vector all_pairs(cnt_pairwise); std::iota(all_pairs.begin(), all_pairs.end(), 0); global_timer.Stop("pairwise_lambdarank::GetGradients part 1"); global_timer.Start("pairwise_lambdarank::GetGradients part 2"); UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score_pairwise + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), - paired_index_map_ + start_pairwise, right2left_map_byquery_[i], left2right_map_byquery_[i], left_right2pair_map_byquery_[i], truncation_level_, sigmoid_, sigmoid_cache_); + paired_index_map_ + start_pairwise, right2left_map_byquery_[query_index], left2right_map_byquery_[query_index], left_right2pair_map_byquery_[query_index], truncation_level_, sigmoid_, sigmoid_cache_); global_timer.Stop("pairwise_lambdarank::GetGradients part 2"); } if (num_position_ids_ > 0) { @@ -640,9 +642,10 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { std::vector hessians_pointwise(num_data_); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) for (data_size_t i = 0; i < num_queries_; ++i) { - const data_size_t cnt_pointwise = query_boundaries_[i + 1] - query_boundaries_[i]; - const data_size_t cnt_pairwise = query_boundaries_pairwise_[i + 1] - query_boundaries_pairwise_[i]; - TransformGradientsPairwiseIntoPointwiseForOneQuery(i, cnt_pointwise, cnt_pairwise, gradients_pairwise, hessians_pairwise, gradients_pointwise.data(), hessians_pointwise.data()); + const data_size_t query_index = (sampled_query_indices == nullptr ? i : sampled_query_indices[i]); + const data_size_t cnt_pointwise = query_boundaries_[query_index + 1] - query_boundaries_[query_index]; + const data_size_t cnt_pairwise = query_boundaries_pairwise_[query_index + 1] - query_boundaries_pairwise_[query_index]; + TransformGradientsPairwiseIntoPointwiseForOneQuery(query_index, cnt_pointwise, cnt_pairwise, gradients_pairwise, hessians_pairwise, gradients_pointwise.data(), hessians_pointwise.data()); } UpdatePositionBiasFactors(gradients_pointwise.data(), hessians_pointwise.data()); } From b69913d674888a9393685de3b5b23327ded86edc Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 10 Oct 2024 05:00:45 +0000 Subject: [PATCH 60/68] fix compilation error C3200 with visual studio --- src/io/multi_val_pairwise_lambdarank_bin.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/io/multi_val_pairwise_lambdarank_bin.hpp b/src/io/multi_val_pairwise_lambdarank_bin.hpp index 4102bd5ad6da..bb75deea9407 100644 --- a/src/io/multi_val_pairwise_lambdarank_bin.hpp +++ b/src/io/multi_val_pairwise_lambdarank_bin.hpp @@ -25,7 +25,7 @@ template class MultiValDensePairwiseLambdarankBin: public MultiValPairwiseLambdarankBin { public: MultiValDensePairwiseLambdarankBin(data_size_t num_data, int num_bin, int num_feature, - const std::vector& offsets, const std::pair* paired_ranking_item_global_index_map): MultiValPairwiseLambdarankBin(num_data, num_bin, num_feature, offsets) { + const std::vector& offsets, const std::pair* paired_ranking_item_global_index_map): MultiValPairwiseLambdarankBin(num_data, num_bin, num_feature, offsets) { this->paired_ranking_item_global_index_map_ = paired_ranking_item_global_index_map; } From 6dba1cfa47e5a850b5842c1ca4f6dc2a7269d819 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 11 Oct 2024 10:50:23 +0000 Subject: [PATCH 61/68] clean up main.cpp --- src/main.cpp | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 10ccb3d82b6a..b7f4ff0bdf9c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -10,18 +10,10 @@ #include "network/linkers.h" #endif -int main(int /*argc*/, char** /*argv*/) { +int main(int argc, char** argv) { bool success = false; try { - - std::string config_str = std::string("config=train_pairwise_lambdarank.conf"); - char* argv = new char[config_str.size() + 1]; - for (size_t i = 0; i < config_str.size(); ++i) { - argv[i] = config_str[i]; - } - argv[config_str.size()] = '\0'; - - LightGBM::Application app(2, &argv - 1); + LightGBM::Application app(argc, argv); app.Run(); #ifdef USE_MPI @@ -49,4 +41,4 @@ int main(int /*argc*/, char** /*argv*/) { exit(-1); } -} +} \ No newline at end of file From 3b2e29d328304146afa0f826cd5901399462cd86 Mon Sep 17 00:00:00 2001 From: Pavel Metrikov Date: Fri, 18 Oct 2024 00:37:55 -0700 Subject: [PATCH 62/68] Exposing configuration parameters for pairwise ranking --- include/LightGBM/config.h | 20 +++++++++++ include/LightGBM/objective_function.h | 3 +- src/io/config_auto.cpp | 30 ++++++++++++++++ src/metric/rank_metric.hpp | 16 +++++++-- src/objective/rank_objective.hpp | 49 +++++++++++++++++---------- 5 files changed, 96 insertions(+), 22 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 086acd8ddf48..af35018ad7e3 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -1008,6 +1008,26 @@ struct Config { // desc = used only in ``pairwise_lambdarank`` application bool use_differential_feature_in_pairwise_ranking = false; + // desc = whether to additionaly perform indirect document comparison in pairwise ranking + // desc = used only in ``pairwise_lambdarank`` application + bool pairwise_lambdarank_model_indirect_comparison = false; + + // desc = whether to model conditional document relevance (given documents ranked above) in pairwise ranking + // desc = used only in ``pairwise_lambdarank`` application + bool pairwise_lambdarank_model_conditional_rel = false; + + // desc = whether to limit the indirect document comparison to only auxilliary documents ranked above in pairwise ranking + // desc = used only in ``pairwise_lambdarank`` application + bool pairwise_lambdarank_indirect_comparison_above_only = true; + + // desc = whether to use logarithmic discounts when converting pairwise scores into pointwise in pairwise ranking + // desc = used only in ``pairwise_lambdarank`` application + bool pairwise_lambdarank_logarithmic_discounts = true; + + // desc = whether to use hard pairwise preference when converting pairwise scores into pointwise in pairwise ranking + // desc = used only in ``pairwise_lambdarank`` application + bool pairwise_lambdarank_hard_pairwise_preference = false; + #ifndef __NVCC__ #pragma endregion diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h index 9b34488e005d..5e764f34e9f0 100644 --- a/include/LightGBM/objective_function.h +++ b/include/LightGBM/objective_function.h @@ -130,7 +130,8 @@ void UpdatePointwiseScoresForOneQuery(data_size_t query_id, double* score_pointw data_size_t selected_pairs_cnt, const data_size_t* selected_pairs, const std::pair* paired_index_map, const std::multimap& right2left_map, const std::multimap < data_size_t, data_size_t>& left2right_map, const std::map, data_size_t>& left_right2pair_map, - int truncation_level, double sigma, CommonC::SigmoidCache sigmoid_cache); + int truncation_level, double sigma, CommonC::SigmoidCache sigmoid_cache, bool model_indirect_comparison, bool model_conditional_rel, + bool indirect_comparison_above_only, bool logarithmic_discounts, bool hard_pairwise_preference); } // namespace LightGBM diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 8ec187a32918..7700c00b46d3 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -308,6 +308,11 @@ const std::unordered_set& Config::parameter_set() { "label_gain", "lambdarank_position_bias_regularization", "use_differential_feature_in_pairwise_ranking", + "pairwise_lambdarank_model_indirect_comparison", + "pairwise_lambdarank_model_conditional_rel", + "pairwise_lambdarank_indirect_comparison_above_only", + "pairwise_lambdarank_logarithmic_discounts", + "pairwise_lambdarank_hard_pairwise_preference", "metric", "metric_freq", "is_provide_training_metric", @@ -633,6 +638,16 @@ void Config::GetMembersFromString(const std::unordered_map>& Config::paramet {"label_gain", {}}, {"lambdarank_position_bias_regularization", {}}, {"use_differential_feature_in_pairwise_ranking", {}}, + {"pairwise_lambdarank_model_indirect_comparison", {} }, + {"pairwise_lambdarank_model_conditional_rel", {} }, + {"pairwise_lambdarank_indirect_comparison_above_only", {} }, + {"pairwise_lambdarank_logarithmic_discounts", {} }, + {"pairwise_lambdarank_hard_pairwise_preference", {} }, {"metric", {"metrics", "metric_types"}}, {"metric_freq", {"output_freq"}}, {"is_provide_training_metric", {"training_metric", "is_training_metric", "train_metric"}}, @@ -1067,6 +1092,11 @@ const std::unordered_map& Config::ParameterTypes() { {"label_gain", "vector"}, {"lambdarank_position_bias_regularization", "double"}, {"use_differential_feature_in_pairwise_ranking", "bool"}, + {"pairwise_lambdarank_model_indirect_comparison", "bool" }, + {"pairwise_lambdarank_model_conditional_rel", "bool" }, + {"pairwise_lambdarank_indirect_comparison_above_only", "bool" }, + {"pairwise_lambdarank_logarithmic_discounts", "bool" }, + {"pairwise_lambdarank_hard_pairwise_preference", "bool" }, {"metric", "vector"}, {"metric_freq", "int"}, {"is_provide_training_metric", "bool"}, diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp index ca2c75058278..f7e51dd33526 100644 --- a/src/metric/rank_metric.hpp +++ b/src/metric/rank_metric.hpp @@ -30,7 +30,12 @@ class NDCGMetric:public Metric { DCGCalculator::Init(label_gain); pairwise_scores_ = config.objective == std::string("pairwise_lambdarank"); sigmoid_ = config.sigmoid; - truncation_level_ = config.lambdarank_truncation_level; + truncation_level_ = config.lambdarank_truncation_level; + model_indirect_comparison_ = config.pairwise_lambdarank_model_indirect_comparison; + model_conditional_rel_ = config.pairwise_lambdarank_model_conditional_rel; + indirect_comparison_above_only_ = config.pairwise_lambdarank_indirect_comparison_above_only; + logarithmic_discounts_ = config.pairwise_lambdarank_logarithmic_discounts; + hard_pairwise_preference_ = config.pairwise_lambdarank_hard_pairwise_preference; } ~NDCGMetric() { @@ -145,7 +150,7 @@ class NDCGMetric:public Metric { std::iota(all_pairs.begin(), all_pairs.end(), 0); UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), paired_index_map_ + start_pairwise, right2left_map_byquery_[i], left2right_map_byquery_[i], left_right2pair_map_byquery_[i], truncation_level_, - sigmoid_, sigmoid_cache_); + sigmoid_, sigmoid_cache_, model_indirect_comparison_, model_conditional_rel_, indirect_comparison_above_only_, logarithmic_discounts_, hard_pairwise_preference_); } // calculate DCG @@ -177,7 +182,7 @@ class NDCGMetric:public Metric { std::iota(all_pairs.begin(), all_pairs.end(), 0); UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), paired_index_map_ + start_pairwise, right2left_map_byquery_[i], left2right_map_byquery_[i], left_right2pair_map_byquery_[i], truncation_level_, - sigmoid_, sigmoid_cache_); + sigmoid_, sigmoid_cache_, model_indirect_comparison_, model_conditional_rel_, indirect_comparison_above_only_, logarithmic_discounts_, hard_pairwise_preference_); } // calculate DCG DCGCalculator::CalDCG(eval_at_, label_ + query_boundaries_[i], @@ -233,6 +238,11 @@ class NDCGMetric:public Metric { /*! \brief Number of data */ data_size_t num_data_pairwise_; const data_size_t* query_boundaries_pairwise_; + bool model_indirect_comparison_; + bool model_conditional_rel_; + bool indirect_comparison_above_only_; + bool logarithmic_discounts_; + bool hard_pairwise_preference_; }; } // namespace LightGBM diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 86df38566f5c..f3b95b911554 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -6,11 +6,11 @@ #ifndef LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_ -#define model_indirect_comparisons_ false -#define model_conditional_rel_ true -#define indirect_comparisons_above_only true -#define logarithmic_discounts true -#define hard_pairwise_preference false +//#define model_indirect_comparison_ false +//#define model_conditional_rel_ true +//#define indirect_comparison_above_only_ true +//#define logarithmic_discounts_ true +//#define hard_pairwise_preference_ false #include #include @@ -31,7 +31,8 @@ namespace LightGBM { data_size_t selected_pairs_cnt, const data_size_t* selected_pairs, const std::pair* paired_index_map, const std::multimap& right2left_map, const std::multimap < data_size_t, data_size_t>& left2right_map, const std::map, data_size_t>& left_right2pair_map, - int truncation_level, double sigma, CommonC::SigmoidCache sigmoid_cache) { + int truncation_level, double sigma, CommonC::SigmoidCache sigmoid_cache, bool model_indirect_comparison, bool model_conditional_rel, + bool indirect_comparison_above_only, bool logarithmic_discounts, bool hard_pairwise_preference) { // get sorted indices for scores global_timer.Start("pairwise_lambdarank::UpdatePointwiseScoresForOneQuery part 0"); std::vector sorted_idx(cnt_pointwise); @@ -64,12 +65,12 @@ namespace LightGBM { delta_score -= score_pairwise[current_pair_inverse]; comparisons++; } - if (model_indirect_comparisons_) { + if (model_indirect_comparison) { auto indexHead_range = right2left_map.equal_range(indexLeft); for (auto indexHead_it = indexHead_range.first; indexHead_it != indexHead_range.second; indexHead_it++) { data_size_t indexHead = indexHead_it->second; if (left_right2pair_map.count(std::make_pair(indexHead, indexRight)) > 0 && - (!(indirect_comparisons_above_only || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { + (!(indirect_comparison_above_only || model_conditional_rel) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { data_size_t indexHeadLeft = left_right2pair_map.at(std::make_pair(indexHead, indexLeft)); data_size_t indexHeadRight = left_right2pair_map.at(std::make_pair(indexHead, indexRight)); delta_score += score_pairwise[indexHeadRight] - score_pairwise[indexHeadLeft]; @@ -80,8 +81,8 @@ namespace LightGBM { for (auto indexTail_it = indexTail_range.first; indexTail_it != indexTail_range.second; indexTail_it++) { data_size_t indexTail = indexTail_it->second; if (left_right2pair_map.count(std::make_pair(indexRight, indexTail)) > 0 && - (!indirect_comparisons_above_only || (ranks[indexTail] < ranks[indexLeft] && ranks[indexTail] < ranks[indexRight])) && - (!model_conditional_rel_ || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { + (!indirect_comparison_above_only || (ranks[indexTail] < ranks[indexLeft] && ranks[indexTail] < ranks[indexRight])) && + (!model_conditional_rel || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { data_size_t indexLeftTail = left_right2pair_map.at(std::make_pair(indexLeft, indexTail)); data_size_t indexRightTail = left_right2pair_map.at(std::make_pair(indexRight, indexTail)); delta_score += score_pairwise[indexLeftTail] - score_pairwise[indexRightTail]; @@ -564,7 +565,13 @@ class RankXENDCG : public RankingObjective { class PairwiseLambdarankNDCG: public LambdarankNDCG { public: - explicit PairwiseLambdarankNDCG(const Config& config): LambdarankNDCG(config) {} + explicit PairwiseLambdarankNDCG(const Config& config): LambdarankNDCG(config) { + model_indirect_comparison_ = config.pairwise_lambdarank_model_indirect_comparison; + model_conditional_rel_ = config.pairwise_lambdarank_model_conditional_rel; + indirect_comparison_above_only_ = config.pairwise_lambdarank_indirect_comparison_above_only; + logarithmic_discounts_ = config.pairwise_lambdarank_logarithmic_discounts; + hard_pairwise_preference_ = config.pairwise_lambdarank_hard_pairwise_preference; + } explicit PairwiseLambdarankNDCG(const std::vector& strs): LambdarankNDCG(strs) {} @@ -634,7 +641,8 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { global_timer.Stop("pairwise_lambdarank::GetGradients part 1"); global_timer.Start("pairwise_lambdarank::GetGradients part 2"); UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score_pairwise + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), - paired_index_map_ + start_pairwise, right2left_map_byquery_[query_index], left2right_map_byquery_[query_index], left_right2pair_map_byquery_[query_index], truncation_level_, sigmoid_, sigmoid_cache_); + paired_index_map_ + start_pairwise, right2left_map_byquery_[query_index], left2right_map_byquery_[query_index], left_right2pair_map_byquery_[query_index], truncation_level_, sigmoid_, sigmoid_cache_, + model_indirect_comparison_, model_conditional_rel_, indirect_comparison_above_only_, logarithmic_discounts_, hard_pairwise_preference_); global_timer.Stop("pairwise_lambdarank::GetGradients part 2"); } if (num_position_ids_ > 0) { @@ -736,12 +744,12 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { delta_score -= score_pairwise[i_inverse]; comparisons++; } - if (model_indirect_comparisons_) { + if (model_indirect_comparison_) { auto indexHead_range = right2left_map.equal_range(indexLeft); for (auto indexHead_it = indexHead_range.first; indexHead_it != indexHead_range.second; indexHead_it++) { data_size_t indexHead = indexHead_it->second; if (left_right2pair_map.count(std::make_pair(indexHead, indexRight)) > 0 && - (!(indirect_comparisons_above_only || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { + (!(indirect_comparison_above_only_ || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { data_size_t indexHeadLeft = left_right2pair_map.at(std::make_pair(indexHead, indexLeft)); data_size_t indexHeadRight = left_right2pair_map.at(std::make_pair(indexHead, indexRight)); delta_score += score_pairwise[indexHeadRight] - score_pairwise[indexHeadLeft]; @@ -752,7 +760,7 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { for (auto indexTail_it = indexTail_range.first; indexTail_it != indexTail_range.second; indexTail_it++) { data_size_t indexTail = indexTail_it->second; if (left_right2pair_map.count(std::make_pair(indexRight, indexTail)) > 0 && - (!indirect_comparisons_above_only || (ranks[indexTail] < ranks[indexLeft] && ranks[indexTail] < ranks[indexRight])) && + (!indirect_comparison_above_only_ || (ranks[indexTail] < ranks[indexLeft] && ranks[indexTail] < ranks[indexRight])) && (!model_conditional_rel_ || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { data_size_t indexLeftTail = left_right2pair_map.at(std::make_pair(indexLeft, indexTail)); data_size_t indexRightTail = left_right2pair_map.at(std::make_pair(indexRight, indexTail)); @@ -791,12 +799,12 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { lambdas_pairwise[i_inverse] -= static_cast(p_lambda / comparisons); hessians_pairwise[i_inverse] += static_cast(p_hessian / comparisons); } - if (model_indirect_comparisons_) { + if (model_indirect_comparison_) { auto indexHead_range = right2left_map.equal_range(indexLeft); for (auto indexHead_it = indexHead_range.first; indexHead_it != indexHead_range.second; indexHead_it++) { data_size_t indexHead = indexHead_it->second; if (left_right2pair_map.count(std::make_pair(indexHead, indexRight)) > 0 && - (!(indirect_comparisons_above_only || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { + (!(indirect_comparison_above_only_ || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { data_size_t indexHeadLeft = left_right2pair_map.at(std::make_pair(indexHead, indexLeft)); data_size_t indexHeadRight = left_right2pair_map.at(std::make_pair(indexHead, indexRight)); lambdas_pairwise[indexHeadRight] += static_cast(p_lambda / comparisons); @@ -809,7 +817,7 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { for (auto indexTail_it = indexTail_range.first; indexTail_it != indexTail_range.second; indexTail_it++) { data_size_t indexTail = indexTail_it->second; if (left_right2pair_map.count(std::make_pair(indexRight, indexTail)) > 0 && - (!indirect_comparisons_above_only || (ranks[indexTail] < ranks[indexLeft] && ranks[indexTail] < ranks[indexRight])) && + (!indirect_comparison_above_only_ || (ranks[indexTail] < ranks[indexLeft] && ranks[indexTail] < ranks[indexRight])) && (!model_conditional_rel_ || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { data_size_t indexLeftTail = left_right2pair_map.at(std::make_pair(indexLeft, indexTail)); data_size_t indexRightTail = left_right2pair_map.at(std::make_pair(indexRight, indexTail)); @@ -856,6 +864,11 @@ class PairwiseLambdarankNDCG: public LambdarankNDCG { /*! \brief Number of pairwise data */ data_size_t num_data_pairwise_; mutable std::vector scores_pointwise_; + bool model_indirect_comparison_; + bool model_conditional_rel_; + bool indirect_comparison_above_only_; + bool logarithmic_discounts_; + bool hard_pairwise_preference_; private: const std::pair* paired_index_map_; From f1c32d395ae7cc6ae63028657cb5d04768641ea8 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 8 Nov 2024 05:13:50 +0000 Subject: [PATCH 63/68] fix bugs and pass by reference for SigmoidCache& --- include/LightGBM/objective_function.h | 2 +- include/LightGBM/utils/common.h | 2 +- src/boosting/bagging.hpp | 2 +- src/objective/rank_objective.hpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h index 5e764f34e9f0..88e96e463adb 100644 --- a/include/LightGBM/objective_function.h +++ b/include/LightGBM/objective_function.h @@ -130,7 +130,7 @@ void UpdatePointwiseScoresForOneQuery(data_size_t query_id, double* score_pointw data_size_t selected_pairs_cnt, const data_size_t* selected_pairs, const std::pair* paired_index_map, const std::multimap& right2left_map, const std::multimap < data_size_t, data_size_t>& left2right_map, const std::map, data_size_t>& left_right2pair_map, - int truncation_level, double sigma, CommonC::SigmoidCache sigmoid_cache, bool model_indirect_comparison, bool model_conditional_rel, + int truncation_level, double sigma, const CommonC::SigmoidCache& sigmoid_cache, bool model_indirect_comparison, bool model_conditional_rel, bool indirect_comparison_above_only, bool logarithmic_discounts, bool hard_pairwise_preference); } // namespace LightGBM diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index 14948be45d1b..309eba7979ad 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -1275,7 +1275,7 @@ class SigmoidCache { } } - double compute(double score) { + double compute(double score) const { if (score <= min_sigmoid_input_) { // too small, use lower bound return sigmoid_table_[0]; diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index 1a9f72ab7370..f50c7f3160c5 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -188,7 +188,7 @@ class BaggingSampleStrategy : public SampleStrategy { double average_bag_rate = (static_cast(bag_data_cnt_) / num_data_) / config_->bagging_freq; is_use_subset_ = false; - if (config_->device_type != std::string("cuda")) { + if (config_->device_type != std::string("cuda") && !config_->bagging_by_query) { const int group_threshold_usesubset = 200; const double average_bag_rate_threshold = 0.5; Log::Warning("train_data_->num_feature_groups() = %d", train_data_->num_feature_groups()); diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index f3b95b911554..878b4e8234e3 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -31,7 +31,7 @@ namespace LightGBM { data_size_t selected_pairs_cnt, const data_size_t* selected_pairs, const std::pair* paired_index_map, const std::multimap& right2left_map, const std::multimap < data_size_t, data_size_t>& left2right_map, const std::map, data_size_t>& left_right2pair_map, - int truncation_level, double sigma, CommonC::SigmoidCache sigmoid_cache, bool model_indirect_comparison, bool model_conditional_rel, + int truncation_level, double sigma, const CommonC::SigmoidCache& sigmoid_cache, bool model_indirect_comparison, bool model_conditional_rel, bool indirect_comparison_above_only, bool logarithmic_discounts, bool hard_pairwise_preference) { // get sorted indices for scores global_timer.Start("pairwise_lambdarank::UpdatePointwiseScoresForOneQuery part 0"); From 51693e209c4738a75b8ae4ef36e00564b08f6f0a Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 8 Nov 2024 05:56:51 +0000 Subject: [PATCH 64/68] add pairing approach --- include/LightGBM/config.h | 12 ++++++++++++ include/LightGBM/dataset.h | 3 ++- src/io/dataset.cpp | 3 ++- src/io/metadata.cpp | 4 ++-- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index af35018ad7e3..26762af8a46f 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -1028,6 +1028,18 @@ struct Config { // desc = used only in ``pairwise_lambdarank`` application bool pairwise_lambdarank_hard_pairwise_preference = false; + // desc = pairing appraoch for training dataset + // desc = used only in ``pairwise_lambdarank`` application + // desc = with ``relevance``, only consider pairs with difference relevance score + // desc = with ``all``, all pairs will be used + std::string pairwise_lambdarank_train_pairing_approach = std::string("relevance"); + + // desc = pairing appraoch for validation dataset + // desc = used only in ``pairwise_lambdarank`` application + // desc = with ``relevance``, only consider pairs with difference relevance score + // desc = with ``all``, all pairs will be used + std::string pairwise_lambdarank_valid_pairing_approach = std::string("relevance"); + #ifndef __NVCC__ #pragma endregion diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index a2bf2ccf9208..fcb627b14efd 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -208,9 +208,10 @@ class Metadata { * \brief Build metadata for ranking with pairwise features from metadata of an existing ranking dataset * \param metadata Reference to metadata of the existing ranking dataset * \param is_validation Whether the dataset is a validation set + * \param pairing_approach The pairing approach of this dataset * \return The number of paired data */ - data_size_t BuildPairwiseFeatureRanking(const Metadata& metadata, const bool is_validation); + data_size_t BuildPairwiseFeatureRanking(const Metadata& metadata, const bool is_validation, const std::string& pairing_approach); /*! * \brief Perform any extra operations after all data has been loaded diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 37c0a213bd6f..04e39fca366e 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -903,7 +903,8 @@ void Dataset::CreateValid(const Dataset* dataset) { } void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_validation, const Config& config) { - num_data_ = metadata_.BuildPairwiseFeatureRanking(dataset->metadata(), is_validation); + const std::string& pairing_approach = is_validation ? config.pairwise_lambdarank_valid_pairing_approach : config.pairwise_lambdarank_train_pairing_approach; + num_data_ = metadata_.BuildPairwiseFeatureRanking(dataset->metadata(), is_validation, pairing_approach); feature_groups_.clear(); num_features_ = dataset->num_features_ * 2; diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 6c3d4ea9f047..dac9479d6e05 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -853,7 +853,7 @@ size_t Metadata::SizesInByte() const { return size; } -data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata, const bool is_validation) { +data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata, const bool is_validation, const std::string& pairing_approach) { num_queries_ = metadata.num_queries(); label_.clear(); positions_.clear(); @@ -928,7 +928,7 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata, cons continue; } const label_t label_j = label_[item_index_j]; - if (label_i != label_j || is_validation) { + if ((pairing_approach == std::string("all")) || (label_i != label_j)) { paired_ranking_item_index_map_.push_back(std::pair{item_index_i - query_start, item_index_j - query_start}); paired_ranking_item_global_index_map_.push_back(std::pair{item_index_i, item_index_j}); ++num_data_; From 5071842ffe07cbee34d91884e0e5d8415de3709c Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 8 Nov 2024 06:19:21 +0000 Subject: [PATCH 65/68] add at_least_one_relevant --- include/LightGBM/config.h | 10 ++++++---- include/LightGBM/dataset.h | 3 +-- src/io/dataset.cpp | 2 +- src/io/metadata.cpp | 6 ++++-- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 26762af8a46f..a83339a9104f 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -1030,15 +1030,17 @@ struct Config { // desc = pairing appraoch for training dataset // desc = used only in ``pairwise_lambdarank`` application - // desc = with ``relevance``, only consider pairs with difference relevance score + // desc = with ``different_relevance``, only consider pairs with difference relevance score + // desc = with ``at_least_one_relevant``, only consider pairs with at least one relevant item // desc = with ``all``, all pairs will be used - std::string pairwise_lambdarank_train_pairing_approach = std::string("relevance"); + std::string pairwise_lambdarank_train_pairing_approach = std::string("different_relevance"); // desc = pairing appraoch for validation dataset // desc = used only in ``pairwise_lambdarank`` application - // desc = with ``relevance``, only consider pairs with difference relevance score + // desc = with ``different_relevance``, only consider pairs with difference relevance score + // desc = with ``at_least_one_relevant``, only consider pairs with at least one relevant item // desc = with ``all``, all pairs will be used - std::string pairwise_lambdarank_valid_pairing_approach = std::string("relevance"); + std::string pairwise_lambdarank_valid_pairing_approach = std::string("different_relevance"); #ifndef __NVCC__ #pragma endregion diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index fcb627b14efd..129dad4e1cba 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -207,11 +207,10 @@ class Metadata { /*! * \brief Build metadata for ranking with pairwise features from metadata of an existing ranking dataset * \param metadata Reference to metadata of the existing ranking dataset - * \param is_validation Whether the dataset is a validation set * \param pairing_approach The pairing approach of this dataset * \return The number of paired data */ - data_size_t BuildPairwiseFeatureRanking(const Metadata& metadata, const bool is_validation, const std::string& pairing_approach); + data_size_t BuildPairwiseFeatureRanking(const Metadata& metadata, const std::string& pairing_approach); /*! * \brief Perform any extra operations after all data has been loaded diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 04e39fca366e..61a4a77883fe 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -904,7 +904,7 @@ void Dataset::CreateValid(const Dataset* dataset) { void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_validation, const Config& config) { const std::string& pairing_approach = is_validation ? config.pairwise_lambdarank_valid_pairing_approach : config.pairwise_lambdarank_train_pairing_approach; - num_data_ = metadata_.BuildPairwiseFeatureRanking(dataset->metadata(), is_validation, pairing_approach); + num_data_ = metadata_.BuildPairwiseFeatureRanking(dataset->metadata(), pairing_approach); feature_groups_.clear(); num_features_ = dataset->num_features_ * 2; diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index dac9479d6e05..cc2c26803be3 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -853,7 +853,7 @@ size_t Metadata::SizesInByte() const { return size; } -data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata, const bool is_validation, const std::string& pairing_approach) { +data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata, const std::string& pairing_approach) { num_queries_ = metadata.num_queries(); label_.clear(); positions_.clear(); @@ -928,7 +928,9 @@ data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata, cons continue; } const label_t label_j = label_[item_index_j]; - if ((pairing_approach == std::string("all")) || (label_i != label_j)) { + if ((pairing_approach == std::string("all")) || + (pairing_approach == std::string("different_relevance") && label_i != label_j) || + (pairing_approach == std::string("at_least_one_relevant") && (label_i > 0 || label_j > 0))) { paired_ranking_item_index_map_.push_back(std::pair{item_index_i - query_start, item_index_j - query_start}); paired_ranking_item_global_index_map_.push_back(std::pair{item_index_i, item_index_j}); ++num_data_; From 598764b5ca06f0e1f31b1188c17e721a0129552c Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 21 Nov 2024 12:55:20 +0000 Subject: [PATCH 66/68] fix num bin for row wise in pairwise ranking --- include/LightGBM/dataset.h | 4 +++ src/boosting/gbdt.cpp | 6 +++++ src/io/dataset.cpp | 13 ++++++++-- src/io/multi_val_pairwise_lambdarank_bin.hpp | 3 ++- src/treelearner/col_sampler.hpp | 4 +++ src/treelearner/serial_tree_learner.cpp | 27 ++++++++++++++++++-- 6 files changed, 52 insertions(+), 5 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 129dad4e1cba..418aa560aff2 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -1155,6 +1155,10 @@ class Dataset { const data_size_t* train_query_boundaries_; /*! \brief stored number of queries from training dataset, for creating differential features in pairwise lambdarank */ data_size_t train_num_queries_; + /*! \brief stored number of differential features used in training dataset, for creating differential features in pairwise lambdarank */ + data_size_t num_used_differential_features_; + /*! \brief stored number of differential feature groups used in training dataset, for creating differential features in pairwise lambdarank */ + data_size_t num_used_differential_groups_; }; } // namespace LightGBM diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 770d29e7a811..c801ee4014d3 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -490,7 +490,9 @@ bool GBDT::EvalAndCheckEarlyStopping() { void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { Common::FunctionTimer fun_timer("GBDT::UpdateScore", global_timer); // update training score + Log::Warning("before update score 0"); if (!data_sample_strategy_->is_use_subset()) { + Log::Warning("before update score 1"); train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id); const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt(); @@ -506,16 +508,20 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { } #endif // USE_CUDA } + Log::Warning("before update score 2"); } else { + Log::Warning("before update score 3"); train_score_updater_->AddScore(tree, cur_tree_id); } + Log::Warning("before update score 4"); // update validation score for (auto& score_updater : valid_score_updater_) { score_updater->AddScore(tree, cur_tree_id); } + Log::Warning("before update score 5"); } #ifdef USE_CUDA diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 61a4a77883fe..358e4c54a8b3 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -641,7 +641,9 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& of // } // } - const int num_original_features = static_cast(most_freq_bins.size()) / 2; + Log::Warning("most_freq_bins.size() = %d, num_groups_ = %d, num_used_differential_features_ = %d, num_used_differential_groups_ = %d, ncol = %d", static_cast(most_freq_bins.size()), num_groups_, num_used_differential_features_, num_used_differential_groups_, ncol); + + const int num_original_features = (static_cast(most_freq_bins.size()) - num_used_differential_groups_) / 2; std::vector original_most_freq_bins; std::vector original_offsets; for (int i = 0; i < num_original_features; ++i) { @@ -661,7 +663,7 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& of fout.close(); const data_size_t num_original_data = metadata_.query_boundaries()[metadata_.num_queries()]; ret.reset(MultiValBin::CreateMultiValBin( - num_original_data, original_offsets.back(), num_original_features, + num_original_data, offsets.back(), num_original_features, 1.0 - sum_dense_ratio, original_offsets, use_pairwise_ranking, metadata_.paired_ranking_item_global_index_map())); PushDataToMultiValBin(num_original_data, original_most_freq_bins, original_offsets, &iters, ret.get()); } else { @@ -1025,6 +1027,10 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va group_feature_cnt_[i] = dataset->group_feature_cnt_[original_group_index]; } + Log::Warning("cur_feature_index = %d", cur_feature_index); + + num_used_differential_features_ = 0; + num_used_differential_groups_ = static_cast(diff_feature_groups.size()); if (config.use_differential_feature_in_pairwise_ranking) { for (size_t i = 0; i < diff_feature_groups.size(); ++i) { const std::vector& features_in_group = diff_feature_groups[i]; @@ -1045,6 +1051,7 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va used_feature_map_[diff_feature_index + dataset->num_total_features_ * 2] = cur_feature_index; ++cur_feature_index; ++num_features_in_group; + ++num_used_differential_features_; const int ori_feature_index = dataset->InnerFeatureIndex(diff_original_feature_index[diff_feature_index]); ori_bin_mappers.emplace_back(new BinMapper(*dataset->FeatureBinMapper(ori_feature_index))); ori_bin_mappers_for_diff.emplace_back(new BinMapper(*dataset->FeatureBinMapper(ori_feature_index))); @@ -1080,6 +1087,8 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_va num_groups_ += static_cast(diff_feature_groups.size()); } + Log::Warning("cur_feature_index = %d", cur_feature_index); + feature_groups_.shrink_to_fit(); feature_names_.clear(); diff --git a/src/io/multi_val_pairwise_lambdarank_bin.hpp b/src/io/multi_val_pairwise_lambdarank_bin.hpp index bb75deea9407..ced631100b94 100644 --- a/src/io/multi_val_pairwise_lambdarank_bin.hpp +++ b/src/io/multi_val_pairwise_lambdarank_bin.hpp @@ -14,7 +14,8 @@ template class MULTI_VAL_BIN_TYPE> class MultiValPairwiseLambdarankBin : public MULTI_VAL_BIN_TYPE { public: MultiValPairwiseLambdarankBin(data_size_t num_data, int num_bin, int num_feature, const std::vector& offsets): MULTI_VAL_BIN_TYPE(num_data, num_bin, num_feature, offsets) { - this->num_bin_ = num_bin * 2; + this->num_bin_ = num_bin; + Log::Warning("num_bin = %d", num_bin); } protected: const std::pair* paired_ranking_item_global_index_map_; diff --git a/src/treelearner/col_sampler.hpp b/src/treelearner/col_sampler.hpp index c70b07e50efa..7ac646ed6bb4 100644 --- a/src/treelearner/col_sampler.hpp +++ b/src/treelearner/col_sampler.hpp @@ -89,6 +89,7 @@ class ColSampler { } std::vector GetByNode(const Tree* tree, int leaf) { + // Log::Warning("GetByNode step 0"); // get interaction constraints for current branch std::unordered_set allowed_features; if (!interaction_constraints_.empty()) { @@ -110,6 +111,7 @@ class ColSampler { } } + // Log::Warning("GetByNode step 1"); std::vector ret(train_data_->num_features(), 0); if (fraction_bynode_ >= 1.0f) { if (interaction_constraints_.empty()) { @@ -124,6 +126,7 @@ class ColSampler { return ret; } } + // Log::Warning("GetByNode step 2"); if (need_reset_bytree_) { auto used_feature_cnt = GetCnt(used_feature_indices_.size(), fraction_bynode_); std::vector* allowed_used_feature_indices; @@ -175,6 +178,7 @@ class ColSampler { ret[inner_feature_index] = 1; } } + // Log::Warning("GetByNode step 3"); return ret; } diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 1f19429ddddb..2e1aefc51772 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -68,7 +68,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian GetShareStates(train_data_, is_constant_hessian, true); histogram_pool_.DynamicChangeSize(train_data_, - share_state_->num_hist_total_bin(), + share_state_->num_hist_total_bin() * 2, share_state_->feature_hist_offsets(), config_, max_cache_size, config_->num_leaves); Log::Info("Number of data points in the train set: %d, number of used features: %d", num_data_, num_features_); @@ -320,6 +320,8 @@ void SerialTreeLearner::BeforeTrain() { } } + // Log::Warning("smaller_leaf_splits_->leaf_index() = %d before train", smaller_leaf_splits_->leaf_index()); + larger_leaf_splits_->Init(); if (cegb_ != nullptr) { @@ -391,8 +393,12 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree, const std::set* fo } bool use_subtract = parent_leaf_histogram_array_ != nullptr; + // Log::Warning("before ConstructHistograms"); ConstructHistograms(is_feature_used, use_subtract); + // Log::Warning("after ConstructHistograms"); + // Log::Warning("before FindBestSplitsFromHistograms"); FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree); + // Log::Warning("after FindBestSplitsFromHistograms"); } void SerialTreeLearner::ConstructHistograms( @@ -466,14 +472,19 @@ void SerialTreeLearner::ConstructHistograms( void SerialTreeLearner::FindBestSplitsFromHistograms( const std::vector& is_feature_used, bool use_subtract, const Tree* tree) { + // Log::Warning("FindBestSplitsFromHistograms step 0"); Common::FunctionTimer fun_timer( "SerialTreeLearner::FindBestSplitsFromHistograms", global_timer); + // Log::Warning("FindBestSplitsFromHistograms step 0.1"); std::vector smaller_best(share_state_->num_threads); std::vector larger_best(share_state_->num_threads); + // Log::Warning("smaller_leaf_splits_->leaf_index() = %d", smaller_leaf_splits_->leaf_index()); std::vector smaller_node_used_features = col_sampler_.GetByNode(tree, smaller_leaf_splits_->leaf_index()); std::vector larger_node_used_features; + // Log::Warning("FindBestSplitsFromHistograms step 0.2"); double smaller_leaf_parent_output = GetParentOutput(tree, smaller_leaf_splits_.get()); double larger_leaf_parent_output = 0; + // Log::Warning("FindBestSplitsFromHistograms step 0.3"); if (larger_leaf_splits_ != nullptr && larger_leaf_splits_->leaf_index() >= 0) { larger_leaf_parent_output = GetParentOutput(tree, larger_leaf_splits_.get()); } @@ -481,6 +492,8 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( larger_node_used_features = col_sampler_.GetByNode(tree, larger_leaf_splits_->leaf_index()); } + // Log::Warning("FindBestSplitsFromHistograms step 1"); + if (use_subtract && config_->use_quantized_grad) { const int parent_index = std::min(smaller_leaf_splits_->leaf_index(), larger_leaf_splits_->leaf_index()); const uint8_t parent_hist_bits = gradient_discretizer_->GetHistBitsInNode(parent_index); @@ -500,15 +513,18 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( } } + // Log::Warning("FindBestSplitsFromHistograms step 2"); + OMP_INIT_EX(); // find splits -#pragma omp parallel for schedule(static) num_threads(share_state_->num_threads) +// #pragma omp parallel for schedule(static) num_threads(share_state_->num_threads) for (int feature_index = 0; feature_index < num_features_; ++feature_index) { OMP_LOOP_EX_BEGIN(); if (!is_feature_used[feature_index]) { continue; } const int tid = omp_get_thread_num(); + // Log::Warning("FindBestSplitsFromHistograms step 2.1"); if (config_->use_quantized_grad) { const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf(smaller_leaf_splits_->leaf_index()); const int64_t int_sum_gradient_and_hessian = smaller_leaf_splits_->int_sum_gradients_and_hessians(); @@ -529,6 +545,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( } int real_fidx = train_data_->RealFeatureIndex(feature_index); + // Log::Warning("FindBestSplitsFromHistograms step 2.2"); ComputeBestSplitForFeature(smaller_leaf_histogram_array_, feature_index, real_fidx, smaller_node_used_features[feature_index], @@ -542,6 +559,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( continue; } + // Log::Warning("FindBestSplitsFromHistograms step 2.3"); if (use_subtract) { if (config_->use_quantized_grad) { const int parent_index = std::min(smaller_leaf_splits_->leaf_index(), larger_leaf_splits_->leaf_index()); @@ -589,6 +607,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( } } + // Log::Warning("FindBestSplitsFromHistograms step 2.4"); ComputeBestSplitForFeature(larger_leaf_histogram_array_, feature_index, real_fidx, larger_node_used_features[feature_index], @@ -599,6 +618,10 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( OMP_LOOP_EX_END(); } OMP_THROW_EX(); + + + // Log::Warning("FindBestSplitsFromHistograms step 3"); + auto smaller_best_idx = ArrayArgs::ArgMax(smaller_best); int leaf = smaller_leaf_splits_->leaf_index(); best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx]; From f7deab47c69bc990f1e5849cc93b947202b89f83 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 17 Dec 2024 03:33:35 +0000 Subject: [PATCH 67/68] save for debug --- src/boosting/gbdt.cpp | 13 +++++++++++++ src/io/multi_val_dense_bin.hpp | 5 +++++ src/io/train_share_states.cpp | 21 +++++++++++++++++++++ src/treelearner/serial_tree_learner.cpp | 21 +++++++++++++++++++++ 4 files changed, 60 insertions(+) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index c801ee4014d3..c22ecfc561ba 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -343,6 +343,7 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) { bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { Common::FunctionTimer fun_timer("GBDT::TrainOneIter", global_timer); + Log::Warning("TrainOneIter step -10"); std::vector init_scores(num_tree_per_iteration_, 0.0); // boosting first if (gradients == nullptr || hessians == nullptr) { @@ -373,6 +374,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } } + Log::Warning("TrainOneIter step -9"); // bagging logic if (!config_->bagging_by_query) { data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data()); @@ -385,6 +387,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { ResetGradientBuffers(); } + Log::Warning("TrainOneIter step -8"); bool should_continue = false; for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { const size_t offset = static_cast(cur_tree_id) * num_data_; @@ -402,9 +405,12 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { hess = hessians_pointer_ + offset; } bool is_first_tree = models_.size() < static_cast(num_tree_per_iteration_); + Log::Warning("TrainOneIter step -7"); new_tree.reset(tree_learner_->Train(grad, hess, is_first_tree)); } + Log::Warning("TrainOneIter step 0"); + if (new_tree->num_leaves() > 1) { should_continue = true; auto score_ptr = train_score_updater_->score() + offset; @@ -424,18 +430,24 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { if (objective_function_ != nullptr && !config_->boost_from_average && !train_score_updater_->has_init_score()) { init_scores[cur_tree_id] = ObtainAutomaticInitialScore(objective_function_, cur_tree_id); // updates scores + Log::Warning("TrainOneIter step 0.1"); train_score_updater_->AddScore(init_scores[cur_tree_id], cur_tree_id); + Log::Warning("TrainOneIter step 0.2"); for (auto& score_updater : valid_score_updater_) { score_updater->AddScore(init_scores[cur_tree_id], cur_tree_id); } + Log::Warning("TrainOneIter step 0.3"); } new_tree->AsConstantTree(init_scores[cur_tree_id]); + Log::Warning("TrainOneIter step 0.4"); } } // add model models_.push_back(std::move(new_tree)); } + Log::Warning("TrainOneIter step 1"); + if (!should_continue) { Log::Warning("Stopped training because there are no more leaves that meet the split requirements"); if (models_.size() > static_cast(num_tree_per_iteration_)) { @@ -446,6 +458,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { return true; } + Log::Warning("TrainOneIter step 2"); ++iter_; return false; } diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index a2dd286eb9f0..cabff9dcae25 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -260,14 +260,19 @@ class MultiValDenseBin : public MultiValBin { void ReSize(data_size_t num_data, int num_bin, int num_feature, double, const std::vector& offsets) override { + Log::Warning("ReSize step 0"); num_data_ = num_data; num_bin_ = num_bin; num_feature_ = num_feature; offsets_ = offsets; + Log::Warning("ReSize step 1"); + Log::Warning("data_.size() = %ld", data_.size()); size_t new_size = static_cast(num_feature_) * num_data_; + Log::Warning("new_size = %ld", new_size); if (data_.size() < new_size) { data_.resize(new_size, 0); } + Log::Warning("ReSize step 2"); } template diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp index 26deaaa70981..d5573533b65b 100644 --- a/src/io/train_share_states.cpp +++ b/src/io/train_share_states.cpp @@ -31,11 +31,14 @@ void MultiValBinWrapper::InitTrain(const std::vector& group_feature_start, if (multi_val_bin_ == nullptr) { return; } + Log::Warning("MultiValBinWrapper::InitTrain step 0"); CopyMultiValBinSubset(group_feature_start, feature_groups, is_feature_used, bagging_use_indices, bagging_indices_cnt); + Log::Warning("MultiValBinWrapper::InitTrain step 1"); const auto cur_multi_val_bin = (is_use_subcol_ || is_use_subrow_) ? multi_val_bin_subset_.get() : multi_val_bin_.get(); + Log::Warning("MultiValBinWrapper::InitTrain step 2"); if (cur_multi_val_bin != nullptr) { num_bin_ = cur_multi_val_bin->num_bin(); num_bin_aligned_ = (num_bin_ + kAlignedSize - 1) / kAlignedSize * kAlignedSize; @@ -44,6 +47,7 @@ void MultiValBinWrapper::InitTrain(const std::vector& group_feature_start, (num_element_per_row + kZeroThreshold)) + 1, 1024); min_block_size_ = std::max(min_block_size_, 32); } + Log::Warning("MultiValBinWrapper::InitTrain step 3"); } template @@ -227,6 +231,7 @@ void MultiValBinWrapper::CopyMultiValBinSubset( int num_used = 0; int total = 0; std::vector used_feature_index; + Log::Warning("CopyMultiValBinSubset step 0"); for (int i : feature_groups_contained_) { int f_start = group_feature_start[i]; if (feature_groups[i]->is_multi_val_) { @@ -259,8 +264,10 @@ void MultiValBinWrapper::CopyMultiValBinSubset( ++total; } } + Log::Warning("CopyMultiValBinSubset step 1"); const double k_subfeature_threshold = 0.6; if (sum_used_dense_ratio >= sum_dense_ratio * k_subfeature_threshold) { + Log::Warning("CopyMultiValBinSubset step 2"); // only need to copy subset if (is_use_subrow_ && !is_subrow_copied_) { if (multi_val_bin_subset_ == nullptr) { @@ -279,6 +286,7 @@ void MultiValBinWrapper::CopyMultiValBinSubset( is_subrow_copied_ = true; } } else { + Log::Warning("CopyMultiValBinSubset step 3"); is_use_subcol_ = true; std::vector upper_bound; std::vector lower_bound; @@ -292,9 +300,12 @@ void MultiValBinWrapper::CopyMultiValBinSubset( int num_total_bin = offset; int new_num_total_bin = offset; offsets.push_back(static_cast(new_num_total_bin)); + Log::Warning("CopyMultiValBinSubset step 3.1"); for (int i : feature_groups_contained_) { int f_start = group_feature_start[i]; + Log::Warning("CopyMultiValBinSubset step 3.2"); if (feature_groups[i]->is_multi_val_) { + Log::Warning("CopyMultiValBinSubset step 3.3"); for (int j = 0; j < feature_groups[i]->num_feature_; ++j) { const auto& bin_mapper = feature_groups[i]->bin_mappers_[j]; if (i == 0 && j == 0 && bin_mapper->GetMostFreqBin() > 0) { @@ -320,6 +331,7 @@ void MultiValBinWrapper::CopyMultiValBinSubset( } } } else { + Log::Warning("CopyMultiValBinSubset step 3.4"); bool is_group_used = false; for (int j = 0; j < feature_groups[i]->num_feature_; ++j) { if (is_feature_used[f_start + j]) { @@ -327,9 +339,12 @@ void MultiValBinWrapper::CopyMultiValBinSubset( break; } } + Log::Warning("CopyMultiValBinSubset step 3.5"); int cur_num_bin = feature_groups[i]->bin_offsets_.back() - offset; num_total_bin += cur_num_bin; + Log::Warning("CopyMultiValBinSubset step 3.6"); if (is_group_used) { + Log::Warning("CopyMultiValBinSubset step 3.7"); new_num_total_bin += cur_num_bin; offsets.push_back(static_cast(new_num_total_bin)); lower_bound.push_back(num_total_bin - cur_num_bin); @@ -345,16 +360,21 @@ void MultiValBinWrapper::CopyMultiValBinSubset( } } // avoid out of range + Log::Warning("CopyMultiValBinSubset step 3.8"); lower_bound.push_back(num_total_bin); upper_bound.push_back(num_total_bin); + Log::Warning("CopyMultiValBinSubset step 3.9"); data_size_t num_data = is_use_subrow_ ? bagging_indices_cnt : num_data_; if (multi_val_bin_subset_ == nullptr) { + Log::Warning("CopyMultiValBinSubset step 3.9.1"); multi_val_bin_subset_.reset(multi_val_bin_->CreateLike( num_data, new_num_total_bin, num_used, sum_used_dense_ratio, offsets)); } else { + Log::Warning("CopyMultiValBinSubset step 3.9.2"); multi_val_bin_subset_->ReSize(num_data, new_num_total_bin, num_used, sum_used_dense_ratio, offsets); } + Log::Warning("CopyMultiValBinSubset step 3.10"); if (is_use_subrow_) { multi_val_bin_subset_->CopySubrowAndSubcol( multi_val_bin_.get(), bagging_use_indices, @@ -367,6 +387,7 @@ void MultiValBinWrapper::CopyMultiValBinSubset( multi_val_bin_.get(), used_feature_index, lower_bound, upper_bound, delta); } } + Log::Warning("CopyMultiValBinSubset step 4"); } void TrainingShareStates::CalcBinOffsets(const std::vector>& feature_groups, diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 2e1aefc51772..2d284732580b 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -189,18 +189,23 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians } share_state_->num_threads = num_threads; + Log::Warning("Train step 0"); + if (config_->use_quantized_grad) { gradient_discretizer_->DiscretizeGradients(num_data_, gradients_, hessians_); } + Log::Warning("Train step 1"); // some initial works before training BeforeTrain(); + Log::Warning("Train step 2"); bool track_branch_features = !(config_->interaction_constraints_vector.empty()); auto tree = std::unique_ptr(new Tree(config_->num_leaves, track_branch_features, false)); auto tree_ptr = tree.get(); constraints_->ShareTreePointer(tree_ptr); + Log::Warning("Train step 3"); // root leaf int left_leaf = 0; int cur_depth = 1; @@ -209,8 +214,10 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians int init_splits = ForceSplits(tree_ptr, &left_leaf, &right_leaf, &cur_depth); + Log::Warning("Train step 4"); for (int split = init_splits; split < config_->num_leaves - 1; ++split) { // some initial works before finding best split + Log::Warning("Train step 5, split = %d", split); if (BeforeFindBestSplit(tree_ptr, left_leaf, right_leaf)) { // find best threshold for every feature FindBestSplits(tree_ptr); @@ -225,15 +232,18 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians break; } // split tree with best leaf + Log::Warning("Train step 6, split = %d", split); Split(tree_ptr, best_leaf, &left_leaf, &right_leaf); cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf)); } + Log::Warning("Train step 7"); if (config_->use_quantized_grad && config_->quant_train_renew_leaf) { gradient_discretizer_->RenewIntGradTreeOutput(tree.get(), config_, data_partition_.get(), gradients_, hessians_, [this] (int leaf_index) { return GetGlobalDataCountInLeaf(leaf_index); }); } + Log::Warning("Train step 8"); Log::Debug("Trained a tree with leaves = %d and depth = %d", tree->num_leaves(), cur_depth); return tree.release(); } @@ -282,20 +292,28 @@ Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const std::vect void SerialTreeLearner::BeforeTrain() { Common::FunctionTimer fun_timer("SerialTreeLearner::BeforeTrain", global_timer); // reset histogram pool + Log::Warning("BeforeTrain step 0"); + histogram_pool_.ResetMap(); + Log::Warning("BeforeTrain step 1"); col_sampler_.ResetByTree(); + Log::Warning("BeforeTrain step 1.1"); train_data_->InitTrain(col_sampler_.is_feature_used_bytree(), share_state_.get()); + Log::Warning("BeforeTrain step 1.2"); // initialize data partition data_partition_->Init(); + Log::Warning("BeforeTrain step 2"); constraints_->Reset(); + Log::Warning("BeforeTrain step 3"); // reset the splits for leaves for (int i = 0; i < config_->num_leaves; ++i) { best_split_per_leaf_[i].Reset(); } + Log::Warning("BeforeTrain step 4"); // Sumup for root if (data_partition_->leaf_count(0) == num_data_) { // use all data @@ -320,6 +338,7 @@ void SerialTreeLearner::BeforeTrain() { } } + Log::Warning("BeforeTrain step 5"); // Log::Warning("smaller_leaf_splits_->leaf_index() = %d before train", smaller_leaf_splits_->leaf_index()); larger_leaf_splits_->Init(); @@ -328,9 +347,11 @@ void SerialTreeLearner::BeforeTrain() { cegb_->BeforeTrain(); } + Log::Warning("BeforeTrain step 6"); if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) { gradient_discretizer_->SetNumBitsInHistogramBin(0, -1, data_partition_->leaf_count(0), 0); } + Log::Warning("BeforeTrain step 7"); } bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) { From 0d1b310f0c3e60bf23d0808bda9c09e7606546f6 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 18 Dec 2024 09:15:03 +0000 Subject: [PATCH 68/68] update doc --- docs/Parameters.rst | 54 ++++++++++++++++++++++++++++++++++++++++++ src/io/config_auto.cpp | 32 +++++++++++++++++-------- 2 files changed, 76 insertions(+), 10 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 0e46eb240c27..9ce8a3b77ab0 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -1226,6 +1226,60 @@ Objective Parameters - used only in ``pairwise_lambdarank`` application +- ``pairwise_lambdarank_model_indirect_comparison`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - whether to additionaly perform indirect document comparison in pairwise ranking + + - used only in ``pairwise_lambdarank`` application + +- ``pairwise_lambdarank_model_conditional_rel`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - whether to model conditional document relevance (given documents ranked above) in pairwise ranking + + - used only in ``pairwise_lambdarank`` application + +- ``pairwise_lambdarank_indirect_comparison_above_only`` :raw-html:`🔗︎`, default = ``true``, type = bool + + - whether to limit the indirect document comparison to only auxilliary documents ranked above in pairwise ranking + + - used only in ``pairwise_lambdarank`` application + +- ``pairwise_lambdarank_logarithmic_discounts`` :raw-html:`🔗︎`, default = ``true``, type = bool + + - whether to use logarithmic discounts when converting pairwise scores into pointwise in pairwise ranking + + - used only in ``pairwise_lambdarank`` application + +- ``pairwise_lambdarank_hard_pairwise_preference`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - whether to use hard pairwise preference when converting pairwise scores into pointwise in pairwise ranking + + - used only in ``pairwise_lambdarank`` application + +- ``pairwise_lambdarank_train_pairing_approach`` :raw-html:`🔗︎`, default = ``std::string("different_relevance")``, type = string + + - pairing appraoch for training dataset + + - used only in ``pairwise_lambdarank`` application + + - with ``different_relevance``, only consider pairs with difference relevance score + + - with ``at_least_one_relevant``, only consider pairs with at least one relevant item + + - with ``all``, all pairs will be used + +- ``pairwise_lambdarank_valid_pairing_approach`` :raw-html:`🔗︎`, default = ``std::string("different_relevance")``, type = string + + - pairing appraoch for validation dataset + + - used only in ``pairwise_lambdarank`` application + + - with ``different_relevance``, only consider pairs with difference relevance score + + - with ``at_least_one_relevant``, only consider pairs with at least one relevant item + + - with ``all``, all pairs will be used + Metric Parameters ----------------- diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 7700c00b46d3..85927d4a95a2 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -313,6 +313,8 @@ const std::unordered_set& Config::parameter_set() { "pairwise_lambdarank_indirect_comparison_above_only", "pairwise_lambdarank_logarithmic_discounts", "pairwise_lambdarank_hard_pairwise_preference", + "pairwise_lambdarank_train_pairing_approach", + "pairwise_lambdarank_valid_pairing_approach", "metric", "metric_freq", "is_provide_training_metric", @@ -648,6 +650,10 @@ void Config::GetMembersFromString(const std::unordered_map>& Config::paramet {"label_gain", {}}, {"lambdarank_position_bias_regularization", {}}, {"use_differential_feature_in_pairwise_ranking", {}}, - {"pairwise_lambdarank_model_indirect_comparison", {} }, - {"pairwise_lambdarank_model_conditional_rel", {} }, - {"pairwise_lambdarank_indirect_comparison_above_only", {} }, - {"pairwise_lambdarank_logarithmic_discounts", {} }, - {"pairwise_lambdarank_hard_pairwise_preference", {} }, + {"pairwise_lambdarank_model_indirect_comparison", {}}, + {"pairwise_lambdarank_model_conditional_rel", {}}, + {"pairwise_lambdarank_indirect_comparison_above_only", {}}, + {"pairwise_lambdarank_logarithmic_discounts", {}}, + {"pairwise_lambdarank_hard_pairwise_preference", {}}, + {"pairwise_lambdarank_train_pairing_approach", {}}, + {"pairwise_lambdarank_valid_pairing_approach", {}}, {"metric", {"metrics", "metric_types"}}, {"metric_freq", {"output_freq"}}, {"is_provide_training_metric", {"training_metric", "is_training_metric", "train_metric"}}, @@ -1092,11 +1102,13 @@ const std::unordered_map& Config::ParameterTypes() { {"label_gain", "vector"}, {"lambdarank_position_bias_regularization", "double"}, {"use_differential_feature_in_pairwise_ranking", "bool"}, - {"pairwise_lambdarank_model_indirect_comparison", "bool" }, - {"pairwise_lambdarank_model_conditional_rel", "bool" }, - {"pairwise_lambdarank_indirect_comparison_above_only", "bool" }, - {"pairwise_lambdarank_logarithmic_discounts", "bool" }, - {"pairwise_lambdarank_hard_pairwise_preference", "bool" }, + {"pairwise_lambdarank_model_indirect_comparison", "bool"}, + {"pairwise_lambdarank_model_conditional_rel", "bool"}, + {"pairwise_lambdarank_indirect_comparison_above_only", "bool"}, + {"pairwise_lambdarank_logarithmic_discounts", "bool"}, + {"pairwise_lambdarank_hard_pairwise_preference", "bool"}, + {"pairwise_lambdarank_train_pairing_approach", "string"}, + {"pairwise_lambdarank_valid_pairing_approach", "string"}, {"metric", "vector"}, {"metric_freq", "int"}, {"is_provide_training_metric", "bool"},