diff --git a/CMakeLists.txt b/CMakeLists.txt index 048818ff1c1b..00351e12d1cf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -439,6 +439,8 @@ set( src/io/parser.cpp src/io/train_share_states.cpp src/io/tree.cpp + src/io/pairwise_lambdarank_bin.cpp + src/io/pairwise_ranking_feature_group.cpp src/metric/dcg_calculator.cpp src/metric/metric.cpp src/network/linker_topo.cpp diff --git a/docs/Parameters.rst b/docs/Parameters.rst index baaf20c4e5ad..9ce8a3b77ab0 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -168,6 +168,8 @@ Core Parameters - ``rank_xendcg`` is faster than and achieves the similar performance as ``lambdarank`` + - ``pairwise_lambdarank``, pairwise lambdarank algorithm + - label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect) - custom objective function (gradients and hessians not computed directly by LightGBM) @@ -413,6 +415,10 @@ Learning Control Parameters - random seed for bagging +- ``bagging_by_query`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - whether to do bagging sample by query + - ``feature_fraction`` :raw-html:`🔗︎`, default = ``1.0``, type = double, aliases: ``sub_feature``, ``colsample_bytree``, constraints: ``0.0 < feature_fraction <= 1.0`` - LightGBM will randomly select a subset of features on each iteration (tree) if ``feature_fraction`` is smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features before training each tree @@ -1214,6 +1220,66 @@ Objective Parameters - *New in version 4.1.0* +- ``use_differential_feature_in_pairwise_ranking`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - whether to use differential features in pairwise ranking + + - used only in ``pairwise_lambdarank`` application + +- ``pairwise_lambdarank_model_indirect_comparison`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - whether to additionaly perform indirect document comparison in pairwise ranking + + - used only in ``pairwise_lambdarank`` application + +- ``pairwise_lambdarank_model_conditional_rel`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - whether to model conditional document relevance (given documents ranked above) in pairwise ranking + + - used only in ``pairwise_lambdarank`` application + +- ``pairwise_lambdarank_indirect_comparison_above_only`` :raw-html:`🔗︎`, default = ``true``, type = bool + + - whether to limit the indirect document comparison to only auxilliary documents ranked above in pairwise ranking + + - used only in ``pairwise_lambdarank`` application + +- ``pairwise_lambdarank_logarithmic_discounts`` :raw-html:`🔗︎`, default = ``true``, type = bool + + - whether to use logarithmic discounts when converting pairwise scores into pointwise in pairwise ranking + + - used only in ``pairwise_lambdarank`` application + +- ``pairwise_lambdarank_hard_pairwise_preference`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - whether to use hard pairwise preference when converting pairwise scores into pointwise in pairwise ranking + + - used only in ``pairwise_lambdarank`` application + +- ``pairwise_lambdarank_train_pairing_approach`` :raw-html:`🔗︎`, default = ``std::string("different_relevance")``, type = string + + - pairing appraoch for training dataset + + - used only in ``pairwise_lambdarank`` application + + - with ``different_relevance``, only consider pairs with difference relevance score + + - with ``at_least_one_relevant``, only consider pairs with at least one relevant item + + - with ``all``, all pairs will be used + +- ``pairwise_lambdarank_valid_pairing_approach`` :raw-html:`🔗︎`, default = ``std::string("different_relevance")``, type = string + + - pairing appraoch for validation dataset + + - used only in ``pairwise_lambdarank`` application + + - with ``different_relevance``, only consider pairs with difference relevance score + + - with ``at_least_one_relevant``, only consider pairs with at least one relevant item + + - with ``all``, all pairs will be used + Metric Parameters ----------------- diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index a33fcfa9c45c..0747417f81cf 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -14,6 +14,7 @@ #include #include #include +#include #include namespace LightGBM { @@ -305,6 +306,10 @@ class Bin { */ virtual BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const = 0; + virtual BinIterator* GetUnpairedIterator(uint32_t /* min_bin */, uint32_t /* max_bin */, uint32_t /* most_freq_bin */) const { + return nullptr; + } + /*! * \brief Save binary data to file * \param file File want to write @@ -466,6 +471,64 @@ class Bin { */ static Bin* CreateSparseBin(data_size_t num_data, int num_bin); + /*! + * \brief Create object for bin data of the first feature in pair, used for pairwise ranking, for an original dense bin + * \param num_data Size of the pairwise dataset + * \param num_bin Number of bin + * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair + * \return The bin data object + */ + static Bin* CreateDensePairwiseRankingFirstBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); + + /*! + * \brief Create object for bin data of the first feature in pair, used for pairwise ranking, for an original sparse bin + * \param num_data Size of the pairwise dataset + * \param num_bin Number of bin + * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair + * \return The bin data object + */ + static Bin* CreateSparsePairwiseRankingFirstBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); + + /*! + * \brief Create object for bin data of the second feature in pair, used for pairwise ranking, for an original dense bin + * \param num_data Size of the pairwise dataset + * \param num_bin Number of bin + * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair + * \return The bin data object + */ + static Bin* CreateDensePairwiseRankingSecondBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); + + /*! + * \brief Create object for bin data of the second feature in pair, used for pairwise ranking, for an original sparse bin + * \param num_data Size of the pairwise dataset + * \param num_bin Number of bin + * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair + * \return The bin data object + */ + static Bin* CreateSparsePairwiseRankingSecondBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); + + /*! + * \brief Create object for bin data of the differential feature in pair, used for pairwise ranking, for an original dense bin + * \param num_data Size of the pairwise dataset + * \param num_bin Number of bin + * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair + * \param diff_bin_mappers Bin mappers for differential features in this group + * \param bin_offsets Bin offsets in feature group + * \return The bin data object + */ + static Bin* CreateDensePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets); + + /*! + * \brief Create object for bin data of the differential feature in pair, used for pairwise ranking, for an original sparse bin + * \param num_data Size of the pairwise dataset + * \param num_bin Number of bin + * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair + * \param diff_bin_mappers Bin mappers for differential features in this group + * \param bin_offsets Bin offsets in feature group + * \return The bin data object + */ + static Bin* CreateSparsePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets); + /*! * \brief Deep copy the bin */ @@ -474,6 +537,8 @@ class Bin { virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, const int num_threads) const = 0; virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const = 0; + + int group_index_ = -1; }; @@ -495,6 +560,8 @@ class MultiValBin { const data_size_t* used_indices, data_size_t num_used_indices) = 0; + virtual void DumpContent() const {} + virtual MultiValBin* CreateLike(data_size_t num_data, int num_bin, int num_feature, double estimate_element_per_row, @@ -588,12 +655,14 @@ class MultiValBin { virtual bool IsSparse() = 0; static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin, - int num_feature, double sparse_rate, const std::vector& offsets); + int num_feature, double sparse_rate, const std::vector& offsets, const bool use_pairwise_ranking, + const std::pair* paired_ranking_item_global_index_map); static MultiValBin* CreateMultiValDenseBin(data_size_t num_data, int num_bin, - int num_feature, const std::vector& offsets); + int num_feature, const std::vector& offsets, const bool use_pairwise_ranking, + const std::pair* paired_ranking_item_global_index_map); - static MultiValBin* CreateMultiValSparseBin(data_size_t num_data, int num_bin, double estimate_element_per_row); + static MultiValBin* CreateMultiValSparseBin(data_size_t num_data, int num_bin, double estimate_element_per_row, const bool use_pairwise_ranking, const std::pair* paired_ranking_item_global_index_map); static constexpr double multi_val_bin_sparse_threshold = 0.25f; diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 4f83898a28c9..a83339a9104f 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -36,6 +36,11 @@ enum TaskType { }; const int kDefaultNumLeaves = 31; +/*! \brief Types of pairwise ranking mode */ +enum PairwiseRankingMode { + kNone, kFull, kRelevance, kManual +}; + struct Config { public: Config() {} @@ -157,6 +162,7 @@ struct Config { // descl2 = ``lambdarank``, `lambdarank `__ objective. `label_gain <#label_gain>`__ can be used to set the gain (weight) of ``int`` label and all values in ``label`` must be smaller than number of elements in ``label_gain`` // descl2 = ``rank_xendcg``, `XE_NDCG_MART `__ ranking objective function, aliases: ``xendcg``, ``xe_ndcg``, ``xe_ndcg_mart``, ``xendcg_mart`` // descl2 = ``rank_xendcg`` is faster than and achieves the similar performance as ``lambdarank`` + // descl2 = ``pairwise_lambdarank``, pairwise lambdarank algorithm // descl2 = label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect) // desc = custom objective function (gradients and hessians not computed directly by LightGBM) // descl2 = ``custom`` @@ -358,6 +364,9 @@ struct Config { // desc = random seed for bagging int bagging_seed = 3; + // desc = whether to do bagging sample by query + bool bagging_by_query = false; + // alias = sub_feature, colsample_bytree // check = >0.0 // check = <=1.0 @@ -995,6 +1004,44 @@ struct Config { // desc = *New in version 4.1.0* double lambdarank_position_bias_regularization = 0.0; + // desc = whether to use differential features in pairwise ranking + // desc = used only in ``pairwise_lambdarank`` application + bool use_differential_feature_in_pairwise_ranking = false; + + // desc = whether to additionaly perform indirect document comparison in pairwise ranking + // desc = used only in ``pairwise_lambdarank`` application + bool pairwise_lambdarank_model_indirect_comparison = false; + + // desc = whether to model conditional document relevance (given documents ranked above) in pairwise ranking + // desc = used only in ``pairwise_lambdarank`` application + bool pairwise_lambdarank_model_conditional_rel = false; + + // desc = whether to limit the indirect document comparison to only auxilliary documents ranked above in pairwise ranking + // desc = used only in ``pairwise_lambdarank`` application + bool pairwise_lambdarank_indirect_comparison_above_only = true; + + // desc = whether to use logarithmic discounts when converting pairwise scores into pointwise in pairwise ranking + // desc = used only in ``pairwise_lambdarank`` application + bool pairwise_lambdarank_logarithmic_discounts = true; + + // desc = whether to use hard pairwise preference when converting pairwise scores into pointwise in pairwise ranking + // desc = used only in ``pairwise_lambdarank`` application + bool pairwise_lambdarank_hard_pairwise_preference = false; + + // desc = pairing appraoch for training dataset + // desc = used only in ``pairwise_lambdarank`` application + // desc = with ``different_relevance``, only consider pairs with difference relevance score + // desc = with ``at_least_one_relevant``, only consider pairs with at least one relevant item + // desc = with ``all``, all pairs will be used + std::string pairwise_lambdarank_train_pairing_approach = std::string("different_relevance"); + + // desc = pairing appraoch for validation dataset + // desc = used only in ``pairwise_lambdarank`` application + // desc = with ``different_relevance``, only consider pairs with difference relevance score + // desc = with ``at_least_one_relevant``, only consider pairs with at least one relevant item + // desc = with ``all``, all pairs will be used + std::string pairwise_lambdarank_valid_pairing_approach = std::string("different_relevance"); + #ifndef __NVCC__ #pragma endregion diff --git a/include/LightGBM/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp index 465ed334156c..a7877361cd09 100644 --- a/include/LightGBM/cuda/cuda_objective_function.hpp +++ b/include/LightGBM/cuda/cuda_objective_function.hpp @@ -49,6 +49,11 @@ class CUDAObjectiveInterface: public HOST_OBJECTIVE { SynchronizeCUDADevice(__FILE__, __LINE__); } + void GetGradients(const double* scores, const data_size_t /*num_sampled_queries*/, const data_size_t* /*sampled_query_indices*/, score_t* gradients, score_t* hessians) const override { + LaunchGetGradientsKernel(scores, gradients, hessians); + SynchronizeCUDADevice(__FILE__, __LINE__); + } + void RenewTreeOutputCUDA(const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf, const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const override { global_timer.Start("CUDAObjectiveInterface::LaunchRenewTreeOutputCUDAKernel"); diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 220a1f9f009c..418aa560aff2 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -204,6 +204,14 @@ class Metadata { const double* init_scores, const int32_t* queries); + /*! + * \brief Build metadata for ranking with pairwise features from metadata of an existing ranking dataset + * \param metadata Reference to metadata of the existing ranking dataset + * \param pairing_approach The pairing approach of this dataset + * \return The number of paired data + */ + data_size_t BuildPairwiseFeatureRanking(const Metadata& metadata, const std::string& pairing_approach); + /*! * \brief Perform any extra operations after all data has been loaded */ @@ -252,11 +260,40 @@ class Metadata { return position_ids_.size(); } + /*! + * \brief Get the pairwise item index map within query in ranking with pairwise features + * \return Pointer to the pairwise item index map within query + */ + inline const std::pair* paired_ranking_item_index_map() const { + if (!paired_ranking_item_index_map_.empty()) { + return paired_ranking_item_index_map_.data(); + } else { + return nullptr; + } + } + + /*! + * \brief Get the pairwise item global index map in ranking with pairwise features + * \return Pointer to the pairwise item global index map + */ + inline const std::pair* paired_ranking_item_global_index_map() const { + if (!paired_ranking_item_global_index_map_.empty()) { + return paired_ranking_item_global_index_map_.data(); + } else { + return nullptr; + } + } + + inline data_size_t paired_ranking_item_index_map_size() const { + return static_cast(paired_ranking_item_index_map_.size()); + } + /*! * \brief Get data boundaries on queries, if not exists, will return nullptr * we assume data will order by query, * the interval of [query_boundaris[i], query_boundaris[i+1]) * is the data indices for query i. + * When pairwise ranking, this points to the paired query boundaries. * \return Pointer of data boundaries on queries */ inline const data_size_t* query_boundaries() const { @@ -267,6 +304,18 @@ class Metadata { } } + /*! + * \brief Used in pairwise ranking. Pointwise query boundaries. + * \return Pointer of data boundaries on queries + */ + inline const data_size_t* pairwise_query_boundaries() const { + if (!pairwise_query_boundaries_.empty()) { + return pairwise_query_boundaries_.data(); + } else { + return nullptr; + } + } + /*! * \brief Get Number of queries * \return Number of queries @@ -364,7 +413,7 @@ class Metadata { data_size_t num_weights_; /*! \brief Number of positions, used to check correct position file */ data_size_t num_positions_; - /*! \brief Label data */ + /*! \brief Label data. In pairwise ranking, the label_ refer to the labels of the original unpaired dataset. */ std::vector label_; /*! \brief Weights data */ std::vector weights_; @@ -374,6 +423,8 @@ class Metadata { std::vector position_ids_; /*! \brief Query boundaries */ std::vector query_boundaries_; + /*! \brief Original query boundaries, used in pairwise ranking */ + std::vector pairwise_query_boundaries_; /*! \brief Query weights */ std::vector query_weights_; /*! \brief Number of querys */ @@ -384,6 +435,12 @@ class Metadata { std::vector init_score_; /*! \brief Queries data */ std::vector queries_; + /*! \brief Mode for pairwise ranking */ + PairwiseRankingMode pairwise_ranking_mode_ = PairwiseRankingMode::kRelevance; + /*! \brief Pairwise data index within query to original data indices for ranking with pairwise features */ + std::vector> paired_ranking_item_index_map_; + /*! \brief Pairwise global data index to original data indices for ranking with pairwise features */ + std::vector> paired_ranking_item_global_index_map_; /*! \brief mutex for threading safe call */ std::mutex mutex_; bool weight_load_from_file_; @@ -659,15 +716,16 @@ class Dataset { void CopySubrow(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data); - MultiValBin* GetMultiBinFromSparseFeatures(const std::vector& offsets) const; + MultiValBin* GetMultiBinFromSparseFeatures(const std::vector& offsets, const bool use_pairwise_ranking) const; - MultiValBin* GetMultiBinFromAllFeatures(const std::vector& offsets) const; + MultiValBin* GetMultiBinFromAllFeatures(const std::vector& offsets, const bool use_pairwise_ranking) const; template TrainingShareStates* GetShareStates( score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, - bool force_col_wise, bool force_row_wise, const int num_grad_quant_bins) const; + bool force_col_wise, bool force_row_wise, const int num_grad_quant_bins, + const bool use_pairwise_ranking) const; LIGHTGBM_EXPORT void FinishLoad(); @@ -701,6 +759,8 @@ class Dataset { LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset); + LIGHTGBM_EXPORT void CreatePairWiseRankingData(const Dataset* dataset, const bool is_validation, const Config& config); + void InitTrain(const std::vector& is_feature_used, TrainingShareStates* share_state) const; @@ -782,6 +842,13 @@ class Dataset { } } + void PrintGroupFeatureInfo(int group_index) const { + for (int sub_feature = 0; sub_feature < group_feature_cnt_[group_index]; ++sub_feature) { + const BinMapper* bin_mapper = feature_groups_[group_index]->bin_mappers_[sub_feature].get(); + Log::Warning("sub_feature = %d, missing_type = %d, most_freq_bin = %d", sub_feature, bin_mapper->missing_type(), bin_mapper->GetMostFreqBin()); + } + } + inline int FeatureNumBin(int i) const { const int group = feature2group_[i]; const int sub_feature = feature2subfeature_[i]; @@ -1003,6 +1070,25 @@ class Dataset { void CreateCUDAColumnData(); + /*! \brief Create differential features for pairwise lambdarank + * \param sample_values sampled values from the file + * \param sample_indices sampled data indices from the file + * \param bin_mappers bin mappers of the original features + * \param filter_cnt filter count for bin finding + * \param num_total_sample_data number of all sampled data + * \param differential_feature_bin_mappers output differential feature bin mapppers + */ + void CreatePairwiseRankingDifferentialFeatures( + const std::vector>& sample_values, + const std::vector>& sample_indices, + const std::vector>& bin_mappers, + const data_size_t num_total_sample_data, + const data_size_t* query_boundaries, + const data_size_t num_queries, + std::vector>* differential_feature_bin_mappers, + std::vector* diff_original_feature_index, + const Config& config) const; + std::string data_filename_; /*! \brief Store used features */ std::vector> feature_groups_; @@ -1058,6 +1144,21 @@ class Dataset { #endif // USE_CUDA std::string parser_config_str_; + + /*! \brief stored sampled features, for creating differential features in pairwise lambdarank */ + std::shared_ptr>> sampled_values_; + /*! \brief stored sampled data indices, for creating differential features in pairwise lambdarank */ + std::shared_ptr>> sampled_indices_; + /*! \brief stored number of totally sampled data, for creating differential features in pairwise lambdarank */ + data_size_t num_total_sampled_data_; + /*! \brief stored query boundaries from training dataset, for creating differential features in pairwise lambdarank */ + const data_size_t* train_query_boundaries_; + /*! \brief stored number of queries from training dataset, for creating differential features in pairwise lambdarank */ + data_size_t train_num_queries_; + /*! \brief stored number of differential features used in training dataset, for creating differential features in pairwise lambdarank */ + data_size_t num_used_differential_features_; + /*! \brief stored number of differential feature groups used in training dataset, for creating differential features in pairwise lambdarank */ + data_size_t num_used_differential_groups_; }; } // namespace LightGBM diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index f13a5fff966f..53a501cd149b 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -152,7 +152,7 @@ class FeatureGroup { } /*! \brief Destructor */ - ~FeatureGroup() {} + virtual ~FeatureGroup() {} /*! * \brief Load the overall definition of the feature group from binary serialized data @@ -286,7 +286,10 @@ class FeatureGroup { } inline void CopySubrowByCol(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices, int fidx) { + Log::Warning("in CopySubrowByCol"); if (!is_multi_val_) { + Log::Warning("is not multi val"); + Log::Warning("full_feature->bin_data_.get() = %ld", full_feature->bin_data_.get()); bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices); } else { multi_bin_data_[fidx]->CopySubrow(full_feature->multi_bin_data_[fidx].get(), used_indices, num_used_indices); @@ -343,14 +346,14 @@ class FeatureGroup { num_feature_ += other->num_feature_; } - inline BinIterator* SubFeatureIterator(int sub_feature) { + virtual inline BinIterator* SubFeatureIterator(int sub_feature) const { uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin(); if (!is_multi_val_) { uint32_t min_bin = bin_offsets_[sub_feature]; uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1; return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin); } else { - int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1; + int addi = most_freq_bin == 0 ? 0 : 1; uint32_t min_bin = 1; uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi; return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin, @@ -373,7 +376,7 @@ class FeatureGroup { } } - inline BinIterator* FeatureGroupIterator() { + virtual inline BinIterator* FeatureGroupIterator() { if (is_multi_val_) { return nullptr; } @@ -581,32 +584,32 @@ class FeatureGroup { } } - private: - void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { + protected: + virtual void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { if (is_multi_val) { multi_bin_data_.clear(); for (int i = 0; i < num_feature_; ++i) { int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1; - if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) { - multi_bin_data_.emplace_back(Bin::CreateSparseBin( - num_data, bin_mappers_[i]->num_bin() + addi)); - } else { + // if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) { + // multi_bin_data_.emplace_back(Bin::CreateSparseBin( + // num_data, bin_mappers_[i]->num_bin() + addi)); + // } else { multi_bin_data_.emplace_back( Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi)); - } + // } } is_multi_val_ = true; } else { - if (force_sparse || - (!force_dense && num_feature_ == 1 && - bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { - is_sparse_ = true; - bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_)); - } else { + // if (force_sparse || + // (!force_dense && num_feature_ == 1 && + // bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { + // is_sparse_ = true; + // bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_)); + // } else { is_sparse_ = false; bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_)); - } - is_multi_val_ = false; + // } + // is_multi_val_ = false; } } diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h index ad188dc39676..88e96e463adb 100644 --- a/include/LightGBM/objective_function.h +++ b/include/LightGBM/objective_function.h @@ -37,6 +37,17 @@ class ObjectiveFunction { virtual void GetGradients(const double* score, score_t* gradients, score_t* hessians) const = 0; + /*! + * \brief calculating first order derivative of loss function, used only for baggin by query in lambdarank + * \param score prediction score in this round + * \param num_sampled_queries number of in-bag queries + * \param sampled_query_indices indices of in-bag queries + * \gradients Output gradients + * \hessians Output hessians + */ + virtual void GetGradients(const double* score, const data_size_t /*num_sampled_queries*/, const data_size_t* /*sampled_query_indices*/, + score_t* gradients, score_t* hessians) const { GetGradients(score, gradients, hessians); } + virtual const char* GetName() const = 0; virtual bool IsConstantHessian() const { return false; } @@ -108,8 +119,20 @@ class ObjectiveFunction { virtual bool NeedConvertOutputCUDA () const { return false; } #endif // USE_CUDA + + virtual void SetDataIndices(const data_size_t* used_data_indices) const { used_data_indices_ = used_data_indices; } + + private: + mutable const data_size_t* used_data_indices_ = nullptr; }; +void UpdatePointwiseScoresForOneQuery(data_size_t query_id, double* score_pointwise, const double* score_pairwise, data_size_t cnt_pointwise, + data_size_t selected_pairs_cnt, const data_size_t* selected_pairs, const std::pair* paired_index_map, + const std::multimap& right2left_map, const std::multimap < data_size_t, data_size_t>& left2right_map, + const std::map, data_size_t>& left_right2pair_map, + int truncation_level, double sigma, const CommonC::SigmoidCache& sigmoid_cache, bool model_indirect_comparison, bool model_conditional_rel, + bool indirect_comparison_above_only, bool logarithmic_discounts, bool hard_pairwise_preference); + } // namespace LightGBM #endif // LightGBM_OBJECTIVE_FUNCTION_H_ diff --git a/include/LightGBM/pairwise_ranking_feature_group.h b/include/LightGBM/pairwise_ranking_feature_group.h new file mode 100644 index 000000000000..b08b3c8bbf7d --- /dev/null +++ b/include/LightGBM/pairwise_ranking_feature_group.h @@ -0,0 +1,154 @@ +/*! + * Copyright (c) 2023 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_PAIRWISE_RANKING_FEATURE_GROUP_H_ +#define LIGHTGBM_PAIRWISE_RANKING_FEATURE_GROUP_H_ + +#include +#include +#include +#include + +#include "feature_group.h" + +namespace LightGBM { + +/*! \brief Using to store data and providing some operations on one pairwise feature group for pairwise ranking */ +class PairwiseRankingFeatureGroup: public FeatureGroup { + public: + /*! + * \brief Constructor + * \param num_feature number of features of this group + * \param bin_mappers Bin mapper for features + * \param num_data Total number of data + * \param is_enable_sparse True if enable sparse feature + * \param is_first_or_second_in_pairing Mark whether features in this group belong to the first or second element in the pairing + */ + + PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map); + + /*! + * \brief Constructor from memory when data is present + * \param memory Pointer of memory + * \param num_all_data Number of global data + * \param local_used_indices Local used indices, empty means using all data + * \param group_id Id of group + */ + // PairwiseRankingFeatureGroup(const void* memory, + // data_size_t num_all_data, + // const std::vector& local_used_indices, + // int group_id) { + // // TODO(shiyu1994) + // } + + // /*! + // * \brief Constructor from definition in memory (without data) + // * \param memory Pointer of memory + // * \param local_used_indices Local used indices, empty means using all data + // */ + // PairwiseRankingFeatureGroup(const void* memory, data_size_t num_data, int group_id): FeatureGroup(memory, num_data, group_id) { + // // TODO(shiyu1994) + // } + + /*! \brief Destructor */ + ~PairwiseRankingFeatureGroup() {} + + /*! + * \brief Load the overall definition of the feature group from binary serialized data + * \param memory Pointer of memory + * \param group_id Id of group + */ + const char* LoadDefinitionFromMemory(const void* /*memory*/, int /*group_id*/) { + // TODO(shiyu1994) + return nullptr; + } + + inline BinIterator* SubFeatureIterator(int /*sub_feature*/) { + // TODO(shiyu1994) + return nullptr; + } + + inline void FinishLoad() { + CHECK(!is_multi_val_); + bin_data_->FinishLoad(); + } + + inline BinIterator* FeatureGroupIterator() { + if (is_multi_val_) { + return nullptr; + } + uint32_t min_bin = bin_offsets_[0]; + uint32_t max_bin = bin_offsets_.back() - 1; + uint32_t most_freq_bin = 0; + return bin_data_->GetUnpairedIterator(min_bin, max_bin, most_freq_bin); + } + + /*! + * \brief Push one record, will auto convert to bin and push to bin data + * \param tid Thread id + * \param sub_feature_idx Index of the subfeature + * \param line_idx Index of record + * \param bin feature bin value of record + */ + inline void PushBinData(int tid, int sub_feature_idx, data_size_t line_idx, uint32_t bin) { + if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) { + return; + } + if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) { + bin -= 1; + } + if (is_multi_val_) { + multi_bin_data_[sub_feature_idx]->Push(tid, line_idx, bin + 1); + } else { + bin += bin_offsets_[sub_feature_idx]; + bin_data_->Push(tid, line_idx, bin); + } + } + + protected: + void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) override; + + /*! \brief Pairwise data index to original data indices for ranking with pairwise features */ + const std::pair* paired_ranking_item_index_map_; + /*! \brief Number of pairwise data */ + data_size_t num_data_; + /*! \brief Mark whether features in this group belong to the first or second element in the pairing */ + const int is_first_or_second_in_pairing_; +}; + + +/*! \brief One differential feature group in pairwise ranking */ +class PairwiseRankingDifferentialFeatureGroup: public PairwiseRankingFeatureGroup { + public: + /*! + * \brief Constructor + * \param num_feature number of features of this group + * \param bin_mappers Bin mapper for features + * \param num_data Total number of data + * \param is_enable_sparse True if enable sparse feature + * \param is_first_or_second_in_pairing Mark whether features in this group belong to the first or second element in the pairing + */ + + PairwiseRankingDifferentialFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map, std::vector>& diff_feature_bin_mappers, std::vector>& ori_feature_bin_mappers); + + virtual inline BinIterator* SubFeatureIterator(int sub_feature) const override; + + virtual inline BinIterator* FeatureGroupIterator() override; + + /*! \brief Destructor */ + ~PairwiseRankingDifferentialFeatureGroup() {} + + private: + void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) override; + + std::vector> diff_feature_bin_mappers_; + std::vector> ori_feature_bin_mappers_; + std::vector original_bin_offsets_; +}; + + +} // namespace LightGBM + +#endif // LIGHTGBM_PAIRWISE_RANKING_FEATURE_GROUP_H_ diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h index 4ea5cfc5f436..d2c26877c8ee 100644 --- a/include/LightGBM/sample_strategy.h +++ b/include/LightGBM/sample_strategy.h @@ -55,6 +55,10 @@ class SampleStrategy { bool NeedResizeGradients() const { return need_resize_gradients_; } + virtual data_size_t num_sampled_queries() const { return 0; } + + virtual const data_size_t* sampled_query_indices() const { return nullptr; } + protected: const Config* config_; const Dataset* train_data_; diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index 6c3ebf5d0096..309eba7979ad 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -1255,6 +1255,54 @@ inline static std::string ArrayToString(const std::vector& arr, size_t n) { return str_buf.str(); } +class SigmoidCache { +public: + SigmoidCache(){} + + void Init(double sigmoid) { + sigmoid_ = sigmoid; + // get boundary + min_sigmoid_input_ = min_sigmoid_input_ / sigmoid_ / 2; + max_sigmoid_input_ = -min_sigmoid_input_; + sigmoid_table_.resize(_sigmoid_bins); + // get score to bin factor + sigmoid_table_idx_factor_ = + _sigmoid_bins / (max_sigmoid_input_ - min_sigmoid_input_); + // cache + for (size_t i = 0; i < _sigmoid_bins; ++i) { + const double score = i / sigmoid_table_idx_factor_ + min_sigmoid_input_; + sigmoid_table_[i] = 1.0f / (1.0f + std::exp(score * sigmoid_)); + } + } + + double compute(double score) const { + if (score <= min_sigmoid_input_) { + // too small, use lower bound + return sigmoid_table_[0]; + } + else if (score >= max_sigmoid_input_) { + // too large, use upper bound + return sigmoid_table_[_sigmoid_bins - 1]; + } + else { + return sigmoid_table_[static_cast((score - min_sigmoid_input_) * + sigmoid_table_idx_factor_)]; + } + } +private: + /*! \brief Sigmoid param */ + double sigmoid_; + /*! \brief Cache result for sigmoid transform to speed up */ + std::vector sigmoid_table_; + /*! \brief Number of bins in simoid table */ + size_t _sigmoid_bins = 1024 * 1024; + /*! \brief Minimal input of sigmoid table */ + double min_sigmoid_input_ = -50; + /*! \brief Maximal input of Sigmoid table */ + double max_sigmoid_input_ = 50; + /*! \brief Factor that covert score to bin in Sigmoid table */ + double sigmoid_table_idx_factor_; +}; } // namespace CommonC diff --git a/src/application/application.cpp b/src/application/application.cpp index 42f707f0c801..f7ba45f139c2 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -118,6 +118,14 @@ void Application::LoadData() { train_data_->SaveBinaryFile(nullptr); } // create training metric + const Dataset* ref_train_data = nullptr; + if (config_.objective == std::string("pairwise_lambdarank")) { + ref_train_data = train_data_.release(); + train_data_.reset(new Dataset()); + train_data_->CreatePairWiseRankingData(ref_train_data, false, config_); + } else { + ref_train_data = train_data_.get(); + } if (config_.is_provide_training_metric) { for (auto metric_type : config_.metric) { auto metric = std::unique_ptr(Metric::CreateMetric(metric_type, config_)); @@ -138,7 +146,12 @@ void Application::LoadData() { auto new_dataset = std::unique_ptr( dataset_loader.LoadFromFileAlignWithOtherDataset( config_.valid[i].c_str(), - train_data_.get())); + ref_train_data)); + if (config_.objective == std::string("pairwise_lambdarank")) { + const Dataset* original_dataset = new_dataset.release(); + new_dataset.reset(new Dataset()); + new_dataset->CreatePairWiseRankingData(original_dataset, true, config_); + } valid_datas_.push_back(std::move(new_dataset)); // need save binary file if (config_.save_binary) { diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index 4c2c81553e7c..f50c7f3160c5 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -7,6 +7,7 @@ #define LIGHTGBM_BOOSTING_BAGGING_HPP_ #include +#include namespace LightGBM { @@ -17,8 +18,15 @@ class BaggingSampleStrategy : public SampleStrategy { config_ = config; train_data_ = train_data; num_data_ = train_data->num_data(); + num_queries_ = train_data->metadata().num_queries(); + if (config->objective == std::string("pairwise_lambdarank")) { + query_boundaries_ = train_data->metadata().pairwise_query_boundaries(); + } else { + query_boundaries_ = train_data->metadata().query_boundaries(); + } objective_function_ = objective_function; num_tree_per_iteration_ = num_tree_per_iteration; + num_threads_ = OMP_NUM_THREADS(); } ~BaggingSampleStrategy() {} @@ -27,9 +35,10 @@ class BaggingSampleStrategy : public SampleStrategy { Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer); // if need bagging if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) || - need_re_bagging_) { + need_re_bagging_) { need_re_bagging_ = false; - auto left_cnt = bagging_runner_.Run( + if (!config_->bagging_by_query) { + auto left_cnt = bagging_runner_.Run( num_data_, [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left, data_size_t*) { @@ -43,7 +52,60 @@ class BaggingSampleStrategy : public SampleStrategy { return cur_left_count; }, bag_data_indices_.data()); - bag_data_cnt_ = left_cnt; + bag_data_cnt_ = left_cnt; + } else { + num_sampled_queries_ = bagging_runner_.Run( + num_queries_, + [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left, + data_size_t*) { + data_size_t cur_left_count = 0; + cur_left_count = BaggingHelper(cur_start, cur_cnt, left); + return cur_left_count; + }, bag_query_indices_.data()); + + sampled_query_boundaries_[0] = 0; + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (data_size_t i = 0; i < num_queries_; ++i) { + OMP_LOOP_EX_BEGIN(); + sampled_query_boundaries_[i + 1] = query_boundaries_[bag_query_indices_[i] + 1] - query_boundaries_[bag_query_indices_[i]]; + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + + const int num_blocks = Threading::For(0, num_queries_ + 1, 128, [this](int thread_index, data_size_t start_index, data_size_t end_index) { + for (data_size_t i = start_index + 1; i < end_index; ++i) { + sampled_query_boundaries_[i] += sampled_query_boundaries_[i - 1]; + } + sampled_query_boundaires_thread_buffer_[thread_index] = sampled_query_boundaries_[end_index - 1]; + }); + + for (int thread_index = 1; thread_index < num_blocks; ++thread_index) { + sampled_query_boundaires_thread_buffer_[thread_index] += sampled_query_boundaires_thread_buffer_[thread_index - 1]; + } + + Threading::For(0, num_queries_ + 1, 128, [this](int thread_index, data_size_t start_index, data_size_t end_index) { + if (thread_index > 0) { + for (data_size_t i = start_index; i < end_index; ++i) { + sampled_query_boundaries_[i] += sampled_query_boundaires_thread_buffer_[thread_index - 1]; + } + } + }); + + bag_data_cnt_ = sampled_query_boundaries_[num_sampled_queries_]; + + Threading::For(0, num_queries_, 1, [this](int /*thread_index*/, data_size_t start_index, data_size_t end_index) { + for (data_size_t sampled_query_id = start_index; sampled_query_id < end_index; ++sampled_query_id) { + const data_size_t query_index = bag_query_indices_[sampled_query_id]; + const data_size_t data_index_start = query_boundaries_[query_index]; + const data_size_t data_index_end = query_boundaries_[query_index + 1]; + const data_size_t sampled_query_start = sampled_query_boundaries_[sampled_query_id]; + for (data_size_t i = data_index_start; i < data_index_end; ++i) { + bag_data_indices_[sampled_query_start + i - data_index_start] = i; + } + } + }); + } Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_); // set bagging data to tree learner if (!is_use_subset_) { @@ -60,6 +122,7 @@ class BaggingSampleStrategy : public SampleStrategy { } else { // get subset tmp_subset_->ReSize(bag_data_cnt_); + Log::Warning("bag_data_indices_.size() = %ld, bag_data_cnt_ = %d", bag_data_indices_.size(), bag_data_cnt_); tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); #ifdef USE_CUDA @@ -108,7 +171,14 @@ class BaggingSampleStrategy : public SampleStrategy { cuda_bag_data_indices_.Resize(num_data_); } #endif // USE_CUDA - bagging_runner_.ReSize(num_data_); + if (!config_->bagging_by_query) { + bagging_runner_.ReSize(num_data_); + } else { + bagging_runner_.ReSize(num_queries_); + sampled_query_boundaries_.resize(num_queries_ + 1, 0); + sampled_query_boundaires_thread_buffer_.resize(num_threads_, 0); + bag_query_indices_.resize(num_data_); + } bagging_rands_.clear(); for (int i = 0; i < (num_data_ + bagging_rand_block_ - 1) / bagging_rand_block_; ++i) { @@ -118,9 +188,11 @@ class BaggingSampleStrategy : public SampleStrategy { double average_bag_rate = (static_cast(bag_data_cnt_) / num_data_) / config_->bagging_freq; is_use_subset_ = false; - if (config_->device_type != std::string("cuda")) { - const int group_threshold_usesubset = 100; + if (config_->device_type != std::string("cuda") && !config_->bagging_by_query) { + const int group_threshold_usesubset = 200; const double average_bag_rate_threshold = 0.5; + Log::Warning("train_data_->num_feature_groups() = %d", train_data_->num_feature_groups()); + Log::Warning("average_bag_rate = %f", average_bag_rate); if (average_bag_rate <= average_bag_rate_threshold && (train_data_->num_feature_groups() < group_threshold_usesubset)) { if (tmp_subset_ == nullptr || is_change_dataset) { @@ -153,6 +225,14 @@ class BaggingSampleStrategy : public SampleStrategy { return false; } + data_size_t num_sampled_queries() const override { + return num_sampled_queries_; + } + + const data_size_t* sampled_query_indices() const override { + return bag_query_indices_.data(); + } + private: data_size_t BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) { if (cnt <= 0) { @@ -202,6 +282,20 @@ class BaggingSampleStrategy : public SampleStrategy { /*! \brief whether need restart bagging in continued training */ bool need_re_bagging_; + /*! \brief number of threads */ + int num_threads_; + /*! \brief query boundaries of the in-bag queries */ + std::vector sampled_query_boundaries_; + /*! \brief buffer for calculating sampled_query_boundaries_ */ + std::vector sampled_query_boundaires_thread_buffer_; + /*! \brief in-bag query indices */ + std::vector> bag_query_indices_; + /*! \brief number of queries in the training dataset */ + data_size_t num_queries_; + /*! \brief number of in-bag queries */ + data_size_t num_sampled_queries_; + /*! \brief query boundaries of the whole training dataset */ + const data_size_t* query_boundaries_; }; } // namespace LightGBM diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 937b44fcc8aa..c22ecfc561ba 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -224,8 +224,14 @@ void GBDT::Boosting() { } // objective function will calculate gradients and hessians int64_t num_score = 0; - objective_function_-> - GetGradients(GetTrainingScore(&num_score), gradients_pointer_, hessians_pointer_); + if (config_->bagging_by_query) { + data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data()); + objective_function_-> + GetGradients(GetTrainingScore(&num_score), data_sample_strategy_->num_sampled_queries(), data_sample_strategy_->sampled_query_indices(), gradients_pointer_, hessians_pointer_); + } else { + objective_function_-> + GetGradients(GetTrainingScore(&num_score), gradients_pointer_, hessians_pointer_); + } } void GBDT::Train(int snapshot_freq, const std::string& model_output_path) { @@ -337,12 +343,15 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) { bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { Common::FunctionTimer fun_timer("GBDT::TrainOneIter", global_timer); + Log::Warning("TrainOneIter step -10"); std::vector init_scores(num_tree_per_iteration_, 0.0); // boosting first if (gradients == nullptr || hessians == nullptr) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true); } + data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data()); + objective_function_->SetDataIndices(data_sample_strategy_->bag_data_indices().data()); Boosting(); gradients = gradients_pointer_; hessians = hessians_pointer_; @@ -365,8 +374,11 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } } + Log::Warning("TrainOneIter step -9"); // bagging logic - data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data()); + if (!config_->bagging_by_query) { + data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data()); + } const bool is_use_subset = data_sample_strategy_->is_use_subset(); const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt(); const std::vector>& bag_data_indices = data_sample_strategy_->bag_data_indices(); @@ -375,6 +387,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { ResetGradientBuffers(); } + Log::Warning("TrainOneIter step -8"); bool should_continue = false; for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { const size_t offset = static_cast(cur_tree_id) * num_data_; @@ -392,9 +405,12 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { hess = hessians_pointer_ + offset; } bool is_first_tree = models_.size() < static_cast(num_tree_per_iteration_); + Log::Warning("TrainOneIter step -7"); new_tree.reset(tree_learner_->Train(grad, hess, is_first_tree)); } + Log::Warning("TrainOneIter step 0"); + if (new_tree->num_leaves() > 1) { should_continue = true; auto score_ptr = train_score_updater_->score() + offset; @@ -414,18 +430,24 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { if (objective_function_ != nullptr && !config_->boost_from_average && !train_score_updater_->has_init_score()) { init_scores[cur_tree_id] = ObtainAutomaticInitialScore(objective_function_, cur_tree_id); // updates scores + Log::Warning("TrainOneIter step 0.1"); train_score_updater_->AddScore(init_scores[cur_tree_id], cur_tree_id); + Log::Warning("TrainOneIter step 0.2"); for (auto& score_updater : valid_score_updater_) { score_updater->AddScore(init_scores[cur_tree_id], cur_tree_id); } + Log::Warning("TrainOneIter step 0.3"); } new_tree->AsConstantTree(init_scores[cur_tree_id]); + Log::Warning("TrainOneIter step 0.4"); } } // add model models_.push_back(std::move(new_tree)); } + Log::Warning("TrainOneIter step 1"); + if (!should_continue) { Log::Warning("Stopped training because there are no more leaves that meet the split requirements"); if (models_.size() > static_cast(num_tree_per_iteration_)) { @@ -436,6 +458,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { return true; } + Log::Warning("TrainOneIter step 2"); ++iter_; return false; } @@ -480,7 +503,9 @@ bool GBDT::EvalAndCheckEarlyStopping() { void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { Common::FunctionTimer fun_timer("GBDT::UpdateScore", global_timer); // update training score + Log::Warning("before update score 0"); if (!data_sample_strategy_->is_use_subset()) { + Log::Warning("before update score 1"); train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id); const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt(); @@ -496,16 +521,20 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { } #endif // USE_CUDA } + Log::Warning("before update score 2"); } else { + Log::Warning("before update score 3"); train_score_updater_->AddScore(tree, cur_tree_id); } + Log::Warning("before update score 4"); // update validation score for (auto& score_updater : valid_score_updater_) { score_updater->AddScore(tree, cur_tree_id); } + Log::Warning("before update score 5"); } #ifdef USE_CUDA diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 3d84599e6589..326df0ec5f79 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -15,8 +15,10 @@ #include "dense_bin.hpp" #include "multi_val_dense_bin.hpp" +#include "multi_val_pairwise_lambdarank_bin.hpp" #include "multi_val_sparse_bin.hpp" #include "sparse_bin.hpp" +#include "pairwise_lambdarank_bin.hpp" namespace LightGBM { @@ -632,21 +634,94 @@ namespace LightGBM { } } + Bin* Bin::CreateDensePairwiseRankingFirstBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { + if (num_bin <= 16) { + return new DensePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); + } else if (num_bin <= 256) { + return new DensePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); + } else if (num_bin <= 65536) { + return new DensePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); + } else { + return new DensePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); + } + } + + Bin* Bin::CreateDensePairwiseRankingSecondBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { + if (num_bin <= 16) { + return new DensePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); + } else if (num_bin <= 256) { + return new DensePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); + } else if (num_bin <= 65536) { + return new DensePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); + } else { + return new DensePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); + } + } + + Bin* Bin::CreateSparsePairwiseRankingFirstBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { + if (num_bin <= 256) { + return new SparsePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); + } else if (num_bin <= 65536) { + return new SparsePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); + } else { + return new SparsePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); + } + } + + Bin* Bin::CreateSparsePairwiseRankingSecondBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { + if (num_bin <= 256) { + return new SparsePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); + } else if (num_bin <= 65536) { + return new SparsePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); + } else { + return new SparsePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); + } + } + + Bin* Bin::CreateDensePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets) { + if (num_bin <= 16) { + return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets); + } else if (num_bin <= 256) { + return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets); + } else if (num_bin <= 65536) { + return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets); + } else { + return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets); + } + } + + Bin* Bin::CreateSparsePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets) { + if (num_bin <= 256) { + return new SparsePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets); + } else if (num_bin <= 65536) { + return new SparsePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets); + } else { + return new SparsePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets); + } + } + MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, - double sparse_rate, const std::vector& offsets) { + double sparse_rate, const std::vector& offsets, const bool use_pairwise_ranking, const std::pair* paired_ranking_item_global_index_map) { if (sparse_rate >= multi_val_bin_sparse_threshold) { const double average_element_per_row = (1.0 - sparse_rate) * num_feature; - return CreateMultiValSparseBin(num_data, num_bin, - average_element_per_row); + // if (use_pairwise_ranking) { + Log::Warning("Pairwise ranking with sparse row-wse bins is not supported yet."); + return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets, use_pairwise_ranking, paired_ranking_item_global_index_map); + // } else { + // return CreateMultiValSparseBin(num_data, num_bin, + // average_element_per_row, use_pairwise_ranking, paired_ranking_item_global_index_map); + // } } else { - return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets); + return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets, use_pairwise_ranking, paired_ranking_item_global_index_map); } } MultiValBin* MultiValBin::CreateMultiValDenseBin(data_size_t num_data, int num_bin, int num_feature, - const std::vector& offsets) { + const std::vector& offsets, + const bool use_pairwise_ranking, + const std::pair* paired_ranking_item_global_index_map) { // calculate max bin of all features to select the int type in MultiValDenseBin int max_bin = 0; for (int i = 0; i < static_cast(offsets.size()) - 1; ++i) { @@ -656,17 +731,31 @@ namespace LightGBM { } } if (max_bin <= 256) { - return new MultiValDenseBin(num_data, num_bin, num_feature, offsets); + if (use_pairwise_ranking) { + return new MultiValDensePairwiseLambdarankBin(num_data, num_bin, num_feature, offsets, paired_ranking_item_global_index_map); + } else { + return new MultiValDenseBin(num_data, num_bin, num_feature, offsets); + } } else if (max_bin <= 65536) { - return new MultiValDenseBin(num_data, num_bin, num_feature, offsets); + if (use_pairwise_ranking) { + return new MultiValDensePairwiseLambdarankBin(num_data, num_bin, num_feature, offsets, paired_ranking_item_global_index_map); + } else { + return new MultiValDenseBin(num_data, num_bin, num_feature, offsets); + } } else { - return new MultiValDenseBin(num_data, num_bin, num_feature, offsets); + if (use_pairwise_ranking) { + return new MultiValDensePairwiseLambdarankBin(num_data, num_bin, num_feature, offsets, paired_ranking_item_global_index_map); + } else { + return new MultiValDenseBin(num_data, num_bin, num_feature, offsets); + } } } MultiValBin* MultiValBin::CreateMultiValSparseBin(data_size_t num_data, int num_bin, - double estimate_element_per_row) { + double estimate_element_per_row, + const bool /*use_pairwise_ranking*/, + const std::pair* /*paired_ranking_item_global_index_map*/) { size_t estimate_total_entries = static_cast(estimate_element_per_row * 1.1 * num_data); if (estimate_total_entries <= std::numeric_limits::max()) { diff --git a/src/io/config.cpp b/src/io/config.cpp index c63de70fc16b..20d327ca2edb 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -466,6 +466,11 @@ void Config::CheckParamConflict(const std::unordered_map& Config::parameter_set() { "neg_bagging_fraction", "bagging_freq", "bagging_seed", + "bagging_by_query", "feature_fraction", "feature_fraction_bynode", "feature_fraction_seed", @@ -306,6 +307,14 @@ const std::unordered_set& Config::parameter_set() { "lambdarank_norm", "label_gain", "lambdarank_position_bias_regularization", + "use_differential_feature_in_pairwise_ranking", + "pairwise_lambdarank_model_indirect_comparison", + "pairwise_lambdarank_model_conditional_rel", + "pairwise_lambdarank_indirect_comparison_above_only", + "pairwise_lambdarank_logarithmic_discounts", + "pairwise_lambdarank_hard_pairwise_preference", + "pairwise_lambdarank_train_pairing_approach", + "pairwise_lambdarank_valid_pairing_approach", "metric", "metric_freq", "is_provide_training_metric", @@ -377,6 +386,8 @@ void Config::GetMembersFromString(const std::unordered_map>& Config::paramet {"neg_bagging_fraction", {"neg_sub_row", "neg_subsample", "neg_bagging"}}, {"bagging_freq", {"subsample_freq"}}, {"bagging_seed", {"bagging_fraction_seed"}}, + {"bagging_by_query", {}}, {"feature_fraction", {"sub_feature", "colsample_bytree"}}, {"feature_fraction_bynode", {"sub_feature_bynode", "colsample_bynode"}}, {"feature_fraction_seed", {}}, @@ -911,6 +948,14 @@ const std::unordered_map>& Config::paramet {"lambdarank_norm", {}}, {"label_gain", {}}, {"lambdarank_position_bias_regularization", {}}, + {"use_differential_feature_in_pairwise_ranking", {}}, + {"pairwise_lambdarank_model_indirect_comparison", {}}, + {"pairwise_lambdarank_model_conditional_rel", {}}, + {"pairwise_lambdarank_indirect_comparison_above_only", {}}, + {"pairwise_lambdarank_logarithmic_discounts", {}}, + {"pairwise_lambdarank_hard_pairwise_preference", {}}, + {"pairwise_lambdarank_train_pairing_approach", {}}, + {"pairwise_lambdarank_valid_pairing_approach", {}}, {"metric", {"metrics", "metric_types"}}, {"metric_freq", {"output_freq"}}, {"is_provide_training_metric", {"training_metric", "is_training_metric", "train_metric"}}, @@ -957,6 +1002,7 @@ const std::unordered_map& Config::ParameterTypes() { {"neg_bagging_fraction", "double"}, {"bagging_freq", "int"}, {"bagging_seed", "int"}, + {"bagging_by_query", "bool"}, {"feature_fraction", "double"}, {"feature_fraction_bynode", "double"}, {"feature_fraction_seed", "int"}, @@ -1055,6 +1101,14 @@ const std::unordered_map& Config::ParameterTypes() { {"lambdarank_norm", "bool"}, {"label_gain", "vector"}, {"lambdarank_position_bias_regularization", "double"}, + {"use_differential_feature_in_pairwise_ranking", "bool"}, + {"pairwise_lambdarank_model_indirect_comparison", "bool"}, + {"pairwise_lambdarank_model_conditional_rel", "bool"}, + {"pairwise_lambdarank_indirect_comparison_above_only", "bool"}, + {"pairwise_lambdarank_logarithmic_discounts", "bool"}, + {"pairwise_lambdarank_hard_pairwise_preference", "bool"}, + {"pairwise_lambdarank_train_pairing_approach", "string"}, + {"pairwise_lambdarank_valid_pairing_approach", "string"}, {"metric", "vector"}, {"metric_freq", "int"}, {"is_provide_training_metric", "bool"}, diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 01687d95c747..358e4c54a8b3 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -17,6 +18,8 @@ #include #include +#include + namespace LightGBM { const int Dataset::kSerializedReferenceVersionLength = 2; @@ -279,6 +282,7 @@ std::vector> FastFeatureBundling( } std::vector> tmp_indices; + std::vector tmp_indices_ptr(num_sample_col, nullptr); std::vector tmp_num_per_col(num_sample_col, 0); for (auto fidx : used_features) { if (fidx >= num_sample_col) { @@ -290,18 +294,19 @@ std::vector> FastFeatureBundling( if (!ret.empty()) { tmp_indices.push_back(ret); tmp_num_per_col[fidx] = static_cast(ret.size()); - sample_indices[fidx] = tmp_indices.back().data(); + tmp_indices_ptr[fidx] = tmp_indices.back().data(); } else { tmp_num_per_col[fidx] = num_per_col[fidx]; + tmp_indices_ptr[fidx] = sample_indices[fidx]; } } std::vector group_is_multi_val, group_is_multi_val2; auto features_in_group = - FindGroups(bin_mappers, used_features, sample_indices, + FindGroups(bin_mappers, used_features, tmp_indices_ptr.data(), tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, is_sparse, &group_is_multi_val); auto group2 = - FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, + FindGroups(bin_mappers, feature_order_by_cnt, tmp_indices_ptr.data(), tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, is_sparse, &group_is_multi_val2); @@ -352,12 +357,17 @@ void Dataset::Construct(std::vector>* bin_mappers, auto is_sparse = io_config.is_enable_sparse; if (io_config.device_type == std::string("cuda")) { LGBM_config_::current_device = lgbm_device_cuda; - if ((io_config.device_type == std::string("cuda")) && is_sparse) { + if (is_sparse) { Log::Warning("Using sparse features with CUDA is currently not supported."); is_sparse = false; } + } else if ((io_config.objective == std::string("pairwise_lambdarank")) && is_sparse) { + Log::Warning("Using sparse features with pairwise_lambdarank is currently not supported."); + is_sparse = false; } + is_sparse = false; + std::vector group_is_multi_val(used_features.size(), 0); if (io_config.enable_bundle && !used_features.empty()) { bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda"); @@ -438,6 +448,26 @@ void Dataset::Construct(std::vector>* bin_mappers, } device_type_ = io_config.device_type; gpu_device_id_ = io_config.gpu_device_id; + + if (io_config.objective == std::string("pairwise_lambdarank")) { + // store sampled values for constructing differential features + const int num_threads = OMP_NUM_THREADS(); + sampled_values_.reset(new std::vector>()); + sampled_indices_.reset(new std::vector>()); + sampled_values_->resize(static_cast(num_sample_col)); + sampled_indices_->resize(static_cast(num_sample_col)); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (int col_idx = 0; col_idx < num_sample_col; ++col_idx) { + const int num_samples_in_col = num_per_col[col_idx]; + sampled_values_->at(col_idx).resize(num_samples_in_col); + sampled_indices_->at(col_idx).resize(num_samples_in_col); + for (int i = 0; i < num_samples_in_col; ++i) { + sampled_values_->at(col_idx)[i] = sample_values[col_idx][i]; + sampled_indices_->at(col_idx)[i] = sample_non_zero_indices[col_idx][i]; + } + } + num_total_sampled_data_ = static_cast(total_sample_cnt); + } } void Dataset::FinishLoad() { @@ -469,7 +499,9 @@ void PushDataToMultiValBin( MultiValBin* ret) { Common::FunctionTimer fun_time("Dataset::PushDataToMultiValBin", global_timer); + Log::Warning("num_data = %d", num_data); if (ret->IsSparse()) { + // Log::Fatal("pairwise ranking with sparse multi val bin is not supported."); Threading::For( 0, num_data, 1024, [&](int tid, data_size_t start, data_size_t end) { std::vector cur_data; @@ -499,6 +531,7 @@ void PushDataToMultiValBin( 0, num_data, 1024, [&](int tid, data_size_t start, data_size_t end) { std::vector cur_data(most_freq_bins.size(), 0); for (size_t j = 0; j < most_freq_bins.size(); ++j) { + //Log::Warning("(*iters)[%d].size() = %d, j = %d, start = %d", tid, (*iters)[tid].size(), j, start); (*iters)[tid][j]->Reset(start); } for (data_size_t i = start; i < end; ++i) { @@ -513,7 +546,7 @@ void PushDataToMultiValBin( } } -MultiValBin* Dataset::GetMultiBinFromSparseFeatures(const std::vector& offsets) const { +MultiValBin* Dataset::GetMultiBinFromSparseFeatures(const std::vector& offsets, const bool use_pairwise_ranking) const { Common::FunctionTimer fun_time("Dataset::GetMultiBinFromSparseFeatures", global_timer); int multi_group_id = -1; @@ -551,13 +584,13 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures(const std::vector& sum_sparse_rate); std::unique_ptr ret; ret.reset(MultiValBin::CreateMultiValBin(num_data_, offsets.back(), - num_feature, sum_sparse_rate, offsets)); + num_feature, sum_sparse_rate, offsets, use_pairwise_ranking, metadata_.paired_ranking_item_global_index_map())); PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get()); ret->FinishLoad(); return ret.release(); } -MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& offsets) const { +MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& offsets, const bool use_pairwise_ranking) const { Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures", global_timer); int num_threads = OMP_NUM_THREADS(); @@ -600,11 +633,57 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& of CHECK(static_cast(most_freq_bins.size()) == ncol); Log::Debug("Dataset::GetMultiBinFromAllFeatures: sparse rate %f", 1.0 - sum_dense_ratio); - ret.reset(MultiValBin::CreateMultiValBin( - num_data_, offsets.back(), static_cast(most_freq_bins.size()), - 1.0 - sum_dense_ratio, offsets)); - PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get()); + if (use_pairwise_ranking) { + + // for (size_t i = 0; i < iters.size(); ++i) { + // for (size_t j = 0; j < iters[i].size(); ++j) { + // Log::Warning("i = %ld, j = %ld, iters[i][j] = %d", i, j, static_cast(iters[i][j] == nullptr)); + // } + // } + + Log::Warning("most_freq_bins.size() = %d, num_groups_ = %d, num_used_differential_features_ = %d, num_used_differential_groups_ = %d, ncol = %d", static_cast(most_freq_bins.size()), num_groups_, num_used_differential_features_, num_used_differential_groups_, ncol); + + const int num_original_features = (static_cast(most_freq_bins.size()) - num_used_differential_groups_) / 2; + std::vector original_most_freq_bins; + std::vector original_offsets; + for (int i = 0; i < num_original_features; ++i) { + original_most_freq_bins.push_back(most_freq_bins[i]); + original_offsets.push_back(offsets[i]); + } + original_offsets.push_back(offsets[num_original_features]); + std::ofstream fout("mutli_val_bin_meta_info_pairwise.txt"); + fout << "original_most_freq_bins" << std::endl; + for (size_t i = 0; i < original_most_freq_bins.size(); ++i) { + fout << original_most_freq_bins[i] << std::endl; + } + fout << "original_offsets" << std::endl; + for (size_t i = 0; i < original_offsets.size(); ++i) { + fout << original_offsets[i] << std::endl; + } + fout.close(); + const data_size_t num_original_data = metadata_.query_boundaries()[metadata_.num_queries()]; + ret.reset(MultiValBin::CreateMultiValBin( + num_original_data, offsets.back(), num_original_features, + 1.0 - sum_dense_ratio, original_offsets, use_pairwise_ranking, metadata_.paired_ranking_item_global_index_map())); + PushDataToMultiValBin(num_original_data, original_most_freq_bins, original_offsets, &iters, ret.get()); + } else { + ret.reset(MultiValBin::CreateMultiValBin( + num_data_, offsets.back(), static_cast(most_freq_bins.size()), + 1.0 - sum_dense_ratio, offsets, use_pairwise_ranking, metadata_.paired_ranking_item_global_index_map())); + PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get()); + std::ofstream fout("mutli_val_bin_meta_info_no_pairwise.txt"); + fout << "original_most_freq_bins" << std::endl; + for (size_t i = 0; i < most_freq_bins.size(); ++i) { + fout << most_freq_bins[i] << std::endl; + } + fout << "original_offsets" << std::endl; + for (size_t i = 0; i < offsets.size(); ++i) { + fout << offsets[i] << std::endl; + } + fout.close(); + } ret->FinishLoad(); + ret->DumpContent(); return ret.release(); } @@ -613,7 +692,8 @@ TrainingShareStates* Dataset::GetShareStates( score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, bool force_col_wise, bool force_row_wise, - const int num_grad_quant_bins) const { + const int num_grad_quant_bins, + const bool use_pairwise_ranking) const { Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod", global_timer); if (force_col_wise && force_row_wise) { @@ -632,7 +712,8 @@ TrainingShareStates* Dataset::GetShareStates( std::vector offsets; share_state->CalcBinOffsets( feature_groups_, &offsets, true); - share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets), + Log::Warning("feature_groups_.size() = %ld, offsets.size() = %ld", feature_groups_.size(), offsets.size()); + share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets, use_pairwise_ranking), num_data_, feature_groups_, false, true, num_grad_quant_bins); share_state->is_col_wise = true; share_state->is_constant_hessian = is_constant_hessian; @@ -642,7 +723,8 @@ TrainingShareStates* Dataset::GetShareStates( std::vector offsets; share_state->CalcBinOffsets( feature_groups_, &offsets, false); - share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets), num_data_, + Log::Warning("feature_groups_.size() = %ld, offsets.size() = %ld", feature_groups_.size(), offsets.size()); + share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets, use_pairwise_ranking), num_data_, feature_groups_, false, false, num_grad_quant_bins); share_state->is_col_wise = false; share_state->is_constant_hessian = is_constant_hessian; @@ -659,14 +741,14 @@ TrainingShareStates* Dataset::GetShareStates( auto start_time = std::chrono::steady_clock::now(); std::vector col_wise_offsets; col_wise_state->CalcBinOffsets(feature_groups_, &col_wise_offsets, true); - col_wise_state->SetMultiValBin(GetMultiBinFromSparseFeatures(col_wise_offsets), num_data_, + col_wise_state->SetMultiValBin(GetMultiBinFromSparseFeatures(col_wise_offsets, use_pairwise_ranking), num_data_, feature_groups_, false, true, num_grad_quant_bins); col_wise_init_time = std::chrono::steady_clock::now() - start_time; start_time = std::chrono::steady_clock::now(); std::vector row_wise_offsets; row_wise_state->CalcBinOffsets(feature_groups_, &row_wise_offsets, false); - row_wise_state->SetMultiValBin(GetMultiBinFromAllFeatures(row_wise_offsets), num_data_, + row_wise_state->SetMultiValBin(GetMultiBinFromAllFeatures(row_wise_offsets, use_pairwise_ranking), num_data_, feature_groups_, false, false, num_grad_quant_bins); row_wise_init_time = std::chrono::steady_clock::now() - start_time; @@ -727,19 +809,22 @@ template TrainingShareStates* Dataset::GetShareStates( score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, bool force_col_wise, bool force_row_wise, - const int num_grad_quant_bins) const; + const int num_grad_quant_bins, + const bool use_pairwise_ranking) const; template TrainingShareStates* Dataset::GetShareStates( score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, bool force_col_wise, bool force_row_wise, - const int num_grad_quant_bins) const; + const int num_grad_quant_bins, + const bool use_pairwise_ranking) const; template TrainingShareStates* Dataset::GetShareStates( score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, bool force_col_wise, bool force_row_wise, - const int num_grad_quant_bins) const; + const int num_grad_quant_bins, + const bool use_pairwise_ranking) const; void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) { feature_groups_.clear(); @@ -819,6 +904,234 @@ void Dataset::CreateValid(const Dataset* dataset) { gpu_device_id_ = dataset->gpu_device_id_; } +void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_validation, const Config& config) { + const std::string& pairing_approach = is_validation ? config.pairwise_lambdarank_valid_pairing_approach : config.pairwise_lambdarank_train_pairing_approach; + num_data_ = metadata_.BuildPairwiseFeatureRanking(dataset->metadata(), pairing_approach); + + feature_groups_.clear(); + num_features_ = dataset->num_features_ * 2; + num_groups_ = dataset->num_groups_ * 2; + max_bin_ = dataset->max_bin_; + min_data_in_bin_ = dataset->min_data_in_bin_; + bin_construct_sample_cnt_ = dataset->bin_construct_sample_cnt_; + use_missing_ = dataset->use_missing_; + zero_as_missing_ = dataset->zero_as_missing_; + feature2group_.clear(); + feature2subfeature_.clear(); + has_raw_ = dataset->has_raw(); + numeric_feature_map_ = dataset->numeric_feature_map_; + num_numeric_features_ = dataset->num_numeric_features_; + for (const int nuermic_feature_index : dataset->numeric_feature_map_) { + if (nuermic_feature_index != -1) { + numeric_feature_map_.push_back(num_numeric_features_); + ++num_numeric_features_; + } else { + numeric_feature_map_.push_back(-1); + } + } + // copy feature bin mapper data + feature_need_push_zeros_.clear(); + group_bin_boundaries_.clear(); + uint64_t num_total_bin = 0; + group_bin_boundaries_.push_back(num_total_bin); + group_feature_start_.resize(num_groups_); + group_feature_cnt_.resize(num_groups_); + + sampled_values_ = dataset->sampled_values_; + sampled_indices_ = dataset->sampled_indices_; + num_total_sampled_data_ = dataset->num_total_sampled_data_; + + // create differential features + std::vector> diff_feature_bin_mappers; + std::vector> original_bin_mappers; + std::vector diff_original_feature_index; + if (config.use_differential_feature_in_pairwise_ranking) { + for (int i = 0; i < dataset->num_total_features_; ++i) { + const int inner_feature_index = dataset->InnerFeatureIndex(i); + if (inner_feature_index >= 0) { + original_bin_mappers.emplace_back(new BinMapper(*dataset->FeatureBinMapper(inner_feature_index))); + } else { + original_bin_mappers.emplace_back(nullptr); + } + } + + if (!is_validation) { + train_query_boundaries_ = metadata_.query_boundaries(); + train_num_queries_ = metadata_.num_queries(); + } else { + train_query_boundaries_ = dataset->train_query_boundaries_; + train_num_queries_ = dataset->train_num_queries_; + } + // TODO(shiyu1994): verify the difference in training and validation results even when they share the same dataset + CreatePairwiseRankingDifferentialFeatures(*sampled_values_, *sampled_indices_, original_bin_mappers, num_total_sampled_data_, train_query_boundaries_, train_num_queries_, &diff_feature_bin_mappers, &diff_original_feature_index, config); + } + + used_feature_map_.clear(); + used_feature_map_.reserve(2 * dataset->used_feature_map_.size()); + used_feature_map_.insert(used_feature_map_.begin(), dataset->used_feature_map_.begin(), dataset->used_feature_map_.end()); + + for (int i = 0; i < dataset->num_total_features_; ++i) { + if (dataset->used_feature_map_[i] != -1) { + used_feature_map_.push_back(dataset->used_feature_map_[i] + dataset->num_features_); + } else { + used_feature_map_.push_back(-1); + } + } + + std::vector used_diff_features; + if (config.use_differential_feature_in_pairwise_ranking) { + for (int diff_feature_index = 0; diff_feature_index < static_cast(diff_feature_bin_mappers.size()); ++diff_feature_index) { + if (!diff_feature_bin_mappers[diff_feature_index]->is_trivial()) { + num_numeric_features_ += 1; + num_features_ += 1; + used_diff_features.push_back(diff_feature_index); + } + } + numeric_feature_map_.resize(num_features_, -1); + used_feature_map_.resize(2 * dataset->num_total_features_ + static_cast(diff_feature_bin_mappers.size()), -1); + } + + const bool is_use_gpu = config.device_type == std::string("cuda") || config.device_type == std::string("gpu"); + std::vector group_is_multi_val; + std::vector> diff_feature_groups = + FindGroups(diff_feature_bin_mappers, used_diff_features, Common::Vector2Ptr(sampled_indices_.get()).data(), Common::VectorSize(*sampled_indices_).data(), static_cast(sampled_indices_->size()), num_total_sampled_data_, num_data_, is_use_gpu, false, &group_is_multi_val); + + if (is_validation) { + std::vector> flatten_feature_groups; + for (const auto& features_in_group : diff_feature_groups) { + for (const int feature_index : features_in_group) { + flatten_feature_groups.push_back(std::vector{feature_index}); + } + } + diff_feature_groups = flatten_feature_groups; + } + + int cur_feature_index = 0; + for (int i = 0; i < num_groups_; ++i) { + int original_group_index = i % dataset->num_groups_; + int original_group_feature_start = dataset->group_feature_start_[original_group_index]; + const int is_first_or_second_in_pairing = i / dataset->num_groups_; // 0 for first, 1 for second + group_feature_start_[i] = cur_feature_index; + for (int feature_index_in_group = 0; feature_index_in_group < dataset->group_feature_cnt_[original_group_index]; ++feature_index_in_group) { + const BinMapper* feature_bin_mapper = dataset->FeatureBinMapper(original_group_feature_start + feature_index_in_group); + if (feature_bin_mapper->GetDefaultBin() != feature_bin_mapper->GetMostFreqBin()) { + feature_need_push_zeros_.push_back(cur_feature_index); + } + feature2group_.push_back(i); + feature2subfeature_.push_back(dataset->feature2subfeature_[original_group_feature_start + feature_index_in_group]); + cur_feature_index += 1; + } + feature_groups_.emplace_back(new PairwiseRankingFeatureGroup(*dataset->feature_groups_[original_group_index].get(), dataset->num_data(), is_first_or_second_in_pairing, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_global_index_map())); + num_total_bin += dataset->FeatureGroupNumBin(original_group_index); + group_bin_boundaries_.push_back(num_total_bin); + group_feature_cnt_[i] = dataset->group_feature_cnt_[original_group_index]; + } + + Log::Warning("cur_feature_index = %d", cur_feature_index); + + num_used_differential_features_ = 0; + num_used_differential_groups_ = static_cast(diff_feature_groups.size()); + if (config.use_differential_feature_in_pairwise_ranking) { + for (size_t i = 0; i < diff_feature_groups.size(); ++i) { + const std::vector& features_in_group = diff_feature_groups[i]; + group_feature_start_.push_back(cur_feature_index); + int num_features_in_group = 0; + std::vector> ori_bin_mappers; + std::vector> ori_bin_mappers_for_diff; + std::vector> diff_bin_mappers; + for (size_t j = 0; j < features_in_group.size(); ++j) { + const int diff_feature_index = features_in_group[j]; + if (!diff_feature_bin_mappers[diff_feature_index]->is_trivial()) { + if (diff_feature_bin_mappers[diff_feature_index]->GetDefaultBin() != diff_feature_bin_mappers[diff_feature_index]->GetMostFreqBin()) { + feature_need_push_zeros_.push_back(cur_feature_index); + } + feature2group_.push_back(i + num_groups_); + feature2subfeature_.push_back(num_features_in_group); + numeric_feature_map_[cur_feature_index] = cur_feature_index; + used_feature_map_[diff_feature_index + dataset->num_total_features_ * 2] = cur_feature_index; + ++cur_feature_index; + ++num_features_in_group; + ++num_used_differential_features_; + const int ori_feature_index = dataset->InnerFeatureIndex(diff_original_feature_index[diff_feature_index]); + ori_bin_mappers.emplace_back(new BinMapper(*dataset->FeatureBinMapper(ori_feature_index))); + ori_bin_mappers_for_diff.emplace_back(new BinMapper(*dataset->FeatureBinMapper(ori_feature_index))); + diff_bin_mappers.emplace_back(new BinMapper(*diff_feature_bin_mappers[diff_feature_index])); + } + } + + FeatureGroup feature_group(num_features_in_group, 0, &ori_bin_mappers, dataset->num_data(), i + num_groups_); + + const int num_threads = OMP_NUM_THREADS(); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (int j = 0; j < num_features_in_group; ++j) { + const int tid = omp_get_thread_num(); + const int diff_feature_index = features_in_group[j]; + const int original_feature_index = dataset->InnerFeatureIndex(diff_original_feature_index[diff_feature_index]); + const BinMapper* original_feature_bin_mapper = dataset->FeatureBinMapper(original_feature_index); + BinIterator* original_feature_iterator = dataset->FeatureIterator(original_feature_index); + original_feature_iterator->Reset(0); + for (int k = 0; k < dataset->num_data(); ++k) { + feature_group.PushData(tid, j, k, original_feature_bin_mapper->BinToValue(original_feature_iterator->Get(k))); + } + } + + feature_group.FinishLoad(); + + feature_groups_.emplace_back(new PairwiseRankingDifferentialFeatureGroup(feature_group, dataset->num_data(), 2, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_global_index_map(), diff_bin_mappers, ori_bin_mappers_for_diff)); + + group_feature_cnt_.push_back(cur_feature_index - group_feature_start_.back()); + num_total_bin += feature_groups_.back()->num_total_bin_; + group_bin_boundaries_.push_back(num_total_bin); + } + + num_groups_ += static_cast(diff_feature_groups.size()); + } + + Log::Warning("cur_feature_index = %d", cur_feature_index); + + feature_groups_.shrink_to_fit(); + + feature_names_.clear(); + for (const std::string& feature_name : dataset->feature_names_) { + feature_names_.push_back(feature_name + std::string("_i")); + } + for (const std::string& feature_name : dataset->feature_names_) { + feature_names_.push_back(feature_name + std::string("_j")); + } + if (config.use_differential_feature_in_pairwise_ranking) { + for (const int real_feature_index : diff_original_feature_index) { + feature_names_.push_back(dataset->feature_names_[real_feature_index] + std::string("_k")); + } + } + + real_feature_idx_.clear(); + for (const int idx : dataset->real_feature_idx_) { + real_feature_idx_.push_back(idx); + } + for (const int idx : dataset->real_feature_idx_) { + real_feature_idx_.push_back(idx + dataset->num_total_features_); + } + if (config.use_differential_feature_in_pairwise_ranking) { + for (const auto& features_in_diff_group : diff_feature_groups) { + for (const int idx : features_in_diff_group) { + real_feature_idx_.push_back(idx + 2 * dataset->num_total_features_); + } + } + } + + num_total_features_ = dataset->num_total_features_ * 2 + static_cast(diff_feature_bin_mappers.size()); + + forced_bin_bounds_.clear(); + forced_bin_bounds_.reserve(2 * dataset->num_total_features_); + forced_bin_bounds_.insert(forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.end()); + forced_bin_bounds_.insert(forced_bin_bounds_.begin() + dataset->forced_bin_bounds_.size(), dataset->forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.end()); + forced_bin_bounds_.resize(num_total_features_); + + label_idx_ = dataset->label_idx_; + device_type_ = dataset->device_type_; + gpu_device_id_ = dataset->gpu_device_id_; +} + void Dataset::ReSize(data_size_t num_data) { if (num_data_ != num_data) { num_data_ = num_data; @@ -838,6 +1151,7 @@ void Dataset::CopySubrow(const Dataset* fullset, data_size_t num_used_indices, bool need_meta_data) { CHECK_EQ(num_used_indices, num_data_); + Log::Warning("copy subrow here !!!!"); std::vector group_ids, subfeature_ids; group_ids.reserve(num_features_); subfeature_ids.reserve(num_features_); @@ -853,20 +1167,24 @@ void Dataset::CopySubrow(const Dataset* fullset, subfeature_ids.emplace_back(-1); } } + Log::Warning("copy subrow step 0 !!!!"); int num_copy_tasks = static_cast(group_ids.size()); - - OMP_INIT_EX(); - #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(dynamic) + // OMP_INIT_EX(); + // #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(dynamic) for (int task_id = 0; task_id < num_copy_tasks; ++task_id) { - OMP_LOOP_EX_BEGIN(); + // OMP_LOOP_EX_BEGIN(); + Log::Warning("before copy sub row by col 0"); int group = group_ids[task_id]; int subfeature = subfeature_ids[task_id]; + Log::Warning("before copy sub row by col 1"); feature_groups_[group]->CopySubrowByCol(fullset->feature_groups_[group].get(), used_indices, num_used_indices, subfeature); - OMP_LOOP_EX_END(); + Log::Warning("after copy sub row by col"); + // OMP_LOOP_EX_END(); } - OMP_THROW_EX(); + // OMP_THROW_EX(); + Log::Warning("copy subrow step 1 !!!!"); if (need_meta_data) { metadata_.Init(fullset->metadata_, used_indices, num_used_indices); } @@ -886,6 +1204,8 @@ void Dataset::CopySubrow(const Dataset* fullset, device_type_ = fullset->device_type_; gpu_device_id_ = fullset->gpu_device_id_; + Log::Warning("copy subrow step 2 !!!!"); + #ifdef USE_CUDA if (device_type_ == std::string("cuda")) { if (cuda_column_data_ == nullptr) { @@ -1331,6 +1651,7 @@ void Dataset::ConstructHistogramsInner( OMP_LOOP_EX_BEGIN(); int group = used_dense_group[gi]; const int num_bin = feature_groups_[group]->num_total_bin_; + feature_groups_[group]->bin_data_->group_index_ = gi; if (USE_QUANT_GRAD) { if (HIST_BITS == 16) { auto data_ptr = reinterpret_cast(reinterpret_cast(hist_data) + group_bin_boundaries_[group]); @@ -1763,6 +2084,84 @@ const void* Dataset::GetColWiseData( return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator); } +void Dataset::CreatePairwiseRankingDifferentialFeatures( + const std::vector>& sample_values, + const std::vector>& sample_indices, + const std::vector>& bin_mappers, + const data_size_t num_total_sample_data, + const data_size_t* query_boundaries, + const data_size_t num_queries, + std::vector>* differential_feature_bin_mappers, + std::vector* diff_original_feature_index, + const Config& config) const { + const int num_original_features = static_cast(sample_values.size()); + const data_size_t filter_cnt = static_cast( + static_cast(config.min_data_in_leaf * num_total_sample_data) / num_data_); + for (int i = 0; i < num_original_features; ++i) { + if (bin_mappers[i] != nullptr && !bin_mappers[i]->is_trivial() && bin_mappers[i]->bin_type() == BinType::NumericalBin) { + diff_original_feature_index->push_back(i); + } + } + const int num_numerical_features = static_cast(diff_original_feature_index->size()); + std::vector> sampled_differential_values(num_numerical_features); + for (int i = 0; i < num_numerical_features; ++i) { + differential_feature_bin_mappers->push_back(nullptr); + } + const int num_threads = OMP_NUM_THREADS(); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (int i = 0; i < num_numerical_features; ++i) { + const int feature_index = diff_original_feature_index->at(i); + const data_size_t num_samples_for_feature = static_cast(sample_values[feature_index].size()); + if (config.zero_as_missing) { + int cur_query = 0; + for (int j = 0; j < num_samples_for_feature; ++j) { + const double value = sample_values[feature_index][j]; + data_size_t cur_data_index = sample_indices[feature_index][j]; + while (query_boundaries[cur_query + 1] <= cur_data_index) { + ++cur_query; + } + for (int k = j + 1; sample_indices[feature_index][k] < query_boundaries[cur_query + 1]; ++k) { + const double diff_value = value - sample_values[feature_index][k]; + sampled_differential_values[i].push_back(diff_value); + } + } + } else { + CHECK_GT(sample_indices[feature_index].size(), 0); + int cur_pos_j = 0; + int cur_query = 0; + for (int j = 0; j < sample_indices[feature_index].back() + 1; ++j) { + while (query_boundaries[cur_query + 1] <= j) { + ++cur_query; + } + double value_j = 0.0; + if (j == sample_indices[feature_index][cur_pos_j]) { + value_j = sample_values[feature_index][cur_pos_j]; + ++cur_pos_j; + } + int cur_pos_k = cur_pos_j; + for (int k = j + 1; k < query_boundaries[cur_query + 1] && k < sample_indices[feature_index].back() + 1; ++k) { + double value_k = 0.0; + if (k == sample_indices[feature_index][cur_pos_k]) { + value_k = sample_values[feature_index][cur_pos_k]; + ++cur_pos_k; + } + const double diff_value = value_j - value_k; + sampled_differential_values[i].push_back(diff_value); + } + } + } + differential_feature_bin_mappers->operator[](i).reset(new BinMapper()); + std::vector forced_upper_bounds; + differential_feature_bin_mappers->operator[](i)->FindBin( + sampled_differential_values[i].data(), + static_cast(sampled_differential_values[i].size()), + static_cast(num_total_sample_data * (num_total_sample_data + 1) / 2), + config.max_bin, config.min_data_in_bin, filter_cnt, config.feature_pre_filter, + BinType::NumericalBin, config.use_missing, config.zero_as_missing, forced_upper_bounds + ); + } +} + #ifdef USE_CUDA void Dataset::CreateCUDAColumnData() { cuda_column_data_.reset(new CUDAColumnData(num_data_, gpu_device_id_)); diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 9c8a0417b118..c5715e390b1b 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -350,6 +350,13 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, // not need to check validation data // check meta data dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices); + + dataset->sampled_values_ = train_data->sampled_values_; + dataset->sampled_indices_ = train_data->sampled_indices_; + dataset->num_total_sampled_data_ = train_data->num_total_sampled_data_; + dataset->train_query_boundaries_ = train_data->metadata().query_boundaries(); + dataset->train_num_queries_ = train_data->metadata().num_queries(); + return dataset.release(); } @@ -1249,7 +1256,8 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, CheckCategoricalFeatureNumBin(bin_mappers, config_.max_bin, config_.max_bin_by_feature); dataset->Construct(&bin_mappers, dataset->num_total_features_, forced_bin_bounds, Common::Vector2Ptr(&sample_indices).data(), Common::Vector2Ptr(&sample_values).data(), - Common::VectorSize(sample_indices).data(), static_cast(sample_indices.size()), sample_data.size(), config_); + Common::VectorSize(sample_indices).data(), static_cast(sample_indices.size()), + sample_data.size(), config_); if (dataset->has_raw()) { dataset->ResizeRaw(static_cast(sample_data.size())); } diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 84df590c3fe7..0146558eeffd 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -566,8 +566,11 @@ class DenseBin : public Bin { void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { + Log::Warning("is dense"); auto other_bin = dynamic_cast*>(full_bin); + Log::Warning("other bin created"); if (IS_4BIT) { + Log::Warning("is 4 bit"); const data_size_t rest = num_used_indices & 1; for (int i = 0; i < num_used_indices - rest; i += 2) { data_size_t idx = used_indices[i]; @@ -586,6 +589,7 @@ class DenseBin : public Bin { } } else { for (int i = 0; i < num_used_indices; ++i) { + CHECK_LT(used_indices[i], data_.size()); data_[i] = other_bin->data_[used_indices[i]]; } } @@ -605,7 +609,7 @@ class DenseBin : public Bin { const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const override; - private: + protected: data_size_t num_data_; #ifdef USE_CUDA std::vector> data_; diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index f46e6d1c9f14..cc2c26803be3 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -853,5 +853,98 @@ size_t Metadata::SizesInByte() const { return size; } +data_size_t Metadata::BuildPairwiseFeatureRanking(const Metadata& metadata, const std::string& pairing_approach) { + num_queries_ = metadata.num_queries(); + label_.clear(); + positions_.clear(); + position_ids_.clear(); + if (pairwise_ranking_mode_ == PairwiseRankingMode::kRelevance) { + const label_t* pointwise_label = metadata.label(); + const label_t* pointwise_weights = metadata.weights(); + paired_ranking_item_index_map_.clear(); + paired_ranking_item_global_index_map_.clear(); + const data_size_t* query_boundaries = metadata.query_boundaries(); + + if (query_boundaries == nullptr) { + Log::Fatal("Query boundaries must be provided for ranking."); + } + + // backup pointwise query boundaries + query_boundaries_.clear(); + query_boundaries_.resize(num_queries_ + 1); + const int num_threads = OMP_NUM_THREADS(); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (num_queries_ >= 1024) + for (data_size_t i = 0; i < num_queries_ + 1; ++i) { + query_boundaries_[i] = query_boundaries[i]; + } + + // copy labels + const data_size_t pointwise_num_data = query_boundaries[num_queries_]; + if (pointwise_label != nullptr) { + label_.resize(pointwise_num_data); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (pointwise_num_data >= 1024) + for (data_size_t i = 0; i < pointwise_num_data; ++i) { + label_[i] = pointwise_label[i]; + } + } + + // copy weights + if (pointwise_weights != nullptr) { + weights_.resize(pointwise_num_data); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (pointwise_num_data >= 1024) + for (data_size_t i = 0; i < pointwise_num_data; ++i) { + weights_[i] = pointwise_weights[i]; + } + } + + // copy position information + if (metadata.num_position_ids() > 0) { + positions_.resize(pointwise_num_data); + const data_size_t* pointwise_positions = metadata.positions(); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (pointwise_num_data >= 1024) + for (data_size_t i = 0; i < pointwise_num_data; ++i) { + positions_[i] = pointwise_positions[i]; + } + + const data_size_t num_position_ids = static_cast(metadata.num_position_ids()); + position_ids_.resize(num_position_ids); + const std::string* pointwise_position_ids = metadata.position_ids(); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (num_position_ids >= 1024) + for (data_size_t i = 0; i < num_position_ids; ++i) { + position_ids_[i] = pointwise_position_ids[i]; + } + } + + pairwise_query_boundaries_.clear(); + pairwise_query_boundaries_.push_back(0); + num_data_ = 0; + for (data_size_t query_index = 0; query_index < metadata.num_queries(); ++query_index) { + const data_size_t query_start = query_boundaries[query_index]; + const data_size_t query_end = query_boundaries[query_index + 1]; + for (data_size_t item_index_i = query_start; item_index_i < query_end; ++item_index_i) { + const label_t label_i = label_[item_index_i]; + for (data_size_t item_index_j = query_start; item_index_j < query_end; ++item_index_j) { + if (item_index_i == item_index_j) { + continue; + } + const label_t label_j = label_[item_index_j]; + if ((pairing_approach == std::string("all")) || + (pairing_approach == std::string("different_relevance") && label_i != label_j) || + (pairing_approach == std::string("at_least_one_relevant") && (label_i > 0 || label_j > 0))) { + paired_ranking_item_index_map_.push_back(std::pair{item_index_i - query_start, item_index_j - query_start}); + paired_ranking_item_global_index_map_.push_back(std::pair{item_index_i, item_index_j}); + ++num_data_; + } + } + } + pairwise_query_boundaries_.push_back(num_data_); + } + } else { + // TODO(shiyu1994) + Log::Fatal("Not implemented."); + } + + return num_data_; +} } // namespace LightGBM diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index f8764c03bec6..cabff9dcae25 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -13,6 +13,7 @@ #include #include #include +#include namespace LightGBM { @@ -26,6 +27,17 @@ class MultiValDenseBin : public MultiValBin { data_.resize(static_cast(num_data_) * num_feature_, static_cast(0)); } + void DumpContent() const override { + std::ofstream fout("multi_val_bin.txt"); + for (data_size_t i = 0; i < num_data_; ++i) { + for (data_size_t j = 0; j < num_feature_; ++j) { + fout << static_cast(data_[i * num_feature_ + j]) << " "; + } + fout << std::endl; + } + fout.close(); + } + ~MultiValDenseBin() { } @@ -248,14 +260,19 @@ class MultiValDenseBin : public MultiValBin { void ReSize(data_size_t num_data, int num_bin, int num_feature, double, const std::vector& offsets) override { + Log::Warning("ReSize step 0"); num_data_ = num_data; num_bin_ = num_bin; num_feature_ = num_feature; offsets_ = offsets; + Log::Warning("ReSize step 1"); + Log::Warning("data_.size() = %ld", data_.size()); size_t new_size = static_cast(num_feature_) * num_data_; + Log::Warning("new_size = %ld", new_size); if (data_.size() < new_size) { data_.resize(new_size, 0); } + Log::Warning("ReSize step 2"); } template @@ -336,7 +353,7 @@ class MultiValDenseBin : public MultiValBin { uint8_t* data_ptr_bit_type) const override; #endif // USE_CUDA - private: + protected: data_size_t num_data_; int num_bin_; int num_feature_; diff --git a/src/io/multi_val_pairwise_lambdarank_bin.hpp b/src/io/multi_val_pairwise_lambdarank_bin.hpp new file mode 100644 index 000000000000..ced631100b94 --- /dev/null +++ b/src/io/multi_val_pairwise_lambdarank_bin.hpp @@ -0,0 +1,102 @@ +/*! + * Copyright (c) 2024 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_IO_MULTI_VAL_PAIRWISE_LAMBDARANK_BIN_HPP_ +#define LIGHTGBM_IO_MULTI_VAL_PAIRWISE_LAMBDARANK_BIN_HPP_ + +#include "multi_val_dense_bin.hpp" + +namespace LightGBM { + +template class MULTI_VAL_BIN_TYPE> +class MultiValPairwiseLambdarankBin : public MULTI_VAL_BIN_TYPE { + public: + MultiValPairwiseLambdarankBin(data_size_t num_data, int num_bin, int num_feature, const std::vector& offsets): MULTI_VAL_BIN_TYPE(num_data, num_bin, num_feature, offsets) { + this->num_bin_ = num_bin; + Log::Warning("num_bin = %d", num_bin); + } + protected: + const std::pair* paired_ranking_item_global_index_map_; +}; + + +template +class MultiValDensePairwiseLambdarankBin: public MultiValPairwiseLambdarankBin { + public: + MultiValDensePairwiseLambdarankBin(data_size_t num_data, int num_bin, int num_feature, + const std::vector& offsets, const std::pair* paired_ranking_item_global_index_map): MultiValPairwiseLambdarankBin(num_data, num_bin, num_feature, offsets) { + this->paired_ranking_item_global_index_map_ = paired_ranking_item_global_index_map; + } + + void ConstructHistogram(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* gradients, + const score_t* hessians, hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, + gradients, hessians, out); + } + + void ConstructHistogram(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* hessians, + hist_t* out) const override { + ConstructHistogramInner( + nullptr, start, end, gradients, hessians, out); + } + + void ConstructHistogramOrdered(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, + gradients, hessians, out); + } + + template + void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* gradients, const score_t* hessians, hist_t* out) const { + data_size_t i = start; + hist_t* grad = out; + hist_t* hess = out + 1; + for (; i < end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const data_size_t first_idx = this->paired_ranking_item_global_index_map_[idx].first; + const data_size_t second_idx = this->paired_ranking_item_global_index_map_[idx].second; + const auto first_j_start = this->RowPtr(first_idx); + const BIN_TYPE* first_data_ptr = this->data_.data() + first_j_start; + const score_t gradient = ORDERED ? gradients[i] : gradients[idx]; + const score_t hessian = ORDERED ? hessians[i] : hessians[idx]; + for (int j = 0; j < this->num_feature_; ++j) { + const uint32_t bin = static_cast(first_data_ptr[j]); + // if (bin != 0) { + // Log::Warning("first bin = %d, num_feature_ = %d", bin, this->num_feature_); + // } + // if (j == 0) { + // Log::Warning("group index = %d bin = %d gradient = %f hessian = %f", j, bin, gradient, hessian); + // } + + const auto ti = (bin + this->offsets_[j]) << 1; + grad[ti] += gradient; + hess[ti] += hessian; + } + + const auto second_j_start = this->RowPtr(second_idx); + const BIN_TYPE* second_data_ptr = this->data_.data() + second_j_start; + const auto base_offset = this->offsets_.back(); + for (int j = 0; j < this->num_feature_; ++j) { + const uint32_t bin = static_cast(second_data_ptr[j]); + // if (bin != 0) { + // Log::Warning("second bin = %d, num_feature_ = %d", bin, this->num_feature_); + // } + const auto ti = (bin + this->offsets_[j] + base_offset) << 1; + grad[ti] += gradient; + hess[ti] += hessian; + } + } + } +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_IO_MULTI_VAL_PAIRWISE_LAMBDARANK_BIN_HPP_ diff --git a/src/io/pairwise_lambdarank_bin.cpp b/src/io/pairwise_lambdarank_bin.cpp new file mode 100644 index 000000000000..6f5cfd8cbad9 --- /dev/null +++ b/src/io/pairwise_lambdarank_bin.cpp @@ -0,0 +1,549 @@ +/*! + * Copyright (c) 2024 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "pairwise_lambdarank_bin.hpp" + +#include + +namespace LightGBM { + +template +uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const { + const data_size_t first_data_index = this->paired_ranking_item_index_map_[paired_data_index].first; + const data_size_t second_data_index = this->paired_ranking_item_index_map_[paired_data_index].second; + const uint32_t first_bin = static_cast(this->unpaired_bin_->data(first_data_index)); + const uint32_t second_bin = static_cast(this->unpaired_bin_->data(second_data_index)); + int first_feature_index = static_cast(std::upper_bound(bin_offsets_->begin(), bin_offsets_->end(), first_bin) - bin_offsets_->begin()) - 1; + int second_feature_index = static_cast(std::upper_bound(bin_offsets_->begin(), bin_offsets_->end(), second_bin) - bin_offsets_->begin()) - 1; + + // TODO(shiyu1994): better original value, handle nan as missing + const double first_value = first_feature_index >= 0 ? ori_bin_mappers_->at(first_feature_index)->BinToValue(first_bin) : 0.0; + const double second_value = second_feature_index >= 0 ? ori_bin_mappers_->at(second_feature_index)->BinToValue(second_bin) : 0.0; + const double diff_value = first_value - second_value; + CHECK(first_feature_index >= 0 || first_bin == 0); + if (first_feature_index >= 0 && first_feature_index == second_feature_index) { + const uint32_t min_bin = diff_bin_offsets_->at(first_feature_index); + const uint32_t max_bin = diff_bin_offsets_->at(first_feature_index + 1) - 1; + const uint32_t most_freq_bin = diff_bin_mappers_->at(first_feature_index)->GetMostFreqBin(); + const uint32_t diff_bin = diff_bin_mappers_->at(first_feature_index)->ValueToBin(diff_value); + return diff_bin + min_bin - static_cast(most_freq_bin == 0); + } else { + return 0; + } +} + +template uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const; +template uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const; +template uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const; +template uint32_t DensePairwiseRankingDiffBin::GetBinAt(const data_size_t paired_data_index) const; + +template class ITERATOR_TYPE> +void PairwiseRankingBin::InitStreaming(uint32_t num_thread, int32_t omp_max_threads) { + unpaired_bin_->InitStreaming(num_thread, omp_max_threads); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin::Push(int tid, data_size_t idx, uint32_t value) { + unpaired_bin_->Push(tid, idx, value); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) { + Log::Warning("copy subrow in pairwie ranking bin"); + unpaired_bin_->CopySubrow(full_bin, used_indices, num_used_indices); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin::SaveBinaryToFile(BinaryWriter* writer) const { + unpaired_bin_->SaveBinaryToFile(writer); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin::LoadFromMemory(const void* memory, const std::vector& local_used_indices) { + unpaired_bin_->LoadFromMemory(memory, local_used_indices); +} + +template class ITERATOR_TYPE> +size_t PairwiseRankingBin::SizesInByte() const { + return unpaired_bin_->SizesInByte(); +} + +template class ITERATOR_TYPE> +data_size_t PairwiseRankingBin::num_data() const { + return unpaired_bin_->num_data(); +} + +template class ITERATOR_TYPE> +void PairwiseRankingBin::ReSize(data_size_t num_data) { + return unpaired_bin_->ReSize(num_data); +} + +template class ITERATOR_TYPE> +template +void DensePairwiseRankingBin::ConstructHistogramInner( + const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const { + data_size_t i = start; + hist_t* grad = out; + hist_t* hess = out + 1; + hist_cnt_t* cnt = reinterpret_cast(hess); + if (USE_PREFETCH) { + const data_size_t pf_offset = 64 / sizeof(VAL_T); + const data_size_t pf_end = end - pf_offset; + for (; i < pf_end; ++i) { + const auto paired_idx = USE_INDICES ? data_indices[i] : i; + const auto ti = GetBinAt(paired_idx) << 1; + // if (this->group_index_ == 0) { + // Log::Warning("group index = %d bin = %d gradient = %f hessian = %f", this->group_index_, ti / 2, ordered_gradients[i], ordered_hessians[i]); + // } + if (USE_HESSIAN) { + grad[ti] += ordered_gradients[i]; + hess[ti] += ordered_hessians[i]; + } else { + grad[ti] += ordered_gradients[i]; + ++cnt[ti]; + } + } + } + for (; i < end; ++i) { + const auto paired_idx = USE_INDICES ? data_indices[i] : i; + const auto ti = GetBinAt(paired_idx) << 1; + // if (this->group_index_ == 0) { + // Log::Warning("group index = %d bin = %d gradient = %f hessian = %f", this->group_index_, ti / 2, ordered_gradients[i], ordered_hessians[i]); + // } + if (USE_HESSIAN) { + grad[ti] += ordered_gradients[i]; + hess[ti] += ordered_hessians[i]; + } else { + grad[ti] += ordered_gradients[i]; + ++cnt[ti]; + } + } +} + +template class ITERATOR_TYPE> +template +void DensePairwiseRankingBin::ConstructHistogramIntInner( + const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const { + data_size_t i = start; + PACKED_HIST_T* out_ptr = reinterpret_cast(out); + const int16_t* gradients_ptr = reinterpret_cast(ordered_gradients); + if (USE_PREFETCH) { + const data_size_t pf_offset = 64 / sizeof(VAL_T); + const data_size_t pf_end = end - pf_offset; + for (; i < pf_end; ++i) { + const auto paired_idx = USE_INDICES ? data_indices[i] : i; + const auto ti = GetBinAt(paired_idx) << 1; + const int16_t gradient_16 = gradients_ptr[i]; + if (USE_HESSIAN) { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff); + out_ptr[ti] += gradient_packed; + } else { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (1); + out_ptr[ti] += gradient_packed; + } + } + } + for (; i < end; ++i) { + const auto paired_idx = USE_INDICES ? data_indices[i] : i; + const auto ti = GetBinAt(paired_idx) << 1; + const int16_t gradient_16 = gradients_ptr[i]; + if (USE_HESSIAN) { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff); + out_ptr[ti] += gradient_packed; + } else { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (1); + out_ptr[ti] += gradient_packed; + } + } +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const { + ConstructHistogramInner( + data_indices, start, end, ordered_gradients, ordered_hessians, out); +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const { + ConstructHistogramInner( + nullptr, start, end, ordered_gradients, ordered_hessians, out); +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogram( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const { + ConstructHistogramInner(data_indices, start, end, + ordered_gradients, nullptr, out); +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogram( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const { + ConstructHistogramInner( + nullptr, start, end, ordered_gradients, nullptr, out); +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt8( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt16( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +void DensePairwiseRankingBin::ConstructHistogramInt32( + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); +} + +template class ITERATOR_TYPE> +template +data_size_t DensePairwiseRankingBin::SplitInner(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const { + auto th = static_cast(threshold + min_bin); + auto t_zero_bin = static_cast(min_bin + default_bin); + if (most_freq_bin == 0) { + --th; + --t_zero_bin; + } + const auto minb = static_cast(min_bin); + const auto maxb = static_cast(max_bin); + data_size_t lte_count = 0; + data_size_t gt_count = 0; + data_size_t* default_indices = gt_indices; + data_size_t* default_count = >_count; + data_size_t* missing_default_indices = gt_indices; + data_size_t* missing_default_count = >_count; + if (most_freq_bin <= threshold) { + default_indices = lte_indices; + default_count = <e_count; + } + if (MISS_IS_ZERO || MISS_IS_NA) { + if (default_left) { + missing_default_indices = lte_indices; + missing_default_count = <e_count; + } + } + if (min_bin < max_bin) { + for (data_size_t i = 0; i < cnt; ++i) { + const data_size_t paired_idx = data_indices[i]; + const auto bin = GetBinAt(paired_idx); + if ((MISS_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || + (MISS_IS_NA && !MFB_IS_NA && bin == maxb)) { + missing_default_indices[(*missing_default_count)++] = paired_idx; + } else if ((USE_MIN_BIN && (bin < minb || bin > maxb)) || + (!USE_MIN_BIN && bin == 0)) { + if ((MISS_IS_NA && MFB_IS_NA) || (MISS_IS_ZERO && MFB_IS_ZERO)) { + missing_default_indices[(*missing_default_count)++] = paired_idx; + } else { + default_indices[(*default_count)++] = paired_idx; + } + } else if (bin > th) { + gt_indices[gt_count++] = paired_idx; + } else { + lte_indices[lte_count++] = paired_idx; + } + } + } else { + data_size_t* max_bin_indices = gt_indices; + data_size_t* max_bin_count = >_count; + if (maxb <= th) { + max_bin_indices = lte_indices; + max_bin_count = <e_count; + } + for (data_size_t i = 0; i < cnt; ++i) { + const data_size_t paired_idx = data_indices[i]; + const auto bin = GetBinAt(paired_idx); + if (MISS_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { + missing_default_indices[(*missing_default_count)++] = paired_idx; + } else if (bin != maxb) { + if ((MISS_IS_NA && MFB_IS_NA) || (MISS_IS_ZERO && MFB_IS_ZERO)) { + missing_default_indices[(*missing_default_count)++] = paired_idx; + } else { + default_indices[(*default_count)++] = paired_idx; + } + } else { + if (MISS_IS_NA && !MFB_IS_NA) { + missing_default_indices[(*missing_default_count)++] = paired_idx; + } else { + max_bin_indices[(*max_bin_count)++] = paired_idx; + } + } + } + } + return lte_count; +} + +template class ITERATOR_TYPE> +data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + MissingType missing_type, bool default_left, + uint32_t threshold, const data_size_t* data_indices, + data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const { + #define ARGUMENTS \ + min_bin, max_bin, default_bin, most_freq_bin, default_left, threshold, \ + data_indices, cnt, lte_indices, gt_indices + if (missing_type == MissingType::None) { + return SplitInner(ARGUMENTS); + } else if (missing_type == MissingType::Zero) { + if (default_bin == most_freq_bin) { + return SplitInner(ARGUMENTS); + } else { + return SplitInner(ARGUMENTS); + } + } else { + if (max_bin == most_freq_bin + min_bin && most_freq_bin > 0) { + return SplitInner(ARGUMENTS); + } else { + return SplitInner(ARGUMENTS); + } + } +#undef ARGUMENTS +} + +template class ITERATOR_TYPE> +data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, + uint32_t most_freq_bin, MissingType missing_type, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const { +#define ARGUMENTS \ + 1, max_bin, default_bin, most_freq_bin, default_left, threshold, \ + data_indices, cnt, lte_indices, gt_indices + if (missing_type == MissingType::None) { + return SplitInner(ARGUMENTS); + } else if (missing_type == MissingType::Zero) { + if (default_bin == most_freq_bin) { + return SplitInner(ARGUMENTS); + } else { + return SplitInner(ARGUMENTS); + } + } else { + if (max_bin == most_freq_bin + 1 && most_freq_bin > 0) { + return SplitInner(ARGUMENTS); + } else { + return SplitInner(ARGUMENTS); + } + } +#undef ARGUMENTS +} + + +#define REGISTER_BIN_TYPE(BIN_TYPE, ITERATOR_TYPE) \ + template void PairwiseRankingBin::InitStreaming(uint32_t num_thread, int32_t omp_max_threads); \ + template void PairwiseRankingBin::Push(int tid, data_size_t idx, uint32_t value); \ + template void PairwiseRankingBin::CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices); \ + template void PairwiseRankingBin::SaveBinaryToFile(BinaryWriter* writer) const; \ + template void PairwiseRankingBin::LoadFromMemory(const void* memory, const std::vector& local_used_indices); \ + template size_t PairwiseRankingBin::SizesInByte() const; \ + template data_size_t PairwiseRankingBin::num_data() const; \ + template void PairwiseRankingBin::ReSize(data_size_t num_data); + +#define COMMA , + +#define REGISTER_DENSE_HISTOGRAM_CONSTRUCTION_WITH_ITERATOR_TYPE_AND_FUNC_NAME(DATA_TYPE, USE_4BIT, ITERATOR_TYPE, FUNC_NAME) \ + template void DensePairwiseRankingBin::FUNC_NAME( \ + const data_size_t* data_indices, data_size_t start, data_size_t end, \ + const score_t* ordered_gradients, const score_t* ordered_hessians, \ + hist_t* out) const; \ + template void DensePairwiseRankingBin::FUNC_NAME( \ + data_size_t start, data_size_t end, \ + const score_t* ordered_gradients, \ + const score_t* ordered_hessians, \ + hist_t* out) const; \ + template void DensePairwiseRankingBin::FUNC_NAME( \ + const data_size_t* data_indices, data_size_t start, \ + data_size_t end, const score_t* ordered_gradients, \ + hist_t* out) const; \ + template void DensePairwiseRankingBin::FUNC_NAME( \ + data_size_t start, data_size_t end, \ + const score_t* ordered_gradients, \ + hist_t* out) const; + + +#define REGISTER_DENSE_TREE_LEARNING_FUNC_WITH_ITERATOR_TYPE(DATA_TYPE, USE_4BIT, ITERATOR_TYPE) \ + REGISTER_DENSE_HISTOGRAM_CONSTRUCTION_WITH_ITERATOR_TYPE_AND_FUNC_NAME(DATA_TYPE, USE_4BIT, ITERATOR_TYPE, ConstructHistogram) \ + REGISTER_DENSE_HISTOGRAM_CONSTRUCTION_WITH_ITERATOR_TYPE_AND_FUNC_NAME(DATA_TYPE, USE_4BIT, ITERATOR_TYPE, ConstructHistogramInt8) \ + REGISTER_DENSE_HISTOGRAM_CONSTRUCTION_WITH_ITERATOR_TYPE_AND_FUNC_NAME(DATA_TYPE, USE_4BIT, ITERATOR_TYPE, ConstructHistogramInt16) \ + REGISTER_DENSE_HISTOGRAM_CONSTRUCTION_WITH_ITERATOR_TYPE_AND_FUNC_NAME(DATA_TYPE, USE_4BIT, ITERATOR_TYPE, ConstructHistogramInt32) \ + template data_size_t DensePairwiseRankingBin::Split(uint32_t min_bin, uint32_t max_bin, \ + uint32_t default_bin, uint32_t most_freq_bin, \ + MissingType missing_type, bool default_left, \ + uint32_t threshold, const data_size_t* data_indices, \ + data_size_t cnt, \ + data_size_t* lte_indices, \ + data_size_t* gt_indices) const; \ + template data_size_t DensePairwiseRankingBin::Split(uint32_t max_bin, uint32_t default_bin, \ + uint32_t most_freq_bin, MissingType missing_type, \ + bool default_left, uint32_t threshold, \ + const data_size_t* data_indices, data_size_t cnt, \ + data_size_t* lte_indices, \ + data_size_t* gt_indices) const; + + +#define REGISTER_DENSE_TREE_LEARNING_FUNC_FOR_ITERATOR_TYPE(ITERATOR_TYPE) \ + REGISTER_DENSE_TREE_LEARNING_FUNC_WITH_ITERATOR_TYPE(uint8_t, true, ITERATOR_TYPE) \ + REGISTER_DENSE_TREE_LEARNING_FUNC_WITH_ITERATOR_TYPE(uint8_t, false, ITERATOR_TYPE) \ + REGISTER_DENSE_TREE_LEARNING_FUNC_WITH_ITERATOR_TYPE(uint16_t, false, ITERATOR_TYPE) \ + REGISTER_DENSE_TREE_LEARNING_FUNC_WITH_ITERATOR_TYPE(uint32_t, false, ITERATOR_TYPE) + + +#define REGISTER_ITERATOR(ITERATOR_TYPE) \ + REGISTER_BIN_TYPE(DenseBin, ITERATOR_TYPE) \ + REGISTER_BIN_TYPE(DenseBin, ITERATOR_TYPE) \ + REGISTER_BIN_TYPE(DenseBin, ITERATOR_TYPE) \ + REGISTER_BIN_TYPE(DenseBin, ITERATOR_TYPE) \ + REGISTER_BIN_TYPE(SparseBin, ITERATOR_TYPE) \ + REGISTER_BIN_TYPE(SparseBin, ITERATOR_TYPE) \ + REGISTER_BIN_TYPE(SparseBin, ITERATOR_TYPE) \ + REGISTER_DENSE_TREE_LEARNING_FUNC_FOR_ITERATOR_TYPE(ITERATOR_TYPE) + + +REGISTER_ITERATOR(PairwiseRankingFirstIterator) +REGISTER_ITERATOR(PairwiseRankingSecondIterator) +REGISTER_ITERATOR(PairwiseRankingDiffIterator) + + +#undef COMMA +#undef REGISTER_TYPE +#undef REGISTER_BIN_TYPE +#undef REGISTER_DENSE_TREE_LEARNING_FUNC_FOR_ITERATOR_TYPE +#undef REGISTER_DENSE_TREE_LEARNING_FUNC_WITH_ITERATOR_TYPE +#undef REGISTER_DENSE_HISTOGRAM_CONSTRUCTION_WITH_ITERATOR_TYPE_AND_FUNC_NAME + + +} // namespace LightGBM diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp new file mode 100644 index 000000000000..0e481c78f37f --- /dev/null +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -0,0 +1,633 @@ +/*! + * Copyright (c) 2016 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_IO_PAIRWISE_LAMBDARANK_BIN_HPP_ +#define LIGHTGBM_IO_PAIRWISE_LAMBDARANK_BIN_HPP_ + +#include +#include + +#include + +#include "dense_bin.hpp" +#include "sparse_bin.hpp" + +namespace LightGBM { + +template +class PairwiseRankingFirstBin; + +template +class PairwiseRankingSecondBin; + +template +class PairwiseRankingDiffBin; + +template +class PairwiseRankingFirstIterator: public BinIterator { + public: + friend PairwiseRankingFirstBin; + + PairwiseRankingFirstIterator(const BIN_TYPE* unpaired_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) { + unpaired_bin_ = unpaired_bin; + unpaired_bin_iterator_.reset(unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin)); + unpaired_bin_iterator_->Reset(0); + paired_ranking_item_index_map_ = paired_ranking_item_index_map; + prev_index_ = -1; + prev_val_ = 0; + } + + ~PairwiseRankingFirstIterator() {} + + uint32_t Get(data_size_t idx) { + const data_size_t data_index = paired_ranking_item_index_map_[idx].first; + if (data_index != prev_index_) { + CHECK_GE(data_index, prev_index_); + prev_val_ = unpaired_bin_iterator_->Get(data_index); + } + prev_index_ = data_index; + return prev_val_; + } + + uint32_t RawGet(data_size_t idx) { + const data_size_t data_index = paired_ranking_item_index_map_[idx].first; + if (data_index != prev_index_) { + CHECK_GE(data_index, prev_index_); + prev_val_ = unpaired_bin_iterator_->RawGet(data_index); + } + prev_index_ = data_index; + return prev_val_; + } + + void Reset(data_size_t idx) { + const data_size_t first_idx = paired_ranking_item_index_map_[idx].first; + unpaired_bin_iterator_->Reset(first_idx); + prev_index_ = -1; + prev_val_ = 0; + } + + private: + const BIN_TYPE* unpaired_bin_; + std::unique_ptr unpaired_bin_iterator_; + const std::pair* paired_ranking_item_index_map_; + data_size_t prev_index_; + uint32_t prev_val_; +}; + +template +class PairwiseRankingSecondIterator: public BinIterator { + public: + friend PairwiseRankingSecondBin; + + PairwiseRankingSecondIterator(const BIN_TYPE* unpaired_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) { + unpaired_bin_ = unpaired_bin; + unpaired_bin_iterator_.reset(unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin)); + unpaired_bin_iterator_->Reset(0); + paired_ranking_item_index_map_ = paired_ranking_item_index_map; + prev_index_ = 0; + } + + ~PairwiseRankingSecondIterator() {} + + uint32_t Get(data_size_t idx) { + const data_size_t data_index = paired_ranking_item_index_map_[idx].second; + if (data_index < prev_index_) { + unpaired_bin_iterator_->Reset(0); + } + prev_index_ = data_index; + return unpaired_bin_iterator_->Get(data_index); + } + + uint32_t RawGet(data_size_t idx) { + const data_size_t data_index = paired_ranking_item_index_map_[idx].second; + if (data_index < prev_index_) { + unpaired_bin_iterator_->Reset(0); + } + prev_index_ = data_index; + return unpaired_bin_iterator_->RawGet(data_index); + } + + void Reset(data_size_t idx) { + const data_size_t second_idx = paired_ranking_item_index_map_[idx].second; + unpaired_bin_iterator_->Reset(second_idx); + prev_index_ = 0; + } + + private: + const BIN_TYPE* unpaired_bin_; + std::unique_ptr unpaired_bin_iterator_; + const std::pair* paired_ranking_item_index_map_; + data_size_t prev_index_; +}; + + +template +class PairwiseRankingDiffIterator: public BinIterator { + public: + friend PairwiseRankingDiffBin; + + PairwiseRankingDiffIterator(const BIN_TYPE* unpaired_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin, const BinMapper* original_feature_bin_mapper, const BinMapper* diff_feature_bin_mapper): min_bin_(min_bin), max_bin_(max_bin), offset_(diff_feature_bin_mapper->GetMostFreqBin() == 0) { + unpaired_bin_ = unpaired_bin; + first_unpaired_bin_iterator_.reset(unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin)); + first_unpaired_bin_iterator_->Reset(0); + second_unpaired_bin_iterator_.reset(unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin)); + second_unpaired_bin_iterator_->Reset(0); + paired_ranking_item_index_map_ = paired_ranking_item_index_map; + first_prev_index_ = 0; + second_prev_index_ = 0; + original_feature_bin_mapper_ = original_feature_bin_mapper; + diff_feature_bin_mapper_ = diff_feature_bin_mapper; + } + + ~PairwiseRankingDiffIterator() {} + + uint32_t Get(data_size_t idx) { + const data_size_t first_data_index = paired_ranking_item_index_map_[idx].first; + const data_size_t second_data_index = paired_ranking_item_index_map_[idx].second; + if (second_data_index < second_prev_index_) { + second_unpaired_bin_iterator_->Reset(0); + } + first_prev_index_ = first_data_index; + second_prev_index_ = second_data_index; + const uint32_t first_bin = first_unpaired_bin_iterator_->Get(first_data_index); + const uint32_t second_bin = second_unpaired_bin_iterator_->Get(second_data_index); + // TODO(shiyu1994): better original value + const double first_value = original_feature_bin_mapper_->BinToValue(first_bin); + const double second_value = original_feature_bin_mapper_->BinToValue(second_bin); + const double diff_value = first_value - second_value; + const uint32_t diff_bin = diff_feature_bin_mapper_->ValueToBin(diff_value); + return diff_bin; + } + + uint32_t RawGet(data_size_t idx) { + const uint32_t bin = Get(idx); + return bin + min_bin_ - offset_; + } + + void Reset(data_size_t idx) { + const data_size_t first_idx = paired_ranking_item_index_map_[idx].first; + const data_size_t second_idx = paired_ranking_item_index_map_[idx].second; + first_unpaired_bin_iterator_->Reset(first_idx); + second_unpaired_bin_iterator_->Reset(second_idx); + first_prev_index_ = -1; + second_prev_index_ = 0; + } + + private: + const BIN_TYPE* unpaired_bin_; + std::unique_ptr first_unpaired_bin_iterator_; + std::unique_ptr second_unpaired_bin_iterator_; + const std::pair* paired_ranking_item_index_map_; + const BinMapper* original_feature_bin_mapper_; + const BinMapper* diff_feature_bin_mapper_; + data_size_t first_prev_index_; + data_size_t second_prev_index_; + const uint32_t min_bin_; + const uint32_t max_bin_; + const uint32_t offset_; +}; + + +template class ITERATOR_TYPE> +class PairwiseRankingBin: public BIN_TYPE { + public: + PairwiseRankingBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, BIN_TYPE* unpaired_bin): BIN_TYPE(0), paired_ranking_item_index_map_(paired_ranking_item_index_map), unpaired_bin_(unpaired_bin) { + num_data_ = num_data; + } + + virtual ~PairwiseRankingBin() {} + + void InitStreaming(uint32_t num_thread, int32_t omp_max_threads) override; + + void Push(int tid, data_size_t idx, uint32_t value) override; + + void FinishLoad() override { + unpaired_bin_->FinishLoad(); + } + + void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override; + + void SaveBinaryToFile(BinaryWriter* writer) const override; + + void LoadFromMemory(const void* memory, + const std::vector& local_used_indices) override; + + size_t SizesInByte() const override; + + data_size_t num_data() const override; + + void* get_data() override { + return unpaired_bin_->get_data(); + } + + BinIterator* GetUnpairedIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + return unpaired_bin_->GetIterator(min_bin, max_bin, most_freq_bin); + } + + void ReSize(data_size_t num_data) override; + + data_size_t Split(uint32_t /*min_bin*/, uint32_t /*max_bin*/, + uint32_t /*default_bin*/, uint32_t /*most_freq_bin*/, + MissingType /*missing_type*/, bool /*default_left*/, + uint32_t /*threshold*/, const data_size_t* /*data_indices*/, + data_size_t /*cnt*/, + data_size_t* /*lte_indices*/, + data_size_t* /*gt_indices*/) const override { + Log::Fatal("Not implemented."); + return 0; + } + + data_size_t SplitCategorical( + uint32_t /*min_bin*/, uint32_t /*max_bin*/, uint32_t /*most_freq_bin*/, + const uint32_t* /*threshold*/, int /*num_threshold*/, + const data_size_t* /*data_indices*/, data_size_t /*cnt*/, + data_size_t* /*lte_indices*/, data_size_t* /*gt_indices*/) const override { + Log::Fatal("Not implemented."); + return 0; + } + + data_size_t Split(uint32_t /*max_bin*/, uint32_t /*default_bin*/, + uint32_t /*most_freq_bin*/, MissingType /*missing_type*/, + bool /*default_left*/, uint32_t /*threshold*/, + const data_size_t* /*data_indices*/, data_size_t /*cnt*/, + data_size_t* /*lte_indices*/, + data_size_t* /*gt_indices*/) const override { + Log::Fatal("Not implemented."); + return 0; + } + + data_size_t SplitCategorical( + uint32_t /*max_bin*/, uint32_t /*most_freq_bin*/, const uint32_t* /*threshold*/, + int /*num_threshold*/, const data_size_t* /*data_indices*/, data_size_t /*cnt*/, + data_size_t* /*lte_indices*/, data_size_t* /*gt_indices*/) const override { + Log::Fatal("Not implemented."); + return 0; + } + + void ConstructHistogram( + const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogram(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt8( + const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt8(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt16( + const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt16(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt32( + const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt32(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, const score_t* /*ordered_hessians*/, + hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogram(const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogram(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt8(const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt8(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt16(const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt16(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt32(const data_size_t* /*data_indices*/, data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + void ConstructHistogramInt32(data_size_t /*start*/, data_size_t /*end*/, + const score_t* /*ordered_gradients*/, hist_t* /*out*/) const override { + Log::Fatal("Not implemented."); + } + + const void* GetColWiseData(uint8_t* /*bit_type*/, bool* /*is_sparse*/, std::vector* /*bin_iterator*/, const int /*num_threads*/) const override { + Log::Fatal("Not implemented."); + return nullptr; + } + + const void* GetColWiseData(uint8_t* /*bit_type*/, bool* /*is_sparse*/, BinIterator** /*bin_iterator*/) const override { + Log::Fatal("Not implemented."); + return nullptr; + } + + protected: + + virtual data_size_t get_unpaired_index(const data_size_t paired_index) const = 0; + + const std::pair* paired_ranking_item_index_map_; + const std::unique_ptr unpaired_bin_; + data_size_t num_data_; +}; + +template class ITERATOR_TYPE> +class DensePairwiseRankingBin: public PairwiseRankingBin, ITERATOR_TYPE> { + public: + DensePairwiseRankingBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, DenseBin* unpaired_bin): PairwiseRankingBin, ITERATOR_TYPE>(num_data, paired_ranking_item_index_map, unpaired_bin) {} + + void ConstructHistogram( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogram(data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override; + + void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + void ConstructHistogram(data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const override; + + data_size_t Split(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + MissingType missing_type, bool default_left, + uint32_t threshold, const data_size_t* data_indices, + data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const override; + + data_size_t Split(uint32_t max_bin, uint32_t default_bin, + uint32_t most_freq_bin, MissingType missing_type, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const override; + + protected: + template + void ConstructHistogramInner(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const; + + template + void ConstructHistogramIntInner(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const; + + template + data_size_t SplitInner(uint32_t min_bin, uint32_t max_bin, + uint32_t default_bin, uint32_t most_freq_bin, + bool default_left, uint32_t threshold, + const data_size_t* data_indices, data_size_t cnt, + data_size_t* lte_indices, + data_size_t* gt_indices) const; + + virtual inline uint32_t GetBinAt(const data_size_t paired_data_index) const { + const data_size_t idx = this->get_unpaired_index(paired_data_index); + return this->unpaired_bin_->data(idx); + } +}; + +template class ITERATOR_TYPE> +class SparsePairwiseRankingBin: public PairwiseRankingBin, ITERATOR_TYPE> { + public: + SparsePairwiseRankingBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, SparseBin* unpaired_bin): PairwiseRankingBin, ITERATOR_TYPE>(num_data, paired_ranking_item_index_map, unpaired_bin) {} +}; + +template +class DensePairwiseRankingFirstBin: public DensePairwiseRankingBin { + public: + DensePairwiseRankingFirstBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, DenseBin* unpaired_bin): DensePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} + + BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + return new PairwiseRankingFirstIterator>(this->unpaired_bin_.get(), this->paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin); + } + + private: + data_size_t get_unpaired_index(const data_size_t paired_index) const { + return this->paired_ranking_item_index_map_[paired_index].first; + } +}; + +template +class DensePairwiseRankingSecondBin: public DensePairwiseRankingBin { + public: + DensePairwiseRankingSecondBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, DenseBin* unpaired_bin): DensePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} + + BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + return new PairwiseRankingSecondIterator>(this->unpaired_bin_.get(), this->paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin); + } + + private: + data_size_t get_unpaired_index(const data_size_t paired_index) const { + return this->paired_ranking_item_index_map_[paired_index].second; + } +}; + +template +class DensePairwiseRankingDiffBin: public DensePairwiseRankingBin { + public: + DensePairwiseRankingDiffBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, DenseBin* unpaired_bin, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets): DensePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) { + diff_bin_mappers_ = diff_bin_mappers; + ori_bin_mappers_ = ori_bin_mappers; + bin_offsets_ = bin_offsets; + diff_bin_offsets_ = diff_bin_offsets; + } + + BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + int sub_feature_index = -1; + for (int i = 0; i < static_cast(bin_offsets_->size()); ++i) { + if (bin_offsets_->at(i) == min_bin) { + sub_feature_index = i; + break; + } + } + CHECK_GE(sub_feature_index, 0); + return new PairwiseRankingDiffIterator>(this->unpaired_bin_.get(), this->paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin, ori_bin_mappers_->at(sub_feature_index).get(), diff_bin_mappers_->at(sub_feature_index).get()); + } + + private: + data_size_t get_unpaired_index(const data_size_t /*paired_index*/) const { + Log::Fatal("get_unpaired_index of DensePairwiseRankingDiffBin should not be called."); + } + + inline uint32_t GetBinAt(const data_size_t paired_data_index) const override; + + const std::vector* bin_offsets_; + const std::vector* diff_bin_offsets_; + const std::vector>* diff_bin_mappers_; + const std::vector>* ori_bin_mappers_; +}; + +template +class SparsePairwiseRankingFirstBin: public SparsePairwiseRankingBin { + public: + SparsePairwiseRankingFirstBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, SparseBin* unpaired_bin): SparsePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} + + BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + return new PairwiseRankingFirstIterator>(this->unpaired_bin_.get(), this->paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin); + } + + private: + data_size_t get_unpaired_index(const data_size_t paired_index) const { + return this->paired_ranking_item_index_map_[paired_index].first; + } +}; + +template +class SparsePairwiseRankingSecondBin: public SparsePairwiseRankingBin { + public: + SparsePairwiseRankingSecondBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, SparseBin* unpaired_bin): SparsePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} + + BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + return new PairwiseRankingSecondIterator>(this->unpaired_bin_.get(), this->paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin); + } + + private: + data_size_t get_unpaired_index(const data_size_t paired_index) const { + return this->paired_ranking_item_index_map_[paired_index].second; + } +}; + +template +class SparsePairwiseRankingDiffBin: public SparsePairwiseRankingBin { + public: + SparsePairwiseRankingDiffBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, SparseBin* unpaired_bin, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets): SparsePairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) { + bin_offsets_ = bin_offsets; + diff_bin_offsets_ = diff_bin_offsets; + diff_bin_mappers_ = diff_bin_mappers; + ori_bin_mappers_ = ori_bin_mappers; + } + + BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { + int sub_feature_index = -1; + for (int i = 0; i < static_cast(bin_offsets_->size()); ++i) { + if (bin_offsets_->at(i) == min_bin) { + CHECK_GT(i, 0); + sub_feature_index = i; + break; + } + } + CHECK_GE(sub_feature_index, 0); + return new PairwiseRankingDiffIterator>(this->unpaired_bin_.get(), this->paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin, ori_bin_mappers_->at(sub_feature_index).get(), diff_bin_mappers_->at(sub_feature_index).get()); + } + + private: + data_size_t get_unpaired_index(const data_size_t /*paired_index*/) const { + Log::Fatal("get_unpaired_index of SparsePairwiseRankingDiffBin should not be called."); + } + + const std::vector* bin_offsets_; + const std::vector* diff_bin_offsets_; + const std::vector>* diff_bin_mappers_; + const std::vector>* ori_bin_mappers_; +}; + +template +class MultiValPairwiseBin : public MULTI_VAL_BIN_TYPE { + public: + +}; + + +} // namespace LightGBM + +#endif // LIGHTGBM_IO_PAIRWISE_LAMBDARANK_BIN_HPP_ diff --git a/src/io/pairwise_ranking_feature_group.cpp b/src/io/pairwise_ranking_feature_group.cpp new file mode 100644 index 000000000000..8a02e31a8d56 --- /dev/null +++ b/src/io/pairwise_ranking_feature_group.cpp @@ -0,0 +1,161 @@ +/*! + * Copyright (c) 2016 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include +#include + +#include "pairwise_lambdarank_bin.hpp" + +namespace LightGBM { + +PairwiseRankingFeatureGroup::PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map): + FeatureGroup(other, num_original_data), paired_ranking_item_index_map_(paired_ranking_item_index_map), num_data_(num_pairs), is_first_or_second_in_pairing_(is_first_or_second_in_pairing) { + + CreateBinData(num_original_data, is_multi_val_, !is_sparse_, is_sparse_); + + Threading::For(0, num_original_data, 512, [this, &other] (int block_index, data_size_t block_start, data_size_t block_end) { + for (int feature_index = 0; feature_index < num_feature_; ++feature_index) { + std::unique_ptr bin_iterator(other.SubFeatureIterator(feature_index)); + bin_iterator->Reset(block_start); + for (data_size_t index = block_start; index < block_end; ++index) { + PushBinData(block_index, feature_index, index, bin_iterator->Get(index)); + } + } + }); + + FinishLoad(); +} + +void PairwiseRankingFeatureGroup::CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { + CHECK(!is_multi_val); // do not support multi-value bin for now + if (is_multi_val) { + multi_bin_data_.clear(); + for (int i = 0; i < num_feature_; ++i) { + int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1; + // if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) { + // if (is_first_or_second_in_pairing_ == 0) { + // multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingFirstBin( + // num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); + // } else { + // multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingSecondBin( + // num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); + // } + // } else { + if (is_first_or_second_in_pairing_ == 0) { + multi_bin_data_.emplace_back( + Bin::CreateDensePairwiseRankingFirstBin(num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); + } else { + multi_bin_data_.emplace_back( + Bin::CreateDensePairwiseRankingSecondBin(num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); + } + // } + } + is_multi_val_ = true; + } else { + // if (force_sparse || + // (!force_dense && num_feature_ == 1 && + // bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { + // is_sparse_ = true; + // if (is_first_or_second_in_pairing_ == 0) { + // bin_data_.reset(Bin::CreateSparsePairwiseRankingFirstBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); + // } else { + // bin_data_.reset(Bin::CreateSparsePairwiseRankingSecondBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); + // } + // } else { + is_sparse_ = false; + if (is_first_or_second_in_pairing_ == 0) { + bin_data_.reset(Bin::CreateDensePairwiseRankingFirstBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); + } else { + bin_data_.reset(Bin::CreateDensePairwiseRankingSecondBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); + } + // } + is_multi_val_ = false; + } +} + +PairwiseRankingDifferentialFeatureGroup::PairwiseRankingDifferentialFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map, std::vector>& diff_feature_bin_mappers, std::vector>& ori_feature_bin_mappers): PairwiseRankingFeatureGroup(other, num_original_data, is_first_or_second_in_pairing, num_pairs, paired_ranking_item_index_map) { + for (auto& bin_mapper_ref : diff_feature_bin_mappers) { + diff_feature_bin_mappers_.emplace_back(bin_mapper_ref.release()); + } + for (auto& bin_mapper_ref : ori_feature_bin_mappers) { + ori_feature_bin_mappers_.emplace_back(bin_mapper_ref.release()); + } + + CreateBinData(num_original_data, is_multi_val_, !is_sparse_, is_sparse_); + + Threading::For(0, num_original_data, 512, [this, &other] (int block_index, data_size_t block_start, data_size_t block_end) { + for (int feature_index = 0; feature_index < num_feature_; ++feature_index) { + std::unique_ptr bin_iterator(other.SubFeatureIterator(feature_index)); + bin_iterator->Reset(block_start); + for (data_size_t index = block_start; index < block_end; ++index) { + PushBinData(block_index, feature_index, index, bin_iterator->Get(index)); + } + } + }); + + FinishLoad(); + + // calculate diff bin offsets + const int offset = 1; + original_bin_offsets_ = bin_offsets_; + bin_offsets_.clear(); + num_total_bin_ = offset; + bin_offsets_.emplace_back(num_total_bin_); + for (int i = 0; i < num_feature_; ++i) { + auto num_bin = diff_feature_bin_mappers_[i]->num_bin(); + if (diff_feature_bin_mappers_[i]->GetMostFreqBin() == 0) { + num_bin -= offset; + } + num_total_bin_ += num_bin; + bin_offsets_.emplace_back(num_total_bin_); + } + + bin_mappers_.clear(); + for (const auto& bin_mapper : diff_feature_bin_mappers_) { + bin_mappers_.emplace_back(new BinMapper(*bin_mapper.get())); + } +} + +void PairwiseRankingDifferentialFeatureGroup::CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { + CHECK(!is_multi_val); // do not support multi-value bin for now + if (force_sparse || + (!force_dense && num_feature_ == 1 && + bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { + is_sparse_ = true; + bin_data_.reset(Bin::CreateDensePairwiseRankingDiffBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_, &diff_feature_bin_mappers_, &ori_feature_bin_mappers_, &original_bin_offsets_, &bin_offsets_)); + } else { + is_sparse_ = false; + bin_data_.reset(Bin::CreateDensePairwiseRankingDiffBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_, &diff_feature_bin_mappers_, &ori_feature_bin_mappers_, &original_bin_offsets_, &bin_offsets_)); + } + is_multi_val_ = false; +} + +inline BinIterator* PairwiseRankingDifferentialFeatureGroup::SubFeatureIterator(int sub_feature) const { + uint32_t most_freq_bin = ori_feature_bin_mappers_[sub_feature]->GetMostFreqBin(); + if (!is_multi_val_) { + uint32_t min_bin = original_bin_offsets_[sub_feature]; + uint32_t max_bin = original_bin_offsets_[sub_feature + 1] - 1; + return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin); + } else { + int addi = most_freq_bin == 0 ? 0 : 1; + uint32_t min_bin = 1; + uint32_t max_bin = ori_feature_bin_mappers_[sub_feature]->num_bin() - 1 + addi; + return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin, + most_freq_bin); + } +} + +inline BinIterator* PairwiseRankingDifferentialFeatureGroup::FeatureGroupIterator() { + if (is_multi_val_) { + return nullptr; + } + uint32_t min_bin = original_bin_offsets_[0]; + uint32_t max_bin = original_bin_offsets_.back() - 1; + uint32_t most_freq_bin = 0; + return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin); +} + +} // namespace LightGBM diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index 3ec26aba4d95..50dd3a3c980d 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -77,6 +77,7 @@ class SparseBin : public Bin { explicit SparseBin(data_size_t num_data) : num_data_(num_data) { int num_threads = OMP_NUM_THREADS(); push_buffers_.resize(num_threads); + Log::Warning("sparse bin is created !!!"); } ~SparseBin() {} @@ -744,9 +745,11 @@ class SparseBin : public Bin { void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { + Log::Warning("is sparse"); auto other_bin = dynamic_cast*>(full_bin); deltas_.clear(); vals_.clear(); + Log::Warning("is sparse"); data_size_t start = 0; if (num_used_indices > 0) { start = used_indices[0]; diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp index ec7581e504c4..d5573533b65b 100644 --- a/src/io/train_share_states.cpp +++ b/src/io/train_share_states.cpp @@ -31,11 +31,14 @@ void MultiValBinWrapper::InitTrain(const std::vector& group_feature_start, if (multi_val_bin_ == nullptr) { return; } + Log::Warning("MultiValBinWrapper::InitTrain step 0"); CopyMultiValBinSubset(group_feature_start, feature_groups, is_feature_used, bagging_use_indices, bagging_indices_cnt); + Log::Warning("MultiValBinWrapper::InitTrain step 1"); const auto cur_multi_val_bin = (is_use_subcol_ || is_use_subrow_) ? multi_val_bin_subset_.get() : multi_val_bin_.get(); + Log::Warning("MultiValBinWrapper::InitTrain step 2"); if (cur_multi_val_bin != nullptr) { num_bin_ = cur_multi_val_bin->num_bin(); num_bin_aligned_ = (num_bin_ + kAlignedSize - 1) / kAlignedSize * kAlignedSize; @@ -44,6 +47,7 @@ void MultiValBinWrapper::InitTrain(const std::vector& group_feature_start, (num_element_per_row + kZeroThreshold)) + 1, 1024); min_block_size_ = std::max(min_block_size_, 32); } + Log::Warning("MultiValBinWrapper::InitTrain step 3"); } template @@ -227,6 +231,7 @@ void MultiValBinWrapper::CopyMultiValBinSubset( int num_used = 0; int total = 0; std::vector used_feature_index; + Log::Warning("CopyMultiValBinSubset step 0"); for (int i : feature_groups_contained_) { int f_start = group_feature_start[i]; if (feature_groups[i]->is_multi_val_) { @@ -259,8 +264,10 @@ void MultiValBinWrapper::CopyMultiValBinSubset( ++total; } } + Log::Warning("CopyMultiValBinSubset step 1"); const double k_subfeature_threshold = 0.6; if (sum_used_dense_ratio >= sum_dense_ratio * k_subfeature_threshold) { + Log::Warning("CopyMultiValBinSubset step 2"); // only need to copy subset if (is_use_subrow_ && !is_subrow_copied_) { if (multi_val_bin_subset_ == nullptr) { @@ -279,6 +286,7 @@ void MultiValBinWrapper::CopyMultiValBinSubset( is_subrow_copied_ = true; } } else { + Log::Warning("CopyMultiValBinSubset step 3"); is_use_subcol_ = true; std::vector upper_bound; std::vector lower_bound; @@ -292,9 +300,12 @@ void MultiValBinWrapper::CopyMultiValBinSubset( int num_total_bin = offset; int new_num_total_bin = offset; offsets.push_back(static_cast(new_num_total_bin)); + Log::Warning("CopyMultiValBinSubset step 3.1"); for (int i : feature_groups_contained_) { int f_start = group_feature_start[i]; + Log::Warning("CopyMultiValBinSubset step 3.2"); if (feature_groups[i]->is_multi_val_) { + Log::Warning("CopyMultiValBinSubset step 3.3"); for (int j = 0; j < feature_groups[i]->num_feature_; ++j) { const auto& bin_mapper = feature_groups[i]->bin_mappers_[j]; if (i == 0 && j == 0 && bin_mapper->GetMostFreqBin() > 0) { @@ -320,6 +331,7 @@ void MultiValBinWrapper::CopyMultiValBinSubset( } } } else { + Log::Warning("CopyMultiValBinSubset step 3.4"); bool is_group_used = false; for (int j = 0; j < feature_groups[i]->num_feature_; ++j) { if (is_feature_used[f_start + j]) { @@ -327,9 +339,12 @@ void MultiValBinWrapper::CopyMultiValBinSubset( break; } } + Log::Warning("CopyMultiValBinSubset step 3.5"); int cur_num_bin = feature_groups[i]->bin_offsets_.back() - offset; num_total_bin += cur_num_bin; + Log::Warning("CopyMultiValBinSubset step 3.6"); if (is_group_used) { + Log::Warning("CopyMultiValBinSubset step 3.7"); new_num_total_bin += cur_num_bin; offsets.push_back(static_cast(new_num_total_bin)); lower_bound.push_back(num_total_bin - cur_num_bin); @@ -345,16 +360,21 @@ void MultiValBinWrapper::CopyMultiValBinSubset( } } // avoid out of range + Log::Warning("CopyMultiValBinSubset step 3.8"); lower_bound.push_back(num_total_bin); upper_bound.push_back(num_total_bin); + Log::Warning("CopyMultiValBinSubset step 3.9"); data_size_t num_data = is_use_subrow_ ? bagging_indices_cnt : num_data_; if (multi_val_bin_subset_ == nullptr) { + Log::Warning("CopyMultiValBinSubset step 3.9.1"); multi_val_bin_subset_.reset(multi_val_bin_->CreateLike( num_data, new_num_total_bin, num_used, sum_used_dense_ratio, offsets)); } else { + Log::Warning("CopyMultiValBinSubset step 3.9.2"); multi_val_bin_subset_->ReSize(num_data, new_num_total_bin, num_used, sum_used_dense_ratio, offsets); } + Log::Warning("CopyMultiValBinSubset step 3.10"); if (is_use_subrow_) { multi_val_bin_subset_->CopySubrowAndSubcol( multi_val_bin_.get(), bagging_use_indices, @@ -367,6 +387,7 @@ void MultiValBinWrapper::CopyMultiValBinSubset( multi_val_bin_.get(), used_feature_index, lower_bound, upper_bound, delta); } } + Log::Warning("CopyMultiValBinSubset step 4"); } void TrainingShareStates::CalcBinOffsets(const std::vector>& feature_groups, @@ -374,6 +395,7 @@ void TrainingShareStates::CalcBinOffsets(const std::vectorclear(); feature_hist_offsets_.clear(); if (in_is_col_wise) { + // Log::Fatal("not supported 0"); uint32_t cur_num_bin = 0; uint32_t hist_cur_num_bin = 0; for (int group = 0; group < static_cast(feature_groups.size()); ++group) { @@ -438,9 +460,10 @@ void TrainingShareStates::CalcBinOffsets(const std::vector= - MultiValBin::multi_val_bin_sparse_threshold ? 1 : 0; + const bool is_sparse_row_wise = false; //(1.0f - sum_dense_ratio) >= + // MultiValBin::multi_val_bin_sparse_threshold ? 1 : 0; if (is_sparse_row_wise) { + // Log::Fatal("not supported 1"); int cur_num_bin = 1; uint32_t hist_cur_num_bin = 1; for (int group = 0; group < static_cast(feature_groups.size()); ++group) { @@ -474,6 +497,7 @@ void TrainingShareStates::CalcBinOffsets(const std::vector(feature_groups.size()); ++group) { const std::unique_ptr& feature_group = feature_groups[group]; if (feature_group->is_multi_val_) { + Log::Fatal("not supported 2"); for (int i = 0; i < feature_group->num_feature_; ++i) { const std::unique_ptr& bin_mapper = feature_group->bin_mappers_[i]; if (group == 0 && i == 0 && bin_mapper->GetMostFreqBin() > 0) { diff --git a/src/main.cpp b/src/main.cpp index ecd8dd77ed02..b7f4ff0bdf9c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -41,4 +41,4 @@ int main(int argc, char** argv) { exit(-1); } -} +} \ No newline at end of file diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp index e2adb8c082d4..f7e51dd33526 100644 --- a/src/metric/rank_metric.hpp +++ b/src/metric/rank_metric.hpp @@ -9,10 +9,12 @@ #include #include #include +#include #include #include #include +#include namespace LightGBM { @@ -26,6 +28,14 @@ class NDCGMetric:public Metric { DCGCalculator::DefaultLabelGain(&label_gain); // initialize DCG calculator DCGCalculator::Init(label_gain); + pairwise_scores_ = config.objective == std::string("pairwise_lambdarank"); + sigmoid_ = config.sigmoid; + truncation_level_ = config.lambdarank_truncation_level; + model_indirect_comparison_ = config.pairwise_lambdarank_model_indirect_comparison; + model_conditional_rel_ = config.pairwise_lambdarank_model_conditional_rel; + indirect_comparison_above_only_ = config.pairwise_lambdarank_indirect_comparison_above_only; + logarithmic_discounts_ = config.pairwise_lambdarank_logarithmic_discounts; + hard_pairwise_preference_ = config.pairwise_lambdarank_hard_pairwise_preference; } ~NDCGMetric() { @@ -34,7 +44,7 @@ class NDCGMetric:public Metric { for (auto k : eval_at_) { name_.emplace_back(std::string("ndcg@") + std::to_string(k)); } - num_data_ = num_data; + num_data_ = metadata.query_boundaries()[metadata.num_queries()]; // get label label_ = metadata.label(); num_queries_ = metadata.num_queries(); @@ -73,6 +83,36 @@ class NDCGMetric:public Metric { } } } + if (pairwise_scores_) { + paired_index_map_ = metadata.paired_ranking_item_index_map(); + scores_pointwise_.resize(num_data_, 0.0); + num_data_pairwise_ = metadata.pairwise_query_boundaries()[metadata.num_queries()]; + query_boundaries_pairwise_ = metadata.pairwise_query_boundaries(); + + right2left_map_byquery_.resize(num_queries_); + left2right_map_byquery_.resize(num_queries_); + left_right2pair_map_byquery_.resize(num_queries_); + #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) + for (data_size_t q = 0; q < num_queries_; ++q) { + const data_size_t start_pairwise = query_boundaries_pairwise_[q]; + const data_size_t cnt_pairwise = query_boundaries_pairwise_[q + 1] - query_boundaries_pairwise_[q]; + std::multimap right2left_map_; + std::multimap < data_size_t, data_size_t> left2right_map_; + std::map, data_size_t> left_right2pair_map_; + for (data_size_t i = 0; i < cnt_pairwise; ++i) { + //data_size_t current_pair = selected_pairs[i]; + int index_left = paired_index_map_[i + start_pairwise].first; + int index_right = paired_index_map_[i + start_pairwise].second; + right2left_map_.insert(std::make_pair(index_right, index_left)); + left2right_map_.insert(std::make_pair(index_left, index_right)); + left_right2pair_map_.insert(std::make_pair(std::make_pair(index_left, index_right), i)); + } + right2left_map_byquery_[q] = right2left_map_; + left2right_map_byquery_[q] = left2right_map_; + left_right2pair_map_byquery_[q] = left_right2pair_map_; + } + } + sigmoid_cache_.Init(sigmoid_); } const std::vector& GetName() const override { @@ -101,9 +141,21 @@ class NDCGMetric:public Metric { result_buffer_[tid][j] += 1.0f; } } else { + if (pairwise_scores_) { + const data_size_t start_pointwise = query_boundaries_[i]; + const data_size_t cnt_pointwise = query_boundaries_[i + 1] - query_boundaries_[i]; + const data_size_t start_pairwise = query_boundaries_pairwise_[i]; + const data_size_t cnt_pairwise = query_boundaries_pairwise_[i + 1] - query_boundaries_pairwise_[i]; + std::vector all_pairs(cnt_pairwise); + std::iota(all_pairs.begin(), all_pairs.end(), 0); + UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), + paired_index_map_ + start_pairwise, right2left_map_byquery_[i], left2right_map_byquery_[i], left_right2pair_map_byquery_[i], truncation_level_, + sigmoid_, sigmoid_cache_, model_indirect_comparison_, model_conditional_rel_, indirect_comparison_above_only_, logarithmic_discounts_, hard_pairwise_preference_); + } + // calculate DCG DCGCalculator::CalDCG(eval_at_, label_ + query_boundaries_[i], - score + query_boundaries_[i], + (pairwise_scores_? scores_pointwise_.data(): score) + query_boundaries_[i], query_boundaries_[i + 1] - query_boundaries_[i], &tmp_dcg); // calculate NDCG for (size_t j = 0; j < eval_at_.size(); ++j) { @@ -121,9 +173,20 @@ class NDCGMetric:public Metric { result_buffer_[tid][j] += 1.0f; } } else { + if (pairwise_scores_) { + const data_size_t start_pointwise = query_boundaries_[i]; + const data_size_t cnt_pointwise = query_boundaries_[i + 1] - query_boundaries_[i]; + const data_size_t start_pairwise = query_boundaries_pairwise_[i]; + const data_size_t cnt_pairwise = query_boundaries_pairwise_[i + 1] - query_boundaries_pairwise_[i]; + std::vector all_pairs(cnt_pairwise); + std::iota(all_pairs.begin(), all_pairs.end(), 0); + UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), + paired_index_map_ + start_pairwise, right2left_map_byquery_[i], left2right_map_byquery_[i], left_right2pair_map_byquery_[i], truncation_level_, + sigmoid_, sigmoid_cache_, model_indirect_comparison_, model_conditional_rel_, indirect_comparison_above_only_, logarithmic_discounts_, hard_pairwise_preference_); + } // calculate DCG DCGCalculator::CalDCG(eval_at_, label_ + query_boundaries_[i], - score + query_boundaries_[i], + (pairwise_scores_ ? scores_pointwise_.data() : score) + query_boundaries_[i], query_boundaries_[i + 1] - query_boundaries_[i], &tmp_dcg); // calculate NDCG for (size_t j = 0; j < eval_at_.size(); ++j) { @@ -162,6 +225,24 @@ class NDCGMetric:public Metric { std::vector eval_at_; /*! \brief Cache the inverse max dcg for all queries */ std::vector> inverse_max_dcgs_; + bool pairwise_scores_; + double sigmoid_; + CommonC::SigmoidCache sigmoid_cache_; + /*! \brief Truncation position for max DCG */ + int truncation_level_; + mutable std::vector scores_pointwise_; + const std::pair* paired_index_map_; + std::vector> right2left_map_byquery_; + std::vector> left2right_map_byquery_; + std::vector, data_size_t>> left_right2pair_map_byquery_; + /*! \brief Number of data */ + data_size_t num_data_pairwise_; + const data_size_t* query_boundaries_pairwise_; + bool model_indirect_comparison_; + bool model_conditional_rel_; + bool indirect_comparison_above_only_; + bool logarithmic_discounts_; + bool hard_pairwise_preference_; }; } // namespace LightGBM diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp index a203017cf36e..508a59fe0364 100644 --- a/src/objective/objective_function.cpp +++ b/src/objective/objective_function.cpp @@ -44,6 +44,9 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& return new CUDAMulticlassSoftmax(config); } else if (type == std::string("multiclassova")) { return new CUDAMulticlassOVA(config); + } else if (type == std::string("pairwise_lambdarank")) { + Log::Warning("Objective pairwise_lambdarank is not implemented in cuda version. Fall back to boosting on CPU."); + return new PairwiseLambdarankNDCG(config); } else if (type == std::string("cross_entropy")) { Log::Warning("Objective cross_entropy is not implemented in cuda version. Fall back to boosting on CPU."); return new CrossEntropy(config); @@ -81,6 +84,8 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& return new BinaryLogloss(config); } else if (type == std::string("lambdarank")) { return new LambdarankNDCG(config); + } else if (type == std::string("pairwise_lambdarank")) { + return new PairwiseLambdarankNDCG(config); } else if (type == std::string("rank_xendcg")) { return new RankXENDCG(config); } else if (type == std::string("multiclass")) { @@ -126,6 +131,8 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& return new BinaryLogloss(strs); } else if (type == std::string("lambdarank")) { return new LambdarankNDCG(strs); + } else if (type == std::string("pairwise_lambdarank")) { + return new PairwiseLambdarankNDCG(strs); } else if (type == std::string("rank_xendcg")) { return new RankXENDCG(strs); } else if (type == std::string("multiclass")) { diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index ae3b74651759..878b4e8234e3 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -6,6 +6,12 @@ #ifndef LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_ +//#define model_indirect_comparison_ false +//#define model_conditional_rel_ true +//#define indirect_comparison_above_only_ true +//#define logarithmic_discounts_ true +//#define hard_pairwise_preference_ false + #include #include @@ -15,10 +21,109 @@ #include #include #include +#include #include +#include namespace LightGBM { + void UpdatePointwiseScoresForOneQuery(data_size_t query_id, double* score_pointwise, const double* score_pairwise, data_size_t cnt_pointwise, + data_size_t selected_pairs_cnt, const data_size_t* selected_pairs, const std::pair* paired_index_map, + const std::multimap& right2left_map, const std::multimap < data_size_t, data_size_t>& left2right_map, + const std::map, data_size_t>& left_right2pair_map, + int truncation_level, double sigma, const CommonC::SigmoidCache& sigmoid_cache, bool model_indirect_comparison, bool model_conditional_rel, + bool indirect_comparison_above_only, bool logarithmic_discounts, bool hard_pairwise_preference) { + // get sorted indices for scores + global_timer.Start("pairwise_lambdarank::UpdatePointwiseScoresForOneQuery part 0"); + std::vector sorted_idx(cnt_pointwise); + for (data_size_t i = 0; i < cnt_pointwise; ++i) { + sorted_idx[i] = i; + } + std::stable_sort( + sorted_idx.begin(), sorted_idx.end(), + [score_pointwise](data_size_t a, data_size_t b) { return score_pointwise[a] > score_pointwise[b]; }); + // get ranks when sorted by scores + std::vector ranks(cnt_pointwise); + for (int i = 0; i < cnt_pointwise; i++) { + ranks[sorted_idx.at(i)] = i; + } + global_timer.Stop("pairwise_lambdarank::UpdatePointwiseScoresForOneQuery part 0"); + global_timer.Start("pairwise_lambdarank::UpdatePointwiseScoresForOneQuery part 1"); + std::vector gradients(cnt_pointwise); + std::vector hessians(cnt_pointwise); + for (data_size_t i = 0; i < selected_pairs_cnt; i++) { + data_size_t current_pair = selected_pairs[i]; + int indexLeft = paired_index_map[current_pair].first; + int indexRight = paired_index_map[current_pair].second; + if (ranks[indexLeft] >= truncation_level && ranks[indexRight] >= truncation_level) { continue; } + + double delta_score = score_pairwise[current_pair]; + int comparisons = 1; + data_size_t current_pair_inverse = -1; + if (left_right2pair_map.count(std::make_pair(indexRight, indexLeft)) > 0) { + current_pair_inverse = left_right2pair_map.at(std::make_pair(indexRight, indexLeft)); + delta_score -= score_pairwise[current_pair_inverse]; + comparisons++; + } + if (model_indirect_comparison) { + auto indexHead_range = right2left_map.equal_range(indexLeft); + for (auto indexHead_it = indexHead_range.first; indexHead_it != indexHead_range.second; indexHead_it++) { + data_size_t indexHead = indexHead_it->second; + if (left_right2pair_map.count(std::make_pair(indexHead, indexRight)) > 0 && + (!(indirect_comparison_above_only || model_conditional_rel) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { + data_size_t indexHeadLeft = left_right2pair_map.at(std::make_pair(indexHead, indexLeft)); + data_size_t indexHeadRight = left_right2pair_map.at(std::make_pair(indexHead, indexRight)); + delta_score += score_pairwise[indexHeadRight] - score_pairwise[indexHeadLeft]; + comparisons++; + } + } + auto indexTail_range = left2right_map.equal_range(indexLeft); + for (auto indexTail_it = indexTail_range.first; indexTail_it != indexTail_range.second; indexTail_it++) { + data_size_t indexTail = indexTail_it->second; + if (left_right2pair_map.count(std::make_pair(indexRight, indexTail)) > 0 && + (!indirect_comparison_above_only || (ranks[indexTail] < ranks[indexLeft] && ranks[indexTail] < ranks[indexRight])) && + (!model_conditional_rel || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { + data_size_t indexLeftTail = left_right2pair_map.at(std::make_pair(indexLeft, indexTail)); + data_size_t indexRightTail = left_right2pair_map.at(std::make_pair(indexRight, indexTail)); + delta_score += score_pairwise[indexLeftTail] - score_pairwise[indexRightTail]; + comparisons++; + } + } + } + double delta_score_pointwise = score_pointwise[indexLeft] - score_pointwise[indexRight]; + if (delta_score_pointwise == kMinScore || -delta_score_pointwise == kMinScore || delta_score == kMinScore || -delta_score == kMinScore) { continue; } + delta_score /= comparisons; + // get discount of this pair + double paired_discount = logarithmic_discounts ? fabs(DCGCalculator::GetDiscount(ranks[indexRight]) - DCGCalculator::GetDiscount(ranks[indexLeft])) : 1.0; + //double p_lr_pairwise = 1.0f / (1.0f + std::exp(-delta_score * sigma)); + double p_lr_pairwise = sigmoid_cache.compute(-delta_score); + double p_rl_pairwise = 1.0 - p_lr_pairwise; + //double p_lr_pointwise = 1.0f / (1.0f + std::exp(-delta_score_pointwise * sigma)); + double p_lr_pointwise = sigmoid_cache.compute(-delta_score_pointwise); + double p_rl_pointwise = 1.0 - p_lr_pointwise; + + if (hard_pairwise_preference) { + paired_discount *= std::abs(0.5 - p_lr_pairwise); + p_lr_pairwise = p_lr_pairwise >= 0.5 ? 1.0 : 0.0; + p_rl_pairwise = 1.0 - p_lr_pairwise; + } + + gradients[indexLeft] += sigma * paired_discount * (p_rl_pointwise - p_rl_pairwise); + hessians[indexLeft] += sigma * sigma * paired_discount * p_rl_pointwise * p_lr_pointwise; + gradients[indexRight] -= sigma * paired_discount * (p_rl_pointwise - p_rl_pairwise); + hessians[indexRight] += sigma * sigma * paired_discount * p_rl_pointwise * p_lr_pointwise; + } + global_timer.Stop("pairwise_lambdarank::UpdatePointwiseScoresForOneQuery part 1"); + global_timer.Start("pairwise_lambdarank::UpdatePointwiseScoresForOneQuery part 2"); + for (data_size_t i = 0; i < cnt_pointwise; i++) { + double delta = 0.3 * gradients[i] / (std::abs(hessians[i]) + 0.001); + delta = std::min(delta, 0.3); + delta = std::max(delta, -0.3); + score_pointwise[i] += delta; + } + global_timer.Stop("pairwise_lambdarank::UpdatePointwiseScoresForOneQuery part 2"); + } + /*! * \brief Objective function for Ranking */ @@ -56,19 +161,21 @@ class RankingObjective : public ObjectiveFunction { pos_biases_.resize(num_position_ids_, 0.0); } - void GetGradients(const double* score, score_t* gradients, - score_t* hessians) const override { + void GetGradients(const double* score, const data_size_t num_sampled_queries, const data_size_t* sampled_query_indices, + score_t* gradients, score_t* hessians) const override { + const data_size_t num_queries = (sampled_query_indices == nullptr ? num_queries_ : num_sampled_queries); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) - for (data_size_t i = 0; i < num_queries_; ++i) { - const data_size_t start = query_boundaries_[i]; - const data_size_t cnt = query_boundaries_[i + 1] - query_boundaries_[i]; + for (data_size_t i = 0; i < num_queries; ++i) { + const data_size_t query_index = (sampled_query_indices == nullptr ? i : sampled_query_indices[i]); + const data_size_t start = query_boundaries_[query_index]; + const data_size_t cnt = query_boundaries_[query_index + 1] - query_boundaries_[query_index]; std::vector score_adjusted; if (num_position_ids_ > 0) { for (data_size_t j = 0; j < cnt; ++j) { score_adjusted.push_back(score[start + j] + pos_biases_[positions_[start + j]]); } } - GetGradientsForOneQuery(i, cnt, label_ + start, num_position_ids_ > 0 ? score_adjusted.data() : score + start, + GetGradientsForOneQuery(query_index, cnt, label_ + start, num_position_ids_ > 0 ? score_adjusted.data() : score + start, gradients + start, hessians + start); if (weights_ != nullptr) { for (data_size_t j = 0; j < cnt; ++j) { @@ -84,6 +191,10 @@ class RankingObjective : public ObjectiveFunction { } } + void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override { + GetGradients(score, num_queries_, nullptr, gradients, hessians); + } + virtual void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt, const label_t* label, const double* score, score_t* lambdas, @@ -169,6 +280,7 @@ class LambdarankNDCG : public RankingObjective { } // construct Sigmoid table to speed up Sigmoid transform ConstructSigmoidTable(); + sigmoid_cache_.Init(sigmoid_); } inline void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt, @@ -363,6 +475,7 @@ class LambdarankNDCG : public RankingObjective { double max_sigmoid_input_ = 50; /*! \brief Factor that covert score to bin in Sigmoid table */ double sigmoid_table_idx_factor_; + CommonC::SigmoidCache sigmoid_cache_; }; /*! @@ -449,5 +562,321 @@ class RankXENDCG : public RankingObjective { mutable std::vector rands_; }; + +class PairwiseLambdarankNDCG: public LambdarankNDCG { + public: + explicit PairwiseLambdarankNDCG(const Config& config): LambdarankNDCG(config) { + model_indirect_comparison_ = config.pairwise_lambdarank_model_indirect_comparison; + model_conditional_rel_ = config.pairwise_lambdarank_model_conditional_rel; + indirect_comparison_above_only_ = config.pairwise_lambdarank_indirect_comparison_above_only; + logarithmic_discounts_ = config.pairwise_lambdarank_logarithmic_discounts; + hard_pairwise_preference_ = config.pairwise_lambdarank_hard_pairwise_preference; + } + + explicit PairwiseLambdarankNDCG(const std::vector& strs): LambdarankNDCG(strs) {} + + ~PairwiseLambdarankNDCG() {} + + void Init(const Metadata& metadata, data_size_t num_data_pairwise) override { + data_size_t num_data_pointwise = metadata.query_boundaries()[metadata.num_queries()]; + Log::Info("!!! num_data_pointwise %d", num_data_pointwise); + LambdarankNDCG::Init(metadata, num_data_pointwise); + num_data_pairwise_ = num_data_pairwise; + query_boundaries_pairwise_ = metadata.pairwise_query_boundaries(); + if (query_boundaries_pairwise_ == nullptr) { + Log::Fatal("Ranking tasks require query information"); + } + paired_index_map_ = metadata.paired_ranking_item_index_map(); + scores_pointwise_.resize(num_data_pointwise, 0.0); + + right2left_map_byquery_.resize(num_queries_); + left2right_map_byquery_.resize(num_queries_); + left_right2pair_map_byquery_.resize(num_queries_); + #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) + for (data_size_t q = 0; q < num_queries_; ++q) { + const data_size_t start_pairwise = query_boundaries_pairwise_[q]; + const data_size_t cnt_pairwise = query_boundaries_pairwise_[q + 1] - query_boundaries_pairwise_[q]; + std::multimap right2left_map_; + std::multimap < data_size_t, data_size_t> left2right_map_; + std::map, data_size_t> left_right2pair_map_; + for (data_size_t i = 0; i < cnt_pairwise; ++i) { + //data_size_t current_pair = selected_pairs[i]; + int index_left = paired_index_map_[i + start_pairwise].first; + int index_right = paired_index_map_[i + start_pairwise].second; + right2left_map_.insert(std::make_pair(index_right, index_left)); + left2right_map_.insert(std::make_pair(index_left, index_right)); + left_right2pair_map_.insert(std::make_pair(std::make_pair(index_left, index_right), i)); + } + right2left_map_byquery_[q] = right2left_map_; + left2right_map_byquery_[q] = left2right_map_; + left_right2pair_map_byquery_[q] = left_right2pair_map_; + } + } + + void GetGradients(const double* score_pairwise, const data_size_t num_sampled_queries, const data_size_t* sampled_query_indices, + score_t* gradients_pairwise, score_t* hessians_pairwise) const override { + const data_size_t num_queries = (sampled_query_indices == nullptr ? num_queries_ : num_sampled_queries); + #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) + for (data_size_t i = 0; i < num_queries; ++i) { + global_timer.Start("pairwise_lambdarank::GetGradients part 0"); + const data_size_t query_index = (sampled_query_indices == nullptr ? i : sampled_query_indices[i]); + const data_size_t start_pointwise = query_boundaries_[query_index]; + const data_size_t cnt_pointwise = query_boundaries_[query_index + 1] - query_boundaries_[query_index]; + const data_size_t start_pairwise = query_boundaries_pairwise_[query_index]; + const data_size_t cnt_pairwise = query_boundaries_pairwise_[query_index + 1] - query_boundaries_pairwise_[query_index]; + std::vector score_adjusted_pairwise; + if (num_position_ids_ > 0) { + for (data_size_t j = 0; j < cnt_pairwise; ++j) { + score_adjusted_pairwise.push_back(score_pairwise[start_pairwise + j] + pos_biases_[positions_[start_pointwise + paired_index_map_[start_pairwise + j].first]] - + pos_biases_[positions_[start_pointwise + paired_index_map_[start_pairwise + j].second]]); + } + } + global_timer.Stop("pairwise_lambdarank::GetGradients part 0"); + global_timer.Start("pairwise_lambdarank::GetGradients part 1"); + GetGradientsForOneQuery(query_index, cnt_pointwise, cnt_pairwise, label_ + start_pointwise, scores_pointwise_.data() + start_pointwise, num_position_ids_ > 0 ? score_adjusted_pairwise.data() : score_pairwise + start_pairwise, + right2left_map_byquery_[query_index], left2right_map_byquery_[query_index], left_right2pair_map_byquery_[query_index], + gradients_pairwise + start_pairwise, hessians_pairwise + start_pairwise); + std::vector all_pairs(cnt_pairwise); + std::iota(all_pairs.begin(), all_pairs.end(), 0); + global_timer.Stop("pairwise_lambdarank::GetGradients part 1"); + global_timer.Start("pairwise_lambdarank::GetGradients part 2"); + UpdatePointwiseScoresForOneQuery(i, scores_pointwise_.data() + start_pointwise, score_pairwise + start_pairwise, cnt_pointwise, cnt_pairwise, all_pairs.data(), + paired_index_map_ + start_pairwise, right2left_map_byquery_[query_index], left2right_map_byquery_[query_index], left_right2pair_map_byquery_[query_index], truncation_level_, sigmoid_, sigmoid_cache_, + model_indirect_comparison_, model_conditional_rel_, indirect_comparison_above_only_, logarithmic_discounts_, hard_pairwise_preference_); + global_timer.Stop("pairwise_lambdarank::GetGradients part 2"); + } + if (num_position_ids_ > 0) { + std::vector gradients_pointwise(num_data_); + std::vector hessians_pointwise(num_data_); + #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) + for (data_size_t i = 0; i < num_queries_; ++i) { + const data_size_t query_index = (sampled_query_indices == nullptr ? i : sampled_query_indices[i]); + const data_size_t cnt_pointwise = query_boundaries_[query_index + 1] - query_boundaries_[query_index]; + const data_size_t cnt_pairwise = query_boundaries_pairwise_[query_index + 1] - query_boundaries_pairwise_[query_index]; + TransformGradientsPairwiseIntoPointwiseForOneQuery(query_index, cnt_pointwise, cnt_pairwise, gradients_pairwise, hessians_pairwise, gradients_pointwise.data(), hessians_pointwise.data()); + } + UpdatePositionBiasFactors(gradients_pointwise.data(), hessians_pointwise.data()); + } + } + + inline void TransformGradientsPairwiseIntoPointwiseForOneQuery(data_size_t query_id, data_size_t cnt_pointwise, data_size_t cnt, + const score_t* gradients, const score_t* hessians, score_t* gradients_pointwise, score_t* hessians_pointwise) const { + // initialize with zero + for (data_size_t i = 0; i < cnt_pointwise; ++i) { + gradients_pointwise[i] = 0.0f; + hessians_pointwise[i] = 0.0f; + } + const data_size_t start = query_boundaries_[query_id]; + for (data_size_t i = 0; i < cnt; i++) { + int indexLeft = paired_index_map_[i + start].first; + int indexRight = paired_index_map_[i + start].second; + gradients_pointwise[indexLeft] += gradients[i]; + gradients_pointwise[indexRight] -= gradients[i]; + hessians_pointwise[indexLeft] += hessians[i]; + hessians_pointwise[indexRight] += hessians[i]; + } + } + + + inline void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt_pointwise, data_size_t cnt_pairwise, + const label_t* label, const double* score_pointwise, const double* score_pairwise, + const std::multimap& right2left_map, const std::multimap < data_size_t, data_size_t>& left2right_map, + const std::map, data_size_t>& left_right2pair_map, + score_t* lambdas_pairwise, + score_t* hessians_pairwise) const { + + const data_size_t start_pointwise = query_boundaries_[query_id]; + const data_size_t start_pairwise = query_boundaries_pairwise_[query_id]; + + // get max DCG on current query + const double inverse_max_dcg = inverse_max_dcgs_[query_id]; + // initialize with zero + for (data_size_t i = 0; i < cnt_pairwise; ++i) { + lambdas_pairwise[i] = 0.0f; + hessians_pairwise[i] = 0.0f; + } + // get sorted indices for scores + std::vector sorted_idx(cnt_pointwise); + for (data_size_t i = 0; i < cnt_pointwise; ++i) { + sorted_idx[i] = i; + } + std::stable_sort( + sorted_idx.begin(), sorted_idx.end(), + [score_pointwise](data_size_t a, data_size_t b) { return score_pointwise[a] > score_pointwise[b]; }); + // get ranks when sorted by scores + std::vector ranks(cnt_pointwise); + for (int i = 0; i < cnt_pointwise; i++) { + ranks[sorted_idx.at(i)] = i; + } + // get best and worst score + const double best_score = score_pointwise[sorted_idx[0]]; + data_size_t worst_idx = cnt_pointwise - 1; + if (worst_idx > 0 && score_pointwise[sorted_idx[worst_idx]] == kMinScore) { + worst_idx -= 1; + } + const double worst_score = score_pointwise[sorted_idx[worst_idx]]; + double sum_lambdas = 0.0; + // start accmulate lambdas by pairs + for (data_size_t i = 0; i < cnt_pairwise; i++) { + int indexLeft = paired_index_map_[i + start_pairwise].first; + int indexRight = paired_index_map_[i + start_pairwise].second; + + if (label[indexLeft] <= label[indexRight] || (ranks[indexLeft] >= truncation_level_ && ranks[indexRight] >= truncation_level_)) { + continue; + } + + const data_size_t high = indexLeft; + const data_size_t low = indexRight; + const data_size_t high_rank = ranks[high]; + const data_size_t low_rank = ranks[low]; + const int high_label = static_cast(label[high]); + const double high_label_gain = label_gain_[high_label]; + const double high_discount = DCGCalculator::GetDiscount(high_rank); + const int low_label = static_cast(label[low]); + const double low_label_gain = label_gain_[low_label]; + const double low_discount = DCGCalculator::GetDiscount(low_rank); + double delta_score = score_pairwise[i]; + int comparisons = 1; + + data_size_t i_inverse = -1; + if (left_right2pair_map.count(std::make_pair(indexRight, indexLeft)) > 0) { + i_inverse = left_right2pair_map.at(std::make_pair(indexRight, indexLeft)); + delta_score -= score_pairwise[i_inverse]; + comparisons++; + } + if (model_indirect_comparison_) { + auto indexHead_range = right2left_map.equal_range(indexLeft); + for (auto indexHead_it = indexHead_range.first; indexHead_it != indexHead_range.second; indexHead_it++) { + data_size_t indexHead = indexHead_it->second; + if (left_right2pair_map.count(std::make_pair(indexHead, indexRight)) > 0 && + (!(indirect_comparison_above_only_ || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { + data_size_t indexHeadLeft = left_right2pair_map.at(std::make_pair(indexHead, indexLeft)); + data_size_t indexHeadRight = left_right2pair_map.at(std::make_pair(indexHead, indexRight)); + delta_score += score_pairwise[indexHeadRight] - score_pairwise[indexHeadLeft]; + comparisons++; + } + } + auto indexTail_range = left2right_map.equal_range(indexLeft); + for (auto indexTail_it = indexTail_range.first; indexTail_it != indexTail_range.second; indexTail_it++) { + data_size_t indexTail = indexTail_it->second; + if (left_right2pair_map.count(std::make_pair(indexRight, indexTail)) > 0 && + (!indirect_comparison_above_only_ || (ranks[indexTail] < ranks[indexLeft] && ranks[indexTail] < ranks[indexRight])) && + (!model_conditional_rel_ || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { + data_size_t indexLeftTail = left_right2pair_map.at(std::make_pair(indexLeft, indexTail)); + data_size_t indexRightTail = left_right2pair_map.at(std::make_pair(indexRight, indexTail)); + delta_score += score_pairwise[indexLeftTail] - score_pairwise[indexRightTail]; + comparisons++; + } + } + } + + if (delta_score == kMinScore || -delta_score == kMinScore) { continue; } + delta_score /= comparisons; + + // get dcg gap + const double dcg_gap = high_label_gain - low_label_gain; + // get discount of this pair + const double paired_discount = fabs(high_discount - low_discount); + // get delta NDCG + double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg; + // regularize the delta_pair_NDCG by score distance + if (norm_ && best_score != worst_score) { + delta_pair_NDCG /= (0.01f + fabs(delta_score)); + } + // calculate lambda for this pair + double p_lambda = GetSigmoid(delta_score); + double p_hessian = p_lambda * (1.0f - p_lambda); + // update + p_lambda *= -sigmoid_ * delta_pair_NDCG; + p_hessian *= sigmoid_ * sigmoid_ * delta_pair_NDCG; + if (weights_ != nullptr) { + p_lambda *= weights_[start_pointwise + high] * weights_[start_pointwise + low]; + p_hessian *= weights_[start_pointwise + high] * weights_[start_pointwise + low]; + } + lambdas_pairwise[i] += static_cast(p_lambda / comparisons); + hessians_pairwise[i] += static_cast(p_hessian / comparisons); + if (i_inverse >= 0) { + lambdas_pairwise[i_inverse] -= static_cast(p_lambda / comparisons); + hessians_pairwise[i_inverse] += static_cast(p_hessian / comparisons); + } + if (model_indirect_comparison_) { + auto indexHead_range = right2left_map.equal_range(indexLeft); + for (auto indexHead_it = indexHead_range.first; indexHead_it != indexHead_range.second; indexHead_it++) { + data_size_t indexHead = indexHead_it->second; + if (left_right2pair_map.count(std::make_pair(indexHead, indexRight)) > 0 && + (!(indirect_comparison_above_only_ || model_conditional_rel_) || (ranks[indexHead] < ranks[indexLeft] && ranks[indexHead] < ranks[indexRight]))) { + data_size_t indexHeadLeft = left_right2pair_map.at(std::make_pair(indexHead, indexLeft)); + data_size_t indexHeadRight = left_right2pair_map.at(std::make_pair(indexHead, indexRight)); + lambdas_pairwise[indexHeadRight] += static_cast(p_lambda / comparisons); + hessians_pairwise[indexHeadRight] += static_cast(p_hessian / comparisons); + lambdas_pairwise[indexHeadLeft] -= static_cast(p_lambda / comparisons); + hessians_pairwise[indexHeadLeft] += static_cast(p_hessian / comparisons); + } + } + auto indexTail_range = left2right_map.equal_range(indexLeft); + for (auto indexTail_it = indexTail_range.first; indexTail_it != indexTail_range.second; indexTail_it++) { + data_size_t indexTail = indexTail_it->second; + if (left_right2pair_map.count(std::make_pair(indexRight, indexTail)) > 0 && + (!indirect_comparison_above_only_ || (ranks[indexTail] < ranks[indexLeft] && ranks[indexTail] < ranks[indexRight])) && + (!model_conditional_rel_ || (ranks[indexTail] > ranks[indexLeft] && ranks[indexTail] > ranks[indexRight]))) { + data_size_t indexLeftTail = left_right2pair_map.at(std::make_pair(indexLeft, indexTail)); + data_size_t indexRightTail = left_right2pair_map.at(std::make_pair(indexRight, indexTail)); + lambdas_pairwise[indexLeftTail] += static_cast(p_lambda / comparisons); + hessians_pairwise[indexLeftTail] += static_cast(p_hessian / comparisons); + lambdas_pairwise[indexRightTail] -= static_cast(p_lambda / comparisons); + hessians_pairwise[indexRightTail] += static_cast(p_hessian / comparisons); + } + } + } + // lambda is negative, so use minus to accumulate + sum_lambdas -= 2 * p_lambda; + } + + if (norm_ && sum_lambdas > 0) { + double norm_factor = std::log2(1 + sum_lambdas) / sum_lambdas; + for (data_size_t i = 0; i < cnt_pairwise; ++i) { + lambdas_pairwise[i] = static_cast(lambdas_pairwise[i] * norm_factor); + hessians_pairwise[i] = static_cast(hessians_pairwise[i] * norm_factor); + } + } + } + + inline double GetSigmoid(double score) const { + if (score <= min_sigmoid_input_) { + // too small, use lower bound + return sigmoid_table_[0]; + } + else if (score >= max_sigmoid_input_) { + // too large, use upper bound + return sigmoid_table_[_sigmoid_bins - 1]; + } + else { + return sigmoid_table_[static_cast((score - min_sigmoid_input_) * + sigmoid_table_idx_factor_)]; + } + } + + const char* GetName() const override { return "pairwise_lambdarank"; } + + protected: + /*! \brief Query boundaries for pairwise data instances */ + const data_size_t* query_boundaries_pairwise_; + /*! \brief Number of pairwise data */ + data_size_t num_data_pairwise_; + mutable std::vector scores_pointwise_; + bool model_indirect_comparison_; + bool model_conditional_rel_; + bool indirect_comparison_above_only_; + bool logarithmic_discounts_; + bool hard_pairwise_preference_; + + private: + const std::pair* paired_index_map_; + std::vector> right2left_map_byquery_; + std::vector> left2right_map_byquery_; + std::vector, data_size_t>> left_right2pair_map_byquery_; +}; + + } // namespace LightGBM #endif // LightGBM_OBJECTIVE_RANK_OBJECTIVE_HPP_ diff --git a/src/treelearner/col_sampler.hpp b/src/treelearner/col_sampler.hpp index c70b07e50efa..7ac646ed6bb4 100644 --- a/src/treelearner/col_sampler.hpp +++ b/src/treelearner/col_sampler.hpp @@ -89,6 +89,7 @@ class ColSampler { } std::vector GetByNode(const Tree* tree, int leaf) { + // Log::Warning("GetByNode step 0"); // get interaction constraints for current branch std::unordered_set allowed_features; if (!interaction_constraints_.empty()) { @@ -110,6 +111,7 @@ class ColSampler { } } + // Log::Warning("GetByNode step 1"); std::vector ret(train_data_->num_features(), 0); if (fraction_bynode_ >= 1.0f) { if (interaction_constraints_.empty()) { @@ -124,6 +126,7 @@ class ColSampler { return ret; } } + // Log::Warning("GetByNode step 2"); if (need_reset_bytree_) { auto used_feature_cnt = GetCnt(used_feature_indices_.size(), fraction_bynode_); std::vector* allowed_used_feature_indices; @@ -175,6 +178,7 @@ class ColSampler { ret[inner_feature_index] = 1; } } + // Log::Warning("GetByNode step 3"); return ret; } diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 70dd0fb5436f..8eb458c83680 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -20,6 +20,8 @@ #include "monotone_constraints.hpp" #include "split_info.hpp" +#include + namespace LightGBM { class FeatureMetainfo { @@ -1501,6 +1503,7 @@ class HistogramPool { } OMP_THROW_EX(); } + offsets_ = offsets; } void ResetConfig(const Dataset* train_data, const Config* config) { @@ -1522,6 +1525,18 @@ class HistogramPool { } } + void DumpContent() const { + std::ofstream fout("historam_wise.txt"); + int cur_offsets_ptr = 0; + for (int i = 0; i < data_[0].size() / 2; ++i) { + if (i == offsets_[cur_offsets_ptr]) { + fout << "offset " << cur_offsets_ptr << " " << offsets_[cur_offsets_ptr] << " " << feature_metas_[cur_offsets_ptr].num_bin << " " << static_cast(feature_metas_[cur_offsets_ptr].offset) << std::endl; + ++cur_offsets_ptr; + } + fout << i << " " << data_[0][2 * i] << " " << data_[0][2 * i + 1] << std::endl; + } + } + /*! * \brief Get data for the specific index * \param idx which index want to get @@ -1591,6 +1606,7 @@ class HistogramPool { std::vector inverse_mapper_; std::vector last_used_time_; int cur_time_ = 0; + std::vector offsets_; }; } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index f3a88bd18679..2d284732580b 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -68,7 +68,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian GetShareStates(train_data_, is_constant_hessian, true); histogram_pool_.DynamicChangeSize(train_data_, - share_state_->num_hist_total_bin(), + share_state_->num_hist_total_bin() * 2, share_state_->feature_hist_offsets(), config_, max_cache_size, config_->num_leaves); Log::Info("Number of data points in the train set: %d, number of used features: %d", num_data_, num_features_); @@ -86,12 +86,12 @@ void SerialTreeLearner::GetShareStates(const Dataset* dataset, share_state_.reset(dataset->GetShareStates( reinterpret_cast(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr, col_sampler_.is_feature_used_bytree(), is_constant_hessian, - config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins)); + config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_lambdarank"))); } else { share_state_.reset(dataset->GetShareStates( ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(), is_constant_hessian, - config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins)); + config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_lambdarank"))); } } else { CHECK_NOTNULL(share_state_); @@ -100,12 +100,12 @@ void SerialTreeLearner::GetShareStates(const Dataset* dataset, share_state_.reset(dataset->GetShareStates( reinterpret_cast(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr, col_sampler_.is_feature_used_bytree(), is_constant_hessian, - share_state_->is_col_wise, !share_state_->is_col_wise, config_->num_grad_quant_bins)); + share_state_->is_col_wise, !share_state_->is_col_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_lambdarank"))); } else { share_state_.reset(dataset->GetShareStates( ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(), is_constant_hessian, share_state_->is_col_wise, - !share_state_->is_col_wise, config_->num_grad_quant_bins)); + !share_state_->is_col_wise, config_->num_grad_quant_bins, config_->objective == std::string("pairwise_lambdarank"))); } } CHECK_NOTNULL(share_state_); @@ -126,7 +126,7 @@ void SerialTreeLearner::ResetTrainingDataInner(const Dataset* train_data, data_partition_->ResetNumData(num_data_); if (reset_multi_val_bin) { col_sampler_.SetTrainingData(train_data_); - GetShareStates(train_data_, is_constant_hessian, false); + GetShareStates(train_data_, is_constant_hessian, config_->objective == std::string("pairwise_lambdarank")); } // initialize ordered gradients and hessians @@ -189,18 +189,23 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians } share_state_->num_threads = num_threads; + Log::Warning("Train step 0"); + if (config_->use_quantized_grad) { gradient_discretizer_->DiscretizeGradients(num_data_, gradients_, hessians_); } + Log::Warning("Train step 1"); // some initial works before training BeforeTrain(); + Log::Warning("Train step 2"); bool track_branch_features = !(config_->interaction_constraints_vector.empty()); auto tree = std::unique_ptr(new Tree(config_->num_leaves, track_branch_features, false)); auto tree_ptr = tree.get(); constraints_->ShareTreePointer(tree_ptr); + Log::Warning("Train step 3"); // root leaf int left_leaf = 0; int cur_depth = 1; @@ -209,8 +214,10 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians int init_splits = ForceSplits(tree_ptr, &left_leaf, &right_leaf, &cur_depth); + Log::Warning("Train step 4"); for (int split = init_splits; split < config_->num_leaves - 1; ++split) { // some initial works before finding best split + Log::Warning("Train step 5, split = %d", split); if (BeforeFindBestSplit(tree_ptr, left_leaf, right_leaf)) { // find best threshold for every feature FindBestSplits(tree_ptr); @@ -225,15 +232,18 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians break; } // split tree with best leaf + Log::Warning("Train step 6, split = %d", split); Split(tree_ptr, best_leaf, &left_leaf, &right_leaf); cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf)); } + Log::Warning("Train step 7"); if (config_->use_quantized_grad && config_->quant_train_renew_leaf) { gradient_discretizer_->RenewIntGradTreeOutput(tree.get(), config_, data_partition_.get(), gradients_, hessians_, [this] (int leaf_index) { return GetGlobalDataCountInLeaf(leaf_index); }); } + Log::Warning("Train step 8"); Log::Debug("Trained a tree with leaves = %d and depth = %d", tree->num_leaves(), cur_depth); return tree.release(); } @@ -282,20 +292,28 @@ Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const std::vect void SerialTreeLearner::BeforeTrain() { Common::FunctionTimer fun_timer("SerialTreeLearner::BeforeTrain", global_timer); // reset histogram pool + Log::Warning("BeforeTrain step 0"); + histogram_pool_.ResetMap(); + Log::Warning("BeforeTrain step 1"); col_sampler_.ResetByTree(); + Log::Warning("BeforeTrain step 1.1"); train_data_->InitTrain(col_sampler_.is_feature_used_bytree(), share_state_.get()); + Log::Warning("BeforeTrain step 1.2"); // initialize data partition data_partition_->Init(); + Log::Warning("BeforeTrain step 2"); constraints_->Reset(); + Log::Warning("BeforeTrain step 3"); // reset the splits for leaves for (int i = 0; i < config_->num_leaves; ++i) { best_split_per_leaf_[i].Reset(); } + Log::Warning("BeforeTrain step 4"); // Sumup for root if (data_partition_->leaf_count(0) == num_data_) { // use all data @@ -320,15 +338,20 @@ void SerialTreeLearner::BeforeTrain() { } } + Log::Warning("BeforeTrain step 5"); + // Log::Warning("smaller_leaf_splits_->leaf_index() = %d before train", smaller_leaf_splits_->leaf_index()); + larger_leaf_splits_->Init(); if (cegb_ != nullptr) { cegb_->BeforeTrain(); } + Log::Warning("BeforeTrain step 6"); if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) { gradient_discretizer_->SetNumBitsInHistogramBin(0, -1, data_partition_->leaf_count(0), 0); } + Log::Warning("BeforeTrain step 7"); } bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) { @@ -391,8 +414,12 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree, const std::set* fo } bool use_subtract = parent_leaf_histogram_array_ != nullptr; + // Log::Warning("before ConstructHistograms"); ConstructHistograms(is_feature_used, use_subtract); + // Log::Warning("after ConstructHistograms"); + // Log::Warning("before FindBestSplitsFromHistograms"); FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree); + // Log::Warning("after FindBestSplitsFromHistograms"); } void SerialTreeLearner::ConstructHistograms( @@ -466,14 +493,19 @@ void SerialTreeLearner::ConstructHistograms( void SerialTreeLearner::FindBestSplitsFromHistograms( const std::vector& is_feature_used, bool use_subtract, const Tree* tree) { + // Log::Warning("FindBestSplitsFromHistograms step 0"); Common::FunctionTimer fun_timer( "SerialTreeLearner::FindBestSplitsFromHistograms", global_timer); + // Log::Warning("FindBestSplitsFromHistograms step 0.1"); std::vector smaller_best(share_state_->num_threads); std::vector larger_best(share_state_->num_threads); + // Log::Warning("smaller_leaf_splits_->leaf_index() = %d", smaller_leaf_splits_->leaf_index()); std::vector smaller_node_used_features = col_sampler_.GetByNode(tree, smaller_leaf_splits_->leaf_index()); std::vector larger_node_used_features; + // Log::Warning("FindBestSplitsFromHistograms step 0.2"); double smaller_leaf_parent_output = GetParentOutput(tree, smaller_leaf_splits_.get()); double larger_leaf_parent_output = 0; + // Log::Warning("FindBestSplitsFromHistograms step 0.3"); if (larger_leaf_splits_ != nullptr && larger_leaf_splits_->leaf_index() >= 0) { larger_leaf_parent_output = GetParentOutput(tree, larger_leaf_splits_.get()); } @@ -481,6 +513,8 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( larger_node_used_features = col_sampler_.GetByNode(tree, larger_leaf_splits_->leaf_index()); } + // Log::Warning("FindBestSplitsFromHistograms step 1"); + if (use_subtract && config_->use_quantized_grad) { const int parent_index = std::min(smaller_leaf_splits_->leaf_index(), larger_leaf_splits_->leaf_index()); const uint8_t parent_hist_bits = gradient_discretizer_->GetHistBitsInNode(parent_index); @@ -500,15 +534,18 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( } } + // Log::Warning("FindBestSplitsFromHistograms step 2"); + OMP_INIT_EX(); // find splits -#pragma omp parallel for schedule(static) num_threads(share_state_->num_threads) +// #pragma omp parallel for schedule(static) num_threads(share_state_->num_threads) for (int feature_index = 0; feature_index < num_features_; ++feature_index) { OMP_LOOP_EX_BEGIN(); if (!is_feature_used[feature_index]) { continue; } const int tid = omp_get_thread_num(); + // Log::Warning("FindBestSplitsFromHistograms step 2.1"); if (config_->use_quantized_grad) { const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf(smaller_leaf_splits_->leaf_index()); const int64_t int_sum_gradient_and_hessian = smaller_leaf_splits_->int_sum_gradients_and_hessians(); @@ -529,6 +566,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( } int real_fidx = train_data_->RealFeatureIndex(feature_index); + // Log::Warning("FindBestSplitsFromHistograms step 2.2"); ComputeBestSplitForFeature(smaller_leaf_histogram_array_, feature_index, real_fidx, smaller_node_used_features[feature_index], @@ -542,6 +580,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( continue; } + // Log::Warning("FindBestSplitsFromHistograms step 2.3"); if (use_subtract) { if (config_->use_quantized_grad) { const int parent_index = std::min(smaller_leaf_splits_->leaf_index(), larger_leaf_splits_->leaf_index()); @@ -589,6 +628,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( } } + // Log::Warning("FindBestSplitsFromHistograms step 2.4"); ComputeBestSplitForFeature(larger_leaf_histogram_array_, feature_index, real_fidx, larger_node_used_features[feature_index], @@ -599,6 +639,10 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( OMP_LOOP_EX_END(); } OMP_THROW_EX(); + + + // Log::Warning("FindBestSplitsFromHistograms step 3"); + auto smaller_best_idx = ArrayArgs::ArgMax(smaller_best); int leaf = smaller_leaf_splits_->leaf_index(); best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx]; @@ -756,6 +800,9 @@ std::set SerialTreeLearner::FindAllForceFeatures(Json force_split_leaf_sett void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf, bool update_cnt) { Common::FunctionTimer fun_timer("SerialTreeLearner::SplitInner", global_timer); + + // histogram_pool_.DumpContent(); + SplitInfo& best_split_info = best_split_per_leaf_[best_leaf]; const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature); @@ -843,7 +890,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, // init the leaves that used on next iteration if (!config_->use_quantized_grad) { if (best_split_info.left_count < best_split_info.right_count) { - CHECK_GT(best_split_info.left_count, 0); + // CHECK_GT(best_split_info.left_count, 0); smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian, @@ -853,7 +900,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, best_split_info.right_sum_hessian, best_split_info.right_output); } else { - CHECK_GT(best_split_info.right_count, 0); + // CHECK_GT(best_split_info.right_count, 0); smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian, @@ -865,7 +912,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, } } else { if (best_split_info.left_count < best_split_info.right_count) { - CHECK_GT(best_split_info.left_count, 0); + // CHECK_GT(best_split_info.left_count, 0); smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian, @@ -877,7 +924,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, best_split_info.right_sum_gradient_and_hessian, best_split_info.right_output); } else { - CHECK_GT(best_split_info.right_count, 0); + // CHECK_GT(best_split_info.right_count, 0); smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian, @@ -896,9 +943,9 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, data_partition_->leaf_count(*right_leaf)); } - #ifdef DEBUG - CheckSplit(best_split_info, *left_leaf, *right_leaf); - #endif + // #ifdef DEBUG + // CheckSplit(best_split_info, *left_leaf, *right_leaf); + // #endif auto leaves_need_update = constraints_->Update( is_numerical_split, *left_leaf, *right_leaf, @@ -1057,7 +1104,7 @@ std::vector node_used_features = col_sampler_.GetByNode(tree, leaf); *split = bests[best_idx]; } -#ifdef DEBUG +// #ifdef DEBUG void SerialTreeLearner::CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index) { data_size_t num_data_in_left = 0; data_size_t num_data_in_right = 0; @@ -1097,8 +1144,38 @@ void SerialTreeLearner::CheckSplit(const SplitInfo& best_split_info, const int l CHECK_EQ(sum_right_gradient, static_cast(best_split_info.right_sum_gradient_and_hessian >> 32)); CHECK_EQ(sum_right_hessian, static_cast(best_split_info.right_sum_gradient_and_hessian & 0x00000000ffffffff)); Log::Warning("============================ end leaf split info ============================"); + Log::Warning("============================ pass split check ============================"); + } else { + double sum_left_gradient = 0; + double sum_left_hessian = 0; + double sum_right_gradient = 0; + double sum_right_hessian = 0; + + for (data_size_t i = 0; i < num_data_in_left; ++i) { + const data_size_t index = data_indices_in_left[i]; + sum_left_gradient += gradients_[index]; + sum_left_hessian += hessians_[index]; + } + for (data_size_t i = 0; i < num_data_in_right; ++i) { + const data_size_t index = data_indices_in_right[i]; + sum_right_gradient += gradients_[index]; + sum_right_hessian += hessians_[index]; + } + Log::Warning("num_data_in_left = %d, best_split_info.left_count = %d", num_data_in_left, best_split_info.left_count); + Log::Warning("num_data_in_right = %d, best_split_info.right_count = %d", num_data_in_right, best_split_info.right_count); + Log::Warning("sum_left_gradient = %f, best_split_info.left_sum_gradient = %f", sum_left_gradient, best_split_info.left_sum_gradient); + Log::Warning("sum_left_hessian = %f, best_split_info.sum_left_hessian = %f", sum_left_hessian, best_split_info.left_sum_hessian); + Log::Warning("sum_right_gradient = %f, best_split_info.sum_right_gradient = %f", sum_right_gradient, best_split_info.right_sum_gradient); + Log::Warning("sum_right_hessian = %f, best_split_info.sum_right_hessian = %f", sum_right_hessian, best_split_info.right_sum_hessian); + CHECK_EQ(num_data_in_left, best_split_info.left_count); + CHECK_EQ(num_data_in_right, best_split_info.right_count); + CHECK_LE(std::fabs(sum_left_gradient - best_split_info.left_sum_gradient), 1e-3); + CHECK_LE(std::fabs(sum_left_hessian - best_split_info.left_sum_hessian), 1e-3); + CHECK_LE(std::fabs(sum_right_gradient - best_split_info.right_sum_gradient), 1e-3); + CHECK_LE(std::fabs(sum_right_hessian - best_split_info.right_sum_hessian), 1e-3); + Log::Warning("============================ pass split check ============================"); } } -#endif +// #endif } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 43ff6a4b1e13..e1ec2100ddea 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -171,9 +171,9 @@ class SerialTreeLearner: public TreeLearner { std::set FindAllForceFeatures(Json force_split_leaf_setting); - #ifdef DEBUG + // #ifdef DEBUG void CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index); - #endif + // #endif /*! * \brief Get the number of data in a leaf diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 74f6939c8371..ea4c16f3828f 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -4509,3 +4509,27 @@ def test_quantized_training(): quant_bst = lgb.train(bst_params, ds, num_boost_round=10) quant_rmse = np.sqrt(np.mean((quant_bst.predict(X) - y) ** 2)) assert quant_rmse < rmse + 6.0 + + +def test_bagging_by_query_in_lambdarank(): + rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank" + X_train, y_train = load_svmlight_file(str(rank_example_dir / "rank.train")) + q_train = np.loadtxt(str(rank_example_dir / "rank.train.query")) + X_test, y_test = load_svmlight_file(str(rank_example_dir / "rank.test")) + q_test = np.loadtxt(str(rank_example_dir / "rank.test.query")) + params = {"objective": "lambdarank", "verbose": -1, "metric": "ndcg", "ndcg_eval_at": "5"} + lgb_train = lgb.Dataset(X_train, y_train, group=q_train, params=params) + lgb_test = lgb.Dataset(X_test, y_test, group=q_test, params=params) + gbm = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=[lgb_test]) + ndcg_score = gbm.best_score["valid_0"]["ndcg@5"] + + params.update({"bagging_by_query": True, "bagging_fraction": 0.1, "bagging_freq": 1}) + gbm_bagging_by_query = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=[lgb_test]) + ndcg_score_bagging_by_query = gbm_bagging_by_query.best_score["valid_0"]["ndcg@5"] + + params.update({"bagging_by_query": False, "bagging_fraction": 0.1, "bagging_freq": 1}) + gbm_no_bagging_by_query = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=[lgb_test]) + ndcg_score_no_bagging_by_query = gbm_no_bagging_by_query.best_score["valid_0"]["ndcg@5"] + print(ndcg_score_bagging_by_query, ndcg_score, ndcg_score_no_bagging_by_query) + assert ndcg_score_bagging_by_query >= ndcg_score - 0.1 + assert ndcg_score_no_bagging_by_query >= ndcg_score - 0.1 diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj index 009c744964d1..55c99b520b47 100644 --- a/windows/LightGBM.vcxproj +++ b/windows/LightGBM.vcxproj @@ -248,6 +248,7 @@ + @@ -283,6 +284,7 @@ + @@ -328,6 +330,8 @@ + + diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters index 1bb899738213..5f8fbcfc819d 100644 --- a/windows/LightGBM.vcxproj.filters +++ b/windows/LightGBM.vcxproj.filters @@ -69,6 +69,9 @@ src\io + + src\io + src\metric @@ -141,6 +144,9 @@ include\LightGBM + + include\LightGBM + include\LightGBM\utils @@ -338,6 +344,12 @@ src\io + + src\io + + + src\io + src\io