diff --git a/CMakeLists.txt b/CMakeLists.txt
index 048818ff1c1b..00351e12d1cf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -439,6 +439,8 @@ set(
src/io/parser.cpp
src/io/train_share_states.cpp
src/io/tree.cpp
+ src/io/pairwise_lambdarank_bin.cpp
+ src/io/pairwise_ranking_feature_group.cpp
src/metric/dcg_calculator.cpp
src/metric/metric.cpp
src/network/linker_topo.cpp
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index baaf20c4e5ad..9ce8a3b77ab0 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -168,6 +168,8 @@ Core Parameters
- ``rank_xendcg`` is faster than and achieves the similar performance as ``lambdarank``
+ - ``pairwise_lambdarank``, pairwise lambdarank algorithm
+
- label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect)
- custom objective function (gradients and hessians not computed directly by LightGBM)
@@ -413,6 +415,10 @@ Learning Control Parameters
- random seed for bagging
+- ``bagging_by_query`` :raw-html:`🔗︎`, default = ``false``, type = bool
+
+ - whether to do bagging sample by query
+
- ``feature_fraction`` :raw-html:`🔗︎`, default = ``1.0``, type = double, aliases: ``sub_feature``, ``colsample_bytree``, constraints: ``0.0 < feature_fraction <= 1.0``
- LightGBM will randomly select a subset of features on each iteration (tree) if ``feature_fraction`` is smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features before training each tree
@@ -1214,6 +1220,66 @@ Objective Parameters
- *New in version 4.1.0*
+- ``use_differential_feature_in_pairwise_ranking`` :raw-html:`🔗︎`, default = ``false``, type = bool
+
+ - whether to use differential features in pairwise ranking
+
+ - used only in ``pairwise_lambdarank`` application
+
+- ``pairwise_lambdarank_model_indirect_comparison`` :raw-html:`🔗︎`, default = ``false``, type = bool
+
+ - whether to additionaly perform indirect document comparison in pairwise ranking
+
+ - used only in ``pairwise_lambdarank`` application
+
+- ``pairwise_lambdarank_model_conditional_rel`` :raw-html:`🔗︎`, default = ``false``, type = bool
+
+ - whether to model conditional document relevance (given documents ranked above) in pairwise ranking
+
+ - used only in ``pairwise_lambdarank`` application
+
+- ``pairwise_lambdarank_indirect_comparison_above_only`` :raw-html:`🔗︎`, default = ``true``, type = bool
+
+ - whether to limit the indirect document comparison to only auxilliary documents ranked above in pairwise ranking
+
+ - used only in ``pairwise_lambdarank`` application
+
+- ``pairwise_lambdarank_logarithmic_discounts`` :raw-html:`🔗︎`, default = ``true``, type = bool
+
+ - whether to use logarithmic discounts when converting pairwise scores into pointwise in pairwise ranking
+
+ - used only in ``pairwise_lambdarank`` application
+
+- ``pairwise_lambdarank_hard_pairwise_preference`` :raw-html:`🔗︎`, default = ``false``, type = bool
+
+ - whether to use hard pairwise preference when converting pairwise scores into pointwise in pairwise ranking
+
+ - used only in ``pairwise_lambdarank`` application
+
+- ``pairwise_lambdarank_train_pairing_approach`` :raw-html:`🔗︎`, default = ``std::string("different_relevance")``, type = string
+
+ - pairing appraoch for training dataset
+
+ - used only in ``pairwise_lambdarank`` application
+
+ - with ``different_relevance``, only consider pairs with difference relevance score
+
+ - with ``at_least_one_relevant``, only consider pairs with at least one relevant item
+
+ - with ``all``, all pairs will be used
+
+- ``pairwise_lambdarank_valid_pairing_approach`` :raw-html:`🔗︎`, default = ``std::string("different_relevance")``, type = string
+
+ - pairing appraoch for validation dataset
+
+ - used only in ``pairwise_lambdarank`` application
+
+ - with ``different_relevance``, only consider pairs with difference relevance score
+
+ - with ``at_least_one_relevant``, only consider pairs with at least one relevant item
+
+ - with ``all``, all pairs will be used
+
Metric Parameters
-----------------
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index a33fcfa9c45c..0747417f81cf 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -14,6 +14,7 @@
#include
#include
#include
+#include
#include
namespace LightGBM {
@@ -305,6 +306,10 @@ class Bin {
*/
virtual BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const = 0;
+ virtual BinIterator* GetUnpairedIterator(uint32_t /* min_bin */, uint32_t /* max_bin */, uint32_t /* most_freq_bin */) const {
+ return nullptr;
+ }
+
/*!
* \brief Save binary data to file
* \param file File want to write
@@ -466,6 +471,64 @@ class Bin {
*/
static Bin* CreateSparseBin(data_size_t num_data, int num_bin);
+ /*!
+ * \brief Create object for bin data of the first feature in pair, used for pairwise ranking, for an original dense bin
+ * \param num_data Size of the pairwise dataset
+ * \param num_bin Number of bin
+ * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair
+ * \return The bin data object
+ */
+ static Bin* CreateDensePairwiseRankingFirstBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map);
+
+ /*!
+ * \brief Create object for bin data of the first feature in pair, used for pairwise ranking, for an original sparse bin
+ * \param num_data Size of the pairwise dataset
+ * \param num_bin Number of bin
+ * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair
+ * \return The bin data object
+ */
+ static Bin* CreateSparsePairwiseRankingFirstBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map);
+
+ /*!
+ * \brief Create object for bin data of the second feature in pair, used for pairwise ranking, for an original dense bin
+ * \param num_data Size of the pairwise dataset
+ * \param num_bin Number of bin
+ * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair
+ * \return The bin data object
+ */
+ static Bin* CreateDensePairwiseRankingSecondBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map);
+
+ /*!
+ * \brief Create object for bin data of the second feature in pair, used for pairwise ranking, for an original sparse bin
+ * \param num_data Size of the pairwise dataset
+ * \param num_bin Number of bin
+ * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair
+ * \return The bin data object
+ */
+ static Bin* CreateSparsePairwiseRankingSecondBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map);
+
+ /*!
+ * \brief Create object for bin data of the differential feature in pair, used for pairwise ranking, for an original dense bin
+ * \param num_data Size of the pairwise dataset
+ * \param num_bin Number of bin
+ * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair
+ * \param diff_bin_mappers Bin mappers for differential features in this group
+ * \param bin_offsets Bin offsets in feature group
+ * \return The bin data object
+ */
+ static Bin* CreateDensePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets);
+
+ /*!
+ * \brief Create object for bin data of the differential feature in pair, used for pairwise ranking, for an original sparse bin
+ * \param num_data Size of the pairwise dataset
+ * \param num_bin Number of bin
+ * \param paired_ranking_item_index_map Map from data index to the original index for items in the pair
+ * \param diff_bin_mappers Bin mappers for differential features in this group
+ * \param bin_offsets Bin offsets in feature group
+ * \return The bin data object
+ */
+ static Bin* CreateSparsePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets);
+
/*!
* \brief Deep copy the bin
*/
@@ -474,6 +537,8 @@ class Bin {
virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, const int num_threads) const = 0;
virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const = 0;
+
+ int group_index_ = -1;
};
@@ -495,6 +560,8 @@ class MultiValBin {
const data_size_t* used_indices,
data_size_t num_used_indices) = 0;
+ virtual void DumpContent() const {}
+
virtual MultiValBin* CreateLike(data_size_t num_data, int num_bin,
int num_feature,
double estimate_element_per_row,
@@ -588,12 +655,14 @@ class MultiValBin {
virtual bool IsSparse() = 0;
static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin,
- int num_feature, double sparse_rate, const std::vector& offsets);
+ int num_feature, double sparse_rate, const std::vector& offsets, const bool use_pairwise_ranking,
+ const std::pair* paired_ranking_item_global_index_map);
static MultiValBin* CreateMultiValDenseBin(data_size_t num_data, int num_bin,
- int num_feature, const std::vector& offsets);
+ int num_feature, const std::vector& offsets, const bool use_pairwise_ranking,
+ const std::pair* paired_ranking_item_global_index_map);
- static MultiValBin* CreateMultiValSparseBin(data_size_t num_data, int num_bin, double estimate_element_per_row);
+ static MultiValBin* CreateMultiValSparseBin(data_size_t num_data, int num_bin, double estimate_element_per_row, const bool use_pairwise_ranking, const std::pair* paired_ranking_item_global_index_map);
static constexpr double multi_val_bin_sparse_threshold = 0.25f;
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 4f83898a28c9..a83339a9104f 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -36,6 +36,11 @@ enum TaskType {
};
const int kDefaultNumLeaves = 31;
+/*! \brief Types of pairwise ranking mode */
+enum PairwiseRankingMode {
+ kNone, kFull, kRelevance, kManual
+};
+
struct Config {
public:
Config() {}
@@ -157,6 +162,7 @@ struct Config {
// descl2 = ``lambdarank``, `lambdarank `__ objective. `label_gain <#label_gain>`__ can be used to set the gain (weight) of ``int`` label and all values in ``label`` must be smaller than number of elements in ``label_gain``
// descl2 = ``rank_xendcg``, `XE_NDCG_MART `__ ranking objective function, aliases: ``xendcg``, ``xe_ndcg``, ``xe_ndcg_mart``, ``xendcg_mart``
// descl2 = ``rank_xendcg`` is faster than and achieves the similar performance as ``lambdarank``
+ // descl2 = ``pairwise_lambdarank``, pairwise lambdarank algorithm
// descl2 = label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect)
// desc = custom objective function (gradients and hessians not computed directly by LightGBM)
// descl2 = ``custom``
@@ -358,6 +364,9 @@ struct Config {
// desc = random seed for bagging
int bagging_seed = 3;
+ // desc = whether to do bagging sample by query
+ bool bagging_by_query = false;
+
// alias = sub_feature, colsample_bytree
// check = >0.0
// check = <=1.0
@@ -995,6 +1004,44 @@ struct Config {
// desc = *New in version 4.1.0*
double lambdarank_position_bias_regularization = 0.0;
+ // desc = whether to use differential features in pairwise ranking
+ // desc = used only in ``pairwise_lambdarank`` application
+ bool use_differential_feature_in_pairwise_ranking = false;
+
+ // desc = whether to additionaly perform indirect document comparison in pairwise ranking
+ // desc = used only in ``pairwise_lambdarank`` application
+ bool pairwise_lambdarank_model_indirect_comparison = false;
+
+ // desc = whether to model conditional document relevance (given documents ranked above) in pairwise ranking
+ // desc = used only in ``pairwise_lambdarank`` application
+ bool pairwise_lambdarank_model_conditional_rel = false;
+
+ // desc = whether to limit the indirect document comparison to only auxilliary documents ranked above in pairwise ranking
+ // desc = used only in ``pairwise_lambdarank`` application
+ bool pairwise_lambdarank_indirect_comparison_above_only = true;
+
+ // desc = whether to use logarithmic discounts when converting pairwise scores into pointwise in pairwise ranking
+ // desc = used only in ``pairwise_lambdarank`` application
+ bool pairwise_lambdarank_logarithmic_discounts = true;
+
+ // desc = whether to use hard pairwise preference when converting pairwise scores into pointwise in pairwise ranking
+ // desc = used only in ``pairwise_lambdarank`` application
+ bool pairwise_lambdarank_hard_pairwise_preference = false;
+
+ // desc = pairing appraoch for training dataset
+ // desc = used only in ``pairwise_lambdarank`` application
+ // desc = with ``different_relevance``, only consider pairs with difference relevance score
+ // desc = with ``at_least_one_relevant``, only consider pairs with at least one relevant item
+ // desc = with ``all``, all pairs will be used
+ std::string pairwise_lambdarank_train_pairing_approach = std::string("different_relevance");
+
+ // desc = pairing appraoch for validation dataset
+ // desc = used only in ``pairwise_lambdarank`` application
+ // desc = with ``different_relevance``, only consider pairs with difference relevance score
+ // desc = with ``at_least_one_relevant``, only consider pairs with at least one relevant item
+ // desc = with ``all``, all pairs will be used
+ std::string pairwise_lambdarank_valid_pairing_approach = std::string("different_relevance");
+
#ifndef __NVCC__
#pragma endregion
diff --git a/include/LightGBM/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp
index 465ed334156c..a7877361cd09 100644
--- a/include/LightGBM/cuda/cuda_objective_function.hpp
+++ b/include/LightGBM/cuda/cuda_objective_function.hpp
@@ -49,6 +49,11 @@ class CUDAObjectiveInterface: public HOST_OBJECTIVE {
SynchronizeCUDADevice(__FILE__, __LINE__);
}
+ void GetGradients(const double* scores, const data_size_t /*num_sampled_queries*/, const data_size_t* /*sampled_query_indices*/, score_t* gradients, score_t* hessians) const override {
+ LaunchGetGradientsKernel(scores, gradients, hessians);
+ SynchronizeCUDADevice(__FILE__, __LINE__);
+ }
+
void RenewTreeOutputCUDA(const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf,
const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const override {
global_timer.Start("CUDAObjectiveInterface::LaunchRenewTreeOutputCUDAKernel");
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index 220a1f9f009c..418aa560aff2 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -204,6 +204,14 @@ class Metadata {
const double* init_scores,
const int32_t* queries);
+ /*!
+ * \brief Build metadata for ranking with pairwise features from metadata of an existing ranking dataset
+ * \param metadata Reference to metadata of the existing ranking dataset
+ * \param pairing_approach The pairing approach of this dataset
+ * \return The number of paired data
+ */
+ data_size_t BuildPairwiseFeatureRanking(const Metadata& metadata, const std::string& pairing_approach);
+
/*!
* \brief Perform any extra operations after all data has been loaded
*/
@@ -252,11 +260,40 @@ class Metadata {
return position_ids_.size();
}
+ /*!
+ * \brief Get the pairwise item index map within query in ranking with pairwise features
+ * \return Pointer to the pairwise item index map within query
+ */
+ inline const std::pair* paired_ranking_item_index_map() const {
+ if (!paired_ranking_item_index_map_.empty()) {
+ return paired_ranking_item_index_map_.data();
+ } else {
+ return nullptr;
+ }
+ }
+
+ /*!
+ * \brief Get the pairwise item global index map in ranking with pairwise features
+ * \return Pointer to the pairwise item global index map
+ */
+ inline const std::pair* paired_ranking_item_global_index_map() const {
+ if (!paired_ranking_item_global_index_map_.empty()) {
+ return paired_ranking_item_global_index_map_.data();
+ } else {
+ return nullptr;
+ }
+ }
+
+ inline data_size_t paired_ranking_item_index_map_size() const {
+ return static_cast(paired_ranking_item_index_map_.size());
+ }
+
/*!
* \brief Get data boundaries on queries, if not exists, will return nullptr
* we assume data will order by query,
* the interval of [query_boundaris[i], query_boundaris[i+1])
* is the data indices for query i.
+ * When pairwise ranking, this points to the paired query boundaries.
* \return Pointer of data boundaries on queries
*/
inline const data_size_t* query_boundaries() const {
@@ -267,6 +304,18 @@ class Metadata {
}
}
+ /*!
+ * \brief Used in pairwise ranking. Pointwise query boundaries.
+ * \return Pointer of data boundaries on queries
+ */
+ inline const data_size_t* pairwise_query_boundaries() const {
+ if (!pairwise_query_boundaries_.empty()) {
+ return pairwise_query_boundaries_.data();
+ } else {
+ return nullptr;
+ }
+ }
+
/*!
* \brief Get Number of queries
* \return Number of queries
@@ -364,7 +413,7 @@ class Metadata {
data_size_t num_weights_;
/*! \brief Number of positions, used to check correct position file */
data_size_t num_positions_;
- /*! \brief Label data */
+ /*! \brief Label data. In pairwise ranking, the label_ refer to the labels of the original unpaired dataset. */
std::vector label_;
/*! \brief Weights data */
std::vector weights_;
@@ -374,6 +423,8 @@ class Metadata {
std::vector position_ids_;
/*! \brief Query boundaries */
std::vector query_boundaries_;
+ /*! \brief Original query boundaries, used in pairwise ranking */
+ std::vector pairwise_query_boundaries_;
/*! \brief Query weights */
std::vector query_weights_;
/*! \brief Number of querys */
@@ -384,6 +435,12 @@ class Metadata {
std::vector init_score_;
/*! \brief Queries data */
std::vector queries_;
+ /*! \brief Mode for pairwise ranking */
+ PairwiseRankingMode pairwise_ranking_mode_ = PairwiseRankingMode::kRelevance;
+ /*! \brief Pairwise data index within query to original data indices for ranking with pairwise features */
+ std::vector> paired_ranking_item_index_map_;
+ /*! \brief Pairwise global data index to original data indices for ranking with pairwise features */
+ std::vector> paired_ranking_item_global_index_map_;
/*! \brief mutex for threading safe call */
std::mutex mutex_;
bool weight_load_from_file_;
@@ -659,15 +716,16 @@ class Dataset {
void CopySubrow(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
- MultiValBin* GetMultiBinFromSparseFeatures(const std::vector& offsets) const;
+ MultiValBin* GetMultiBinFromSparseFeatures(const std::vector& offsets, const bool use_pairwise_ranking) const;
- MultiValBin* GetMultiBinFromAllFeatures(const std::vector& offsets) const;
+ MultiValBin* GetMultiBinFromAllFeatures(const std::vector& offsets, const bool use_pairwise_ranking) const;
template
TrainingShareStates* GetShareStates(
score_t* gradients, score_t* hessians,
const std::vector& is_feature_used, bool is_constant_hessian,
- bool force_col_wise, bool force_row_wise, const int num_grad_quant_bins) const;
+ bool force_col_wise, bool force_row_wise, const int num_grad_quant_bins,
+ const bool use_pairwise_ranking) const;
LIGHTGBM_EXPORT void FinishLoad();
@@ -701,6 +759,8 @@ class Dataset {
LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset);
+ LIGHTGBM_EXPORT void CreatePairWiseRankingData(const Dataset* dataset, const bool is_validation, const Config& config);
+
void InitTrain(const std::vector& is_feature_used,
TrainingShareStates* share_state) const;
@@ -782,6 +842,13 @@ class Dataset {
}
}
+ void PrintGroupFeatureInfo(int group_index) const {
+ for (int sub_feature = 0; sub_feature < group_feature_cnt_[group_index]; ++sub_feature) {
+ const BinMapper* bin_mapper = feature_groups_[group_index]->bin_mappers_[sub_feature].get();
+ Log::Warning("sub_feature = %d, missing_type = %d, most_freq_bin = %d", sub_feature, bin_mapper->missing_type(), bin_mapper->GetMostFreqBin());
+ }
+ }
+
inline int FeatureNumBin(int i) const {
const int group = feature2group_[i];
const int sub_feature = feature2subfeature_[i];
@@ -1003,6 +1070,25 @@ class Dataset {
void CreateCUDAColumnData();
+ /*! \brief Create differential features for pairwise lambdarank
+ * \param sample_values sampled values from the file
+ * \param sample_indices sampled data indices from the file
+ * \param bin_mappers bin mappers of the original features
+ * \param filter_cnt filter count for bin finding
+ * \param num_total_sample_data number of all sampled data
+ * \param differential_feature_bin_mappers output differential feature bin mapppers
+ */
+ void CreatePairwiseRankingDifferentialFeatures(
+ const std::vector>& sample_values,
+ const std::vector>& sample_indices,
+ const std::vector>& bin_mappers,
+ const data_size_t num_total_sample_data,
+ const data_size_t* query_boundaries,
+ const data_size_t num_queries,
+ std::vector>* differential_feature_bin_mappers,
+ std::vector* diff_original_feature_index,
+ const Config& config) const;
+
std::string data_filename_;
/*! \brief Store used features */
std::vector> feature_groups_;
@@ -1058,6 +1144,21 @@ class Dataset {
#endif // USE_CUDA
std::string parser_config_str_;
+
+ /*! \brief stored sampled features, for creating differential features in pairwise lambdarank */
+ std::shared_ptr>> sampled_values_;
+ /*! \brief stored sampled data indices, for creating differential features in pairwise lambdarank */
+ std::shared_ptr>> sampled_indices_;
+ /*! \brief stored number of totally sampled data, for creating differential features in pairwise lambdarank */
+ data_size_t num_total_sampled_data_;
+ /*! \brief stored query boundaries from training dataset, for creating differential features in pairwise lambdarank */
+ const data_size_t* train_query_boundaries_;
+ /*! \brief stored number of queries from training dataset, for creating differential features in pairwise lambdarank */
+ data_size_t train_num_queries_;
+ /*! \brief stored number of differential features used in training dataset, for creating differential features in pairwise lambdarank */
+ data_size_t num_used_differential_features_;
+ /*! \brief stored number of differential feature groups used in training dataset, for creating differential features in pairwise lambdarank */
+ data_size_t num_used_differential_groups_;
};
} // namespace LightGBM
diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h
index f13a5fff966f..53a501cd149b 100644
--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -152,7 +152,7 @@ class FeatureGroup {
}
/*! \brief Destructor */
- ~FeatureGroup() {}
+ virtual ~FeatureGroup() {}
/*!
* \brief Load the overall definition of the feature group from binary serialized data
@@ -286,7 +286,10 @@ class FeatureGroup {
}
inline void CopySubrowByCol(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices, int fidx) {
+ Log::Warning("in CopySubrowByCol");
if (!is_multi_val_) {
+ Log::Warning("is not multi val");
+ Log::Warning("full_feature->bin_data_.get() = %ld", full_feature->bin_data_.get());
bin_data_->CopySubrow(full_feature->bin_data_.get(), used_indices, num_used_indices);
} else {
multi_bin_data_[fidx]->CopySubrow(full_feature->multi_bin_data_[fidx].get(), used_indices, num_used_indices);
@@ -343,14 +346,14 @@ class FeatureGroup {
num_feature_ += other->num_feature_;
}
- inline BinIterator* SubFeatureIterator(int sub_feature) {
+ virtual inline BinIterator* SubFeatureIterator(int sub_feature) const {
uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
if (!is_multi_val_) {
uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
} else {
- int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
+ int addi = most_freq_bin == 0 ? 0 : 1;
uint32_t min_bin = 1;
uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin,
@@ -373,7 +376,7 @@ class FeatureGroup {
}
}
- inline BinIterator* FeatureGroupIterator() {
+ virtual inline BinIterator* FeatureGroupIterator() {
if (is_multi_val_) {
return nullptr;
}
@@ -581,32 +584,32 @@ class FeatureGroup {
}
}
- private:
- void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) {
+ protected:
+ virtual void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) {
if (is_multi_val) {
multi_bin_data_.clear();
for (int i = 0; i < num_feature_; ++i) {
int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
- if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
- multi_bin_data_.emplace_back(Bin::CreateSparseBin(
- num_data, bin_mappers_[i]->num_bin() + addi));
- } else {
+ // if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
+ // multi_bin_data_.emplace_back(Bin::CreateSparseBin(
+ // num_data, bin_mappers_[i]->num_bin() + addi));
+ // } else {
multi_bin_data_.emplace_back(
Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
- }
+ // }
}
is_multi_val_ = true;
} else {
- if (force_sparse ||
- (!force_dense && num_feature_ == 1 &&
- bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
- is_sparse_ = true;
- bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
- } else {
+ // if (force_sparse ||
+ // (!force_dense && num_feature_ == 1 &&
+ // bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
+ // is_sparse_ = true;
+ // bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
+ // } else {
is_sparse_ = false;
bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
- }
- is_multi_val_ = false;
+ // }
+ // is_multi_val_ = false;
}
}
diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h
index ad188dc39676..88e96e463adb 100644
--- a/include/LightGBM/objective_function.h
+++ b/include/LightGBM/objective_function.h
@@ -37,6 +37,17 @@ class ObjectiveFunction {
virtual void GetGradients(const double* score,
score_t* gradients, score_t* hessians) const = 0;
+ /*!
+ * \brief calculating first order derivative of loss function, used only for baggin by query in lambdarank
+ * \param score prediction score in this round
+ * \param num_sampled_queries number of in-bag queries
+ * \param sampled_query_indices indices of in-bag queries
+ * \gradients Output gradients
+ * \hessians Output hessians
+ */
+ virtual void GetGradients(const double* score, const data_size_t /*num_sampled_queries*/, const data_size_t* /*sampled_query_indices*/,
+ score_t* gradients, score_t* hessians) const { GetGradients(score, gradients, hessians); }
+
virtual const char* GetName() const = 0;
virtual bool IsConstantHessian() const { return false; }
@@ -108,8 +119,20 @@ class ObjectiveFunction {
virtual bool NeedConvertOutputCUDA () const { return false; }
#endif // USE_CUDA
+
+ virtual void SetDataIndices(const data_size_t* used_data_indices) const { used_data_indices_ = used_data_indices; }
+
+ private:
+ mutable const data_size_t* used_data_indices_ = nullptr;
};
+void UpdatePointwiseScoresForOneQuery(data_size_t query_id, double* score_pointwise, const double* score_pairwise, data_size_t cnt_pointwise,
+ data_size_t selected_pairs_cnt, const data_size_t* selected_pairs, const std::pair* paired_index_map,
+ const std::multimap& right2left_map, const std::multimap < data_size_t, data_size_t>& left2right_map,
+ const std::map, data_size_t>& left_right2pair_map,
+ int truncation_level, double sigma, const CommonC::SigmoidCache& sigmoid_cache, bool model_indirect_comparison, bool model_conditional_rel,
+ bool indirect_comparison_above_only, bool logarithmic_discounts, bool hard_pairwise_preference);
+
} // namespace LightGBM
#endif // LightGBM_OBJECTIVE_FUNCTION_H_
diff --git a/include/LightGBM/pairwise_ranking_feature_group.h b/include/LightGBM/pairwise_ranking_feature_group.h
new file mode 100644
index 000000000000..b08b3c8bbf7d
--- /dev/null
+++ b/include/LightGBM/pairwise_ranking_feature_group.h
@@ -0,0 +1,154 @@
+/*!
+ * Copyright (c) 2023 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+#ifndef LIGHTGBM_PAIRWISE_RANKING_FEATURE_GROUP_H_
+#define LIGHTGBM_PAIRWISE_RANKING_FEATURE_GROUP_H_
+
+#include
+#include
+#include
+#include
+
+#include "feature_group.h"
+
+namespace LightGBM {
+
+/*! \brief Using to store data and providing some operations on one pairwise feature group for pairwise ranking */
+class PairwiseRankingFeatureGroup: public FeatureGroup {
+ public:
+ /*!
+ * \brief Constructor
+ * \param num_feature number of features of this group
+ * \param bin_mappers Bin mapper for features
+ * \param num_data Total number of data
+ * \param is_enable_sparse True if enable sparse feature
+ * \param is_first_or_second_in_pairing Mark whether features in this group belong to the first or second element in the pairing
+ */
+
+ PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map);
+
+ /*!
+ * \brief Constructor from memory when data is present
+ * \param memory Pointer of memory
+ * \param num_all_data Number of global data
+ * \param local_used_indices Local used indices, empty means using all data
+ * \param group_id Id of group
+ */
+ // PairwiseRankingFeatureGroup(const void* memory,
+ // data_size_t num_all_data,
+ // const std::vector& local_used_indices,
+ // int group_id) {
+ // // TODO(shiyu1994)
+ // }
+
+ // /*!
+ // * \brief Constructor from definition in memory (without data)
+ // * \param memory Pointer of memory
+ // * \param local_used_indices Local used indices, empty means using all data
+ // */
+ // PairwiseRankingFeatureGroup(const void* memory, data_size_t num_data, int group_id): FeatureGroup(memory, num_data, group_id) {
+ // // TODO(shiyu1994)
+ // }
+
+ /*! \brief Destructor */
+ ~PairwiseRankingFeatureGroup() {}
+
+ /*!
+ * \brief Load the overall definition of the feature group from binary serialized data
+ * \param memory Pointer of memory
+ * \param group_id Id of group
+ */
+ const char* LoadDefinitionFromMemory(const void* /*memory*/, int /*group_id*/) {
+ // TODO(shiyu1994)
+ return nullptr;
+ }
+
+ inline BinIterator* SubFeatureIterator(int /*sub_feature*/) {
+ // TODO(shiyu1994)
+ return nullptr;
+ }
+
+ inline void FinishLoad() {
+ CHECK(!is_multi_val_);
+ bin_data_->FinishLoad();
+ }
+
+ inline BinIterator* FeatureGroupIterator() {
+ if (is_multi_val_) {
+ return nullptr;
+ }
+ uint32_t min_bin = bin_offsets_[0];
+ uint32_t max_bin = bin_offsets_.back() - 1;
+ uint32_t most_freq_bin = 0;
+ return bin_data_->GetUnpairedIterator(min_bin, max_bin, most_freq_bin);
+ }
+
+ /*!
+ * \brief Push one record, will auto convert to bin and push to bin data
+ * \param tid Thread id
+ * \param sub_feature_idx Index of the subfeature
+ * \param line_idx Index of record
+ * \param bin feature bin value of record
+ */
+ inline void PushBinData(int tid, int sub_feature_idx, data_size_t line_idx, uint32_t bin) {
+ if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) {
+ return;
+ }
+ if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
+ bin -= 1;
+ }
+ if (is_multi_val_) {
+ multi_bin_data_[sub_feature_idx]->Push(tid, line_idx, bin + 1);
+ } else {
+ bin += bin_offsets_[sub_feature_idx];
+ bin_data_->Push(tid, line_idx, bin);
+ }
+ }
+
+ protected:
+ void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) override;
+
+ /*! \brief Pairwise data index to original data indices for ranking with pairwise features */
+ const std::pair* paired_ranking_item_index_map_;
+ /*! \brief Number of pairwise data */
+ data_size_t num_data_;
+ /*! \brief Mark whether features in this group belong to the first or second element in the pairing */
+ const int is_first_or_second_in_pairing_;
+};
+
+
+/*! \brief One differential feature group in pairwise ranking */
+class PairwiseRankingDifferentialFeatureGroup: public PairwiseRankingFeatureGroup {
+ public:
+ /*!
+ * \brief Constructor
+ * \param num_feature number of features of this group
+ * \param bin_mappers Bin mapper for features
+ * \param num_data Total number of data
+ * \param is_enable_sparse True if enable sparse feature
+ * \param is_first_or_second_in_pairing Mark whether features in this group belong to the first or second element in the pairing
+ */
+
+ PairwiseRankingDifferentialFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map, std::vector>& diff_feature_bin_mappers, std::vector>& ori_feature_bin_mappers);
+
+ virtual inline BinIterator* SubFeatureIterator(int sub_feature) const override;
+
+ virtual inline BinIterator* FeatureGroupIterator() override;
+
+ /*! \brief Destructor */
+ ~PairwiseRankingDifferentialFeatureGroup() {}
+
+ private:
+ void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) override;
+
+ std::vector> diff_feature_bin_mappers_;
+ std::vector> ori_feature_bin_mappers_;
+ std::vector original_bin_offsets_;
+};
+
+
+} // namespace LightGBM
+
+#endif // LIGHTGBM_PAIRWISE_RANKING_FEATURE_GROUP_H_
diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h
index 4ea5cfc5f436..d2c26877c8ee 100644
--- a/include/LightGBM/sample_strategy.h
+++ b/include/LightGBM/sample_strategy.h
@@ -55,6 +55,10 @@ class SampleStrategy {
bool NeedResizeGradients() const { return need_resize_gradients_; }
+ virtual data_size_t num_sampled_queries() const { return 0; }
+
+ virtual const data_size_t* sampled_query_indices() const { return nullptr; }
+
protected:
const Config* config_;
const Dataset* train_data_;
diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h
index 6c3ebf5d0096..309eba7979ad 100644
--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -1255,6 +1255,54 @@ inline static std::string ArrayToString(const std::vector& arr, size_t n) {
return str_buf.str();
}
+class SigmoidCache {
+public:
+ SigmoidCache(){}
+
+ void Init(double sigmoid) {
+ sigmoid_ = sigmoid;
+ // get boundary
+ min_sigmoid_input_ = min_sigmoid_input_ / sigmoid_ / 2;
+ max_sigmoid_input_ = -min_sigmoid_input_;
+ sigmoid_table_.resize(_sigmoid_bins);
+ // get score to bin factor
+ sigmoid_table_idx_factor_ =
+ _sigmoid_bins / (max_sigmoid_input_ - min_sigmoid_input_);
+ // cache
+ for (size_t i = 0; i < _sigmoid_bins; ++i) {
+ const double score = i / sigmoid_table_idx_factor_ + min_sigmoid_input_;
+ sigmoid_table_[i] = 1.0f / (1.0f + std::exp(score * sigmoid_));
+ }
+ }
+
+ double compute(double score) const {
+ if (score <= min_sigmoid_input_) {
+ // too small, use lower bound
+ return sigmoid_table_[0];
+ }
+ else if (score >= max_sigmoid_input_) {
+ // too large, use upper bound
+ return sigmoid_table_[_sigmoid_bins - 1];
+ }
+ else {
+ return sigmoid_table_[static_cast((score - min_sigmoid_input_) *
+ sigmoid_table_idx_factor_)];
+ }
+ }
+private:
+ /*! \brief Sigmoid param */
+ double sigmoid_;
+ /*! \brief Cache result for sigmoid transform to speed up */
+ std::vector sigmoid_table_;
+ /*! \brief Number of bins in simoid table */
+ size_t _sigmoid_bins = 1024 * 1024;
+ /*! \brief Minimal input of sigmoid table */
+ double min_sigmoid_input_ = -50;
+ /*! \brief Maximal input of Sigmoid table */
+ double max_sigmoid_input_ = 50;
+ /*! \brief Factor that covert score to bin in Sigmoid table */
+ double sigmoid_table_idx_factor_;
+};
} // namespace CommonC
diff --git a/src/application/application.cpp b/src/application/application.cpp
index 42f707f0c801..f7ba45f139c2 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -118,6 +118,14 @@ void Application::LoadData() {
train_data_->SaveBinaryFile(nullptr);
}
// create training metric
+ const Dataset* ref_train_data = nullptr;
+ if (config_.objective == std::string("pairwise_lambdarank")) {
+ ref_train_data = train_data_.release();
+ train_data_.reset(new Dataset());
+ train_data_->CreatePairWiseRankingData(ref_train_data, false, config_);
+ } else {
+ ref_train_data = train_data_.get();
+ }
if (config_.is_provide_training_metric) {
for (auto metric_type : config_.metric) {
auto metric = std::unique_ptr(Metric::CreateMetric(metric_type, config_));
@@ -138,7 +146,12 @@ void Application::LoadData() {
auto new_dataset = std::unique_ptr(
dataset_loader.LoadFromFileAlignWithOtherDataset(
config_.valid[i].c_str(),
- train_data_.get()));
+ ref_train_data));
+ if (config_.objective == std::string("pairwise_lambdarank")) {
+ const Dataset* original_dataset = new_dataset.release();
+ new_dataset.reset(new Dataset());
+ new_dataset->CreatePairWiseRankingData(original_dataset, true, config_);
+ }
valid_datas_.push_back(std::move(new_dataset));
// need save binary file
if (config_.save_binary) {
diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp
index 4c2c81553e7c..f50c7f3160c5 100644
--- a/src/boosting/bagging.hpp
+++ b/src/boosting/bagging.hpp
@@ -7,6 +7,7 @@
#define LIGHTGBM_BOOSTING_BAGGING_HPP_
#include
+#include
namespace LightGBM {
@@ -17,8 +18,15 @@ class BaggingSampleStrategy : public SampleStrategy {
config_ = config;
train_data_ = train_data;
num_data_ = train_data->num_data();
+ num_queries_ = train_data->metadata().num_queries();
+ if (config->objective == std::string("pairwise_lambdarank")) {
+ query_boundaries_ = train_data->metadata().pairwise_query_boundaries();
+ } else {
+ query_boundaries_ = train_data->metadata().query_boundaries();
+ }
objective_function_ = objective_function;
num_tree_per_iteration_ = num_tree_per_iteration;
+ num_threads_ = OMP_NUM_THREADS();
}
~BaggingSampleStrategy() {}
@@ -27,9 +35,10 @@ class BaggingSampleStrategy : public SampleStrategy {
Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer);
// if need bagging
if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) ||
- need_re_bagging_) {
+ need_re_bagging_) {
need_re_bagging_ = false;
- auto left_cnt = bagging_runner_.Run(
+ if (!config_->bagging_by_query) {
+ auto left_cnt = bagging_runner_.Run(
num_data_,
[=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left,
data_size_t*) {
@@ -43,7 +52,60 @@ class BaggingSampleStrategy : public SampleStrategy {
return cur_left_count;
},
bag_data_indices_.data());
- bag_data_cnt_ = left_cnt;
+ bag_data_cnt_ = left_cnt;
+ } else {
+ num_sampled_queries_ = bagging_runner_.Run(
+ num_queries_,
+ [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left,
+ data_size_t*) {
+ data_size_t cur_left_count = 0;
+ cur_left_count = BaggingHelper(cur_start, cur_cnt, left);
+ return cur_left_count;
+ }, bag_query_indices_.data());
+
+ sampled_query_boundaries_[0] = 0;
+ OMP_INIT_EX();
+ #pragma omp parallel for schedule(static) num_threads(num_threads_)
+ for (data_size_t i = 0; i < num_queries_; ++i) {
+ OMP_LOOP_EX_BEGIN();
+ sampled_query_boundaries_[i + 1] = query_boundaries_[bag_query_indices_[i] + 1] - query_boundaries_[bag_query_indices_[i]];
+ OMP_LOOP_EX_END();
+ }
+ OMP_THROW_EX();
+
+ const int num_blocks = Threading::For(0, num_queries_ + 1, 128, [this](int thread_index, data_size_t start_index, data_size_t end_index) {
+ for (data_size_t i = start_index + 1; i < end_index; ++i) {
+ sampled_query_boundaries_[i] += sampled_query_boundaries_[i - 1];
+ }
+ sampled_query_boundaires_thread_buffer_[thread_index] = sampled_query_boundaries_[end_index - 1];
+ });
+
+ for (int thread_index = 1; thread_index < num_blocks; ++thread_index) {
+ sampled_query_boundaires_thread_buffer_[thread_index] += sampled_query_boundaires_thread_buffer_[thread_index - 1];
+ }
+
+ Threading::For(0, num_queries_ + 1, 128, [this](int thread_index, data_size_t start_index, data_size_t end_index) {
+ if (thread_index > 0) {
+ for (data_size_t i = start_index; i < end_index; ++i) {
+ sampled_query_boundaries_[i] += sampled_query_boundaires_thread_buffer_[thread_index - 1];
+ }
+ }
+ });
+
+ bag_data_cnt_ = sampled_query_boundaries_[num_sampled_queries_];
+
+ Threading::For(0, num_queries_, 1, [this](int /*thread_index*/, data_size_t start_index, data_size_t end_index) {
+ for (data_size_t sampled_query_id = start_index; sampled_query_id < end_index; ++sampled_query_id) {
+ const data_size_t query_index = bag_query_indices_[sampled_query_id];
+ const data_size_t data_index_start = query_boundaries_[query_index];
+ const data_size_t data_index_end = query_boundaries_[query_index + 1];
+ const data_size_t sampled_query_start = sampled_query_boundaries_[sampled_query_id];
+ for (data_size_t i = data_index_start; i < data_index_end; ++i) {
+ bag_data_indices_[sampled_query_start + i - data_index_start] = i;
+ }
+ }
+ });
+ }
Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
// set bagging data to tree learner
if (!is_use_subset_) {
@@ -60,6 +122,7 @@ class BaggingSampleStrategy : public SampleStrategy {
} else {
// get subset
tmp_subset_->ReSize(bag_data_cnt_);
+ Log::Warning("bag_data_indices_.size() = %ld, bag_data_cnt_ = %d", bag_data_indices_.size(), bag_data_cnt_);
tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
bag_data_cnt_, false);
#ifdef USE_CUDA
@@ -108,7 +171,14 @@ class BaggingSampleStrategy : public SampleStrategy {
cuda_bag_data_indices_.Resize(num_data_);
}
#endif // USE_CUDA
- bagging_runner_.ReSize(num_data_);
+ if (!config_->bagging_by_query) {
+ bagging_runner_.ReSize(num_data_);
+ } else {
+ bagging_runner_.ReSize(num_queries_);
+ sampled_query_boundaries_.resize(num_queries_ + 1, 0);
+ sampled_query_boundaires_thread_buffer_.resize(num_threads_, 0);
+ bag_query_indices_.resize(num_data_);
+ }
bagging_rands_.clear();
for (int i = 0;
i < (num_data_ + bagging_rand_block_ - 1) / bagging_rand_block_; ++i) {
@@ -118,9 +188,11 @@ class BaggingSampleStrategy : public SampleStrategy {
double average_bag_rate =
(static_cast(bag_data_cnt_) / num_data_) / config_->bagging_freq;
is_use_subset_ = false;
- if (config_->device_type != std::string("cuda")) {
- const int group_threshold_usesubset = 100;
+ if (config_->device_type != std::string("cuda") && !config_->bagging_by_query) {
+ const int group_threshold_usesubset = 200;
const double average_bag_rate_threshold = 0.5;
+ Log::Warning("train_data_->num_feature_groups() = %d", train_data_->num_feature_groups());
+ Log::Warning("average_bag_rate = %f", average_bag_rate);
if (average_bag_rate <= average_bag_rate_threshold
&& (train_data_->num_feature_groups() < group_threshold_usesubset)) {
if (tmp_subset_ == nullptr || is_change_dataset) {
@@ -153,6 +225,14 @@ class BaggingSampleStrategy : public SampleStrategy {
return false;
}
+ data_size_t num_sampled_queries() const override {
+ return num_sampled_queries_;
+ }
+
+ const data_size_t* sampled_query_indices() const override {
+ return bag_query_indices_.data();
+ }
+
private:
data_size_t BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) {
if (cnt <= 0) {
@@ -202,6 +282,20 @@ class BaggingSampleStrategy : public SampleStrategy {
/*! \brief whether need restart bagging in continued training */
bool need_re_bagging_;
+ /*! \brief number of threads */
+ int num_threads_;
+ /*! \brief query boundaries of the in-bag queries */
+ std::vector sampled_query_boundaries_;
+ /*! \brief buffer for calculating sampled_query_boundaries_ */
+ std::vector sampled_query_boundaires_thread_buffer_;
+ /*! \brief in-bag query indices */
+ std::vector> bag_query_indices_;
+ /*! \brief number of queries in the training dataset */
+ data_size_t num_queries_;
+ /*! \brief number of in-bag queries */
+ data_size_t num_sampled_queries_;
+ /*! \brief query boundaries of the whole training dataset */
+ const data_size_t* query_boundaries_;
};
} // namespace LightGBM
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 937b44fcc8aa..c22ecfc561ba 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -224,8 +224,14 @@ void GBDT::Boosting() {
}
// objective function will calculate gradients and hessians
int64_t num_score = 0;
- objective_function_->
- GetGradients(GetTrainingScore(&num_score), gradients_pointer_, hessians_pointer_);
+ if (config_->bagging_by_query) {
+ data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data());
+ objective_function_->
+ GetGradients(GetTrainingScore(&num_score), data_sample_strategy_->num_sampled_queries(), data_sample_strategy_->sampled_query_indices(), gradients_pointer_, hessians_pointer_);
+ } else {
+ objective_function_->
+ GetGradients(GetTrainingScore(&num_score), gradients_pointer_, hessians_pointer_);
+ }
}
void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
@@ -337,12 +343,15 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
Common::FunctionTimer fun_timer("GBDT::TrainOneIter", global_timer);
+ Log::Warning("TrainOneIter step -10");
std::vector init_scores(num_tree_per_iteration_, 0.0);
// boosting first
if (gradients == nullptr || hessians == nullptr) {
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true);
}
+ data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data());
+ objective_function_->SetDataIndices(data_sample_strategy_->bag_data_indices().data());
Boosting();
gradients = gradients_pointer_;
hessians = hessians_pointer_;
@@ -365,8 +374,11 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
}
}
+ Log::Warning("TrainOneIter step -9");
// bagging logic
- data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data());
+ if (!config_->bagging_by_query) {
+ data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data());
+ }
const bool is_use_subset = data_sample_strategy_->is_use_subset();
const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt();
const std::vector>& bag_data_indices = data_sample_strategy_->bag_data_indices();
@@ -375,6 +387,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
ResetGradientBuffers();
}
+ Log::Warning("TrainOneIter step -8");
bool should_continue = false;
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
const size_t offset = static_cast(cur_tree_id) * num_data_;
@@ -392,9 +405,12 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
hess = hessians_pointer_ + offset;
}
bool is_first_tree = models_.size() < static_cast(num_tree_per_iteration_);
+ Log::Warning("TrainOneIter step -7");
new_tree.reset(tree_learner_->Train(grad, hess, is_first_tree));
}
+ Log::Warning("TrainOneIter step 0");
+
if (new_tree->num_leaves() > 1) {
should_continue = true;
auto score_ptr = train_score_updater_->score() + offset;
@@ -414,18 +430,24 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
if (objective_function_ != nullptr && !config_->boost_from_average && !train_score_updater_->has_init_score()) {
init_scores[cur_tree_id] = ObtainAutomaticInitialScore(objective_function_, cur_tree_id);
// updates scores
+ Log::Warning("TrainOneIter step 0.1");
train_score_updater_->AddScore(init_scores[cur_tree_id], cur_tree_id);
+ Log::Warning("TrainOneIter step 0.2");
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(init_scores[cur_tree_id], cur_tree_id);
}
+ Log::Warning("TrainOneIter step 0.3");
}
new_tree->AsConstantTree(init_scores[cur_tree_id]);
+ Log::Warning("TrainOneIter step 0.4");
}
}
// add model
models_.push_back(std::move(new_tree));
}
+ Log::Warning("TrainOneIter step 1");
+
if (!should_continue) {
Log::Warning("Stopped training because there are no more leaves that meet the split requirements");
if (models_.size() > static_cast(num_tree_per_iteration_)) {
@@ -436,6 +458,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
return true;
}
+ Log::Warning("TrainOneIter step 2");
++iter_;
return false;
}
@@ -480,7 +503,9 @@ bool GBDT::EvalAndCheckEarlyStopping() {
void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
Common::FunctionTimer fun_timer("GBDT::UpdateScore", global_timer);
// update training score
+ Log::Warning("before update score 0");
if (!data_sample_strategy_->is_use_subset()) {
+ Log::Warning("before update score 1");
train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id);
const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt();
@@ -496,16 +521,20 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
}
#endif // USE_CUDA
}
+ Log::Warning("before update score 2");
} else {
+ Log::Warning("before update score 3");
train_score_updater_->AddScore(tree, cur_tree_id);
}
+ Log::Warning("before update score 4");
// update validation score
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(tree, cur_tree_id);
}
+ Log::Warning("before update score 5");
}
#ifdef USE_CUDA
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 3d84599e6589..326df0ec5f79 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -15,8 +15,10 @@
#include "dense_bin.hpp"
#include "multi_val_dense_bin.hpp"
+#include "multi_val_pairwise_lambdarank_bin.hpp"
#include "multi_val_sparse_bin.hpp"
#include "sparse_bin.hpp"
+#include "pairwise_lambdarank_bin.hpp"
namespace LightGBM {
@@ -632,21 +634,94 @@ namespace LightGBM {
}
}
+ Bin* Bin::CreateDensePairwiseRankingFirstBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) {
+ if (num_bin <= 16) {
+ return new DensePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data));
+ } else if (num_bin <= 256) {
+ return new DensePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data));
+ } else if (num_bin <= 65536) {
+ return new DensePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data));
+ } else {
+ return new DensePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data));
+ }
+ }
+
+ Bin* Bin::CreateDensePairwiseRankingSecondBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) {
+ if (num_bin <= 16) {
+ return new DensePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data));
+ } else if (num_bin <= 256) {
+ return new DensePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data));
+ } else if (num_bin <= 65536) {
+ return new DensePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data));
+ } else {
+ return new DensePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data));
+ }
+ }
+
+ Bin* Bin::CreateSparsePairwiseRankingFirstBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) {
+ if (num_bin <= 256) {
+ return new SparsePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data));
+ } else if (num_bin <= 65536) {
+ return new SparsePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data));
+ } else {
+ return new SparsePairwiseRankingFirstBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data));
+ }
+ }
+
+ Bin* Bin::CreateSparsePairwiseRankingSecondBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) {
+ if (num_bin <= 256) {
+ return new SparsePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data));
+ } else if (num_bin <= 65536) {
+ return new SparsePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data));
+ } else {
+ return new SparsePairwiseRankingSecondBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data));
+ }
+ }
+
+ Bin* Bin::CreateDensePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets) {
+ if (num_bin <= 16) {
+ return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets);
+ } else if (num_bin <= 256) {
+ return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets);
+ } else if (num_bin <= 65536) {
+ return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets);
+ } else {
+ return new DensePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets);
+ }
+ }
+
+ Bin* Bin::CreateSparsePairwiseRankingDiffBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map, const std::vector>* diff_bin_mappers, const std::vector>* ori_bin_mappers, const std::vector* bin_offsets, const std::vector* diff_bin_offsets) {
+ if (num_bin <= 256) {
+ return new SparsePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets);
+ } else if (num_bin <= 65536) {
+ return new SparsePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets);
+ } else {
+ return new SparsePairwiseRankingDiffBin(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data), diff_bin_mappers, ori_bin_mappers, bin_offsets, diff_bin_offsets);
+ }
+ }
+
MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature,
- double sparse_rate, const std::vector& offsets) {
+ double sparse_rate, const std::vector& offsets, const bool use_pairwise_ranking, const std::pair* paired_ranking_item_global_index_map) {
if (sparse_rate >= multi_val_bin_sparse_threshold) {
const double average_element_per_row = (1.0 - sparse_rate) * num_feature;
- return CreateMultiValSparseBin(num_data, num_bin,
- average_element_per_row);
+ // if (use_pairwise_ranking) {
+ Log::Warning("Pairwise ranking with sparse row-wse bins is not supported yet.");
+ return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets, use_pairwise_ranking, paired_ranking_item_global_index_map);
+ // } else {
+ // return CreateMultiValSparseBin(num_data, num_bin,
+ // average_element_per_row, use_pairwise_ranking, paired_ranking_item_global_index_map);
+ // }
} else {
- return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets);
+ return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets, use_pairwise_ranking, paired_ranking_item_global_index_map);
}
}
MultiValBin* MultiValBin::CreateMultiValDenseBin(data_size_t num_data,
int num_bin,
int num_feature,
- const std::vector& offsets) {
+ const std::vector& offsets,
+ const bool use_pairwise_ranking,
+ const std::pair* paired_ranking_item_global_index_map) {
// calculate max bin of all features to select the int type in MultiValDenseBin
int max_bin = 0;
for (int i = 0; i < static_cast(offsets.size()) - 1; ++i) {
@@ -656,17 +731,31 @@ namespace LightGBM {
}
}
if (max_bin <= 256) {
- return new MultiValDenseBin(num_data, num_bin, num_feature, offsets);
+ if (use_pairwise_ranking) {
+ return new MultiValDensePairwiseLambdarankBin(num_data, num_bin, num_feature, offsets, paired_ranking_item_global_index_map);
+ } else {
+ return new MultiValDenseBin(num_data, num_bin, num_feature, offsets);
+ }
} else if (max_bin <= 65536) {
- return new MultiValDenseBin(num_data, num_bin, num_feature, offsets);
+ if (use_pairwise_ranking) {
+ return new MultiValDensePairwiseLambdarankBin(num_data, num_bin, num_feature, offsets, paired_ranking_item_global_index_map);
+ } else {
+ return new MultiValDenseBin(num_data, num_bin, num_feature, offsets);
+ }
} else {
- return new MultiValDenseBin(num_data, num_bin, num_feature, offsets);
+ if (use_pairwise_ranking) {
+ return new MultiValDensePairwiseLambdarankBin(num_data, num_bin, num_feature, offsets, paired_ranking_item_global_index_map);
+ } else {
+ return new MultiValDenseBin(num_data, num_bin, num_feature, offsets);
+ }
}
}
MultiValBin* MultiValBin::CreateMultiValSparseBin(data_size_t num_data,
int num_bin,
- double estimate_element_per_row) {
+ double estimate_element_per_row,
+ const bool /*use_pairwise_ranking*/,
+ const std::pair* /*paired_ranking_item_global_index_map*/) {
size_t estimate_total_entries =
static_cast(estimate_element_per_row * 1.1 * num_data);
if (estimate_total_entries <= std::numeric_limits::max()) {
diff --git a/src/io/config.cpp b/src/io/config.cpp
index c63de70fc16b..20d327ca2edb 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -466,6 +466,11 @@ void Config::CheckParamConflict(const std::unordered_map& Config::parameter_set() {
"neg_bagging_fraction",
"bagging_freq",
"bagging_seed",
+ "bagging_by_query",
"feature_fraction",
"feature_fraction_bynode",
"feature_fraction_seed",
@@ -306,6 +307,14 @@ const std::unordered_set& Config::parameter_set() {
"lambdarank_norm",
"label_gain",
"lambdarank_position_bias_regularization",
+ "use_differential_feature_in_pairwise_ranking",
+ "pairwise_lambdarank_model_indirect_comparison",
+ "pairwise_lambdarank_model_conditional_rel",
+ "pairwise_lambdarank_indirect_comparison_above_only",
+ "pairwise_lambdarank_logarithmic_discounts",
+ "pairwise_lambdarank_hard_pairwise_preference",
+ "pairwise_lambdarank_train_pairing_approach",
+ "pairwise_lambdarank_valid_pairing_approach",
"metric",
"metric_freq",
"is_provide_training_metric",
@@ -377,6 +386,8 @@ void Config::GetMembersFromString(const std::unordered_map>& Config::paramet
{"neg_bagging_fraction", {"neg_sub_row", "neg_subsample", "neg_bagging"}},
{"bagging_freq", {"subsample_freq"}},
{"bagging_seed", {"bagging_fraction_seed"}},
+ {"bagging_by_query", {}},
{"feature_fraction", {"sub_feature", "colsample_bytree"}},
{"feature_fraction_bynode", {"sub_feature_bynode", "colsample_bynode"}},
{"feature_fraction_seed", {}},
@@ -911,6 +948,14 @@ const std::unordered_map>& Config::paramet
{"lambdarank_norm", {}},
{"label_gain", {}},
{"lambdarank_position_bias_regularization", {}},
+ {"use_differential_feature_in_pairwise_ranking", {}},
+ {"pairwise_lambdarank_model_indirect_comparison", {}},
+ {"pairwise_lambdarank_model_conditional_rel", {}},
+ {"pairwise_lambdarank_indirect_comparison_above_only", {}},
+ {"pairwise_lambdarank_logarithmic_discounts", {}},
+ {"pairwise_lambdarank_hard_pairwise_preference", {}},
+ {"pairwise_lambdarank_train_pairing_approach", {}},
+ {"pairwise_lambdarank_valid_pairing_approach", {}},
{"metric", {"metrics", "metric_types"}},
{"metric_freq", {"output_freq"}},
{"is_provide_training_metric", {"training_metric", "is_training_metric", "train_metric"}},
@@ -957,6 +1002,7 @@ const std::unordered_map& Config::ParameterTypes() {
{"neg_bagging_fraction", "double"},
{"bagging_freq", "int"},
{"bagging_seed", "int"},
+ {"bagging_by_query", "bool"},
{"feature_fraction", "double"},
{"feature_fraction_bynode", "double"},
{"feature_fraction_seed", "int"},
@@ -1055,6 +1101,14 @@ const std::unordered_map& Config::ParameterTypes() {
{"lambdarank_norm", "bool"},
{"label_gain", "vector"},
{"lambdarank_position_bias_regularization", "double"},
+ {"use_differential_feature_in_pairwise_ranking", "bool"},
+ {"pairwise_lambdarank_model_indirect_comparison", "bool"},
+ {"pairwise_lambdarank_model_conditional_rel", "bool"},
+ {"pairwise_lambdarank_indirect_comparison_above_only", "bool"},
+ {"pairwise_lambdarank_logarithmic_discounts", "bool"},
+ {"pairwise_lambdarank_hard_pairwise_preference", "bool"},
+ {"pairwise_lambdarank_train_pairing_approach", "string"},
+ {"pairwise_lambdarank_valid_pairing_approach", "string"},
{"metric", "vector"},
{"metric_freq", "int"},
{"is_provide_training_metric", "bool"},
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 01687d95c747..358e4c54a8b3 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -6,6 +6,7 @@
#include
#include
+#include
#include
#include
#include
@@ -17,6 +18,8 @@
#include
#include
+#include
+
namespace LightGBM {
const int Dataset::kSerializedReferenceVersionLength = 2;
@@ -279,6 +282,7 @@ std::vector> FastFeatureBundling(
}
std::vector> tmp_indices;
+ std::vector tmp_indices_ptr(num_sample_col, nullptr);
std::vector tmp_num_per_col(num_sample_col, 0);
for (auto fidx : used_features) {
if (fidx >= num_sample_col) {
@@ -290,18 +294,19 @@ std::vector> FastFeatureBundling(
if (!ret.empty()) {
tmp_indices.push_back(ret);
tmp_num_per_col[fidx] = static_cast(ret.size());
- sample_indices[fidx] = tmp_indices.back().data();
+ tmp_indices_ptr[fidx] = tmp_indices.back().data();
} else {
tmp_num_per_col[fidx] = num_per_col[fidx];
+ tmp_indices_ptr[fidx] = sample_indices[fidx];
}
}
std::vector group_is_multi_val, group_is_multi_val2;
auto features_in_group =
- FindGroups(bin_mappers, used_features, sample_indices,
+ FindGroups(bin_mappers, used_features, tmp_indices_ptr.data(),
tmp_num_per_col.data(), num_sample_col, total_sample_cnt,
num_data, is_use_gpu, is_sparse, &group_is_multi_val);
auto group2 =
- FindGroups(bin_mappers, feature_order_by_cnt, sample_indices,
+ FindGroups(bin_mappers, feature_order_by_cnt, tmp_indices_ptr.data(),
tmp_num_per_col.data(), num_sample_col, total_sample_cnt,
num_data, is_use_gpu, is_sparse, &group_is_multi_val2);
@@ -352,12 +357,17 @@ void Dataset::Construct(std::vector>* bin_mappers,
auto is_sparse = io_config.is_enable_sparse;
if (io_config.device_type == std::string("cuda")) {
LGBM_config_::current_device = lgbm_device_cuda;
- if ((io_config.device_type == std::string("cuda")) && is_sparse) {
+ if (is_sparse) {
Log::Warning("Using sparse features with CUDA is currently not supported.");
is_sparse = false;
}
+ } else if ((io_config.objective == std::string("pairwise_lambdarank")) && is_sparse) {
+ Log::Warning("Using sparse features with pairwise_lambdarank is currently not supported.");
+ is_sparse = false;
}
+ is_sparse = false;
+
std::vector group_is_multi_val(used_features.size(), 0);
if (io_config.enable_bundle && !used_features.empty()) {
bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda");
@@ -438,6 +448,26 @@ void Dataset::Construct(std::vector>* bin_mappers,
}
device_type_ = io_config.device_type;
gpu_device_id_ = io_config.gpu_device_id;
+
+ if (io_config.objective == std::string("pairwise_lambdarank")) {
+ // store sampled values for constructing differential features
+ const int num_threads = OMP_NUM_THREADS();
+ sampled_values_.reset(new std::vector>());
+ sampled_indices_.reset(new std::vector>());
+ sampled_values_->resize(static_cast(num_sample_col));
+ sampled_indices_->resize(static_cast(num_sample_col));
+ #pragma omp parallel for schedule(static) num_threads(num_threads)
+ for (int col_idx = 0; col_idx < num_sample_col; ++col_idx) {
+ const int num_samples_in_col = num_per_col[col_idx];
+ sampled_values_->at(col_idx).resize(num_samples_in_col);
+ sampled_indices_->at(col_idx).resize(num_samples_in_col);
+ for (int i = 0; i < num_samples_in_col; ++i) {
+ sampled_values_->at(col_idx)[i] = sample_values[col_idx][i];
+ sampled_indices_->at(col_idx)[i] = sample_non_zero_indices[col_idx][i];
+ }
+ }
+ num_total_sampled_data_ = static_cast(total_sample_cnt);
+ }
}
void Dataset::FinishLoad() {
@@ -469,7 +499,9 @@ void PushDataToMultiValBin(
MultiValBin* ret) {
Common::FunctionTimer fun_time("Dataset::PushDataToMultiValBin",
global_timer);
+ Log::Warning("num_data = %d", num_data);
if (ret->IsSparse()) {
+ // Log::Fatal("pairwise ranking with sparse multi val bin is not supported.");
Threading::For(
0, num_data, 1024, [&](int tid, data_size_t start, data_size_t end) {
std::vector cur_data;
@@ -499,6 +531,7 @@ void PushDataToMultiValBin(
0, num_data, 1024, [&](int tid, data_size_t start, data_size_t end) {
std::vector cur_data(most_freq_bins.size(), 0);
for (size_t j = 0; j < most_freq_bins.size(); ++j) {
+ //Log::Warning("(*iters)[%d].size() = %d, j = %d, start = %d", tid, (*iters)[tid].size(), j, start);
(*iters)[tid][j]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
@@ -513,7 +546,7 @@ void PushDataToMultiValBin(
}
}
-MultiValBin* Dataset::GetMultiBinFromSparseFeatures(const std::vector& offsets) const {
+MultiValBin* Dataset::GetMultiBinFromSparseFeatures(const std::vector& offsets, const bool use_pairwise_ranking) const {
Common::FunctionTimer fun_time("Dataset::GetMultiBinFromSparseFeatures",
global_timer);
int multi_group_id = -1;
@@ -551,13 +584,13 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures(const std::vector&
sum_sparse_rate);
std::unique_ptr ret;
ret.reset(MultiValBin::CreateMultiValBin(num_data_, offsets.back(),
- num_feature, sum_sparse_rate, offsets));
+ num_feature, sum_sparse_rate, offsets, use_pairwise_ranking, metadata_.paired_ranking_item_global_index_map()));
PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get());
ret->FinishLoad();
return ret.release();
}
-MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& offsets) const {
+MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& offsets, const bool use_pairwise_ranking) const {
Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures",
global_timer);
int num_threads = OMP_NUM_THREADS();
@@ -600,11 +633,57 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& of
CHECK(static_cast(most_freq_bins.size()) == ncol);
Log::Debug("Dataset::GetMultiBinFromAllFeatures: sparse rate %f",
1.0 - sum_dense_ratio);
- ret.reset(MultiValBin::CreateMultiValBin(
- num_data_, offsets.back(), static_cast(most_freq_bins.size()),
- 1.0 - sum_dense_ratio, offsets));
- PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get());
+ if (use_pairwise_ranking) {
+
+ // for (size_t i = 0; i < iters.size(); ++i) {
+ // for (size_t j = 0; j < iters[i].size(); ++j) {
+ // Log::Warning("i = %ld, j = %ld, iters[i][j] = %d", i, j, static_cast(iters[i][j] == nullptr));
+ // }
+ // }
+
+ Log::Warning("most_freq_bins.size() = %d, num_groups_ = %d, num_used_differential_features_ = %d, num_used_differential_groups_ = %d, ncol = %d", static_cast(most_freq_bins.size()), num_groups_, num_used_differential_features_, num_used_differential_groups_, ncol);
+
+ const int num_original_features = (static_cast(most_freq_bins.size()) - num_used_differential_groups_) / 2;
+ std::vector original_most_freq_bins;
+ std::vector original_offsets;
+ for (int i = 0; i < num_original_features; ++i) {
+ original_most_freq_bins.push_back(most_freq_bins[i]);
+ original_offsets.push_back(offsets[i]);
+ }
+ original_offsets.push_back(offsets[num_original_features]);
+ std::ofstream fout("mutli_val_bin_meta_info_pairwise.txt");
+ fout << "original_most_freq_bins" << std::endl;
+ for (size_t i = 0; i < original_most_freq_bins.size(); ++i) {
+ fout << original_most_freq_bins[i] << std::endl;
+ }
+ fout << "original_offsets" << std::endl;
+ for (size_t i = 0; i < original_offsets.size(); ++i) {
+ fout << original_offsets[i] << std::endl;
+ }
+ fout.close();
+ const data_size_t num_original_data = metadata_.query_boundaries()[metadata_.num_queries()];
+ ret.reset(MultiValBin::CreateMultiValBin(
+ num_original_data, offsets.back(), num_original_features,
+ 1.0 - sum_dense_ratio, original_offsets, use_pairwise_ranking, metadata_.paired_ranking_item_global_index_map()));
+ PushDataToMultiValBin(num_original_data, original_most_freq_bins, original_offsets, &iters, ret.get());
+ } else {
+ ret.reset(MultiValBin::CreateMultiValBin(
+ num_data_, offsets.back(), static_cast(most_freq_bins.size()),
+ 1.0 - sum_dense_ratio, offsets, use_pairwise_ranking, metadata_.paired_ranking_item_global_index_map()));
+ PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get());
+ std::ofstream fout("mutli_val_bin_meta_info_no_pairwise.txt");
+ fout << "original_most_freq_bins" << std::endl;
+ for (size_t i = 0; i < most_freq_bins.size(); ++i) {
+ fout << most_freq_bins[i] << std::endl;
+ }
+ fout << "original_offsets" << std::endl;
+ for (size_t i = 0; i < offsets.size(); ++i) {
+ fout << offsets[i] << std::endl;
+ }
+ fout.close();
+ }
ret->FinishLoad();
+ ret->DumpContent();
return ret.release();
}
@@ -613,7 +692,8 @@ TrainingShareStates* Dataset::GetShareStates(
score_t* gradients, score_t* hessians,
const std::vector& is_feature_used, bool is_constant_hessian,
bool force_col_wise, bool force_row_wise,
- const int num_grad_quant_bins) const {
+ const int num_grad_quant_bins,
+ const bool use_pairwise_ranking) const {
Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod",
global_timer);
if (force_col_wise && force_row_wise) {
@@ -632,7 +712,8 @@ TrainingShareStates* Dataset::GetShareStates(
std::vector offsets;
share_state->CalcBinOffsets(
feature_groups_, &offsets, true);
- share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets),
+ Log::Warning("feature_groups_.size() = %ld, offsets.size() = %ld", feature_groups_.size(), offsets.size());
+ share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets, use_pairwise_ranking),
num_data_, feature_groups_, false, true, num_grad_quant_bins);
share_state->is_col_wise = true;
share_state->is_constant_hessian = is_constant_hessian;
@@ -642,7 +723,8 @@ TrainingShareStates* Dataset::GetShareStates(
std::vector offsets;
share_state->CalcBinOffsets(
feature_groups_, &offsets, false);
- share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets), num_data_,
+ Log::Warning("feature_groups_.size() = %ld, offsets.size() = %ld", feature_groups_.size(), offsets.size());
+ share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets, use_pairwise_ranking), num_data_,
feature_groups_, false, false, num_grad_quant_bins);
share_state->is_col_wise = false;
share_state->is_constant_hessian = is_constant_hessian;
@@ -659,14 +741,14 @@ TrainingShareStates* Dataset::GetShareStates(
auto start_time = std::chrono::steady_clock::now();
std::vector col_wise_offsets;
col_wise_state->CalcBinOffsets(feature_groups_, &col_wise_offsets, true);
- col_wise_state->SetMultiValBin(GetMultiBinFromSparseFeatures(col_wise_offsets), num_data_,
+ col_wise_state->SetMultiValBin(GetMultiBinFromSparseFeatures(col_wise_offsets, use_pairwise_ranking), num_data_,
feature_groups_, false, true, num_grad_quant_bins);
col_wise_init_time = std::chrono::steady_clock::now() - start_time;
start_time = std::chrono::steady_clock::now();
std::vector row_wise_offsets;
row_wise_state->CalcBinOffsets(feature_groups_, &row_wise_offsets, false);
- row_wise_state->SetMultiValBin(GetMultiBinFromAllFeatures(row_wise_offsets), num_data_,
+ row_wise_state->SetMultiValBin(GetMultiBinFromAllFeatures(row_wise_offsets, use_pairwise_ranking), num_data_,
feature_groups_, false, false, num_grad_quant_bins);
row_wise_init_time = std::chrono::steady_clock::now() - start_time;
@@ -727,19 +809,22 @@ template TrainingShareStates* Dataset::GetShareStates(
score_t* gradients, score_t* hessians,
const std::vector& is_feature_used, bool is_constant_hessian,
bool force_col_wise, bool force_row_wise,
- const int num_grad_quant_bins) const;
+ const int num_grad_quant_bins,
+ const bool use_pairwise_ranking) const;
template TrainingShareStates* Dataset::GetShareStates(
score_t* gradients, score_t* hessians,
const std::vector& is_feature_used, bool is_constant_hessian,
bool force_col_wise, bool force_row_wise,
- const int num_grad_quant_bins) const;
+ const int num_grad_quant_bins,
+ const bool use_pairwise_ranking) const;
template TrainingShareStates* Dataset::GetShareStates(
score_t* gradients, score_t* hessians,
const std::vector& is_feature_used, bool is_constant_hessian,
bool force_col_wise, bool force_row_wise,
- const int num_grad_quant_bins) const;
+ const int num_grad_quant_bins,
+ const bool use_pairwise_ranking) const;
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
feature_groups_.clear();
@@ -819,6 +904,234 @@ void Dataset::CreateValid(const Dataset* dataset) {
gpu_device_id_ = dataset->gpu_device_id_;
}
+void Dataset::CreatePairWiseRankingData(const Dataset* dataset, const bool is_validation, const Config& config) {
+ const std::string& pairing_approach = is_validation ? config.pairwise_lambdarank_valid_pairing_approach : config.pairwise_lambdarank_train_pairing_approach;
+ num_data_ = metadata_.BuildPairwiseFeatureRanking(dataset->metadata(), pairing_approach);
+
+ feature_groups_.clear();
+ num_features_ = dataset->num_features_ * 2;
+ num_groups_ = dataset->num_groups_ * 2;
+ max_bin_ = dataset->max_bin_;
+ min_data_in_bin_ = dataset->min_data_in_bin_;
+ bin_construct_sample_cnt_ = dataset->bin_construct_sample_cnt_;
+ use_missing_ = dataset->use_missing_;
+ zero_as_missing_ = dataset->zero_as_missing_;
+ feature2group_.clear();
+ feature2subfeature_.clear();
+ has_raw_ = dataset->has_raw();
+ numeric_feature_map_ = dataset->numeric_feature_map_;
+ num_numeric_features_ = dataset->num_numeric_features_;
+ for (const int nuermic_feature_index : dataset->numeric_feature_map_) {
+ if (nuermic_feature_index != -1) {
+ numeric_feature_map_.push_back(num_numeric_features_);
+ ++num_numeric_features_;
+ } else {
+ numeric_feature_map_.push_back(-1);
+ }
+ }
+ // copy feature bin mapper data
+ feature_need_push_zeros_.clear();
+ group_bin_boundaries_.clear();
+ uint64_t num_total_bin = 0;
+ group_bin_boundaries_.push_back(num_total_bin);
+ group_feature_start_.resize(num_groups_);
+ group_feature_cnt_.resize(num_groups_);
+
+ sampled_values_ = dataset->sampled_values_;
+ sampled_indices_ = dataset->sampled_indices_;
+ num_total_sampled_data_ = dataset->num_total_sampled_data_;
+
+ // create differential features
+ std::vector> diff_feature_bin_mappers;
+ std::vector> original_bin_mappers;
+ std::vector diff_original_feature_index;
+ if (config.use_differential_feature_in_pairwise_ranking) {
+ for (int i = 0; i < dataset->num_total_features_; ++i) {
+ const int inner_feature_index = dataset->InnerFeatureIndex(i);
+ if (inner_feature_index >= 0) {
+ original_bin_mappers.emplace_back(new BinMapper(*dataset->FeatureBinMapper(inner_feature_index)));
+ } else {
+ original_bin_mappers.emplace_back(nullptr);
+ }
+ }
+
+ if (!is_validation) {
+ train_query_boundaries_ = metadata_.query_boundaries();
+ train_num_queries_ = metadata_.num_queries();
+ } else {
+ train_query_boundaries_ = dataset->train_query_boundaries_;
+ train_num_queries_ = dataset->train_num_queries_;
+ }
+ // TODO(shiyu1994): verify the difference in training and validation results even when they share the same dataset
+ CreatePairwiseRankingDifferentialFeatures(*sampled_values_, *sampled_indices_, original_bin_mappers, num_total_sampled_data_, train_query_boundaries_, train_num_queries_, &diff_feature_bin_mappers, &diff_original_feature_index, config);
+ }
+
+ used_feature_map_.clear();
+ used_feature_map_.reserve(2 * dataset->used_feature_map_.size());
+ used_feature_map_.insert(used_feature_map_.begin(), dataset->used_feature_map_.begin(), dataset->used_feature_map_.end());
+
+ for (int i = 0; i < dataset->num_total_features_; ++i) {
+ if (dataset->used_feature_map_[i] != -1) {
+ used_feature_map_.push_back(dataset->used_feature_map_[i] + dataset->num_features_);
+ } else {
+ used_feature_map_.push_back(-1);
+ }
+ }
+
+ std::vector used_diff_features;
+ if (config.use_differential_feature_in_pairwise_ranking) {
+ for (int diff_feature_index = 0; diff_feature_index < static_cast(diff_feature_bin_mappers.size()); ++diff_feature_index) {
+ if (!diff_feature_bin_mappers[diff_feature_index]->is_trivial()) {
+ num_numeric_features_ += 1;
+ num_features_ += 1;
+ used_diff_features.push_back(diff_feature_index);
+ }
+ }
+ numeric_feature_map_.resize(num_features_, -1);
+ used_feature_map_.resize(2 * dataset->num_total_features_ + static_cast(diff_feature_bin_mappers.size()), -1);
+ }
+
+ const bool is_use_gpu = config.device_type == std::string("cuda") || config.device_type == std::string("gpu");
+ std::vector group_is_multi_val;
+ std::vector> diff_feature_groups =
+ FindGroups(diff_feature_bin_mappers, used_diff_features, Common::Vector2Ptr(sampled_indices_.get()).data(), Common::VectorSize(*sampled_indices_).data(), static_cast(sampled_indices_->size()), num_total_sampled_data_, num_data_, is_use_gpu, false, &group_is_multi_val);
+
+ if (is_validation) {
+ std::vector> flatten_feature_groups;
+ for (const auto& features_in_group : diff_feature_groups) {
+ for (const int feature_index : features_in_group) {
+ flatten_feature_groups.push_back(std::vector{feature_index});
+ }
+ }
+ diff_feature_groups = flatten_feature_groups;
+ }
+
+ int cur_feature_index = 0;
+ for (int i = 0; i < num_groups_; ++i) {
+ int original_group_index = i % dataset->num_groups_;
+ int original_group_feature_start = dataset->group_feature_start_[original_group_index];
+ const int is_first_or_second_in_pairing = i / dataset->num_groups_; // 0 for first, 1 for second
+ group_feature_start_[i] = cur_feature_index;
+ for (int feature_index_in_group = 0; feature_index_in_group < dataset->group_feature_cnt_[original_group_index]; ++feature_index_in_group) {
+ const BinMapper* feature_bin_mapper = dataset->FeatureBinMapper(original_group_feature_start + feature_index_in_group);
+ if (feature_bin_mapper->GetDefaultBin() != feature_bin_mapper->GetMostFreqBin()) {
+ feature_need_push_zeros_.push_back(cur_feature_index);
+ }
+ feature2group_.push_back(i);
+ feature2subfeature_.push_back(dataset->feature2subfeature_[original_group_feature_start + feature_index_in_group]);
+ cur_feature_index += 1;
+ }
+ feature_groups_.emplace_back(new PairwiseRankingFeatureGroup(*dataset->feature_groups_[original_group_index].get(), dataset->num_data(), is_first_or_second_in_pairing, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_global_index_map()));
+ num_total_bin += dataset->FeatureGroupNumBin(original_group_index);
+ group_bin_boundaries_.push_back(num_total_bin);
+ group_feature_cnt_[i] = dataset->group_feature_cnt_[original_group_index];
+ }
+
+ Log::Warning("cur_feature_index = %d", cur_feature_index);
+
+ num_used_differential_features_ = 0;
+ num_used_differential_groups_ = static_cast(diff_feature_groups.size());
+ if (config.use_differential_feature_in_pairwise_ranking) {
+ for (size_t i = 0; i < diff_feature_groups.size(); ++i) {
+ const std::vector& features_in_group = diff_feature_groups[i];
+ group_feature_start_.push_back(cur_feature_index);
+ int num_features_in_group = 0;
+ std::vector> ori_bin_mappers;
+ std::vector> ori_bin_mappers_for_diff;
+ std::vector> diff_bin_mappers;
+ for (size_t j = 0; j < features_in_group.size(); ++j) {
+ const int diff_feature_index = features_in_group[j];
+ if (!diff_feature_bin_mappers[diff_feature_index]->is_trivial()) {
+ if (diff_feature_bin_mappers[diff_feature_index]->GetDefaultBin() != diff_feature_bin_mappers[diff_feature_index]->GetMostFreqBin()) {
+ feature_need_push_zeros_.push_back(cur_feature_index);
+ }
+ feature2group_.push_back(i + num_groups_);
+ feature2subfeature_.push_back(num_features_in_group);
+ numeric_feature_map_[cur_feature_index] = cur_feature_index;
+ used_feature_map_[diff_feature_index + dataset->num_total_features_ * 2] = cur_feature_index;
+ ++cur_feature_index;
+ ++num_features_in_group;
+ ++num_used_differential_features_;
+ const int ori_feature_index = dataset->InnerFeatureIndex(diff_original_feature_index[diff_feature_index]);
+ ori_bin_mappers.emplace_back(new BinMapper(*dataset->FeatureBinMapper(ori_feature_index)));
+ ori_bin_mappers_for_diff.emplace_back(new BinMapper(*dataset->FeatureBinMapper(ori_feature_index)));
+ diff_bin_mappers.emplace_back(new BinMapper(*diff_feature_bin_mappers[diff_feature_index]));
+ }
+ }
+
+ FeatureGroup feature_group(num_features_in_group, 0, &ori_bin_mappers, dataset->num_data(), i + num_groups_);
+
+ const int num_threads = OMP_NUM_THREADS();
+ #pragma omp parallel for schedule(static) num_threads(num_threads)
+ for (int j = 0; j < num_features_in_group; ++j) {
+ const int tid = omp_get_thread_num();
+ const int diff_feature_index = features_in_group[j];
+ const int original_feature_index = dataset->InnerFeatureIndex(diff_original_feature_index[diff_feature_index]);
+ const BinMapper* original_feature_bin_mapper = dataset->FeatureBinMapper(original_feature_index);
+ BinIterator* original_feature_iterator = dataset->FeatureIterator(original_feature_index);
+ original_feature_iterator->Reset(0);
+ for (int k = 0; k < dataset->num_data(); ++k) {
+ feature_group.PushData(tid, j, k, original_feature_bin_mapper->BinToValue(original_feature_iterator->Get(k)));
+ }
+ }
+
+ feature_group.FinishLoad();
+
+ feature_groups_.emplace_back(new PairwiseRankingDifferentialFeatureGroup(feature_group, dataset->num_data(), 2, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_global_index_map(), diff_bin_mappers, ori_bin_mappers_for_diff));
+
+ group_feature_cnt_.push_back(cur_feature_index - group_feature_start_.back());
+ num_total_bin += feature_groups_.back()->num_total_bin_;
+ group_bin_boundaries_.push_back(num_total_bin);
+ }
+
+ num_groups_ += static_cast(diff_feature_groups.size());
+ }
+
+ Log::Warning("cur_feature_index = %d", cur_feature_index);
+
+ feature_groups_.shrink_to_fit();
+
+ feature_names_.clear();
+ for (const std::string& feature_name : dataset->feature_names_) {
+ feature_names_.push_back(feature_name + std::string("_i"));
+ }
+ for (const std::string& feature_name : dataset->feature_names_) {
+ feature_names_.push_back(feature_name + std::string("_j"));
+ }
+ if (config.use_differential_feature_in_pairwise_ranking) {
+ for (const int real_feature_index : diff_original_feature_index) {
+ feature_names_.push_back(dataset->feature_names_[real_feature_index] + std::string("_k"));
+ }
+ }
+
+ real_feature_idx_.clear();
+ for (const int idx : dataset->real_feature_idx_) {
+ real_feature_idx_.push_back(idx);
+ }
+ for (const int idx : dataset->real_feature_idx_) {
+ real_feature_idx_.push_back(idx + dataset->num_total_features_);
+ }
+ if (config.use_differential_feature_in_pairwise_ranking) {
+ for (const auto& features_in_diff_group : diff_feature_groups) {
+ for (const int idx : features_in_diff_group) {
+ real_feature_idx_.push_back(idx + 2 * dataset->num_total_features_);
+ }
+ }
+ }
+
+ num_total_features_ = dataset->num_total_features_ * 2 + static_cast(diff_feature_bin_mappers.size());
+
+ forced_bin_bounds_.clear();
+ forced_bin_bounds_.reserve(2 * dataset->num_total_features_);
+ forced_bin_bounds_.insert(forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.end());
+ forced_bin_bounds_.insert(forced_bin_bounds_.begin() + dataset->forced_bin_bounds_.size(), dataset->forced_bin_bounds_.begin(), dataset->forced_bin_bounds_.end());
+ forced_bin_bounds_.resize(num_total_features_);
+
+ label_idx_ = dataset->label_idx_;
+ device_type_ = dataset->device_type_;
+ gpu_device_id_ = dataset->gpu_device_id_;
+}
+
void Dataset::ReSize(data_size_t num_data) {
if (num_data_ != num_data) {
num_data_ = num_data;
@@ -838,6 +1151,7 @@ void Dataset::CopySubrow(const Dataset* fullset,
data_size_t num_used_indices, bool need_meta_data) {
CHECK_EQ(num_used_indices, num_data_);
+ Log::Warning("copy subrow here !!!!");
std::vector group_ids, subfeature_ids;
group_ids.reserve(num_features_);
subfeature_ids.reserve(num_features_);
@@ -853,20 +1167,24 @@ void Dataset::CopySubrow(const Dataset* fullset,
subfeature_ids.emplace_back(-1);
}
}
+ Log::Warning("copy subrow step 0 !!!!");
int num_copy_tasks = static_cast(group_ids.size());
-
- OMP_INIT_EX();
- #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(dynamic)
+ // OMP_INIT_EX();
+ // #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(dynamic)
for (int task_id = 0; task_id < num_copy_tasks; ++task_id) {
- OMP_LOOP_EX_BEGIN();
+ // OMP_LOOP_EX_BEGIN();
+ Log::Warning("before copy sub row by col 0");
int group = group_ids[task_id];
int subfeature = subfeature_ids[task_id];
+ Log::Warning("before copy sub row by col 1");
feature_groups_[group]->CopySubrowByCol(fullset->feature_groups_[group].get(),
used_indices, num_used_indices, subfeature);
- OMP_LOOP_EX_END();
+ Log::Warning("after copy sub row by col");
+ // OMP_LOOP_EX_END();
}
- OMP_THROW_EX();
+ // OMP_THROW_EX();
+ Log::Warning("copy subrow step 1 !!!!");
if (need_meta_data) {
metadata_.Init(fullset->metadata_, used_indices, num_used_indices);
}
@@ -886,6 +1204,8 @@ void Dataset::CopySubrow(const Dataset* fullset,
device_type_ = fullset->device_type_;
gpu_device_id_ = fullset->gpu_device_id_;
+ Log::Warning("copy subrow step 2 !!!!");
+
#ifdef USE_CUDA
if (device_type_ == std::string("cuda")) {
if (cuda_column_data_ == nullptr) {
@@ -1331,6 +1651,7 @@ void Dataset::ConstructHistogramsInner(
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
const int num_bin = feature_groups_[group]->num_total_bin_;
+ feature_groups_[group]->bin_data_->group_index_ = gi;
if (USE_QUANT_GRAD) {
if (HIST_BITS == 16) {
auto data_ptr = reinterpret_cast(reinterpret_cast