Skip to content

Commit

Permalink
templates for bins in pairwise ranking dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
shiyu1994 committed Dec 5, 2023
1 parent 9d0afd9 commit 0cb436d
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 60 deletions.
4 changes: 2 additions & 2 deletions include/LightGBM/bin.h
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ class Bin {
* \return The bin data object
*/
template <template<typename> typename PAIRWISE_BIN_TYPE>
static Bin* CreateDensePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin);
static Bin* CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map);

/*!
* \brief Create object for bin data of one feature, used for pairwise ranking, for an original sparse bin
Expand All @@ -484,7 +484,7 @@ class Bin {
* \return The bin data object
*/
template <template<typename> typename PAIRWISE_BIN_TYPE>
static Bin* CreateSparsePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin);
static Bin* CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map);

/*!
* \brief Deep copy the bin
Expand Down
4 changes: 4 additions & 0 deletions include/LightGBM/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,10 @@ class Metadata {
}
}

inline data_size_t paired_ranking_item_index_map_size() const {
return static_cast<data_size_t>(paired_ranking_item_index_map_.size());
}

/*!
* \brief Get data boundaries on queries, if not exists, will return nullptr
* we assume data will order by query,
Expand Down
12 changes: 6 additions & 6 deletions include/LightGBM/pairwise_ranking_feature_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ class PairwiseRankingFeatureGroup: public FeatureGroup {
* \param is_first_or_second_in_pairing Mark whether features in this group belong to the first or second element in the pairing
*/

PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_data, const int is_first_or_second_in_pairing):
FeatureGroup(other, num_data), is_first_or_second_in_pairing_(is_first_or_second_in_pairing) {}
PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map):
FeatureGroup(other, num_original_data), paired_ranking_item_index_map_(paired_ranking_item_index_map), num_data_(num_pairs), is_first_or_second_in_pairing_(is_first_or_second_in_pairing) {}

/*!
* \brief Constructor from memory when data is present
Expand Down Expand Up @@ -62,11 +62,11 @@ class PairwiseRankingFeatureGroup: public FeatureGroup {
* \param memory Pointer of memory
* \param group_id Id of group
*/
const char* LoadDefinitionFromMemory(const void* memory, int group_id) {
const char* LoadDefinitionFromMemory(const void* /*memory*/, int /*group_id*/) {
// TODO(shiyu1994)
}

inline BinIterator* SubFeatureIterator(int sub_feature) {
inline BinIterator* SubFeatureIterator(int /*sub_feature*/) {
// TODO(shiyu1994)
}

Expand All @@ -79,11 +79,11 @@ class PairwiseRankingFeatureGroup: public FeatureGroup {
}

private:
template <typename PAIRWISE_BIN_TYPE>
template <template<typename> typename PAIRWISE_BIN_TYPE>
void CreateBinDataInner(int num_data, bool is_multi_val, bool force_dense, bool force_sparse);

void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) override;

/*! \brief Pairwise data index to original data indices for ranking with pairwise features */
const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map_;
/*! \brief Number of pairwise data */
Expand Down
26 changes: 13 additions & 13 deletions src/io/bin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -634,36 +634,36 @@ namespace LightGBM {
}

template <template<typename> typename PAIRWISE_BIN_TYPE>
Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) {
Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map) {
if (num_bin <= 16) {
return new PAIRWISE_BIN_TYPE<DenseBin<uint8_t, true>>(num_data, paired_ranking_item_index_map, new DenseBin<uint8_t, true>(num_data), min_bin, max_bin, most_freq_bin);
return new PAIRWISE_BIN_TYPE<DenseBin<uint8_t, true>>(num_pairs, paired_ranking_item_index_map, new DenseBin<uint8_t, true>(num_original_data));
} else if (num_bin <= 256) {
return new PAIRWISE_BIN_TYPE<DenseBin<uint8_t, false>>(num_data, paired_ranking_item_index_map, new DenseBin<uint8_t, false>(num_data), min_bin, max_bin, most_freq_bin);
return new PAIRWISE_BIN_TYPE<DenseBin<uint8_t, false>>(num_pairs, paired_ranking_item_index_map, new DenseBin<uint8_t, false>(num_original_data));
} else if (num_bin <= 65536) {
return new PAIRWISE_BIN_TYPE<DenseBin<uint16_t, false>>(num_data, paired_ranking_item_index_map, new DenseBin<uint16_t, false>(num_data), min_bin, max_bin, most_freq_bin);
return new PAIRWISE_BIN_TYPE<DenseBin<uint16_t, false>>(num_pairs, paired_ranking_item_index_map, new DenseBin<uint16_t, false>(num_original_data));
} else {
return new PAIRWISE_BIN_TYPE<DenseBin<uint32_t, false>>(num_data, paired_ranking_item_index_map, new DenseBin<uint32_t, false>(num_data), min_bin, max_bin, most_freq_bin);
return new PAIRWISE_BIN_TYPE<DenseBin<uint32_t, false>>(num_pairs, paired_ranking_item_index_map, new DenseBin<uint32_t, false>(num_original_data));
}
}

template <template<typename> typename PAIRWISE_BIN_TYPE>
Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) {
Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map) {
if (num_bin <= 256) {
return new PAIRWISE_BIN_TYPE<SparseBin<uint8_t>>(num_data, paired_ranking_item_index_map, new SparseBin<uint8_t>(num_data), min_bin, max_bin, most_freq_bin);
return new PAIRWISE_BIN_TYPE<SparseBin<uint8_t>>(num_pairs, paired_ranking_item_index_map, new SparseBin<uint8_t>(num_original_data));
} else if (num_bin <= 65536) {
return new PAIRWISE_BIN_TYPE<SparseBin<uint16_t>>(num_data, paired_ranking_item_index_map, new SparseBin<uint16_t>(num_data), min_bin, max_bin, most_freq_bin);
return new PAIRWISE_BIN_TYPE<SparseBin<uint16_t>>(num_pairs, paired_ranking_item_index_map, new SparseBin<uint16_t>(num_original_data));
} else {
return new PAIRWISE_BIN_TYPE<SparseBin<uint32_t>>(num_data, paired_ranking_item_index_map, new SparseBin<uint32_t>(num_data), min_bin, max_bin, most_freq_bin);
return new PAIRWISE_BIN_TYPE<SparseBin<uint32_t>>(num_pairs, paired_ranking_item_index_map, new SparseBin<uint32_t>(num_original_data));
}
}

template Bin* Bin::CreateSparsePairwiseRankingBin<PairwiseRankingFirstBin>(data_size_t num_data, int num_bin, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin);
template Bin* Bin::CreateSparsePairwiseRankingBin<PairwiseRankingFirstBin>(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map);

template Bin* Bin::CreateSparsePairwiseRankingBin<PairwiseRankingSecondBin>(data_size_t num_data, int num_bin, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin);
template Bin* Bin::CreateSparsePairwiseRankingBin<PairwiseRankingSecondBin>(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map);

template Bin* Bin::CreateDensePairwiseRankingBin<PairwiseRankingFirstBin>(data_size_t num_data, int num_bin, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin);
template Bin* Bin::CreateDensePairwiseRankingBin<PairwiseRankingFirstBin>(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map);

template Bin* Bin::CreateDensePairwiseRankingBin<PairwiseRankingSecondBin>(data_size_t num_data, int num_bin, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin);
template Bin* Bin::CreateDensePairwiseRankingBin<PairwiseRankingSecondBin>(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map);

MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature,
double sparse_rate, const std::vector<uint32_t>& offsets) {
Expand Down
4 changes: 2 additions & 2 deletions src/io/dataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,7 @@ void Dataset::CreateValid(const Dataset* dataset) {
gpu_device_id_ = dataset->gpu_device_id_;
}

void Dataset::CreatePairWiseRankingData(const Dataset* dataset, std::vector<std::pair<data_size_t, data_size_t>> pair_index_map) {
void Dataset::CreatePairWiseRankingData(const Dataset* dataset, std::vector<std::pair<data_size_t, data_size_t>> /* TODO(shiyu1994) pair_index_map*/) {
metadata_.BuildPairwiseFeatureRanking(dataset->metadata());

feature_groups_.clear();
Expand Down Expand Up @@ -859,7 +859,7 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, std::vector<std:
feature2subfeature_.push_back(dataset->feature2subfeature_[original_group_feature_start + feature_index_in_group]);
cur_feature_index += 1;
}
feature_groups_.emplace_back(new PairwiseRankingFeatureGroup(*dataset->feature_groups_[original_group_index].get(), num_data_, is_first_or_second_in_pairing));
feature_groups_.emplace_back(new PairwiseRankingFeatureGroup(*dataset->feature_groups_[original_group_index].get(), num_data_, is_first_or_second_in_pairing, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_index_map()));
num_total_bin += dataset->FeatureGroupNumBin(original_group_index);
group_bin_boundaries_.push_back(num_total_bin);
group_feature_cnt_[i] = dataset->group_feature_cnt_[original_group_index];
Expand Down
37 changes: 13 additions & 24 deletions src/io/pairwise_lambdarank_bin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,44 +109,33 @@ class PairwiseRankingSecondIterator: public BinIterator {
data_size_t prev_index_;
};

template <typename BIN_TYPE>
class PairwiseRankingFirstBin: public BIN_TYPE {
template <typename BIN_TYPE, template<typename> typename ITERATOR_TYPE>
class PairwiseRankingBin: public BIN_TYPE {
public:
PairwiseRankingFirstBin(data_size_t num_data, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin): BIN_TYPE(0), unpaired_bin_(unpaired_bin), min_bin_(min_bin), max_bin_(max_bin), most_freq_bin_(most_freq_bin) {
paired_ranking_item_index_map_ = paired_ranking_item_index_map;
PairwiseRankingBin(data_size_t num_data, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): BIN_TYPE(0), paired_ranking_item_index_map_(paired_ranking_item_index_map), unpaired_bin_(unpaired_bin) {
num_data_ = num_data;
}

BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override {
return new PairwiseRankingFirstIterator<BIN_TYPE>(unpaired_bin_.get(), paired_ranking_item_index_map_, min_bin_, max_bin_, most_freq_bin_);
return new ITERATOR_TYPE<BIN_TYPE>(unpaired_bin_.get(), paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin);
}

private:
protected:
const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map_;
const std::shared_ptr<const BIN_TYPE> unpaired_bin_;
const uint32_t min_bin_;
const uint32_t max_bin_;
const uint32_t most_freq_bin_;
data_size_t num_data_;
};

template <typename BIN_TYPE>
class PairwiseRankingSecondBin: public BIN_TYPE {
class PairwiseRankingFirstBin: public PairwiseRankingBin<BIN_TYPE, PairwiseRankingFirstIterator> {
public:
PairwiseRankingSecondBin(data_size_t num_data, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin): BIN_TYPE(0), unpaired_bin_(unpaired_bin), min_bin_(min_bin), max_bin_(max_bin), most_freq_bin_(most_freq_bin) {
paired_ranking_item_index_map_ = paired_ranking_item_index_map;
num_data_ = num_data;
}

BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override {
return new PairwiseRankingSecondIterator<BIN_TYPE>(unpaired_bin_.get(), paired_ranking_item_index_map_, min_bin_, max_bin_, most_freq_bin_);
}
PairwiseRankingFirstBin(data_size_t num_data, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): PairwiseRankingBin<BIN_TYPE, PairwiseRankingFirstIterator>(num_data, paired_ranking_item_index_map, unpaired_bin) {}
};

private:
const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map_;
const std::shared_ptr<const BIN_TYPE> unpaired_bin_;
const uint32_t min_bin_;
const uint32_t max_bin_;
const uint32_t most_freq_bin_;
template <typename BIN_TYPE>
class PairwiseRankingSecondBin: public PairwiseRankingBin<BIN_TYPE, PairwiseRankingSecondIterator> {
public:
PairwiseRankingSecondBin(data_size_t num_data, const std::pair<data_size_t, data_size_t>* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): PairwiseRankingBin<BIN_TYPE, PairwiseRankingSecondIterator>(num_data, paired_ranking_item_index_map, unpaired_bin) {}
};

} // LightGBM
Expand Down
19 changes: 6 additions & 13 deletions src/io/pairwise_ranking_feature_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,17 @@ namespace LightGBM {

template <template<typename> typename PAIRWISE_BIN_TYPE>
void PairwiseRankingFeatureGroup::CreateBinDataInner(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) {
CHECK(!is_multi_val); // do not support multi-value bin for now
if (is_multi_val) {
multi_bin_data_.clear();
for (int i = 0; i < num_feature_; ++i) {
uint32_t most_freq_bin = bin_mappers_[i]->GetMostFreqBin();
int addi = most_freq_bin == 0 ? 0 : 1;
if (!is_multi_val) {
uint32_t min_bin = bin_offsets_[i];
uint32_t max_bin = bin_offsets_[i + 1] - 1;
} else {
uint32_t min_bin = 1;
uint32_t max_bin = bin_mappers_[i]->num_bin() - 1 + addi;
}
int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingBin<PAIRWISE_BIN_TYPE>(
num_data, bin_mappers_[i]->num_bin() + addi, paired_ranking_item_index_map_));
num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_));
} else {
multi_bin_data_.emplace_back(
Bin::CreateDensePairwiseRankingBin<PAIRWISE_BIN_TYPE>(num_data, bin_mappers_[i]->num_bin() + addi, paired_ranking_item_index_map_));
Bin::CreateDensePairwiseRankingBin<PAIRWISE_BIN_TYPE>(num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_));
}
}
is_multi_val_ = true;
Expand All @@ -37,10 +30,10 @@ void PairwiseRankingFeatureGroup::CreateBinDataInner(int num_data, bool is_multi
(!force_dense && num_feature_ == 1 &&
bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
is_sparse_ = true;
bin_data_.reset(Bin::CreateSparsePairwiseRankingBin<PAIRWISE_BIN_TYPE>(num_data, num_total_bin_, paired_ranking_item_index_map_));
bin_data_.reset(Bin::CreateSparsePairwiseRankingBin<PAIRWISE_BIN_TYPE>(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_));
} else {
is_sparse_ = false;
bin_data_.reset(Bin::CreateDensePairwiseRankingBin<PAIRWISE_BIN_TYPE>(num_data, num_total_bin_, paired_ranking_item_index_map_));
bin_data_.reset(Bin::CreateDensePairwiseRankingBin<PAIRWISE_BIN_TYPE>(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_));
}
is_multi_val_ = false;
}
Expand Down

0 comments on commit 0cb436d

Please sign in to comment.