Skip to content

Commit

Permalink
[c++] Fixed Predictor lifecycle and trees initialization in Contrib m…
Browse files Browse the repository at this point in the history
…ode (#6778)

* 1) Fixed Predictor lifecycle
2) Fixed Boosting trees initialization

#5482

* Added tests for LGBM_BoosterPredictForMat in Contrib mode

* #6778 Reverted indentation to 4 spaces

---------

Co-authored-by: James Lamb <[email protected]>
Co-authored-by: Nikita Titov <[email protected]>
  • Loading branch information
3 people authored Jan 20, 2025
1 parent 226e7f7 commit 3654eca
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 17 deletions.
13 changes: 12 additions & 1 deletion src/boosting/gbdt.h
Original file line number Diff line number Diff line change
Expand Up @@ -433,11 +433,18 @@ class GBDT : public GBDTBase {
num_iteration_for_pred_ = num_iteration_for_pred_ - start_iteration;
}
start_iteration_for_pred_ = start_iteration;
if (is_pred_contrib) {

if (is_pred_contrib && !models_initialized_) {
std::lock_guard<std::mutex> lock(instance_mutex_);
if (models_initialized_)
return;

#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < static_cast<int>(models_.size()); ++i) {
models_[i]->RecomputeMaxDepth();
}

models_initialized_ = true;
}
}

Expand Down Expand Up @@ -548,6 +555,10 @@ class GBDT : public GBDTBase {
int max_feature_idx_;
/*! \brief Parser config file content */
std::string parser_config_str_ = "";
/*! \brief Are the models initialized (passed RecomputeMaxDepth phase) */
bool models_initialized_ = false;
/*! \brief Mutex for exclusive models initialization */
std::mutex instance_mutex_;

#ifdef USE_CUDA
/*! \brief First order derivative of training data */
Expand Down
10 changes: 5 additions & 5 deletions src/c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ class Booster {
*out_len = single_row_predictor->num_pred_in_one_row;
}

Predictor CreatePredictor(int start_iteration, int num_iteration, int predict_type, int ncol, const Config& config) const {
std::shared_ptr<Predictor> CreatePredictor(int start_iteration, int num_iteration, int predict_type, int ncol, const Config& config) const {
if (!config.predict_disable_shape_check && ncol != boosting_->MaxFeatureIdx() + 1) {
Log::Fatal("The number of features in data (%d) is not the same as it was in training data (%d).\n" \
"You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.", ncol, boosting_->MaxFeatureIdx() + 1);
Expand All @@ -478,7 +478,7 @@ class Booster {
is_raw_score = false;
}

return Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib,
return std::make_shared<Predictor>(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib,
config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin);
}

Expand All @@ -496,7 +496,7 @@ class Booster {
predict_contrib = true;
}
int64_t num_pred_in_one_row = boosting_->NumPredictOneRow(start_iteration, num_iteration, is_predict_leaf, predict_contrib);
auto pred_fun = predictor.GetPredictFunction();
auto pred_fun = predictor->GetPredictFunction();
OMP_INIT_EX();
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
for (int i = 0; i < nrow; ++i) {
Expand All @@ -517,7 +517,7 @@ class Booster {
int32_t** out_indices, void** out_data, int data_type,
bool* is_data_float32_ptr, int num_matrices) const {
auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config);
auto pred_sparse_fun = predictor.GetPredictSparseFunction();
auto pred_sparse_fun = predictor->GetPredictSparseFunction();
std::vector<std::vector<std::unordered_map<int, double>>>& agg = *agg_ptr;
OMP_INIT_EX();
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
Expand Down Expand Up @@ -652,7 +652,7 @@ class Booster {
// Get the number of trees per iteration (for multiclass scenario we output multiple sparse matrices)
int num_matrices = boosting_->NumModelPerIteration();
auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config);
auto pred_sparse_fun = predictor.GetPredictSparseFunction();
auto pred_sparse_fun = predictor->GetPredictSparseFunction();
bool is_col_ptr_int32 = false;
bool is_data_float32 = false;
int num_output_cols = ncol + 1;
Expand Down
66 changes: 55 additions & 11 deletions tests/cpp_tests/test_single_row.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

using LightGBM::TestUtils;

TEST(SingleRow, JustWorks) {
void test_predict_type(int predict_type, int num_predicts) {
// Load some test data
int result;

Expand All @@ -37,17 +37,19 @@ TEST(SingleRow, JustWorks) {
booster_handle,
&n_features);
EXPECT_EQ(0, result) << "LGBM_BoosterGetNumFeature result code: " << result;
EXPECT_EQ(28, n_features) << "LGBM_BoosterGetNumFeature number of features: " << n_features;

// Run a single row prediction and compare with regular Mat prediction:
int64_t output_size;
result = LGBM_BoosterCalcNumPredict(
booster_handle,
1,
C_API_PREDICT_NORMAL, // predict_type
predict_type, // predict_type
0, // start_iteration
-1, // num_iteration
&output_size);
EXPECT_EQ(0, result) << "LGBM_BoosterCalcNumPredict result code: " << result;
EXPECT_EQ(num_predicts, output_size) << "LGBM_BoosterCalcNumPredict output size: " << output_size;

std::ifstream test_file("examples/binary_classification/binary.test");
std::vector<double> test;
Expand Down Expand Up @@ -77,21 +79,55 @@ TEST(SingleRow, JustWorks) {
test_set_size, // nrow
n_features, // ncol
1, // is_row_major
C_API_PREDICT_NORMAL, // predict_type
predict_type, // predict_type
0, // start_iteration
-1, // num_iteration
"",
&written,
&mat_output[0]);
EXPECT_EQ(0, result) << "LGBM_BoosterPredictForMat result code: " << result;

// Now let's run with the single row fast prediction API:
// Test LGBM_BoosterPredictForMat in multi-threaded mode
const int kNThreads = 10;
const int numIterations = 5;
std::vector<std::thread> predict_for_mat_threads(kNThreads);
for (int i = 0; i < kNThreads; i++) {
predict_for_mat_threads[i] = std::thread(
[
i, test_set_size, output_size, n_features,
test = &test[0], booster_handle, predict_type, numIterations
]() {
for (int j = 0; j < numIterations; j++) {
int result;
std::vector<double> mat_output(output_size * test_set_size, -1);
int64_t written;
result = LGBM_BoosterPredictForMat(
booster_handle,
&test[0],
C_API_DTYPE_FLOAT64,
test_set_size, // nrow
n_features, // ncol
1, // is_row_major
predict_type, // predict_type
0, // start_iteration
-1, // num_iteration
"",
&written,
&mat_output[0]);
EXPECT_EQ(0, result) << "LGBM_BoosterPredictForMat result code: " << result;
}
});
}
for (std::thread& t : predict_for_mat_threads) {
t.join();
}

// Now let's run with the single row fast prediction API:
FastConfigHandle fast_configs[kNThreads];
for (int i = 0; i < kNThreads; i++) {
result = LGBM_BoosterPredictForMatSingleRowFastInit(
booster_handle,
C_API_PREDICT_NORMAL, // predict_type
predict_type, // predict_type
0, // start_iteration
-1, // num_iteration
C_API_DTYPE_FLOAT64,
Expand All @@ -102,14 +138,14 @@ TEST(SingleRow, JustWorks) {
}

std::vector<double> single_row_output(output_size * test_set_size, -1);
std::vector<std::thread> threads(kNThreads);
std::vector<std::thread> single_row_threads(kNThreads);
int batch_size = (test_set_size + kNThreads - 1) / kNThreads; // round up
for (int i = 0; i < kNThreads; i++) {
threads[i] = std::thread(
single_row_threads[i] = std::thread(
[
i, batch_size, test_set_size, output_size, n_features,
test = &test[0], fast_configs = &fast_configs[0], single_row_output = &single_row_output[0]
](){
test = &test[0], fast_configs = &fast_configs[0], single_row_output = &single_row_output[0]
]() {
int result;
int64_t written;
for (int j = i * batch_size; j < std::min((i + 1) * batch_size, test_set_size); j++) {
Expand All @@ -122,8 +158,8 @@ TEST(SingleRow, JustWorks) {
EXPECT_EQ(written, output_size) << "LGBM_BoosterPredictForMatSingleRowFast unexpected written output size";
}
});
}
for (std::thread &t : threads) {
}
for (std::thread& t : single_row_threads) {
t.join();
}

Expand All @@ -141,3 +177,11 @@ TEST(SingleRow, JustWorks) {
result = LGBM_DatasetFree(train_dataset);
EXPECT_EQ(0, result) << "LGBM_DatasetFree result code: " << result;
}

TEST(SingleRow, Normal) {
test_predict_type(C_API_PREDICT_NORMAL, 1);
}

TEST(SingleRow, Contrib) {
test_predict_type(C_API_PREDICT_CONTRIB, 29);
}

0 comments on commit 3654eca

Please sign in to comment.