diff --git a/.gitignore b/.gitignore index f0404a7..61d7627 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,6 @@ Cargo.lock lightgbm-sys/target # example -examples/target +examples/binary_classification/target/ +examples/multiclass_classification/target/ +examples/regression/target/ diff --git a/Cargo.toml b/Cargo.toml index 39223c0..dde4452 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lightgbm" -version = "0.1.1" +version = "0.1.2" authors = ["vaaaaanquish <6syun9@gmail.com>"] license = "MIT" repository = "https://github.com/vaaaaanquish/LightGBM" @@ -11,3 +11,5 @@ exclude = [".gitignore", ".gitmodules", "examples", "lightgbm-sys"] [dependencies] lightgbm-sys = "0.1.0" libc = "0.2.81" +derive_builder = "0.5.1" +serde_json = "1.0.59" diff --git a/examples/binary_classification/Cargo.toml b/examples/binary_classification/Cargo.toml new file mode 100644 index 0000000..72ee4fc --- /dev/null +++ b/examples/binary_classification/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "lightgbm-example-binary-classification" +version = "0.1.0" +authors = ["vaaaaanquish <6syun9@gmail.com>"] +publish = false + +[dependencies] +lightgbm = { path = "../../" } +csv = "1.1.5" +itertools = "0.9.0" +serde_json = "1.0.59" diff --git a/examples/binary_classification/src/main.rs b/examples/binary_classification/src/main.rs new file mode 100644 index 0000000..8d2bf35 --- /dev/null +++ b/examples/binary_classification/src/main.rs @@ -0,0 +1,55 @@ +extern crate lightgbm; +extern crate csv; +extern crate serde_json; +extern crate itertools; + + +use itertools::zip; +use lightgbm::{Dataset, Booster}; +use serde_json::json; + + +fn load_file(file_path: &str) -> (Vec>, Vec) { + let rdr = csv::ReaderBuilder::new().has_headers(false).delimiter(b'\t').from_path(file_path); + let mut labels: Vec = Vec::new(); + let mut features: Vec> = Vec::new(); + for result in rdr.unwrap().records() { + let record = result.unwrap(); + let label = record[0].parse::().unwrap(); + let feature: Vec = record.iter().map(|x| x.parse::().unwrap()).collect::>()[1..].to_vec(); + labels.push(label); + features.push(feature); + } + (features, labels) +} + + +fn main() -> std::io::Result<()> { + let (train_features, train_labels) = load_file("../../lightgbm-sys/lightgbm/examples/binary_classification/binary.train"); + let (test_features, test_labels) = load_file("../../lightgbm-sys/lightgbm/examples/binary_classification/binary.test"); + let train_dataset = Dataset::from_mat(train_features, train_labels).unwrap(); + + let params = json!{ + { + "num_iterations": 100, + "objective": "binary", + "metric": "auc" + } + }; + + let booster = Booster::train(train_dataset, ¶ms).unwrap(); + let result = booster.predict(test_features).unwrap(); + + + let mut tp = 0; + for (label, pred) in zip(&test_labels, &result[0]){ + if label == &(1 as f32) && pred > &(0.5 as f64) { + tp = tp + 1; + } else if label == &(0 as f32) && pred <= &(0.5 as f64) { + tp = tp + 1; + } + println!("{}, {}", label, pred) + } + println!("{} / {}", &tp, result[0].len()); + Ok(()) +} diff --git a/examples/multiclass_classification/Cargo.toml b/examples/multiclass_classification/Cargo.toml new file mode 100644 index 0000000..e1bca0c --- /dev/null +++ b/examples/multiclass_classification/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "lightgbm-example-multiclass-classification" +version = "0.1.0" +authors = ["vaaaaanquish <6syun9@gmail.com>"] +publish = false + +[dependencies] +lightgbm = { path = "../../" } +csv = "1.1.5" +itertools = "0.9.0" +serde_json = "1.0.59" diff --git a/examples/multiclass_classification/src/main.rs b/examples/multiclass_classification/src/main.rs new file mode 100644 index 0000000..efbbd92 --- /dev/null +++ b/examples/multiclass_classification/src/main.rs @@ -0,0 +1,72 @@ +extern crate lightgbm; +extern crate csv; +extern crate serde_json; +extern crate itertools; + + +use itertools::zip; +use lightgbm::{Dataset, Booster}; +use serde_json::json; + + +fn load_file(file_path: &str) -> (Vec>, Vec) { + let rdr = csv::ReaderBuilder::new().has_headers(false).delimiter(b'\t').from_path(file_path); + let mut labels: Vec = Vec::new(); + let mut features: Vec> = Vec::new(); + for result in rdr.unwrap().records() { + let record = result.unwrap(); + let label = record[0].parse::().unwrap(); + let feature: Vec = record.iter().map(|x| x.parse::().unwrap()).collect::>()[1..].to_vec(); + labels.push(label); + features.push(feature); + } + (features, labels) +} + +fn argmax(xs: &[T]) -> usize { + if xs.len() == 1 { + 0 + } else { + let mut maxval = &xs[0]; + let mut max_ixs: Vec = vec![0]; + for (i, x) in xs.iter().enumerate().skip(1) { + if x > maxval { + maxval = x; + max_ixs = vec![i]; + } else if x == maxval { + max_ixs.push(i); + } + } + max_ixs[0] + } +} + +fn main() -> std::io::Result<()> { + let (train_features, train_labels) = load_file("../../lightgbm-sys/lightgbm/examples/multiclass_classification/multiclass.train"); + let (test_features, test_labels) = load_file("../../lightgbm-sys/lightgbm/examples/multiclass_classification/multiclass.test"); + let train_dataset = Dataset::from_mat(train_features, train_labels).unwrap(); + + let params = json!{ + { + "num_iterations": 100, + "objective": "multiclass", + "metric": "multi_logloss", + "num_class": 5, + } + }; + + let booster = Booster::train(train_dataset, ¶ms).unwrap(); + let result = booster.predict(test_features).unwrap(); + + + let mut tp = 0; + for (label, pred) in zip(&test_labels, &result){ + let argmax_pred = argmax(&pred); + if *label == argmax_pred as f32 { + tp = tp + 1; + } + println!("{}, {}, {:?}", label, argmax_pred, &pred); + } + println!("{} / {}", &tp, result.len()); + Ok(()) +} diff --git a/examples/Cargo.toml b/examples/regression/Cargo.toml similarity index 60% rename from examples/Cargo.toml rename to examples/regression/Cargo.toml index b303bc3..0289846 100644 --- a/examples/Cargo.toml +++ b/examples/regression/Cargo.toml @@ -1,10 +1,11 @@ [package] -name = "lightgbm-example" +name = "lightgbm-example-regression" version = "0.1.0" authors = ["vaaaaanquish <6syun9@gmail.com>"] publish = false [dependencies] -lightgbm = "0.1.1" +lightgbm = { path = "../../" } csv = "1.1.5" itertools = "0.9.0" +serde_json = "1.0.59" diff --git a/examples/regression/src/main.rs b/examples/regression/src/main.rs new file mode 100644 index 0000000..ddf5f41 --- /dev/null +++ b/examples/regression/src/main.rs @@ -0,0 +1,55 @@ +extern crate lightgbm; +extern crate csv; +extern crate serde_json; +extern crate itertools; + + +use itertools::zip; +use lightgbm::{Dataset, Booster}; +use serde_json::json; + + +fn load_file(file_path: &str) -> (Vec>, Vec) { + let rdr = csv::ReaderBuilder::new().has_headers(false).delimiter(b'\t').from_path(file_path); + let mut labels: Vec = Vec::new(); + let mut features: Vec> = Vec::new(); + for result in rdr.unwrap().records() { + let record = result.unwrap(); + let label = record[0].parse::().unwrap(); + let feature: Vec = record.iter().map(|x| x.parse::().unwrap()).collect::>()[1..].to_vec(); + labels.push(label); + features.push(feature); + } + (features, labels) +} + + +fn main() -> std::io::Result<()> { + let (train_features, train_labels) = load_file("../../lightgbm-sys/lightgbm/examples/regression/regression.train"); + let (test_features, test_labels) = load_file("../../lightgbm-sys/lightgbm/examples/regression/regression.test"); + let train_dataset = Dataset::from_mat(train_features, train_labels).unwrap(); + + let params = json!{ + { + "num_iterations": 100, + "objective": "regression", + "metric": "l2" + } + }; + + let booster = Booster::train(train_dataset, ¶ms).unwrap(); + let result = booster.predict(test_features).unwrap(); + + + let mut tp = 0; + for (label, pred) in zip(&test_labels, &result[0]){ + if label == &(1 as f32) && pred > &(0.5 as f64) { + tp = tp + 1; + } else if label == &(0 as f32) && pred <= &(0.5 as f64) { + tp = tp + 1; + } + println!("{}, {}", label, pred) + } + println!("{} / {}", &tp, result[0].len()); + Ok(()) +} diff --git a/examples/src/main.rs b/examples/src/main.rs deleted file mode 100644 index fed42c3..0000000 --- a/examples/src/main.rs +++ /dev/null @@ -1,56 +0,0 @@ -extern crate lightgbm; -extern crate csv; -extern crate itertools; - -use itertools::zip; -use lightgbm::{Dataset, Booster}; - -fn main() -> std::io::Result<()> { - // let feature = vec![vec![1.0, 0.1, 0.2, 0.1], - // vec![0.7, 0.4, 0.5, 0.1], - // vec![0.9, 0.8, 0.5, 0.1], - // vec![0.2, 0.2, 0.8, 0.7], - // vec![0.1, 0.7, 1.0, 0.9]]; - // let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; - // let train_dataset = Dataset::from_mat(feature, label).unwrap(); - - // let train_dataset = Dataset::from_file("../lightgbm-sys/lightgbm/examples/binary_classification/binary.train".to_string()).unwrap(); - - let mut train_rdr = csv::ReaderBuilder::new().has_headers(false).delimiter(b'\t').from_path("../lightgbm-sys/lightgbm/examples/binary_classification/binary.train")?; - let mut train_labels: Vec = Vec::new(); - let mut train_feature: Vec> = Vec::new(); - for result in train_rdr.records() { - let record = result?; - let label = record[0].parse::().unwrap(); - let feature: Vec = record.iter().map(|x| x.parse::().unwrap()).collect::>()[1..].to_vec(); - train_labels.push(label); - train_feature.push(feature); - } - let train_dataset = Dataset::from_mat(train_feature, train_labels).unwrap(); - - let mut rdr = csv::ReaderBuilder::new().has_headers(false).delimiter(b'\t').from_path("../lightgbm-sys/lightgbm/examples/binary_classification/binary.test")?; - let mut test_labels: Vec = Vec::new(); - let mut test_feature: Vec> = Vec::new(); - for result in rdr.records() { - let record = result?; - let label = record[0].parse::().unwrap(); - let feature: Vec = record.iter().map(|x| x.parse::().unwrap()).collect::>()[1..].to_vec(); - test_labels.push(label); - test_feature.push(feature); - } - - let booster = Booster::train(train_dataset).unwrap(); - let result = booster.predict(test_feature).unwrap(); - - let mut tp = 0; - for (label, pred) in zip(&test_labels, &result){ - if label == &(1 as f32) && pred > &(0.5 as f64) { - tp = tp + 1; - } else if label == &(0 as f32) && pred <= &(0.5 as f64) { - tp = tp + 1; - } - println!("{}, {}", label, pred) - } - println!("{} / {}", &tp, result.len()); - Ok(()) -} diff --git a/src/booster.rs b/src/booster.rs index b8994c7..4d464c9 100644 --- a/src/booster.rs +++ b/src/booster.rs @@ -1,51 +1,111 @@ -use lightgbm_sys; - use libc::{c_char, c_double, c_void, c_long}; use std::ffi::CString; use std; -use super::{LGBMResult, Dataset}; +use serde_json::Value; + +use lightgbm_sys; + + +use super::{LGBMResult, Dataset, LGBMError}; +/// Core model in LightGBM, containing functions for training, evaluating and predicting. pub struct Booster { - pub(super) handle: lightgbm_sys::BoosterHandle + pub(super) handle: lightgbm_sys::BoosterHandle, + num_class: i64 } + impl Booster { - fn new(handle: lightgbm_sys::BoosterHandle) -> LGBMResult { - Ok(Booster{handle}) + fn new(handle: lightgbm_sys::BoosterHandle, num_class: i64) -> LGBMResult { + Ok(Booster{handle, num_class}) } - pub fn train(dataset: Dataset) -> LGBMResult { - let params = CString::new("objective=binary metric=auc").unwrap(); + /// Create a new Booster model with given Dataset and parameters. + /// + /// Example + /// ``` + /// extern crate serde_json; + /// use lightgbm::{Dataset, Booster}; + /// use serde_json::json; + /// + /// let data = vec![vec![1.0, 0.1, 0.2, 0.1], + /// vec![0.7, 0.4, 0.5, 0.1], + /// vec![0.9, 0.8, 0.5, 0.1], + /// vec![0.2, 0.2, 0.8, 0.7], + /// vec![0.1, 0.7, 1.0, 0.9]]; + /// let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + /// let dataset = Dataset::from_mat(data, label).unwrap(); + /// let params = json!{ + /// { + /// "num_iterations": 3, + /// "objective": "binary", + /// "metric": "auc" + /// } + /// }; + /// let bst = Booster::train(dataset, ¶ms).unwrap(); + /// ``` + pub fn train(dataset: Dataset, parameter: &Value) -> LGBMResult { + + // get num_iterations + let num_iterations: i64; + if parameter["num_iterations"].is_null(){ + num_iterations = 100; + } else { + num_iterations = parameter["num_iterations"].as_i64().unwrap(); + } + + // get num_class + let num_class: i64; + if parameter["num_class"].is_null(){ + num_class = 1; + } else { + num_class = parameter["num_class"].as_i64().unwrap(); + } + + // exchange params {"x": "y", "z": 1} => "x=y z=1" + let params_string = parameter.as_object().unwrap().iter().map(|(k, v)| format!("{}={}", k, v)).collect::>().join(" "); + let params_cstring = CString::new(params_string).unwrap(); + let mut handle = std::ptr::null_mut(); - unsafe { + lgbm_call!( lightgbm_sys::LGBM_BoosterCreate( dataset.handle, - params.as_ptr() as *const c_char, + params_cstring.as_ptr() as *const c_char, &mut handle - ); - } + ) + )?; - // train let mut is_finished: i32 = 0; - unsafe{ - for _ in 1..100 { - lightgbm_sys::LGBM_BoosterUpdateOneIter(handle, &mut is_finished); - } + for _ in 1..num_iterations { + lgbm_call!(lightgbm_sys::LGBM_BoosterUpdateOneIter(handle, &mut is_finished))?; } - Ok(Booster::new(handle)?) + Ok(Booster::new(handle, num_class)?) } - pub fn predict(&self, data: Vec>) -> LGBMResult> { + /// Predict results for given data. + /// + /// Input data example + /// ``` + /// let data = vec![vec![1.0, 0.1, 0.2], + /// vec![0.7, 0.4, 0.5], + /// vec![0.1, 0.7, 1.0]]; + /// ``` + /// + /// Output data example + /// ``` + /// let output = vec![vec![1.0, 0.109, 0.433]]; + /// ``` + pub fn predict(&self, data: Vec>) -> LGBMResult>> { let data_length = data.len(); let feature_length = data[0].len(); let params = CString::new("").unwrap(); let mut out_length: c_long = 0; - let out_result: Vec = vec![Default::default(); data.len()]; + let out_result: Vec = vec![Default::default(); data.len() * self.num_class as usize]; let flat_data = data.into_iter().flatten().collect::>(); - unsafe { + lgbm_call!( lightgbm_sys::LGBM_BoosterPredictForMat( self.handle, flat_data.as_ptr() as *const c_void, @@ -59,8 +119,58 @@ impl Booster { params.as_ptr() as *const c_char, &mut out_length, out_result.as_ptr() as *mut c_double - ); + ) + )?; + + // reshape for multiclass [1,2,3,4,5,6] -> [[1,2,3], [4,5,6]] # 3 class + let reshaped_output; + if self.num_class > 1{ + reshaped_output = out_result.chunks(self.num_class as usize).map(|x| x.to_vec()).collect(); + } else { + reshaped_output = vec![out_result]; + } + Ok(reshaped_output) + } +} + + +impl Drop for Booster { + fn drop(&mut self) { + lgbm_call!(lightgbm_sys::LGBM_BoosterFree(self.handle)).unwrap(); + } +} + + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + fn read_train_file() -> LGBMResult { + Dataset::from_file("lightgbm-sys/lightgbm/examples/binary_classification/binary.train".to_string()) + } + + #[test] + fn predict() { + let dataset = read_train_file().unwrap(); + let params = json!{ + { + "num_iterations": 10, + "objective": "binary", + "metric": "auc", + "data_random_seed": 0 + } + }; + let bst = Booster::train(dataset, ¶ms).unwrap(); + let feature = vec![vec![0.5; 28], vec![0.0; 28], vec![0.9; 28]]; + let result = bst.predict(feature).unwrap(); + let mut normalized_result = Vec::new(); + for r in result[0]{ + if r > 0.5{ + normalized_result.push(1); + } else { + normalized_result.push(0); + } } - Ok(out_result) + assert_eq!(normalized_result, vec![0, 0, 1]); } } diff --git a/src/dataset.rs b/src/dataset.rs index 59449a5..63a4ddc 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -1,21 +1,64 @@ -use libc::{c_void,c_char}; - use std; use std::ffi::CString; +use libc::{c_void,c_char}; use lightgbm_sys; -use super::LGBMResult; +use super::{LGBMResult, LGBMError}; + + +/// Dataset used throughout LightGBM for training. +/// +/// # Examples +/// +/// ## from mat +/// +/// ``` +/// use lightgbm::Dataset; +/// +/// let data = vec![vec![1.0, 0.1, 0.2, 0.1], +/// vec![0.7, 0.4, 0.5, 0.1], +/// vec![0.9, 0.8, 0.5, 0.1], +/// vec![0.2, 0.2, 0.8, 0.7], +/// vec![0.1, 0.7, 1.0, 0.9]]; +/// let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; +/// let dataset = Dataset::from_mat(data, label).unwrap(); +/// ``` +/// +/// ## from file +/// +/// ``` +/// use lightgbm::Dataset; +/// +/// let dataset = Dataset::from_file( +/// "lightgbm-sys/lightgbm/examples/binary_classification/binary.train" +/// .to_string()).unwrap(); +/// ``` pub struct Dataset { pub(super) handle: lightgbm_sys::DatasetHandle } + #[link(name = "c")] impl Dataset { fn new(handle: lightgbm_sys::DatasetHandle) -> LGBMResult { Ok(Dataset{handle}) } + /// Create a new `Dataset` from dense array in row-major order. + /// + /// Example + /// ``` + /// use lightgbm::Dataset; + /// + /// let data = vec![vec![1.0, 0.1, 0.2, 0.1], + /// vec![0.7, 0.4, 0.5, 0.1], + /// vec![0.9, 0.8, 0.5, 0.1], + /// vec![0.2, 0.2, 0.8, 0.7], + /// vec![0.1, 0.7, 1.0, 0.9]]; + /// let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + /// let dataset = Dataset::from_mat(data, label).unwrap(); + /// ``` pub fn from_mat(data: Vec>, label: Vec) -> LGBMResult { let data_length = data.len(); let feature_length = data[0].len(); @@ -25,7 +68,7 @@ impl Dataset { let mut handle = std::ptr::null_mut(); let flat_data = data.into_iter().flatten().collect::>(); - unsafe{ + lgbm_call!( lightgbm_sys::LGBM_DatasetCreateFromMat( flat_data.as_ptr() as *const c_void, lightgbm_sys::C_API_DTYPE_FLOAT64 as i32, @@ -35,33 +78,88 @@ impl Dataset { params.as_ptr() as *const c_char, reference, &mut handle - ); + ) + )?; + lgbm_call!( lightgbm_sys::LGBM_DatasetSetField( handle, label_str.as_ptr() as *const c_char, label.as_ptr() as *const c_void, data_length as i32, lightgbm_sys::C_API_DTYPE_FLOAT32 as i32 - ); - } + ) + )?; + Ok(Dataset::new(handle)?) } + /// Create a new `Dataset` from file. + /// + /// file is `tsv`. + /// ```text + ///