diff --git a/Cargo.lock b/Cargo.lock index b0feaed..08d552d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1094,6 +1094,18 @@ dependencies = [ "thiserror", ] +[[package]] +name = "linfa-trees" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "464029f9a6ffd8016539a9c2a73e569b1118d50eb44638549f8cd5807ea77f8c" +dependencies = [ + "linfa", + "ndarray", + "ndarray-rand", + "serde", +] + [[package]] name = "linked-hash-map" version = "0.5.6" @@ -1292,6 +1304,7 @@ dependencies = [ "num-integer", "num-traits", "rawpointer", + "rayon", "serde", ] @@ -1936,6 +1949,7 @@ dependencies = [ "hyper", "linfa", "linfa-logistic", + "linfa-trees", "meval", "mongodb", "ndarray", diff --git a/Cargo.toml b/Cargo.toml index eb18447..5a27fa2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ sysinfo = "0.18.0" # ML Packages for Model Training linfa = "0.6.1" linfa-logistic = "0.6.1" +linfa-trees = { version = "0.6.1", features = ["serde"] } csv = "1.2.0" ndarray = "0.15.6" ciborium = "0.2.0" diff --git a/src/linfa_train/decision_tree/mod.rs b/src/linfa_train/decision_tree/mod.rs new file mode 100644 index 0000000..d4ac67f --- /dev/null +++ b/src/linfa_train/decision_tree/mod.rs @@ -0,0 +1,183 @@ +//! Logistic regression module. + +use ciborium::{cbor, value}; +use colored::Colorize; +use csv::Reader; +use linfa::prelude::*; +use linfa::Dataset; +use linfa_trees::DecisionTree; +use ndarray::{Array, Array1, Array2}; +use std::io::Read; +use std::path::Path; +use std::{env::args, fs, fs::File}; + +/// The entry point of the program. +pub fn main() { + LinfaTrainDecisionTree::new(); +} + +/// Input arguments of the program. +struct InuputArguments { + max_depth: usize, +} + +struct LinfaTrainDecisionTree; + +/// Source: https:///github.com/DataPsycho/data-pipelines-in-rust/blob/main/diabetes_ml_pipeline/Cargo.toml +impl LinfaTrainDecisionTree { + /// Program constructor. + fn new() -> LinfaTrainDecisionTree { + let mut program = LinfaTrainDecisionTree; + program.init(); + program + } + + /// Initializes the program. + fn init(&mut self) { + println!("\n{}", "Linfa train initialized.".blue().bold()); + + let args = self.args(); + + self.train(args.max_depth); + self.load_model(); + } + + /// Parses arguments passed to the program. + fn args(&mut self) -> InuputArguments { + let arguments: Vec = args().collect(); + + println!("\n{}:\n{:?}", "Arguments".cyan().bold(), arguments); + + let max_depth = arguments + .get(2) + .cloned() + .unwrap_or_default() + .trim() + .parse::() + .unwrap_or(500); + + InuputArguments { max_depth } + } + + /// The dataset headers + fn headers(&mut self, reader: &mut Reader) -> Vec { + let result = reader + .headers() + .unwrap() + .iter() + .map(|r| r.to_owned()) + .collect(); + println!("\n{} {:?}", "Header collected, result:".yellow(), result); + result + } + + /// The dataset data + fn data(&mut self, reader: &mut Reader) -> Vec> { + let result = reader + .records() + .map(|r| { + r.unwrap() + .iter() + .map(|field| field.parse::().unwrap()) + .collect::>() + }) + .collect::>>(); + println!( + "\n{} {:?}", + "Data collected, length:".yellow(), + result.len() + ); + result + } + + /// The dataset records + fn records(&mut self, data: &Vec>, target_index: usize) -> Array2 { + let mut records: Vec = vec![]; + for record in data.iter() { + records.extend_from_slice(&record[0..target_index]); + } + + let result = Array::from(records) + .into_shape((data.len(), target_index)) + .unwrap(); + let record_shape = result.shape(); + println!( + "\n{} {:?} x {:?}", + "Records collected, shape:".yellow(), + record_shape[0], + record_shape[1] + ); + result + } + + /// The dataset targets + fn targets(&mut self, data: &[Vec], target_index: usize) -> Array1 { + let targets = data + .iter() + .map(|r| r[target_index] as usize) + .collect::>(); + println!( + "\n{} {:?}", + "Target collected, length:".yellow(), + targets.len() + ); + Array::from(targets) + } + + /// The dataset + /// Data source: https:///github.com/plotly/datasets/blob/master/diabetes.csv + fn dataset(&mut self) -> Dataset> { + let file_path = ".data/input/diabetes.csv"; + let mut reader = Reader::from_path(file_path).unwrap(); + let headers = self.headers(&mut reader); + let data = self.data(&mut reader); + let target_index = headers.len() - 1; + let features = headers[0..target_index].to_vec(); + let records = self.records(&data, target_index); + let targets = self.targets(&data, target_index); + Dataset::new(records, targets).with_feature_names(features) + } + + /// Trains the model + fn train(&mut self, max_depth: usize) { + println!("\n{}", "Training the model...".yellow().bold()); + let dataset = self.dataset(); + let model = DecisionTree::params() + .max_depth(Some(max_depth)) + .fit(&dataset) + .expect("Can not train the model"); + let value = cbor!(model).unwrap(); + let mut vec_model = Vec::new(); + ciborium::ser::into_writer(&value, &mut vec_model).unwrap(); + // debug: start + let prediction = model.predict(&dataset.records); + println!("{:?}", prediction); + // debug: end + let output = Path::new(".data") + .join("output") + .join("diabetes_model.decision_tree.cbor"); + fs::write(output.clone(), vec_model).unwrap(); + println!("\n{} {:?}", "Model saved, path:".yellow(), output.as_path()); + } + + /// Loads the model + fn load_model(&mut self) { + println!("\n{}", "Testing the model...".yellow().bold()); + let dataset = self.dataset(); + let mut data: Vec = Vec::new(); + let path = Path::new(".data") + .join("output") + .join("diabetes_model.decision_tree.cbor"); + let mut file = File::open(path).unwrap(); + file.read_to_end(&mut data).unwrap(); + let value = ciborium::de::from_reader::(&data[..]).unwrap(); + let model: DecisionTree = value.deserialized().unwrap(); + println!("\n{} {:?}", "Model loaded:".yellow(), model); + let prediction = model.predict(dataset.records); + println!( + "\n{} {:?}", + "Prediction test with the model success:".green().bold(), + prediction + ); + } +} diff --git a/src/linfa_train/logistic_regression/mod.rs b/src/linfa_train/logistic_regression/mod.rs new file mode 100644 index 0000000..67c8e7b --- /dev/null +++ b/src/linfa_train/logistic_regression/mod.rs @@ -0,0 +1,185 @@ +//! Logistic regression module. + +use ciborium::{cbor, value}; +use colored::Colorize; +use csv::Reader; +use linfa::prelude::*; +use linfa::Dataset; +use linfa_logistic::FittedLogisticRegression; +use linfa_logistic::LogisticRegression; +use ndarray::{Array, Array1, Array2}; +use std::io::Read; +use std::path::Path; +use std::{env::args, fs, fs::File}; + +/// The entry point of the program. +pub fn main() { + LinfaTrainLogisticRegression::new(); +} + +/// Input arguments of the program. +struct InuputArguments { + max_iterations: u64, +} + +struct LinfaTrainLogisticRegression; + +/// Source: https:///github.com/DataPsycho/data-pipelines-in-rust/blob/main/diabetes_ml_pipeline/Cargo.toml +impl LinfaTrainLogisticRegression { + /// Program constructor. + fn new() -> LinfaTrainLogisticRegression { + let mut program = LinfaTrainLogisticRegression; + program.init(); + program + } + + /// Initializes the program. + fn init(&mut self) { + println!("\n{}", "Linfa train initialized.".blue().bold()); + + let args = self.args(); + + self.train(args.max_iterations); + self.load_model(); + } + + /// Parses arguments passed to the program. + fn args(&mut self) -> InuputArguments { + let arguments: Vec = args().collect(); + + println!("\n{}:\n{:?}", "Arguments".cyan().bold(), arguments); + + let max_iterations = arguments + .get(2) + .cloned() + .unwrap_or_default() + .trim() + .parse::() + .unwrap_or(500); + + InuputArguments { max_iterations } + } + + /// The dataset headers + fn headers(&mut self, reader: &mut Reader) -> Vec { + let result = reader + .headers() + .unwrap() + .iter() + .map(|r| r.to_owned()) + .collect(); + println!("\n{} {:?}", "Header collected, result:".yellow(), result); + result + } + + /// The dataset data + fn data(&mut self, reader: &mut Reader) -> Vec> { + let result = reader + .records() + .map(|r| { + r.unwrap() + .iter() + .map(|field| field.parse::().unwrap()) + .collect::>() + }) + .collect::>>(); + println!( + "\n{} {:?}", + "Data collected, length:".yellow(), + result.len() + ); + result + } + + /// The dataset records + fn records(&mut self, data: &Vec>, target_index: usize) -> Array2 { + let mut records: Vec = vec![]; + for record in data.iter() { + records.extend_from_slice(&record[0..target_index]); + } + + let result = Array::from(records) + .into_shape((data.len(), target_index)) + .unwrap(); + let record_shape = result.shape(); + println!( + "\n{} {:?} x {:?}", + "Records collected, shape:".yellow(), + record_shape[0], + record_shape[1] + ); + result + } + + /// The dataset targets + fn targets(&mut self, data: &[Vec], target_index: usize) -> Array1 { + let targets = data + .iter() + .map(|r| r[target_index] as i32) + .collect::>(); + println!( + "\n{} {:?}", + "Target collected, length:".yellow(), + targets.len() + ); + Array::from(targets) + } + + /// The dataset + /// Data source: https:///github.com/plotly/datasets/blob/master/diabetes.csv + fn dataset(&mut self) -> Dataset> { + let file_path = ".data/input/diabetes.csv"; + let mut reader = Reader::from_path(file_path).unwrap(); + let headers = self.headers(&mut reader); + let data = self.data(&mut reader); + let target_index = headers.len() - 1; + let features = headers[0..target_index].to_vec(); + let records = self.records(&data, target_index); + let targets = self.targets(&data, target_index); + Dataset::new(records, targets).with_feature_names(features) + } + + /// Trains the model + fn train(&mut self, max_iterations: u64) { + println!("\n{}", "Training the model...".yellow().bold()); + let dataset = self.dataset(); + let model = LogisticRegression::default() + .max_iterations(max_iterations) + .gradient_tolerance(0.0001) + .fit(&dataset) + .expect("Can not train the model"); + let value = cbor!(model).unwrap(); + let mut vec_model = Vec::new(); + ciborium::ser::into_writer(&value, &mut vec_model).unwrap(); + // debug: start + let prediction = model.predict(&dataset.records); + println!("{:?}", prediction); + // debug: end + let output = Path::new(".data") + .join("output") + .join("diabetes_model.logistic_regression.cbor"); + fs::write(output.clone(), vec_model).unwrap(); + println!("\n{} {:?}", "Model saved, path:".yellow(), output.as_path()); + } + + /// Loads the model + fn load_model(&mut self) { + println!("\n{}", "Testing the model...".yellow().bold()); + let dataset = self.dataset(); + let mut data: Vec = Vec::new(); + let path = Path::new(".data") + .join("output") + .join("diabetes_model.logistic_regression.cbor"); + let mut file = File::open(path).unwrap(); + file.read_to_end(&mut data).unwrap(); + let value = ciborium::de::from_reader::(&data[..]).unwrap(); + let model: FittedLogisticRegression = value.deserialized().unwrap(); + println!("\n{} {:?}", "Model loaded:".yellow(), model); + let prediction = model.predict(dataset.records); + println!( + "\n{} {:?}", + "Prediction test with the model success:".green().bold(), + prediction + ); + } +} diff --git a/src/linfa_train/mod.rs b/src/linfa_train/mod.rs index 2817cb4..8d73be5 100644 --- a/src/linfa_train/mod.rs +++ b/src/linfa_train/mod.rs @@ -1,16 +1,14 @@ //! Linfa train module. -use ciborium::{cbor, value}; use colored::Colorize; -use csv::Reader; -use linfa::prelude::*; -use linfa::Dataset; -use linfa_logistic::FittedLogisticRegression; -use linfa_logistic::LogisticRegression; -use ndarray::{Array, Array1, Array2}; -use std::io::Read; -use std::path::Path; -use std::{env::args, fs, fs::File}; +use std::cmp::Ordering; +use std::env::args; +use std::io; + +type LinfaTrainPrograms<'a> = [&'a str; 2]; + +mod decision_tree; +mod logistic_regression; /// The entry point of the program. pub fn main() { @@ -19,7 +17,7 @@ pub fn main() { /// Input arguments of the program. struct InuputArguments { - max_iterations: u64, + program_index: Option, } struct LinfaTrain; @@ -39,8 +37,17 @@ impl LinfaTrain { let args = self.args(); - self.train(args.max_iterations); - self.load_model(); + let programs: LinfaTrainPrograms = ["Logistic regression", "Decision tree"]; + + let program_index = self.choose_program(programs, args.program_index); + + match program_index { + 0 => logistic_regression::main(), + 1 => decision_tree::main(), + _ => { + panic!("Program does not exist"); + } + } } /// Parses arguments passed to the program. @@ -49,138 +56,79 @@ impl LinfaTrain { println!("\n{}:\n{:?}", "Arguments".cyan().bold(), arguments); - let max_iterations = arguments - .get(2) - .cloned() - .unwrap_or_default() - .trim() - .parse::() - .unwrap_or(500); + let program_index = arguments.get(3).cloned(); - InuputArguments { max_iterations } + InuputArguments { program_index } } - /// The dataset headers - fn headers(&mut self, reader: &mut Reader) -> Vec { - let result = reader - .headers() - .unwrap() - .iter() - .map(|r| r.to_owned()) - .collect(); - println!("\n{} {:?}", "Header collected, result:".yellow(), result); - result + /// Prompts input from the user, processes it, and returns the index of the selected program. + fn choose_program(&self, programs: LinfaTrainPrograms, program_arg: Option) -> usize { + let is_some = program_arg.is_some(); + let mut program_arg_input = if is_some { + match program_arg.unwrap().trim().parse::() { + Ok(value) => value.to_string(), + Err(_) => String::new(), + } + } else { + String::new() + }; + + loop { + let mut program_input = String::new(); + + if program_arg_input.is_empty() { + self.print_instructions(programs); + + io::stdin() + .read_line(&mut program_input) + .expect("Failed to read line"); + } else { + program_input = program_arg_input.to_string(); + } + + let program_index = match program_input.trim().parse::() { + Ok(num) => num, + Err(_) => continue, + }; + + match program_index.cmp(&programs.len()) { + Ordering::Less => { + return self.select_program(programs, program_index); + } + Ordering::Greater => program_arg_input = self.reset_input_arg(), + Ordering::Equal => program_arg_input = self.reset_input_arg(), + } + } } - /// The dataset data - fn data(&mut self, reader: &mut Reader) -> Vec> { - let result = reader - .records() - .map(|r| { - r.unwrap() - .iter() - .map(|field| field.parse::().unwrap()) - .collect::>() - }) - .collect::>>(); - println!( - "\n{} {:?}", - "Data collected, length:".yellow(), - result.len() - ); - result - } + /// Prints the program selection instructions. + fn print_instructions(&self, programs: LinfaTrainPrograms) { + println!("\n{}", "Available programs:".yellow().bold()); - /// The dataset records - fn records(&mut self, data: &Vec>, target_index: usize) -> Array2 { - let mut records: Vec = vec![]; - for record in data.iter() { - records.extend_from_slice(&record[0..target_index]); + let max_i = programs.len() - 1; + let mut i = 0; + while i <= max_i { + println!("{}: {}", i, programs[i]); + i += 1; } - let result = Array::from(records) - .into_shape((data.len(), target_index)) - .unwrap(); - let record_shape = result.shape(); println!( - "\n{} {:?} x {:?}", - "Records collected, shape:".yellow(), - record_shape[0], - record_shape[1] + "\n{}, [0-{}]:", + "Please select a program".yellow().bold(), + max_i ); - result } - /// The dataset targets - fn targets(&mut self, data: &[Vec], target_index: usize) -> Array1 { - let targets = data - .iter() - .map(|r| r[target_index] as i32) - .collect::>(); - println!( - "\n{} {:?}", - "Target collected, length:".yellow(), - targets.len() - ); - Array::from(targets) - } - - /// The dataset - /// - /// Data source: https:///github.com/plotly/datasets/blob/master/diabetes.csv - fn dataset(&mut self) -> Dataset> { - let file_path = ".data/input/diabetes.csv"; - let mut reader = Reader::from_path(file_path).unwrap(); - let headers = self.headers(&mut reader); - let data = self.data(&mut reader); - let target_index = headers.len() - 1; - let features = headers[0..target_index].to_vec(); - let records = self.records(&data, target_index); - let targets = self.targets(&data, target_index); - Dataset::new(records, targets).with_feature_names(features) + /// Resets the input argument to start over if the program does not exist. + fn reset_input_arg(&self) -> String { + println!("\n{}", "The subprogram does not exist.".red()); + String::new() } - /// Trains the model - fn train(&mut self, max_iterations: u64) { - println!("\n{}", "Training the model...".yellow().bold()); - let dataset = self.dataset(); - let model = LogisticRegression::default() - .max_iterations(max_iterations) - .gradient_tolerance(0.0001) - .fit(&dataset) - .expect("Can not train the model"); - let value = cbor!(model).unwrap(); - let mut vec_model = Vec::new(); - ciborium::ser::into_writer(&value, &mut vec_model).unwrap(); - // debug: start - let prediction = model.predict(&dataset.records); - println!("{:?}", prediction); - // debug: end - let output = Path::new(".data") - .join("output") - .join("diabetes_model.cbor"); - fs::write(output.clone(), vec_model).unwrap(); - println!("\n{} {:?}", "Model saved, path:".yellow(), output.as_path()); - } - - /// Loads the model - fn load_model(&mut self) { - println!("\n{}", "Testing the model...".yellow().bold()); - let dataset = self.dataset(); - let mut data: Vec = Vec::new(); - let path = Path::new(".data") - .join("output") - .join("diabetes_model.cbor"); - let mut file = File::open(path).unwrap(); - file.read_to_end(&mut data).unwrap(); - let value = ciborium::de::from_reader::(&data[..]).unwrap(); - let model: FittedLogisticRegression = value.deserialized().unwrap(); - println!("\n{} {:?}", "Model loaded:".yellow(), model); - let prediction = model.predict(dataset.records); - println!( - "\n{} {:?}", - "Prediction test with the model success:".green().bold(), - prediction - ); + /// Prints selected program and returns the program index. + fn select_program(&self, programs: LinfaTrainPrograms, program_index: usize) -> usize { + let program = programs[program_index]; + println!("You selected: {}", program); + program_index } }