From a5a479fdc2cbc769215136684926ac6639f4a2a1 Mon Sep 17 00:00:00 2001 From: Ivano Donadi Date: Wed, 28 Apr 2021 16:50:05 +0200 Subject: [PATCH] Fit trait modification and cross validation proposal (#122) * change fit signature cross valdation POC * fmt * fix merge issues * concat to from_shape_vec * with labels tests * Move linfa-pls to new Lapack bound (#3) * Move linfa-pls to new Lapack bound * More cleanups * Playing around with `cross_validation` * Make generic over dimension * Run rustfmt * Add simple test for multi target cv * Run rustfmt * Rename cross validation multi target to `cross_validate_multi` * Run rustfmt * docs * update table of contents * fix pls segmentation fault * update contribution guide * snippet Co-authored-by: Lorenz Schmidt --- CONTRIBUTE.md | 29 +- Cargo.toml | 3 +- README.md | 1 + algorithms/linfa-bayes/src/gaussian_nb.rs | 10 +- algorithms/linfa-clustering/Cargo.toml | 2 +- .../src/appx_dbscan/hyperparameters.rs | 10 +- .../src/gaussian_mixture/algorithm.rs | 14 +- .../src/gaussian_mixture/errors.rs | 61 ++-- .../linfa-clustering/src/k_means/algorithm.rs | 8 +- .../linfa-clustering/src/k_means/errors.rs | 22 +- .../examples/elasticnet_cv.rs | 26 ++ algorithms/linfa-elasticnet/src/algorithm.rs | 6 +- algorithms/linfa-ica/Cargo.toml | 3 +- algorithms/linfa-ica/src/error.rs | 36 +- algorithms/linfa-ica/src/fast_ica.rs | 46 ++- algorithms/linfa-linear/src/error.rs | 6 + algorithms/linfa-linear/src/glm.rs | 8 +- .../linfa-linear/src/glm/distribution.rs | 91 +++--- algorithms/linfa-linear/src/glm/link.rs | 24 +- algorithms/linfa-linear/src/ols.rs | 29 +- algorithms/linfa-logistic/Cargo.toml | 1 + .../linfa-logistic/examples/logistic_cv.rs | 34 ++ algorithms/linfa-logistic/src/error.rs | 22 ++ algorithms/linfa-logistic/src/lib.rs | 180 +++++----- algorithms/linfa-pls/Cargo.toml | 4 +- algorithms/linfa-pls/src/errors.rs | 39 +-- algorithms/linfa-pls/src/lib.rs | 6 +- algorithms/linfa-pls/src/pls_generic.rs | 47 ++- algorithms/linfa-pls/src/pls_svd.rs | 17 +- algorithms/linfa-pls/src/utils.rs | 34 +- .../examples/count_vectorization.rs | 15 +- .../examples/tfidf_vectorization.rs | 15 +- .../src/count_vectorization.rs | 2 + algorithms/linfa-preprocessing/src/error.rs | 2 + .../linfa-preprocessing/src/linear_scaling.rs | 7 +- .../linfa-preprocessing/src/whitening.rs | 7 +- algorithms/linfa-reduction/Cargo.toml | 1 + algorithms/linfa-reduction/examples/pca.rs | 2 +- algorithms/linfa-reduction/src/error.rs | 12 + algorithms/linfa-reduction/src/lib.rs | 1 + algorithms/linfa-reduction/src/pca.rs | 32 +- algorithms/linfa-svm/src/classification.rs | 30 +- algorithms/linfa-svm/src/regression.rs | 16 +- .../src/decision_trees/algorithm.rs | 16 +- .../linfa-trees/src/decision_trees/tikz.rs | 3 +- algorithms/linfa-tsne/examples/tsne.rs | 2 +- build.rs | 9 + .../content/snippets/cross-validation.md | 37 +-- docs/website/content/snippets/k-folding.md | 23 ++ src/dataset/impl_dataset.rs | 309 +++++++++++++++--- src/dataset/impl_targets.rs | 30 +- src/dataset/mod.rs | 295 ++++++++++++++++- src/error.rs | 2 + src/lib.rs | 3 + src/metrics_classification.rs | 199 ++++++----- src/traits.rs | 7 +- 56 files changed, 1234 insertions(+), 662 deletions(-) create mode 100644 algorithms/linfa-elasticnet/examples/elasticnet_cv.rs create mode 100644 algorithms/linfa-logistic/examples/logistic_cv.rs create mode 100644 algorithms/linfa-logistic/src/error.rs create mode 100644 algorithms/linfa-reduction/src/error.rs create mode 100644 build.rs create mode 100644 docs/website/content/snippets/k-folding.md diff --git a/CONTRIBUTE.md b/CONTRIBUTE.md index 4cda44a1b..06d5621fa 100644 --- a/CONTRIBUTE.md +++ b/CONTRIBUTE.md @@ -6,38 +6,19 @@ This document should be used as a reference when contributing to Linfa. It descr An important part of the Linfa ecosystem is how to organize data for the training and estimation process. A [Dataset](src/dataset/mod.rs) serves this purpose. It is a small wrapper of data and targets types and should be used as argument for the [Fit](src/traits.rs) trait. Its parametrization is generic, with [Records](src/dataset/mod.rs) representing input data (atm only implemented for `ndarray::ArrayBase`) and [Targets](src/dataset/mod.rs) for targets. -You can find traits for different classes of algorithms [here](src/traits.rs). For example, to implement a fittable algorithm, which takes an `Array2` as input data and boolean array as targets: +You can find traits for different classes of algorithms [here](src/traits.rs). For example, to implement a fittable algorithm, which takes an `Array2` as input data and boolean array as targets and could fail with an `Error` struct: ```rust -impl<'a, F: Float> Fit<'a, Array2, Array1> for SvmParams { +impl Fit, Array1, Error> for SvmParams { type Object = Svm; - fn fit(&self, dataset: &Dataset, Array1>) -> Self::Object { + fn fit(&self, dataset: &Dataset, Array1>) -> Result { ... } } ``` -the type of the dataset is `&Dataset, Array1>`, and lifetime `'a` is the required lifetime for the fitted state. It produces a fitted state, called `Svm` with probability type `Pr`. +where the type of the input dataset is `&Dataset, Array1>`. It produces a result with a fitted state, called `Svm` with probability type `Pr`, or an error of type `Error` in case of failure. -The [Predict](src/traits.rs) should be implemented with dataset arguments, as well as arrays. If a dataset is provided, then predict takes its ownership and returns a new dataset with predicted targets. For an array, predict takes a reference and returns predicted targets. In the same context, SVM implemented predict like this: -```rust -impl Predict, T>, Dataset, Vec>> - for Svm -{ - fn predict(&self, data: Dataset, T>) -> Dataset, Vec> { - ... - } -} -``` -and -```rust -impl> Predict, Vec> for Svm { - fn predict(&self, data: ArrayBase) -> Vec { - ... - } -} -``` - -For an example of a `Transformer` please look into the [linfa-kernel](linfa-kernel/src/lib.rs) implementation. +The [Predict](src/traits.rs) trait has its own section later in this document, while for an example of a `Transformer` please look into the [linfa-kernel](linfa-kernel/src/lib.rs) implementation. ## Parameters and builder diff --git a/Cargo.toml b/Cargo.toml index 3331dae85..de8686d3a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,14 +54,13 @@ features = ["cblas"] default-features = false [dependencies.openblas-src] -version = "0.9.0" +version = "0.10.4" optional = true default-features = false features = ["cblas"] [dev-dependencies] ndarray-rand = "0.13" - linfa-datasets = { path = "datasets", features = ["winequality", "iris", "diabetes"] } [workspace] diff --git a/README.md b/README.md index add7af611..4e2a180a3 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ Where does `linfa` stand right now? [Are we learning yet?](http://www.arewelearn | [ica](algorithms/linfa-ica/) | Independent component analysis | Tested | Unsupervised learning | Contains FastICA implementation | | [pls](algorithms/linfa-pls/) | Partial Least Squares | Tested | Supervised learning | Contains PLS estimators for dimensionality reduction and regression | | [tsne](algorithms/linfa-tsne/) | Dimensionality reduction| Tested | Unsupervised learning | Contains exact solution and Barnes-Hut approximation t-SNE | +| [preprocessing](algorithms/linfa-preprocessing/) |Normalization & Vectorization| Tested | Pre-processing | Contains data normalization/whitening and count vectorization/tf-idf | We believe that only a significant community effort can nurture, build, and sustain a machine learning ecosystem in Rust - there is no other way forward. diff --git a/algorithms/linfa-bayes/src/gaussian_nb.rs b/algorithms/linfa-bayes/src/gaussian_nb.rs index b7c4b39e4..304df69e6 100644 --- a/algorithms/linfa-bayes/src/gaussian_nb.rs +++ b/algorithms/linfa-bayes/src/gaussian_nb.rs @@ -2,7 +2,7 @@ use ndarray::{s, Array1, Array2, ArrayBase, ArrayView1, ArrayView2, Axis, Data, use ndarray_stats::QuantileExt; use std::collections::HashMap; -use crate::error::Result; +use crate::error::{BayesError, Result}; use linfa::dataset::{AsTargets, DatasetBase, Labels}; use linfa::traits::{Fit, IncrementalFit, PredictRef}; use linfa::Float; @@ -40,13 +40,13 @@ impl GaussianNbParams { } } -impl Fit<'_, ArrayBase, L> for GaussianNbParams +impl Fit, L, BayesError> for GaussianNbParams where F: Float, D: Data, L: AsTargets + Labels, { - type Object = Result>; + type Object = GaussianNb; /// Fit the model /// @@ -77,7 +77,7 @@ where /// # Ok(()) /// # } /// ``` - fn fit(&self, dataset: &DatasetBase, L>) -> Self::Object { + fn fit(&self, dataset: &DatasetBase, L>) -> Result { // We extract the unique classes in sorted order let mut unique_classes = dataset.targets.labels(); unique_classes.sort_unstable(); @@ -303,7 +303,7 @@ where /// /// __Panics__ if the input is empty or if pairwise orderings are undefined /// (this occurs in presence of NaN values) - fn predict_ref<'a>(&'a self, x: &ArrayBase) -> Array1 { + fn predict_ref(&self, x: &ArrayBase) -> Array1 { let joint_log_likelihood = self.joint_log_likelihood(x.view()); // We store the classes and likelihood info in an vec and matrix diff --git a/algorithms/linfa-clustering/Cargo.toml b/algorithms/linfa-clustering/Cargo.toml index d12006dc3..bd8d38005 100644 --- a/algorithms/linfa-clustering/Cargo.toml +++ b/algorithms/linfa-clustering/Cargo.toml @@ -34,8 +34,8 @@ ndarray-rand = "0.13" ndarray-stats = "0.4" num-traits = "0.2" rand_isaac = "0.3" +thiserror = "1" partitions = "0.2.4" - linfa = { version = "0.3.1", path = "../..", features = ["ndarray-linalg"] } [dev-dependencies] diff --git a/algorithms/linfa-clustering/src/appx_dbscan/hyperparameters.rs b/algorithms/linfa-clustering/src/appx_dbscan/hyperparameters.rs index c9476ae5e..4b624bf6a 100644 --- a/algorithms/linfa-clustering/src/appx_dbscan/hyperparameters.rs +++ b/algorithms/linfa-clustering/src/appx_dbscan/hyperparameters.rs @@ -99,7 +99,7 @@ impl AppxDbscanHyperParams { } fn build(tolerance: F, min_points: usize, slack: F) -> Self { - if tolerance <= F::cast(0.) { + if tolerance <= F::zero() { panic!("`tolerance` must be greater than 0!"); } // There is always at least one neighbor to a point (itself) @@ -107,13 +107,13 @@ impl AppxDbscanHyperParams { panic!("`min_points` must be greater than 1!"); } - if slack <= F::cast(0.) { + if slack <= F::zero() { panic!("`slack` must be greater than 0!"); } Self { - tolerance: tolerance, - min_points: min_points, - slack: slack, + tolerance, + min_points, + slack, appx_tolerance: tolerance * (F::one() + slack), } } diff --git a/algorithms/linfa-clustering/src/gaussian_mixture/algorithm.rs b/algorithms/linfa-clustering/src/gaussian_mixture/algorithm.rs index 70e6802ed..2388d3006 100644 --- a/algorithms/linfa-clustering/src/gaussian_mixture/algorithm.rs +++ b/algorithms/linfa-clustering/src/gaussian_mixture/algorithm.rs @@ -215,10 +215,10 @@ impl GaussianMixtureModel { reg_covar: F, ) -> Result<(Array1, Array2, Array3)> { let nk = resp.sum_axis(Axis(0)); - if nk.min().unwrap() < &(F::cast(10.) * F::epsilon()) { + if nk.min()? < &(F::cast(10.) * F::epsilon()) { return Err(GmmError::EmptyCluster(format!( "Cluster #{} has no more point. Consider decreasing number of clusters or change initialization.", - nk.argmin().unwrap() + 1 + nk.argmin()? + 1 ))); } @@ -400,12 +400,12 @@ impl GaussianMixtureModel { } } -impl<'a, F: Float, R: Rng + SeedableRng + Clone, D: Data, T> Fit<'a, ArrayBase, T> - for GmmHyperParams +impl, T> + Fit, T, GmmError> for GmmHyperParams { - type Object = Result>; + type Object = GaussianMixtureModel; - fn fit(&self, dataset: &DatasetBase, T>) -> Self::Object { + fn fit(&self, dataset: &DatasetBase, T>) -> Result { self.validate()?; let observations = dataset.records().view(); let mut gmm = GaussianMixtureModel::::new(self, dataset, self.rng())?; @@ -488,7 +488,7 @@ mod tests { } impl MultivariateNormal { pub fn new(mean: &ArrayView1, covariance: &ArrayView2) -> LAResult { - let lower = covariance.cholesky(UPLO::Lower).unwrap(); + let lower = covariance.cholesky(UPLO::Lower)?; Ok(MultivariateNormal { mean: mean.to_owned(), covariance: covariance.to_owned(), diff --git a/algorithms/linfa-clustering/src/gaussian_mixture/errors.rs b/algorithms/linfa-clustering/src/gaussian_mixture/errors.rs index 0ae0859f8..9b62101b1 100644 --- a/algorithms/linfa-clustering/src/gaussian_mixture/errors.rs +++ b/algorithms/linfa-clustering/src/gaussian_mixture/errors.rs @@ -1,58 +1,37 @@ use crate::k_means::KMeansError; use ndarray_linalg::error::LinalgError; -use std::error::Error; -use std::fmt::{self, Display}; - +use thiserror::Error; pub type Result = std::result::Result; /// An error when modeling a GMM algorithm -#[derive(Debug)] +#[derive(Error, Debug)] pub enum GmmError { /// When any of the hyperparameters are set the wrong value + #[error("Invalid value encountered: {0}")] InvalidValue(String), /// Errors encountered during linear algebra operations - LinalgError(LinalgError), + #[error( + "Linalg Error: \ + Fitting the mixture model failed because some components have \ + ill-defined empirical covariance (for instance caused by singleton \ + or collapsed samples). Try to decrease the number of components, \ + or increase reg_covar. Error: {0}" + )] + LinalgError(#[from] LinalgError), /// When a cluster has no more data point while fitting GMM + #[error("Fitting failed: {0}")] EmptyCluster(String), /// When lower bound computation fails + #[error("Fitting failed: {0}")] LowerBoundError(String), /// When fitting EM algorithm does not converge + #[error("Fitting failed: {0}")] NotConverged(String), /// When initial KMeans fails - KMeansError(String), -} - -impl Display for GmmError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::InvalidValue(message) => write!(f, "Invalid value encountered: {}", message), - Self::LinalgError(error) => write!( - f, - "Linalg Error: \ - Fitting the mixture model failed because some components have \ - ill-defined empirical covariance (for instance caused by singleton \ - or collapsed samples). Try to decrease the number of components, \ - or increase reg_covar. Error: {}", - error - ), - Self::EmptyCluster(message) => write!(f, "Fitting failed: {}", message), - Self::LowerBoundError(message) => write!(f, "Fitting failed: {}", message), - Self::NotConverged(message) => write!(f, "Fitting failed: {}", message), - Self::KMeansError(message) => write!(f, "Initial KMeans failed: {}", message), - } - } -} - -impl Error for GmmError {} - -impl From for GmmError { - fn from(error: LinalgError) -> GmmError { - GmmError::LinalgError(error) - } -} - -impl From for GmmError { - fn from(error: KMeansError) -> GmmError { - GmmError::KMeansError(error.to_string()) - } + #[error("Initial KMeans failed: {0}")] + KMeansError(#[from] KMeansError), + #[error(transparent)] + LinfaError(#[from] linfa::error::Error), + #[error(transparent)] + MinMaxError(#[from] ndarray_stats::errors::MinMaxError), } diff --git a/algorithms/linfa-clustering/src/k_means/algorithm.rs b/algorithms/linfa-clustering/src/k_means/algorithm.rs index 8f5aabea0..aefa62c9b 100644 --- a/algorithms/linfa-clustering/src/k_means/algorithm.rs +++ b/algorithms/linfa-clustering/src/k_means/algorithm.rs @@ -215,17 +215,17 @@ impl KMeans { } } -impl<'a, F: Float, R: Rng + Clone + SeedableRng, D: Data, T> Fit<'a, ArrayBase, T> - for KMeansHyperParams +impl, T> + Fit, T, KMeansError> for KMeansHyperParams { - type Object = Result>; + type Object = KMeans; /// Given an input matrix `observations`, with shape `(n_observations, n_features)`, /// `fit` identifies `n_clusters` centroids based on the training data distribution. /// /// An instance of `KMeans` is returned. /// - fn fit(&self, dataset: &DatasetBase, T>) -> Self::Object { + fn fit(&self, dataset: &DatasetBase, T>) -> Result { let mut rng = self.rng(); let observations = dataset.records().view(); let n_samples = dataset.nsamples(); diff --git a/algorithms/linfa-clustering/src/k_means/errors.rs b/algorithms/linfa-clustering/src/k_means/errors.rs index cad2f8548..f63474296 100644 --- a/algorithms/linfa-clustering/src/k_means/errors.rs +++ b/algorithms/linfa-clustering/src/k_means/errors.rs @@ -1,27 +1,19 @@ -use std::error::Error; -use std::fmt::{self, Display}; +use thiserror::Error; pub type Result = std::result::Result; /// An error when modeling a KMeans algorithm -#[derive(Debug)] +#[derive(Error, Debug)] pub enum KMeansError { /// When any of the hyperparameters are set the wrong value + #[error("Invalid value encountered: {0}")] InvalidValue(String), /// When inertia computation fails + #[error("Fitting failed: {0}")] InertiaError(String), /// When fitting algorithm does not converge + #[error("Fitting failed: {0}")] NotConverged(String), + #[error(transparent)] + LinfaError(#[from] linfa::error::Error), } - -impl Display for KMeansError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::InvalidValue(message) => write!(f, "Invalid value encountered: {}", message), - Self::InertiaError(message) => write!(f, "Fitting failed: {}", message), - Self::NotConverged(message) => write!(f, "Fitting failed: {}", message), - } - } -} - -impl Error for KMeansError {} diff --git a/algorithms/linfa-elasticnet/examples/elasticnet_cv.rs b/algorithms/linfa-elasticnet/examples/elasticnet_cv.rs new file mode 100644 index 000000000..751ff9529 --- /dev/null +++ b/algorithms/linfa-elasticnet/examples/elasticnet_cv.rs @@ -0,0 +1,26 @@ +use linfa::prelude::*; +use linfa_elasticnet::{ElasticNet, Result}; + +fn main() -> Result<()> { + // load Diabetes dataset (mutable to allow fast k-folding) + let mut dataset = linfa_datasets::diabetes(); + + // parameters to compare + let ratios = vec![0.1, 0.2, 0.5, 0.7, 1.0]; + + // create a model for each parameter + let models = ratios + .iter() + .map(|ratio| ElasticNet::params().penalty(0.3).l1_ratio(*ratio)) + .collect::>(); + + // get the mean r2 validation score across all folds for each model + let r2_values = + dataset.cross_validate(5, &models, |prediction, truth| prediction.r2(&truth))?; + + for (ratio, r2) in ratios.iter().zip(r2_values.iter()) { + println!("L1 ratio: {}, r2 score: {}", ratio, r2); + } + + Ok(()) +} diff --git a/algorithms/linfa-elasticnet/src/algorithm.rs b/algorithms/linfa-elasticnet/src/algorithm.rs index 8316fea22..a63ef1ee7 100644 --- a/algorithms/linfa-elasticnet/src/algorithm.rs +++ b/algorithms/linfa-elasticnet/src/algorithm.rs @@ -10,13 +10,13 @@ use linfa::{ use super::{ElasticNet, ElasticNetParams, Error, Result}; -impl<'a, F, D, T> Fit<'a, ArrayBase, T> for ElasticNetParams +impl Fit, T, crate::error::Error> for ElasticNetParams where F: Float + Lapack, D: Data, T: AsTargets, { - type Object = Result>; + type Object = ElasticNet; /// Fit an elastic net model given a feature matrix `x` and a target /// variable `y`. @@ -28,7 +28,7 @@ where /// Returns a `FittedElasticNet` object which contains the fitted /// parameters and can be used to `predict` values of the target variable /// for new feature values. - fn fit(&self, dataset: &DatasetBase, T>) -> Result> { + fn fit(&self, dataset: &DatasetBase, T>) -> Result { self.validate_params()?; let target = dataset.try_single_target()?; diff --git a/algorithms/linfa-ica/Cargo.toml b/algorithms/linfa-ica/Cargo.toml index 91ca5330f..4eb0f7290 100644 --- a/algorithms/linfa-ica/Cargo.toml +++ b/algorithms/linfa-ica/Cargo.toml @@ -30,8 +30,9 @@ ndarray-rand = "0.13" ndarray-stats = "0.4" num-traits = "0.2" rand_isaac = "0.3" +thiserror = "1" -linfa = { version = "0.3.1", path = "../.." } +linfa = { version = "0.3.1", path = "../..", features = ["ndarray-linalg"] } [dev-dependencies] ndarray-npy = { version = "0.7", default-features = false } diff --git a/algorithms/linfa-ica/src/error.rs b/algorithms/linfa-ica/src/error.rs index 3f14bd73c..7d8c614ce 100644 --- a/algorithms/linfa-ica/src/error.rs +++ b/algorithms/linfa-ica/src/error.rs @@ -1,38 +1,24 @@ use ndarray_linalg::error::LinalgError; -use std::error::Error; -use std::fmt::{self, Display}; +use thiserror::Error; pub type Result = std::result::Result; /// An error when modeling FastICA algorithm -#[derive(Debug)] +#[derive(Error, Debug)] pub enum FastIcaError { + /// When there are no samples in the provided dataset + #[error("Dataset must contain at least one sample")] + NotEnoughSamples, /// When any of the hyperparameters are set the wrong value + #[error("Invalid value encountered: {0}")] InvalidValue(String), /// If we fail to compute any components of the SVD decomposition /// due to an Ill-Conditioned matrix + #[error("SVD Decomposition failed, X could be an Ill-Conditioned matrix")] SvdDecomposition, /// Errors encountered during linear algebra operations - Linalg(LinalgError), -} - -impl Display for FastIcaError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::InvalidValue(message) => write!(f, "Invalid value encountered: {}", message), - Self::SvdDecomposition => write!( - f, - "SVD Decomposition failed, X could be an Ill-Conditioned matrix", - ), - Self::Linalg(error) => write!(f, "Linalg Error: {}", error), - } - } -} - -impl Error for FastIcaError {} - -impl From for FastIcaError { - fn from(error: LinalgError) -> FastIcaError { - FastIcaError::Linalg(error) - } + #[error("Linalg Error: {0}")] + Linalg(#[from] LinalgError), + #[error(transparent)] + LinfaError(#[from] linfa::error::Error), } diff --git a/algorithms/linfa-ica/src/fast_ica.rs b/algorithms/linfa-ica/src/fast_ica.rs index 657823bf3..839693e41 100644 --- a/algorithms/linfa-ica/src/fast_ica.rs +++ b/algorithms/linfa-ica/src/fast_ica.rs @@ -1,8 +1,12 @@ //! Fast algorithm for Independent Component Analysis (ICA) -use linfa::{dataset::DatasetBase, traits::*, Float}; +use linfa::{ + dataset::{DatasetBase, Records, WithLapack, WithoutLapack}, + traits::*, + Float, +}; use ndarray::{Array, Array1, Array2, ArrayBase, Axis, Data, Ix2}; -use ndarray_linalg::{eigh::Eigh, solveh::UPLO, svd::SVD, Lapack}; +use ndarray_linalg::{eigh::Eigh, solveh::UPLO, svd::SVD}; use ndarray_rand::{rand::SeedableRng, rand_distr::Uniform, RandomExt}; use ndarray_stats::QuantileExt; use rand_isaac::Isaac64Rng; @@ -75,8 +79,8 @@ impl FastIca { } } -impl<'a, F: Float + Lapack, D: Data, T> Fit<'a, ArrayBase, T> for FastIca { - type Object = Result>; +impl, T> Fit, T, FastIcaError> for FastIca { + type Object = FittedFastIca; /// Fit the model /// @@ -87,9 +91,12 @@ impl<'a, F: Float + Lapack, D: Data, T> Fit<'a, ArrayBase, T> /// /// If the `alpha` value set for [`GFunc::Logcosh`] is not between 1 and 2 /// inclusive - fn fit(&self, dataset: &DatasetBase, T>) -> Result> { + fn fit(&self, dataset: &DatasetBase, T>) -> Result { let x = &dataset.records; - let (nsamples, nfeatures) = (x.nrows(), x.ncols()); + let (nsamples, nfeatures) = (x.nsamples(), x.nfeatures()); + if dataset.nsamples() == 0 { + return Err(FastIcaError::NotEnoughSamples); + } // If the number of components is not set, we take the minimum of // the number of rows and columns @@ -105,6 +112,7 @@ impl<'a, F: Float + Lapack, D: Data, T> Fit<'a, ArrayBase, T> } // We center the input by subtracting the mean of its features + // safe unwrap because we already returned an error on zero samples let xmean = x.mean_axis(Axis(0)).unwrap(); let mut xcentered = x - &xmean.view().insert_axis(Axis(0)); @@ -113,20 +121,23 @@ impl<'a, F: Float + Lapack, D: Data, T> Fit<'a, ArrayBase, T> // We whiten the matrix to remove any potential correlation between // the components + let xcentered = xcentered.with_lapack(); let k = match xcentered.svd(true, false)? { (Some(u), s, _) => { - let s = s.mapv(|x| F::cast(x)); - (u.slice(s![.., ..nsamples.min(nfeatures)]).to_owned() / s) + let s = s.mapv(|x| F::Lapack::cast(x)); + (u.slice_move(s![.., ..nsamples.min(nfeatures)]) / s) .t() .slice(s![..ncomponents, ..]) .to_owned() } _ => return Err(FastIcaError::SvdDecomposition), }; - let mut xwhitened = k.dot(&xcentered); + + let mut xwhitened = k.dot(&xcentered).without_lapack(); + let k = k.without_lapack(); // We multiply the matrix with root of the number of records - let nsamples_sqrt = F::cast((nsamples as f64).sqrt()); + let nsamples_sqrt = F::cast(nsamples).sqrt(); xwhitened.mapv_inplace(|x| x * nsamples_sqrt); // We initialize the de-mixing matrix with a uniform distribution @@ -152,7 +163,7 @@ impl<'a, F: Float + Lapack, D: Data, T> Fit<'a, ArrayBase, T> } } -impl FastIca { +impl FastIca { // Parallel FastICA, Optimization step fn ica_parallel(&self, x: &Array2, w: &Array2) -> Result> { let mut w = Self::sym_decorrelation(&w)?; @@ -173,9 +184,9 @@ impl FastIca { .zip(w.outer_iter()) .map(|(a, b)| a.dot(&b)) .collect::>() - .mapv(num_traits::Float::abs) + .mapv(|x| x.abs()) .mapv(|x| x - F::cast(1.)) - .mapv(num_traits::Float::abs) + .mapv(|x| x.abs()) .max() .unwrap(); @@ -193,17 +204,18 @@ impl FastIca { // // W <- (W * W.T)^{-1/2} * W fn sym_decorrelation(w: &Array2) -> Result> { - let (eig_val, eig_vec) = w.dot(&w.t()).eigh(UPLO::Upper)?; + let (eig_val, eig_vec) = w.dot(&w.t()).with_lapack().eigh(UPLO::Upper)?; let eig_val = eig_val.mapv(|x| F::cast(x)); + let eig_vec = eig_vec.without_lapack(); let tmp = &eig_vec - * &(eig_val.mapv(num_traits::Float::sqrt).mapv(|x| { + * &(eig_val.mapv(|x| x.sqrt()).mapv(|x| { // We lower bound the float value at 1e-7 when taking the reciprocal let lower_bound = F::cast(1e-7); if x < lower_bound { - return num_traits::Float::recip(lower_bound); + return lower_bound.recip(); } - num_traits::Float::recip(x) + x.recip() })) .insert_axis(Axis(0)); diff --git a/algorithms/linfa-linear/src/error.rs b/algorithms/linfa-linear/src/error.rs index 45dd2a51c..b996943e8 100644 --- a/algorithms/linfa-linear/src/error.rs +++ b/algorithms/linfa-linear/src/error.rs @@ -11,4 +11,10 @@ pub enum LinearError { Argmin(#[from] argmin::core::Error), #[error(transparent)] BaseCrate(#[from] linfa::Error), + #[error("At least one sample needed")] + NotEnoughSamples, + #[error("At least one target needed")] + NotEnoughTargets, + #[error(transparent)] + LinalgError(#[from] ndarray_linalg::error::LinalgError), } diff --git a/algorithms/linfa-linear/src/glm.rs b/algorithms/linfa-linear/src/glm.rs index c51c29c7c..a7009f283 100644 --- a/algorithms/linfa-linear/src/glm.rs +++ b/algorithms/linfa-linear/src/glm.rs @@ -3,7 +3,7 @@ mod distribution; mod link; -use crate::error::Result; +use crate::error::{LinearError, Result}; use crate::float::{ArgminParam, Float}; use distribution::TweedieDistribution; pub use link::Link; @@ -119,12 +119,12 @@ impl TweedieRegressor { } } -impl, T: AsTargets> Fit<'_, ArrayBase, T> +impl, T: AsTargets> Fit, T, LinearError> for TweedieRegressor { - type Object = Result>; + type Object = FittedTweedieRegressor; - fn fit(&self, ds: &DatasetBase, T>) -> Result> { + fn fit(&self, ds: &DatasetBase, T>) -> Result { let (x, y) = (ds.records(), ds.try_single_target()?); let dist = TweedieDistribution::new(self.power)?; diff --git a/algorithms/linfa-linear/src/glm/distribution.rs b/algorithms/linfa-linear/src/glm/distribution.rs index d91a8cc8c..c16554d3a 100644 --- a/algorithms/linfa-linear/src/glm/distribution.rs +++ b/algorithms/linfa-linear/src/glm/distribution.rs @@ -1,4 +1,4 @@ -use crate::float::Float; +use linfa::Float; use ndarray::Zip; use ndarray::{Array1, ArrayView1}; @@ -42,48 +42,43 @@ impl TweedieDistribution { } // Returns `true` if y is in the valid range - pub fn in_range(&self, y: &ArrayView1) -> bool { + pub fn in_range(&self, y: &ArrayView1) -> bool { if self.inclusive { - return y.iter().all(|&x| x >= A::from(self.lower_bound).unwrap()); + return y.iter().all(|&x| x >= F::cast(self.lower_bound)); } - y.iter().all(|&x| x > A::from(self.lower_bound).unwrap()) + y.iter().all(|&x| x > F::cast(self.lower_bound)) } - fn unit_variance(&self, ypred: ArrayView1) -> Array1 { + fn unit_variance(&self, ypred: ArrayView1) -> Array1 { // ypred ^ power - ypred.mapv(|x| num_traits::Float::powf(x, A::from(self.power).unwrap())) + ypred.mapv(|x| x.powf(F::cast(self.power))) } - fn unit_deviance(&self, y: ArrayView1, ypred: ArrayView1) -> Result> { + fn unit_deviance(&self, y: ArrayView1, ypred: ArrayView1) -> Result> { match self.power { power if power < 0. => { let mut left = y.mapv(|x| { - if x < A::from(0.).unwrap() { - return A::from(0.).unwrap(); + if x.is_negative() { + return F::zero(); } x }); left.mapv_inplace(|x| { - num_traits::Float::powf(x, A::from(2. - self.power).unwrap()) - / A::from((1. - self.power) * (2. - self.power)).unwrap() + x.powf(F::cast(2. - self.power)) + / F::cast((1. - self.power) * (2. - self.power)) }); - let middle = &y - * &ypred.mapv(|x| { - num_traits::Float::powf(x, A::from(1. - self.power).unwrap()) - / A::from(1. - power).unwrap() - }); + let middle = + &y * &ypred.mapv(|x| x.powf(F::cast(1. - self.power)) / F::cast(1. - power)); - let right = ypred.mapv(|x| { - num_traits::Float::powf(x, A::from(2. - self.power).unwrap()) - / A::from(2. - self.power).unwrap() - }); + let right = + ypred.mapv(|x| x.powf(F::cast(2. - self.power)) / F::cast(2. - self.power)); - Ok((left - middle + right).mapv(|x| A::from(2.).unwrap() * x)) + Ok((left - middle + right).mapv(|x| F::cast(2.) * x)) } // Normal distribution // (y - ypred)^2 - power if power == 0. => Ok((&y - &ypred).mapv(|x| num_traits::Float::powi(x, 2))), + power if power == 0. => Ok((&y - &ypred).mapv(|x| x * x)), power if power < 1. => Err(linfa::Error::Parameters(format!( "Power value cannot be between 0 and 1, got: {}", power @@ -93,10 +88,10 @@ impl TweedieDistribution { power if (power - 1.).abs() < 1e-6 => { let mut div = &y / &ypred; Zip::from(&mut div).and(y).apply(|y, &x| { - if x == A::from(0.).unwrap() { - *y = A::from(0.).unwrap(); + if x == F::zero() { + *y = F::zero(); } else { - *y = A::from(2.).unwrap() * (x * num_traits::Float::ln(*y)); + *y = F::cast(2.) * (x * y.ln()); } }); Ok(div - y + ypred) @@ -104,49 +99,41 @@ impl TweedieDistribution { // Gamma distribution // 2 * (log(ypred / y) + (y / ypred) - 1) power if (power - 2.).abs() < 1e-6 => { - let mut temp = (&ypred / &y).mapv(num_traits::Float::ln) + (&y / &ypred); - temp.mapv_inplace(|x| x - A::from(1.).unwrap()); - Ok(temp.mapv(|x| A::from(2.).unwrap() * x)) + let mut temp = (&ypred / &y).mapv(|x| x.ln()) + (&y / &ypred); + temp.mapv_inplace(|x| x - F::one()); + Ok(temp.mapv(|x| F::cast(2.) * x)) } power => { - let left = y.mapv(|x| { - num_traits::Float::powf(x, A::from(2. - power).unwrap()) - / A::from((1. - power) * (2. - power)).unwrap() - }); + let left = + y.mapv(|x| x.powf(F::cast(2. - power)) / F::cast((1. - power) * (2. - power))); - let middle = &y - * &ypred.mapv(|x| { - num_traits::Float::powf(x, A::from(1. - power).unwrap()) - / A::from(1. - power).unwrap() - }); + let middle = + &y * &ypred.mapv(|x| x.powf(F::cast(1. - power)) / F::cast(1. - power)); - let right = ypred.mapv(|x| { - num_traits::Float::powf(x, A::from(2. - power).unwrap()) - / A::from(2. - power).unwrap() - }); + let right = ypred.mapv(|x| x.powf(F::cast(2. - power)) / F::cast(2. - power)); - Ok((left - middle + right).mapv(|x| A::from(2.).unwrap() * x)) + Ok((left - middle + right).mapv(|x| F::cast(2.) * x)) } } } - fn unit_deviance_derivative( + fn unit_deviance_derivative( &self, - y: ArrayView1, - ypred: ArrayView1, - ) -> Array1 { - ((&y - &ypred) / &self.unit_variance(ypred)).mapv(|x| A::from(-2.).unwrap() * x) + y: ArrayView1, + ypred: ArrayView1, + ) -> Array1 { + ((&y - &ypred) / &self.unit_variance(ypred)).mapv(|x| F::cast(-2.) * x) } - pub fn deviance(&self, y: ArrayView1, ypred: ArrayView1) -> Result { + pub fn deviance(&self, y: ArrayView1, ypred: ArrayView1) -> Result { Ok(self.unit_deviance(y, ypred)?.sum()) } - pub fn deviance_derivative( + pub fn deviance_derivative( &self, - y: ArrayView1, - ypred: ArrayView1, - ) -> Array1 { + y: ArrayView1, + ypred: ArrayView1, + ) -> Array1 { self.unit_deviance_derivative(y, ypred) } } diff --git a/algorithms/linfa-linear/src/glm/link.rs b/algorithms/linfa-linear/src/glm/link.rs index ebde1a8b5..4184fd5b0 100644 --- a/algorithms/linfa-linear/src/glm/link.rs +++ b/algorithms/linfa-linear/src/glm/link.rs @@ -89,9 +89,9 @@ impl LinkFn for IdentityLink { struct LogLink; -impl LinkFn for LogLink { +impl LinkFn for LogLink { fn link(ypred: &Array1) -> Array1 { - ypred.mapv(|x| num_traits::Float::ln(x)) + ypred.mapv(|x| x.ln()) } fn link_derivative(ypred: &Array1) -> Array1 { @@ -106,40 +106,36 @@ impl LinkFn for LogLink { } fn inverse(lin_pred: &Array1) -> Array1 { - lin_pred.mapv(|x| num_traits::Float::exp(x)) + lin_pred.mapv(|x| x.exp()) } fn inverse_derivative(lin_pred: &Array1) -> Array1 { - lin_pred.mapv(|x| num_traits::Float::exp(x)) + lin_pred.mapv(|x| x.exp()) } } struct LogitLink; -impl LinkFn for LogitLink { +impl LinkFn for LogitLink { fn link(ypred: &Array1) -> Array1 { // logit(ypred) - ypred.mapv(|x| num_traits::Float::ln(x / (A::from(1.).unwrap() - x))) + ypred.mapv(|x| (x / (A::one() - x)).ln()) } fn link_derivative(ypred: &Array1) -> Array1 { // 1 / (ypred * (1-ypred) - ypred.mapv(|x| A::from(1.).unwrap() / (x * (A::from(1.).unwrap() - x))) + ypred.mapv(|x| A::one() / (x * (A::one() - x))) } fn inverse(lin_pred: &Array1) -> Array1 { // expit(lin_pred) - lin_pred.mapv(|x| { - A::from(1.).unwrap() / (A::from(1.).unwrap() + num_traits::Float::exp(x.neg())) - }) + lin_pred.mapv(|x| A::one() / (A::one() + x.neg().exp())) } fn inverse_derivative(lin_pred: &Array1) -> Array1 { // expit(lin_pred) * (1 - expit(lin_pred)) - let expit = lin_pred.mapv(|x| { - A::from(1.).unwrap() / (A::from(1.).unwrap() + num_traits::Float::exp(x.neg())) - }); - let one_minus_expit = expit.mapv(|x| A::from(1.).unwrap() - x); + let expit = lin_pred.mapv(|x| A::one() / (A::one() + x.neg().exp())); + let one_minus_expit = expit.mapv(|x| A::one() - x); expit * one_minus_expit } } diff --git a/algorithms/linfa-linear/src/ols.rs b/algorithms/linfa-linear/src/ols.rs index 00190c182..de3cb6cc4 100644 --- a/algorithms/linfa-linear/src/ols.rs +++ b/algorithms/linfa-linear/src/ols.rs @@ -1,5 +1,6 @@ //! Ordinary Least Squares #![allow(non_snake_case)] +use crate::error::{LinearError, Result}; use ndarray::{Array1, Array2, ArrayBase, Axis, Data, Ix1, Ix2}; use ndarray_linalg::{Lapack, Scalar, Solve}; use ndarray_stats::SummaryStatisticsExt; @@ -117,10 +118,10 @@ impl LinearRegression { } } -impl<'a, F: Float, D: Data, T: AsTargets> Fit<'a, ArrayBase, T> +impl, T: AsTargets> Fit, T, LinearError> for LinearRegression { - type Object = Result, String>; + type Object = FittedLinearRegression; /// Fit a linear regression model given a feature matrix `X` and a target /// variable `y`. @@ -132,12 +133,9 @@ impl<'a, F: Float, D: Data, T: AsTargets> Fit<'a, ArrayBase< /// Returns a `FittedLinearRegression` object which contains the fitted /// parameters and can be used to `predict` values of the target variable /// for new feature values. - fn fit( - &self, - dataset: &DatasetBase, T>, - ) -> Result, String> { + fn fit(&self, dataset: &DatasetBase, T>) -> Result { let X = dataset.records(); - let y = dataset.try_single_target().unwrap(); + let y = dataset.try_single_target()?; let (n_samples, _) = X.dim(); @@ -151,11 +149,9 @@ impl<'a, F: Float, D: Data, T: AsTargets> Fit<'a, ArrayBase< // to the X_offset and y_offset let X_offset: Array1 = X .mean_axis(Axis(0)) - .ok_or_else(|| String::from("cannot compute mean of X"))?; + .ok_or_else(|| LinearError::NotEnoughSamples)?; let X_centered: Array2 = X - &X_offset; - let y_offset: F = y - .mean() - .ok_or_else(|| String::from("cannot compute mean of y"))?; + let y_offset: F = y.mean().ok_or_else(|| LinearError::NotEnoughTargets)?; let y_centered: Array1 = &y - y_offset; let params: Array1 = compute_params(&X_centered, &y_centered, self.options.should_normalize())?; @@ -176,7 +172,7 @@ fn compute_params( X: &ArrayBase, y: &ArrayBase, normalize: bool, -) -> Result, String> +) -> Result> where F: Float, B: Data, @@ -196,10 +192,7 @@ where /// Solve the overconstrained model Xb = y by solving X^T X b = X^t y, /// this is (mathematically, not numerically) equivalent to computing /// the solution with the Moore-Penrose pseudo-inverse. -fn solve_normal_equation( - X: &ArrayBase, - y: &ArrayBase, -) -> Result, String> +fn solve_normal_equation(X: &ArrayBase, y: &ArrayBase) -> Result> where F: Float, B: Data, @@ -207,9 +200,7 @@ where { let rhs = X.t().dot(y); let linear_operator = X.t().dot(X); - linear_operator - .solve_into(rhs) - .map_err(|err| format! {"{}", err}) + linear_operator.solve_into(rhs).map_err(|err| err.into()) } /// View the fitted parameters and make predictions with a fitted diff --git a/algorithms/linfa-logistic/Cargo.toml b/algorithms/linfa-logistic/Cargo.toml index 144c0261a..495533637 100644 --- a/algorithms/linfa-logistic/Cargo.toml +++ b/algorithms/linfa-logistic/Cargo.toml @@ -19,6 +19,7 @@ ndarray-linalg = "0.13" num-traits = "0.2" argmin = { version = "0.4", features = ["ndarrayl"] } serde = "1.0" +thiserror = "1" linfa = { version = "0.3.1", path = "../.." } diff --git a/algorithms/linfa-logistic/examples/logistic_cv.rs b/algorithms/linfa-logistic/examples/logistic_cv.rs new file mode 100644 index 000000000..0219ead31 --- /dev/null +++ b/algorithms/linfa-logistic/examples/logistic_cv.rs @@ -0,0 +1,34 @@ +use linfa::prelude::*; +use linfa_logistic::error::Result; +use linfa_logistic::LogisticRegression; + +fn main() -> Result<()> { + // Load dataset. Mutability is needed for fast cross validation + let mut dataset = + linfa_datasets::winequality().map_targets(|x| if *x > 6 { "good" } else { "bad" }); + + // define a sequence of models to compare. In this case the + // models will differ by the amount of l2 regularization + let alphas = vec![0.1, 1., 10.]; + let models: Vec<_> = alphas + .iter() + .map(|alpha| { + LogisticRegression::default() + .alpha(*alpha) + .max_iterations(150) + }) + .collect(); + + // use cross validation to compute the validation accuracy of each model. The + // accuracy of each model will be averaged across the folds, 5 in this case + let accuracies = dataset.cross_validate(5, &models, |prediction, truth| { + Ok(prediction.confusion_matrix(truth)?.accuracy()) + })?; + + // display the accuracy of the models along with their regularization coefficient + for (alpha, accuracy) in alphas.iter().zip(accuracies.iter()) { + println!("Alpha: {}, accuracy: {} ", alpha, accuracy); + } + + Ok(()) +} diff --git a/algorithms/linfa-logistic/src/error.rs b/algorithms/linfa-logistic/src/error.rs new file mode 100644 index 000000000..54ae7c5a6 --- /dev/null +++ b/algorithms/linfa-logistic/src/error.rs @@ -0,0 +1,22 @@ +use thiserror::Error; +pub type Result = std::result::Result; + +#[derive(Error, Debug)] +pub enum Error { + #[error(transparent)] + LinfaError(#[from] linfa::Error), + #[error("Expected exactly two classes for logistic regression")] + WrongNumberOfClasses, + #[error(transparent)] + ArgMinError(#[from] argmin::core::Error), + #[error("Expected `x` and `y` to have same number of rows, got {0} != {1}")] + MismatchedShapes(usize, usize), + #[error("Values must be finite and not `Inf`, `-Inf` or `NaN`")] + InvalidValues, + #[error("gradient_tolerance must be a positive, finite number")] + InvalidGradientTolerance, + #[error("Size of initial parameter guess must be the same as the number of columns in the feature matrix `x`")] + InvalidInitialParametersGuessSize, + #[error("Initial parameter guess must be finite")] + InvalidInitialParametersGuess, +} diff --git a/algorithms/linfa-logistic/src/lib.rs b/algorithms/linfa-logistic/src/lib.rs index cb299bbf5..5882fa5de 100644 --- a/algorithms/linfa-logistic/src/lib.rs +++ b/algorithms/linfa-logistic/src/lib.rs @@ -14,7 +14,11 @@ //! ```bash //! $ cargo run --example winequality //! ``` +//! + +pub mod error; +use crate::error::{Error, Result}; use argmin::prelude::*; use argmin::solver::linesearch::MoreThuenteLineSearch; use argmin::solver::quasinewton::lbfgs::LBFGS; @@ -138,11 +142,7 @@ impl LogisticRegression { /// i.e. any values are `Inf` or `NaN`, `y` doesn't have as many items as /// `x` has rows, or if other parameters (gradient_tolerance, alpha) have /// been set to inalid values. - fn fit( - &self, - x: &ArrayBase, - y: T, - ) -> Result, String> + fn fit(&self, x: &ArrayBase, y: T) -> Result> where A: Data, T: AsTargets, @@ -159,45 +159,38 @@ impl LogisticRegression { /// Ensure that `x` and `y` have the right shape and that all data and /// configuration parameters are finite. - fn validate_data( - &self, - x: &ArrayBase, - y: &ArrayBase, - ) -> Result<(), String> + fn validate_data(&self, x: &ArrayBase, y: &ArrayBase) -> Result<()> where A: Data, B: Data, { if x.shape()[0] != y.len() { - return Err( - "Incompatible shapes of data, expected `x` and `y` to have same number of rows" - .to_string(), - ); + return Err(Error::MismatchedShapes(x.shape()[0], y.len())); } if x.iter().any(|x| !x.is_finite()) || y.iter().any(|y| !y.is_finite()) || !self.alpha.is_finite() { - return Err("Values must be finite and not `Inf`, `-Inf` or `NaN`".to_string()); + return Err(Error::InvalidValues); } if !self.gradient_tolerance.is_finite() || self.gradient_tolerance <= F::zero() { - return Err("gradient_tolerance must be a positive, finite number".to_string()); + return Err(Error::InvalidGradientTolerance); } self.validate_init_params(x)?; Ok(()) } - fn validate_init_params(&self, x: &ArrayBase) -> Result<(), String> + fn validate_init_params(&self, x: &ArrayBase) -> Result<()> where A: Data, { if let Some((params, intercept)) = self.initial_params.as_ref() { let (_, n_features) = x.dim(); if n_features != params.dim() { - return Err("Size of initial parameter guess must be the same as the number of columns in the feature matrix `x`".to_string()); + return Err(Error::InvalidInitialParametersGuessSize); } if params.iter().any(|p| !p.is_finite()) || !intercept.is_finite() { - return Err("Initial parameter guess must be finite".to_string()); + return Err(Error::InvalidInitialParametersGuess); } } Ok(()) @@ -254,14 +247,14 @@ impl LogisticRegression { problem: LogisticRegressionProblem<'a, F, A>, solver: LBFGSType, init_params: Array1, - ) -> Result>, String> + ) -> Result>> where A: Data, { Executor::new(problem, solver, ArgminParam(init_params)) .max_iters(self.max_iterations) .run() - .map_err(|err| format!("Error running solver: {}", err)) + .map_err(|err| err.into()) } /// Take an ArgminResult and return a FittedLogisticRegression. @@ -269,7 +262,7 @@ impl LogisticRegression { &self, labels: ClassLabels, result: &ArgminResult>, - ) -> Result, String> + ) -> Result> where A: Data, C: PartialOrd + Clone, @@ -285,9 +278,9 @@ impl LogisticRegression { } impl<'a, C: 'a + PartialOrd + Clone, F: Float, D: Data, T: AsTargets> - Fit<'a, ArrayBase, T> for LogisticRegression + Fit, T, Error> for LogisticRegression { - type Object = Result, String>; + type Object = FittedLogisticRegression; /// Given a 2-dimensional feature matrix array `x` with shape /// (n_samples, n_features) and an array of target classes to predict, @@ -305,10 +298,7 @@ impl<'a, C: 'a + PartialOrd + Clone, F: Float, D: Data, T: AsTargets, T>, - ) -> Result, String> { + fn fit(&self, dataset: &DatasetBase, T>) -> Result { self.fit(dataset.records(), dataset.targets()) } } @@ -319,61 +309,57 @@ impl<'a, C: 'a + PartialOrd + Clone, F: Float, D: Data, T: AsTargets(y: T) -> Result<(ClassLabels, Array1), String> +fn label_classes(y: T) -> Result<(ClassLabels, Array1)> where F: Float, T: AsTargets, C: PartialOrd + Clone, { - match y.try_single_target() { - Err(_) => Err("Expected single target dataset".to_string()), - Ok(y_single_target) => { - let mut classes: Vec<&C> = vec![]; - let mut target_vec = vec![]; - let mut use_negative_label: bool = true; - for item in y_single_target { - if let Some(last_item) = classes.last() { - if *last_item != item { - use_negative_label = !use_negative_label; - } - } - if !classes.contains(&item) { - classes.push(item); - } - target_vec.push(if use_negative_label { - F::NEGATIVE_LABEL - } else { - F::POSITIVE_LABEL - }); - } - if classes.len() != 2 { - return Err("Expected exactly two classes for logistic regression".to_string()); + let y_single_target = y.try_single_target()?; + let mut classes: Vec<&C> = vec![]; + let mut target_vec = vec![]; + let mut use_negative_label: bool = true; + for item in y_single_target { + if let Some(last_item) = classes.last() { + if *last_item != item { + use_negative_label = !use_negative_label; } - let mut target_array = Array1::from(target_vec); - let labels = if classes[0] < classes[1] { - (F::NEGATIVE_LABEL, F::POSITIVE_LABEL) - } else { - // If we found the larger class first, flip the sign in the target - // vector, so that -1.0 is always the label for the smaller class - // and 1.0 the label for the larger class - target_array *= -F::one(); - (F::POSITIVE_LABEL, F::NEGATIVE_LABEL) - }; - Ok(( - vec![ - ClassLabel { - class: classes[0].clone(), - label: labels.0, - }, - ClassLabel { - class: classes[1].clone(), - label: labels.1, - }, - ], - target_array, - )) } + if !classes.contains(&item) { + classes.push(item); + } + target_vec.push(if use_negative_label { + F::NEGATIVE_LABEL + } else { + F::POSITIVE_LABEL + }); + } + if classes.len() != 2 { + return Err(Error::WrongNumberOfClasses); } + let mut target_array = Array1::from(target_vec); + let labels = if classes[0] < classes[1] { + (F::NEGATIVE_LABEL, F::POSITIVE_LABEL) + } else { + // If we found the larger class first, flip the sign in the target + // vector, so that -1.0 is always the label for the smaller class + // and 1.0 the label for the larger class + target_array *= -F::one(); + (F::POSITIVE_LABEL, F::NEGATIVE_LABEL) + }; + Ok(( + vec![ + ClassLabel { + class: classes[0].clone(), + label: labels.0, + }, + ClassLabel { + class: classes[1].clone(), + label: labels.1, + }, + ], + target_array, + )) } /// Conditionally split the feature vector `w` into parameter vector and @@ -394,8 +380,8 @@ fn convert_params(n_features: usize, w: &Array1) -> (Array1, F) } /// The logistic function -fn logistic(x: F) -> F { - F::one() / (F::one() + num_traits::Float::exp(-x)) +fn logistic(x: F) -> F { + F::one() / (F::one() + (-x).exp()) } /// A numerically stable version of the log of the logistic function. @@ -405,11 +391,11 @@ fn logistic(x: F) -> F { /// /// See the blog post describing this implementation: /// http://fa.bianp.net/blog/2013/numerical-optimizers-for-logistic-regression/ -fn log_logistic(x: F) -> F { +fn log_logistic(x: F) -> F { if x > F::zero() { - -num_traits::Float::ln(F::one() + num_traits::Float::exp(-x)) + -(F::one() + (-x).exp()).ln() } else { - x - num_traits::Float::ln(F::one() + num_traits::Float::exp(x)) + x - (F::one() + x.exp()).ln() } } @@ -572,13 +558,13 @@ impl<'a, F: Float, A: Data> ArgminOp for LogisticRegressionProblem<'a, type Float = F; /// Apply the cost function to a parameter `p` - fn apply(&self, p: &Self::Param) -> Result { + fn apply(&self, p: &Self::Param) -> std::result::Result { let w = p.as_array(); Ok(logistic_loss(self.x, &self.target, self.alpha, w)) } /// Compute the gradient at parameter `p`. - fn gradient(&self, p: &Self::Param) -> Result { + fn gradient(&self, p: &Self::Param) -> std::result::Result { let w = p.as_array(); Ok(ArgminParam(logistic_grad( self.x, @@ -773,7 +759,10 @@ mod test { let x = array![[0.01], [1.0], [-1.0], [-0.01]]; let y = array![[0, 0], [0, 0], [0, 0], [0, 0]]; let res = log_reg.fit(&x, &y); - assert_eq!(res, Err("Expected single target dataset".to_string())); + assert_eq!( + res.unwrap_err().to_string(), + "multiple targets not supported".to_string() + ); } #[test] @@ -783,11 +772,8 @@ mod test { let y = array![0.0, 0.0, 1.0, 1.0]; let res = log_reg.fit(&x, &y); assert_eq!( - res, - Err( - "Incompatible shapes of data, expected `x` and `y` to have same number of rows" - .to_string() - ) + res.unwrap_err().to_string(), + "Expected `x` and `y` to have same number of rows, got 3 != 4".to_string() ); } @@ -798,15 +784,15 @@ mod test { let log_reg = LogisticRegression::default(); let normal_x = array![[-1.0], [1.0]]; let y = array![0.0, 1.0]; - let expected = Err("Values must be finite and not `Inf`, `-Inf` or `NaN`".to_string()); + let expected = "Values must be finite and not `Inf`, `-Inf` or `NaN`".to_string(); for inf_x in &inf_xs { let res = log_reg.fit(inf_x, &y); - assert_eq!(res, expected); + assert_eq!(res.unwrap_err().to_string(), expected); } for inf in &infs { let log_reg = LogisticRegression::default().alpha(*inf); let res = log_reg.fit(&normal_x, &y); - assert_eq!(res, expected); + assert_eq!(res.unwrap_err().to_string(), expected); } let mut non_positives = infs.clone(); non_positives.push(-1.0); @@ -815,8 +801,8 @@ mod test { let log_reg = LogisticRegression::default().gradient_tolerance(*inf); let res = log_reg.fit(&normal_x, &y); assert_eq!( - res, - Err("gradient_tolerance must be a positive, finite number".to_string()) + res.unwrap_err().to_string(), + "gradient_tolerance must be a positive, finite number" ); } } @@ -826,21 +812,21 @@ mod test { let infs = vec![std::f64::INFINITY, std::f64::NEG_INFINITY, std::f64::NAN]; let normal_x = array![[-1.0], [1.0]]; let normal_y = array![0.0, 1.0]; - let expected = Err("Initial parameter guess must be finite".to_string()); + let expected = "Initial parameter guess must be finite".to_string(); for inf in &infs { let log_reg = LogisticRegression::default().initial_params(array![*inf], 0.0); let res = log_reg.fit(&normal_x, &normal_y); - assert_eq!(res, expected); + assert_eq!(res.unwrap_err().to_string(), expected); } for inf in &infs { let log_reg = LogisticRegression::default().initial_params(array![0.0], *inf); let res = log_reg.fit(&normal_x, &normal_y); - assert_eq!(res, expected); + assert_eq!(res.unwrap_err().to_string(), expected); } { let log_reg = LogisticRegression::default().initial_params(array![0.0, 0.0], 0.0); let res = log_reg.fit(&normal_x, &normal_y); - assert_eq!(res, Err("Size of initial parameter guess must be the same as the number of columns in the feature matrix `x`".to_string())); + assert_eq!(res.unwrap_err().to_string(), "Size of initial parameter guess must be the same as the number of columns in the feature matrix `x`".to_string()); } } diff --git a/algorithms/linfa-pls/Cargo.toml b/algorithms/linfa-pls/Cargo.toml index af8e9cec8..1221010c1 100644 --- a/algorithms/linfa-pls/Cargo.toml +++ b/algorithms/linfa-pls/Cargo.toml @@ -31,8 +31,8 @@ ndarray-rand = "0.13" rand_isaac = "0.3" num-traits = "0.2" paste = "1.0" - -linfa = { version = "0.3.1", path = "../.." } +thiserror = "1" +linfa = { version = "0.3.1", path = "../..", features = ["ndarray-linalg"] } [dev-dependencies] linfa-datasets = { version = "0.3.1", path = "../../datasets", features = ["linnerud"] } diff --git a/algorithms/linfa-pls/src/errors.rs b/algorithms/linfa-pls/src/errors.rs index a39c568bb..6986050bf 100644 --- a/algorithms/linfa-pls/src/errors.rs +++ b/algorithms/linfa-pls/src/errors.rs @@ -1,36 +1,19 @@ use ndarray_linalg::error::LinalgError; -use std::error::Error; -use std::fmt::{self, Display}; - +use thiserror::Error; pub type Result = std::result::Result; -#[derive(Debug)] +#[derive(Error, Debug)] pub enum PlsError { + #[error("Not enough samples: {0}")] NotEnoughSamplesError(String), + #[error("Bad component number: {0}")] BadComponentNumberError(String), + #[error("Power method not converged: {0}")] PowerMethodNotConvergedError(String), - LinalgError(String), -} - -impl Display for PlsError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::NotEnoughSamplesError(message) => write!(f, "Not enough samples: {}", message), - Self::BadComponentNumberError(message) => { - write!(f, "Bad component number: {}", message) - } - Self::PowerMethodNotConvergedError(message) => { - write!(f, "Power method not converged: {}", message) - } - Self::LinalgError(message) => write!(f, "Linear algebra error: {}", message), - } - } -} - -impl Error for PlsError {} - -impl From for PlsError { - fn from(error: LinalgError) -> PlsError { - PlsError::LinalgError(error.to_string()) - } + #[error(transparent)] + LinalgError(#[from] LinalgError), + #[error(transparent)] + LinfaError(#[from] linfa::error::Error), + #[error(transparent)] + MinMaxError(#[from] ndarray_stats::errors::MinMaxError), } diff --git a/algorithms/linfa-pls/src/lib.rs b/algorithms/linfa-pls/src/lib.rs index 9e2ed11d5..93dfa8df4 100644 --- a/algorithms/linfa-pls/src/lib.rs +++ b/algorithms/linfa-pls/src/lib.rs @@ -130,14 +130,14 @@ macro_rules! pls_algo { ($name:ident) => { } } - impl> Fit<'_, ArrayBase, ArrayBase> + impl> Fit, ArrayBase, PlsError> for [] { - type Object = Result<[]>; + type Object = []; fn fit( &self, dataset: &DatasetBase, ArrayBase>, - ) -> Result<[]> { + ) -> Result { let pls = self.0.fit(dataset)?; Ok([](pls)) } diff --git a/algorithms/linfa-pls/src/pls_generic.rs b/algorithms/linfa-pls/src/pls_generic.rs index 177aa1746..a3f33d526 100644 --- a/algorithms/linfa-pls/src/pls_generic.rs +++ b/algorithms/linfa-pls/src/pls_generic.rs @@ -1,8 +1,12 @@ use crate::errors::{PlsError, Result}; -use crate::{utils, Float}; +use crate::utils; use linfa::{ - dataset::Records, traits::Fit, traits::PredictRef, traits::Transformer, Dataset, DatasetBase, + dataset::{Records, WithLapack, WithoutLapack}, + traits::Fit, + traits::PredictRef, + traits::Transformer, + Dataset, DatasetBase, Float, }; use ndarray::{Array1, Array2, ArrayBase, Data, Ix2}; use ndarray_linalg::svd::*; @@ -197,10 +201,15 @@ impl PlsParams { } } -impl> Fit<'_, ArrayBase, ArrayBase> for PlsParams { - type Object = Result>; +impl> Fit, ArrayBase, PlsError> + for PlsParams +{ + type Object = Pls; - fn fit(&self, dataset: &DatasetBase, ArrayBase>) -> Result> { + fn fit( + &self, + dataset: &DatasetBase, ArrayBase>, + ) -> Result { let records = dataset.records(); let targets = dataset.targets(); @@ -259,9 +268,7 @@ impl> Fit<'_, ArrayBase, ArrayBase> Algorithm::Nipals => { // Replace columns that are all close to zero with zeros for mut yj in yk.gencolumns_mut() { - if *(yj.mapv(|y| num_traits::float::Float::abs(y)).max().unwrap()) - < F::cast(10.) * eps - { + if *(yj.mapv(|y| y.abs()).max()?) < F::cast(10.) * eps { yj.assign(&Array1::zeros(yj.len())); } } @@ -278,7 +285,7 @@ impl> Fit<'_, ArrayBase, ArrayBase> // compute scores, i.e. the projections of x and Y let x_scores_k = xk.dot(&x_weights_k); let y_ss = if norm_y_weights { - F::cast(1.) + F::one() } else { y_weights_k.dot(&y_weights_k) }; @@ -316,8 +323,8 @@ impl> Fit<'_, ArrayBase, ArrayBase> // Similiarly, Y was approximated as Omega . Delta.T + Y_(R+1) // Compute transformation matrices (rotations_). See User Guide. - let x_rotations = x_weights.dot(&utils::pinv2(&x_loadings.t().dot(&x_weights), None)); - let y_rotations = y_weights.dot(&utils::pinv2(&y_loadings.t().dot(&y_weights), None)); + let x_rotations = x_weights.dot(&utils::pinv2(x_loadings.t().dot(&x_weights).view(), None)); + let y_rotations = y_weights.dot(&utils::pinv2(y_loadings.t().dot(&y_weights).view(), None)); let mut coefficients = x_rotations.dot(&y_loadings.t()); coefficients *= &y_std; @@ -354,7 +361,7 @@ impl PlsParams { let mut y_score = Array1::ones(y.ncols()); for col in y.t().genrows() { - if *col.mapv(|v| num_traits::Float::abs(v)).max().unwrap() > eps { + if *col.mapv(|v| v.abs()).max().unwrap() > eps { y_score = col.to_owned(); break; } @@ -363,8 +370,8 @@ impl PlsParams { let mut x_pinv = None; let mut y_pinv = None; if self.mode == Mode::B { - x_pinv = Some(utils::pinv2(&x, Some(F::cast(10.) * eps))); - y_pinv = Some(utils::pinv2(&y, Some(F::cast(10.) * eps))); + x_pinv = Some(utils::pinv2(x.view(), Some(F::cast(10.) * eps))); + y_pinv = Some(utils::pinv2(y.view(), Some(F::cast(10.) * eps))); } // init to big value for first convergence check @@ -379,7 +386,7 @@ impl PlsParams { Mode::A => x.t().dot(&y_score) / y_score.dot(&y_score), Mode::B => x_pinv.to_owned().unwrap().dot(&y_score), }; - x_weights /= num_traits::Float::sqrt(x_weights.dot(&x_weights)) + eps; + x_weights /= x_weights.dot(&x_weights).sqrt() + eps; let x_score = x.dot(&x_weights); y_weights = match self.mode { @@ -388,7 +395,7 @@ impl PlsParams { }; if norm_y_weights { - y_weights /= num_traits::Float::sqrt(y_weights.dot(&y_weights)) + eps + y_weights /= y_weights.dot(&y_weights).sqrt() + eps } let ya = y.dot(&y_weights); @@ -420,9 +427,13 @@ impl PlsParams { y: &ArrayBase, Ix2>, ) -> Result<(Array1, Array1)> { let c = x.t().dot(y); + + let c = c.with_lapack(); let (u, _, vt) = c.svd(true, true)?; - let u = u.unwrap().column(0).to_owned(); - let vt = vt.unwrap().row(0).to_owned(); + // safe unwrap because both parameters are set to true in above call + let u = u.unwrap().column(0).to_owned().without_lapack(); + let vt = vt.unwrap().row(0).to_owned().without_lapack(); + Ok((u, vt)) } } diff --git a/algorithms/linfa-pls/src/pls_svd.rs b/algorithms/linfa-pls/src/pls_svd.rs index a314456ea..05913f179 100644 --- a/algorithms/linfa-pls/src/pls_svd.rs +++ b/algorithms/linfa-pls/src/pls_svd.rs @@ -36,13 +36,15 @@ impl Default for PlsSvdParams { } #[allow(clippy::many_single_char_names)] -impl> Fit<'_, ArrayBase, ArrayBase> for PlsSvdParams { - type Object = Result>; +impl> Fit, ArrayBase, PlsError> + for PlsSvdParams +{ + type Object = PlsSvd; fn fit( &self, dataset: &DatasetBase, ArrayBase>, - ) -> Result> { + ) -> Result { if dataset.nsamples() < 2 { return Err(PlsError::NotEnoughSamplesError(format!( "should be greater than 1, got {}", @@ -68,10 +70,11 @@ impl> Fit<'_, ArrayBase, ArrayBase> // Compute SVD of cross-covariance matrix let c = x.t().dot(&y); - let (u, _, vt) = c.svd(true, true).unwrap(); - let u = u.unwrap().slice(s![.., ..self.n_components]).to_owned(); - let vt = vt.unwrap().slice(s![..self.n_components, ..]).to_owned(); - let (u, vt) = utils::svd_flip(&u, &vt); + let (u, _, vt) = c.svd(true, true)?; + // safe unwraps because both parameters are set to true in above call + let u = u.unwrap().slice_move(s![.., ..self.n_components]); + let vt = vt.unwrap().slice_move(s![..self.n_components, ..]); + let (u, vt) = utils::svd_flip(u, vt); let v = vt.reversed_axes(); let x_weights = u; diff --git a/algorithms/linfa-pls/src/utils.rs b/algorithms/linfa-pls/src/utils.rs index 71331b5fa..1b780e050 100644 --- a/algorithms/linfa-pls/src/utils.rs +++ b/algorithms/linfa-pls/src/utils.rs @@ -1,6 +1,10 @@ -use linfa::{DatasetBase, Float}; -use ndarray::{s, Array1, Array2, ArrayBase, Axis, Data, DataMut, Ix1, Ix2, Zip}; +use linfa::{ + dataset::{WithLapack, WithoutLapack}, + DatasetBase, Float, +}; +use ndarray::{s, Array1, Array2, ArrayBase, ArrayView2, Axis, Data, DataMut, Ix1, Ix2, Zip}; use ndarray_linalg::svd::*; +use ndarray_linalg::Scalar; use ndarray_stats::QuantileExt; pub fn outer( @@ -15,10 +19,8 @@ pub fn outer( } /// Calculates the pseudo inverse of a matrix -pub fn pinv2>( - x: &ArrayBase, - cond: Option, -) -> Array2 { +pub fn pinv2(x: ArrayView2, cond: Option) -> Array2 { + let x = x.with_lapack(); let (opt_u, s, opt_vh) = x.svd(true, true).unwrap(); let u = opt_u.unwrap(); let vh = opt_vh.unwrap(); @@ -33,12 +35,14 @@ pub fn pinv2>( acc }); - let mut ucut = u.slice(s![.., ..rank]).to_owned(); - ucut /= &s.slice(s![..rank]).mapv(|v| F::cast(v)); - ucut.dot(&vh.slice(s![..rank, ..])) - .mapv(|v| v.conj()) + let mut ucut = u.slice_move(s![.., ..rank]); + ucut /= &s.slice(s![..rank]).mapv(|v| F::Lapack::cast(v)); + + vh.slice(s![..rank, ..]) .t() - .to_owned() + .dot(&ucut.t()) + .mapv(|v| v.conj()) + .without_lapack() } #[allow(clippy::type_complexity)] @@ -85,8 +89,8 @@ pub fn svd_flip_1d( } pub fn svd_flip( - u: &ArrayBase, Ix2>, - v: &ArrayBase, Ix2>, + u: ArrayBase, Ix2>, + v: ArrayBase, Ix2>, ) -> (Array2, Array2) { // columns of u, rows of v let abs_u = u.mapv(|v| v.abs()); @@ -97,7 +101,7 @@ pub fn svd_flip( .and(&max_abs_val_indices) .and(&range) .apply(|s, &i, &j| *s = u[[i, j]].signum()); - (u * &signs, v * &signs.insert_axis(Axis(1))) + (&u * &signs, &v * &signs.insert_axis(Axis(1))) } #[cfg(test)] @@ -117,7 +121,7 @@ mod tests { #[test] fn test_pinv2() { let a = array![[1., 2., 3.], [4., 5., 6.], [7., 8., 10.]]; - let a_pinv2 = pinv2(&a, None); + let a_pinv2 = pinv2(a.view(), None); assert_abs_diff_eq!(a.dot(&a_pinv2), Array2::eye(3), epsilon = 1e-6) } } diff --git a/algorithms/linfa-preprocessing/examples/count_vectorization.rs b/algorithms/linfa-preprocessing/examples/count_vectorization.rs index 749262e9b..1164b466b 100644 --- a/algorithms/linfa-preprocessing/examples/count_vectorization.rs +++ b/algorithms/linfa-preprocessing/examples/count_vectorization.rs @@ -4,9 +4,10 @@ use encoding::DecoderTrap::Strict; use flate2::read::GzDecoder; use linfa::metrics::ToConfusionMatrix; use linfa::traits::{Fit, Predict}; +use linfa::Dataset; use linfa_bayes::GaussianNbParams; use linfa_preprocessing::count_vectorization::CountVectorizer; -use ndarray::Array1; +use ndarray::Array2; use std::collections::HashSet; use std::path::Path; use tar::Archive; @@ -34,7 +35,7 @@ fn download_20news_bydate() { fn load_set( path: &'static str, desired_targets: &[&str], -) -> Result<(Vec, Array1, usize), std::io::Error> { +) -> Result<(Vec, Array2, usize), std::io::Error> { let mut file_paths = Vec::new(); let mut targets = Vec::new(); let desired_targets: HashSet = desired_targets.iter().map(|s| s.to_string()).collect(); @@ -59,19 +60,19 @@ fn load_set( ntargets = ntargets + 1; } } - let targets = Array1::from_shape_vec(targets.len(), targets).unwrap(); + let targets = Array2::from_shape_vec((targets.len(), 1), targets).unwrap(); Ok((file_paths, targets, ntargets)) } fn load_train_set( desired_targets: &[&str], -) -> Result<(Vec, Array1, usize), std::io::Error> { +) -> Result<(Vec, Array2, usize), std::io::Error> { load_set("./20news/20news-bydate-train", desired_targets) } fn load_test_set( desired_targets: &[&str], -) -> Result<(Vec, Array1, usize), std::io::Error> { +) -> Result<(Vec, Array2, usize), std::io::Error> { load_set("./20news/20news-bydate-test", desired_targets) } @@ -165,9 +166,9 @@ fn main() { .transform_files(&test_filenames, ISO_8859_1, Strict) .to_dense(); let test_records = test_records.mapv(|c| c as f32); - let test_dataset = (test_records, test_targets).into(); + let test_dataset: Dataset = (test_records, test_targets).into(); // Let's predict the test data targets - let test_prediction: Array1 = model.predict(&test_dataset); + let test_prediction = model.predict(&test_dataset); let cm = test_prediction.confusion_matrix(&test_dataset).unwrap(); // 0.9523 let accuracy = cm.f1_score(); diff --git a/algorithms/linfa-preprocessing/examples/tfidf_vectorization.rs b/algorithms/linfa-preprocessing/examples/tfidf_vectorization.rs index 5ca00a92d..18ae10c8b 100644 --- a/algorithms/linfa-preprocessing/examples/tfidf_vectorization.rs +++ b/algorithms/linfa-preprocessing/examples/tfidf_vectorization.rs @@ -4,9 +4,10 @@ use encoding::DecoderTrap::Strict; use flate2::read::GzDecoder; use linfa::metrics::ToConfusionMatrix; use linfa::traits::{Fit, Predict}; +use linfa::Dataset; use linfa_bayes::GaussianNbParams; use linfa_preprocessing::tf_idf_vectorization::TfIdfVectorizer; -use ndarray::Array1; +use ndarray::Array2; use std::collections::HashSet; use std::path::Path; use tar::Archive; @@ -34,7 +35,7 @@ fn download_20news_bydate() { fn load_set( path: &'static str, desired_targets: &[&str], -) -> Result<(Vec, Array1, usize), std::io::Error> { +) -> Result<(Vec, Array2, usize), std::io::Error> { let mut file_paths = Vec::new(); let mut targets = Vec::new(); let desired_targets: HashSet = desired_targets.iter().map(|s| s.to_string()).collect(); @@ -59,19 +60,19 @@ fn load_set( ntargets = ntargets + 1; } } - let targets = Array1::from_shape_vec(targets.len(), targets).unwrap(); + let targets = Array2::from_shape_vec((targets.len(), 1), targets).unwrap(); Ok((file_paths, targets, ntargets)) } fn load_train_set( desired_targets: &[&str], -) -> Result<(Vec, Array1, usize), std::io::Error> { +) -> Result<(Vec, Array2, usize), std::io::Error> { load_set("./20news/20news-bydate-train", desired_targets) } fn load_test_set( desired_targets: &[&str], -) -> Result<(Vec, Array1, usize), std::io::Error> { +) -> Result<(Vec, Array2, usize), std::io::Error> { load_set("./20news/20news-bydate-test", desired_targets) } @@ -162,9 +163,9 @@ fn main() { let test_records = vectorizer .transform_files(&test_filenames, ISO_8859_1, Strict) .to_dense(); - let test_dataset = (test_records, test_targets).into(); + let test_dataset: Dataset = (test_records, test_targets).into(); // Let's predict the test data targets - let test_prediction: Array1 = model.predict(&test_dataset); + let test_prediction = model.predict(&test_dataset); let cm = test_prediction.confusion_matrix(&test_dataset).unwrap(); // 0.8402 let accuracy = cm.f1_score(); diff --git a/algorithms/linfa-preprocessing/src/count_vectorization.rs b/algorithms/linfa-preprocessing/src/count_vectorization.rs index 7228ef7c9..5dee08093 100644 --- a/algorithms/linfa-preprocessing/src/count_vectorization.rs +++ b/algorithms/linfa-preprocessing/src/count_vectorization.rs @@ -165,9 +165,11 @@ impl CountVectorizer { let mut document_bytes = Vec::new(); file.read_to_end(&mut document_bytes)?; let document = encoding::decode(&document_bytes, trap, encoding).0; + // encoding error contains a cow string, can't just use ?, must go through the unwrap if document.is_err() { return Err(crate::error::Error::EncodingError(document.err().unwrap())); } + // safe unwrap now that error has been handled let document = transform_string(document.unwrap(), &self.properties); self.read_document_into_vocabulary(document, ®ex, &mut vocabulary); } diff --git a/algorithms/linfa-preprocessing/src/error.rs b/algorithms/linfa-preprocessing/src/error.rs index 87a4cbb76..e6cc85bec 100644 --- a/algorithms/linfa-preprocessing/src/error.rs +++ b/algorithms/linfa-preprocessing/src/error.rs @@ -32,4 +32,6 @@ pub enum Error { LinalgError(#[from] ndarray_linalg::error::LinalgError), #[error(transparent)] NdarrayStatsEmptyError(#[from] ndarray_stats::errors::EmptyInput), + #[error(transparent)] + LinfaError(#[from] linfa::error::Error), } diff --git a/algorithms/linfa-preprocessing/src/linear_scaling.rs b/algorithms/linfa-preprocessing/src/linear_scaling.rs index 43497c498..17d435c85 100644 --- a/algorithms/linfa-preprocessing/src/linear_scaling.rs +++ b/algorithms/linfa-preprocessing/src/linear_scaling.rs @@ -113,14 +113,14 @@ impl LinearScaler { } } -impl<'a, F: Float, D: Data, T: AsTargets> Fit<'a, ArrayBase, T> +impl, T: AsTargets> Fit, T, Error> for LinearScaler { - type Object = Result>; + type Object = FittedLinearScaler; /// Fits the input dataset accordng to the scaler [method](enum.ScalingMethod.html). Will return an error /// if the dataset does not contain any samples or (in the case of MinMax scaling) if the specified range is not valid. - fn fit(&self, x: &DatasetBase, T>) -> Self::Object { + fn fit(&self, x: &DatasetBase, T>) -> Result { match &self.method { ScalingMethod::Standard(with_mean, with_std) => { FittedLinearScaler::standard(x.records(), *with_mean, *with_std) @@ -149,6 +149,7 @@ impl FittedLinearScaler { if records.dim().0 == 0 { return Err(Error::NotEnoughSamples); } + // safe unwrap because of above zero records check let means = records.mean_axis(Axis(0)).unwrap(); let std_devs = if with_std { records.std_axis(Axis(0), F::zero()).mapv(|s| { diff --git a/algorithms/linfa-preprocessing/src/whitening.rs b/algorithms/linfa-preprocessing/src/whitening.rs index d7b1c8b89..6c607d21b 100644 --- a/algorithms/linfa-preprocessing/src/whitening.rs +++ b/algorithms/linfa-preprocessing/src/whitening.rs @@ -55,13 +55,14 @@ impl Whitener { } } -impl<'a, F: Float, D: Data, T: AsTargets> Fit<'a, ArrayBase, T> for Whitener { - type Object = Result>; +impl, T: AsTargets> Fit, T, Error> for Whitener { + type Object = FittedWhitener; - fn fit(&self, x: &DatasetBase, T>) -> Self::Object { + fn fit(&self, x: &DatasetBase, T>) -> Result { if x.nsamples() == 0 { return Err(Error::NotEnoughSamples); } + // safe because of above zero samples check let mean = x.records().mean_axis(Axis(0)).unwrap(); let sigma = x.records() - &mean; diff --git a/algorithms/linfa-reduction/Cargo.toml b/algorithms/linfa-reduction/Cargo.toml index 7a6e8491b..16e8f0a0d 100644 --- a/algorithms/linfa-reduction/Cargo.toml +++ b/algorithms/linfa-reduction/Cargo.toml @@ -29,6 +29,7 @@ ndarray = { version = "0.14", default-features = false, features = ["approx"] } ndarray-linalg = "0.13" ndarray-rand = "0.13" num-traits = "0.2" +thiserror = "1" linfa = { version = "0.3.1", path = "../..", features = ["ndarray-linalg"] } linfa-kernel = { version = "0.3.1", path = "../linfa-kernel" } diff --git a/algorithms/linfa-reduction/examples/pca.rs b/algorithms/linfa-reduction/examples/pca.rs index 873b038cf..54f6cb8b1 100644 --- a/algorithms/linfa-reduction/examples/pca.rs +++ b/algorithms/linfa-reduction/examples/pca.rs @@ -16,7 +16,7 @@ fn main() { let n = 10; let dataset = Dataset::from(generate_blobs(n, &expected_centroids, &mut rng)); - let embedding: Pca = Pca::params(1).fit(&dataset); + let embedding: Pca = Pca::params(1).fit(&dataset).unwrap(); let embedding = embedding.predict(&dataset); dbg!(&embedding); diff --git a/algorithms/linfa-reduction/src/error.rs b/algorithms/linfa-reduction/src/error.rs new file mode 100644 index 000000000..bf3279ca5 --- /dev/null +++ b/algorithms/linfa-reduction/src/error.rs @@ -0,0 +1,12 @@ +use thiserror::Error; +pub type Result = std::result::Result; + +#[derive(Error, Debug)] +pub enum Error { + #[error("At least 1 sample needed")] + NotEnoughSamples, + #[error(transparent)] + LinalgError(#[from] ndarray_linalg::error::LinalgError), + #[error(transparent)] + LinfaError(#[from] linfa::error::Error), +} diff --git a/algorithms/linfa-reduction/src/lib.rs b/algorithms/linfa-reduction/src/lib.rs index d8fb94c82..d799abe28 100644 --- a/algorithms/linfa-reduction/src/lib.rs +++ b/algorithms/linfa-reduction/src/lib.rs @@ -12,6 +12,7 @@ extern crate ndarray; pub mod diffusion_map; +pub mod error; pub mod pca; pub mod utils; diff --git a/algorithms/linfa-reduction/src/pca.rs b/algorithms/linfa-reduction/src/pca.rs index bcb1fe3c4..4f9033af3 100644 --- a/algorithms/linfa-reduction/src/pca.rs +++ b/algorithms/linfa-reduction/src/pca.rs @@ -15,12 +15,13 @@ //! //! // apply PCA projection along a line which maximizes the spread of the data //! let embedding = Pca::params(1) -//! .fit(&dataset); +//! .fit(&dataset).unwrap(); //! //! // reduce dimensionality of the dataset //! let dataset = embedding.predict(dataset); //! ``` //! +use crate::error::{Error, Result}; use ndarray::{Array1, Array2, ArrayBase, Axis, Data, Ix2}; use ndarray_linalg::{TruncatedOrder, TruncatedSvd}; #[cfg(feature = "serde")] @@ -67,19 +68,22 @@ impl PcaParams { /// # Returns /// /// A fitted PCA model with origin and hyperplane -impl<'a, T, D: Data> Fit<'a, ArrayBase, T> for PcaParams { +impl> Fit, T, Error> for PcaParams { type Object = Pca; - fn fit(&self, dataset: &DatasetBase, T>) -> Pca { + fn fit(&self, dataset: &DatasetBase, T>) -> Result> { + if dataset.nsamples() == 0 { + return Err(Error::NotEnoughSamples); + } let x = dataset.records(); // calculate mean of data and subtract it + // safe because of above 0 samples check let mean = x.mean_axis(Axis(0)).unwrap(); let x = x - &mean; // estimate Singular Value Decomposition - let result = TruncatedSvd::new(x, TruncatedOrder::Largest) - .decompose(self.embedding_size) - .unwrap(); + let result = + TruncatedSvd::new(x, TruncatedOrder::Largest).decompose(self.embedding_size)?; // explained variance is the spectral distribution of the eigenvalues let (_, sigma, mut v_t) = result.values_vectors(); @@ -96,11 +100,11 @@ impl<'a, T, D: Data> Fit<'a, ArrayBase, T> for PcaParams { } } - Pca { + Ok(Pca { embedding: v_t, sigma, mean, - } + }) } } @@ -118,7 +122,7 @@ impl<'a, T, D: Data> Fit<'a, ArrayBase, T> for PcaParams { /// /// // apply PCA projection along a line which maximizes the spread of the data /// let embedding = Pca::params(1) -/// .fit(&dataset); +/// .fit(&dataset).unwrap(); /// /// // reduce dimensionality of the dataset /// let dataset = embedding.predict(dataset); @@ -216,7 +220,7 @@ mod tests { let dataset = Dataset::from(tmp.dot(&q)); - let model = Pca::params(2).whiten(true).fit(&dataset); + let model = Pca::params(2).whiten(true).fit(&dataset).unwrap(); let proj = model.predict(&dataset); // check that the covariance is unit diagonal @@ -237,7 +241,7 @@ mod tests { let data = Array2::random_using((300, 50), Uniform::new(-1.0f64, 1.), &mut rng); let dataset = Dataset::from(data); - let model = Pca::params(10).whiten(true).fit(&dataset); + let model = Pca::params(10).whiten(true).fit(&dataset).unwrap(); let proj = model.predict(&dataset); // check that the covariance is unit diagonal @@ -262,7 +266,7 @@ mod tests { let data = Array2::random_using((1000, 500), StandardNormal, &mut rng); let dataset = Dataset::from(data / 1000f64.sqrt()); - let model = Pca::params(500).fit(&dataset); + let model = Pca::params(500).fit(&dataset).unwrap(); let sv = model.singular_values().mapv(|x| x * x); // we have created a random spectrum and can apply the Marchenko-Pastur law @@ -319,7 +323,7 @@ mod tests { let dataset = Dataset::from(data); // fit PCA with 10 possible embeddings - let model = Pca::params(10).fit(&dataset); + let model = Pca::params(10).fit(&dataset).unwrap(); // only two eigenvalues are relevant assert_eq!(model.explained_variance_ratio().len(), 2); @@ -334,7 +338,7 @@ mod tests { #[test] fn test_explained_variance_diag() { let dataset = Dataset::from(Array2::from_diag(&array![1., 1., 1., 1.])); - let model = Pca::params(3).fit(&dataset); + let model = Pca::params(3).fit(&dataset).unwrap(); assert_abs_diff_eq!( model.explained_variance_ratio(), diff --git a/algorithms/linfa-svm/src/classification.rs b/algorithms/linfa-svm/src/classification.rs index 449dd3eed..8d3b0ba26 100644 --- a/algorithms/linfa-svm/src/classification.rs +++ b/algorithms/linfa-svm/src/classification.rs @@ -8,7 +8,7 @@ use linfa::{ use ndarray::{Array1, Array2, ArrayBase, ArrayView2, Data, Ix1, Ix2}; use std::cmp::Ordering; -use super::error::Result; +use super::error::{Result, SvmResult}; use super::permutable_kernel::{PermutableKernel, PermutableKernelOneClass}; use super::solver_smo::SolverState; use super::SolverParams; @@ -207,10 +207,10 @@ pub fn fit_one_class( /// probabilities for whether a sample belongs to the first or second class. macro_rules! impl_classification { ($records:ty, $targets:ty) => { - impl<'a, F: Float> Fit<'a, $records, $targets> for SvmParams { - type Object = Result>; + impl Fit<$records, $targets, SvmResult> for SvmParams { + type Object = Svm; - fn fit(&self, dataset: &DatasetBase<$records, $targets>) -> Self::Object { + fn fit(&self, dataset: &DatasetBase<$records, $targets>) -> Result { let kernel = self.kernel.transform(dataset.records()); let target = dataset.try_single_target()?; let target = target.as_slice().unwrap(); @@ -238,10 +238,10 @@ macro_rules! impl_classification { } } - impl<'a, F: Float> Fit<'a, $records, $targets> for SvmParams { - type Object = Result>; + impl Fit<$records, $targets, SvmResult> for SvmParams { + type Object = Svm; - fn fit(&self, dataset: &DatasetBase<$records, $targets>) -> Self::Object { + fn fit(&self, dataset: &DatasetBase<$records, $targets>) -> Result { let kernel = self.kernel.transform(dataset.records()); let target = dataset.try_single_target()?; let target = target.as_slice().unwrap(); @@ -272,10 +272,10 @@ macro_rules! impl_classification { } impl_classification!(Array2, Array2); -impl_classification!(ArrayView2<'a, F>, ArrayView2<'a, bool>); +impl_classification!(ArrayView2<'_, F>, ArrayView2<'_, bool>); impl_classification!(Array2, CountedTargets>); -impl_classification!(ArrayView2<'a, F>, CountedTargets>); -impl_classification!(ArrayView2<'a, F>, CountedTargets>); +impl_classification!(ArrayView2<'_, F>, CountedTargets>); +impl_classification!(ArrayView2<'_, F>, CountedTargets>); /// Fit one-class problem /// @@ -283,10 +283,10 @@ impl_classification!(ArrayView2<'a, F>, CountedTargets { - impl<'a, F: Float> Fit<'a, $records, $targets> for SvmParams { - type Object = Result>; + impl Fit<$records, $targets, SvmResult> for SvmParams { + type Object = Svm; - fn fit(&self, dataset: &DatasetBase<$records, $targets>) -> Self::Object { + fn fit(&self, dataset: &DatasetBase<$records, $targets>) -> Result { let kernel = self.kernel.transform(dataset.records()); let records = dataset.records().view(); @@ -302,9 +302,9 @@ macro_rules! impl_oneclass { } impl_oneclass!(Array2, Array2<()>); -impl_oneclass!(ArrayView2<'a, F>, ArrayView2<'a, ()>); +impl_oneclass!(ArrayView2<'_, F>, ArrayView2<'_, ()>); impl_oneclass!(Array2, CountedTargets<(), Array2<()>>); -impl_oneclass!(Array2, CountedTargets<(), ArrayView2<'a, ()>>); +impl_oneclass!(Array2, CountedTargets<(), ArrayView2<'_, ()>>); /// Predict a probability with a feature vector impl> Predict, Pr> for Svm { diff --git a/algorithms/linfa-svm/src/regression.rs b/algorithms/linfa-svm/src/regression.rs index 1c2f222cd..bd4497d21 100644 --- a/algorithms/linfa-svm/src/regression.rs +++ b/algorithms/linfa-svm/src/regression.rs @@ -8,7 +8,7 @@ use linfa::{ use linfa_kernel::Kernel; use ndarray::{Array1, Array2, ArrayBase, ArrayView1, ArrayView2, Data, Ix2}; -use super::error::Result; +use super::error::{Result, SvmResult}; use super::permutable_kernel::PermutableKernelRegression; use super::solver_smo::SolverState; use super::SolverParams; @@ -119,10 +119,10 @@ pub fn fit_nu( /// Take a number of observations and project them to optimal continuous targets. macro_rules! impl_regression { ($records:ty, $targets:ty, $f:ty) => { - impl<'a> Fit<'a, $records, $targets> for SvmParams<$f, $f> { - type Object = Result>; + impl Fit<$records, $targets, SvmResult> for SvmParams<$f, $f> { + type Object = Svm<$f, $f>; - fn fit(&self, dataset: &DatasetBase<$records, $targets>) -> Self::Object { + fn fit(&self, dataset: &DatasetBase<$records, $targets>) -> Result { let kernel = self.kernel.transform(dataset.records()); let target = dataset.try_single_target()?; let target = target.as_slice().unwrap(); @@ -155,12 +155,12 @@ macro_rules! impl_regression { impl_regression!(Array2, Array2, f32); impl_regression!(Array2, Array2, f64); -impl_regression!(ArrayView2<'a, f32>, ArrayView2<'a, f32>, f32); -impl_regression!(ArrayView2<'a, f64>, ArrayView2<'a, f64>, f64); +impl_regression!(ArrayView2<'_, f32>, ArrayView2<'_, f32>, f32); +impl_regression!(ArrayView2<'_, f64>, ArrayView2<'_, f64>, f64); impl_regression!(Array2, Array1, f32); impl_regression!(Array2, Array1, f64); -impl_regression!(ArrayView2<'a, f32>, ArrayView1<'a, f32>, f32); -impl_regression!(ArrayView2<'a, f64>, ArrayView1<'a, f64>, f64); +impl_regression!(ArrayView2<'_, f32>, ArrayView1<'_, f32>, f32); +impl_regression!(ArrayView2<'_, f64>, ArrayView1<'_, f64>, f64); macro_rules! impl_predict { ( $($t:ty),* ) => { diff --git a/algorithms/linfa-trees/src/decision_trees/algorithm.rs b/algorithms/linfa-trees/src/decision_trees/algorithm.rs index f60152ed7..52b5d6f41 100644 --- a/algorithms/linfa-trees/src/decision_trees/algorithm.rs +++ b/algorithms/linfa-trees/src/decision_trees/algorithm.rs @@ -11,6 +11,7 @@ use super::NodeIter; use super::Tikz; use linfa::{ dataset::{AsTargets, Labels, Records}, + error::Error, error::Result, traits::*, DatasetBase, Float, Label, @@ -128,10 +129,7 @@ pub struct TreeNode { impl Hash for TreeNode { fn hash(&self, state: &mut H) { - let mut data: Vec = vec![]; - data.push(self.feature_idx as u64); - //data.push(self.prediction); - data.push(self.leaf_node as u64); + let data: Vec = vec![self.feature_idx as u64, self.leaf_node as u64]; data.hash(state); } } @@ -495,7 +493,7 @@ impl> PredictRef, Array1 for DecisionTree { /// Make predictions for each row of a matrix of features `x`. - fn predict_ref<'a>(&'a self, x: &ArrayBase) -> Array1 { + fn predict_ref(&self, x: &ArrayBase) -> Array1 { x.genrows() .into_iter() .map(|row| make_prediction(&row, &self.root_node)) @@ -503,18 +501,18 @@ impl> PredictRef, Array1 } } -impl<'a, F: Float, L: Label + 'a + std::fmt::Debug, D, T> Fit<'a, ArrayBase, T> +impl<'a, F: Float, L: Label + 'a + std::fmt::Debug, D, T> Fit, T, Error> for DecisionTreeParams where D: Data, T: AsTargets + Labels, { - type Object = Result>; + type Object = DecisionTree; /// Fit a decision tree using `hyperparamters` on the dataset consisting of /// a matrix of features `x` and an array of labels `y`. - fn fit(&self, dataset: &DatasetBase, T>) -> Self::Object { - self.validate().unwrap(); + fn fit(&self, dataset: &DatasetBase, T>) -> Result { + self.validate()?; let x = dataset.records(); let feature_names = dataset.feature_names(); diff --git a/algorithms/linfa-trees/src/decision_trees/tikz.rs b/algorithms/linfa-trees/src/decision_trees/tikz.rs index 466579a31..bbe518034 100644 --- a/algorithms/linfa-trees/src/decision_trees/tikz.rs +++ b/algorithms/linfa-trees/src/decision_trees/tikz.rs @@ -95,7 +95,8 @@ impl<'a, F: Float, L: Debug + Label> Tikz<'a, F, L> { let var = format!( "Var({})&:&{}\\\\", node.split().0, - node.feature_name().unwrap() + // TODO:: why use lengend if there are no feature names? Should it be allowed? + node.feature_name().unwrap_or(&"".to_string()) ); out.push_str(&var); map.insert(node.split().0); diff --git a/algorithms/linfa-tsne/examples/tsne.rs b/algorithms/linfa-tsne/examples/tsne.rs index f8b05556e..76c692ba4 100644 --- a/algorithms/linfa-tsne/examples/tsne.rs +++ b/algorithms/linfa-tsne/examples/tsne.rs @@ -5,7 +5,7 @@ use std::{io::Write, process::Command}; fn main() -> Result<()> { let ds = linfa_datasets::iris(); - let ds = Pca::params(3).whiten(true).fit(&ds).transform(ds); + let ds = Pca::params(3).whiten(true).fit(&ds).unwrap().transform(ds); let ds = TSne::embedding_size(2) .perplexity(10.0) diff --git a/build.rs b/build.rs new file mode 100644 index 000000000..428b39280 --- /dev/null +++ b/build.rs @@ -0,0 +1,9 @@ +#[cfg(any(feature = "openblas-system", feature = "netlib-system"))] +fn main() { + println!("cargo:rustc-link-lib=lapacke"); + println!("cargo:rustc-link-lib=lapack"); + println!("cargo:rustc-link-lib=cblas"); +} + +#[cfg(not(any(feature = "openblas-system", feature = "netlib-system")))] +fn main() {} diff --git a/docs/website/content/snippets/cross-validation.md b/docs/website/content/snippets/cross-validation.md index 2d48e3c56..bfd7e42b9 100644 --- a/docs/website/content/snippets/cross-validation.md +++ b/docs/website/content/snippets/cross-validation.md @@ -2,22 +2,21 @@ title = "Cross Validation" +++ ```rust -// perform cross-validation with the F1 score -let f1_runs = dataset - .iter_fold(8, |v| params.fit(&v).unwrap()) - .map(|(model, valid)| { - let cm = model - .predict(&valid) - .mapv(|x| x > Pr::even()) - .confusion_matrix(&valid).unwrap(); - - cm.f1_score() - }) - .collect::>(); - -// calculate mean and standard deviation -println!("F1 score: {}±{}", - f1_runs.mean().unwrap(), - f1_runs.std_axis(Axis(0), 0.0), -); -``` +// parameters to compare +let ratios = vec![0.1, 0.2, 0.5, 0.7, 1.0]; + +// create a model for each parameter +let models = ratios + .iter() + .map(|ratio| ElasticNet::params().penalty(0.3).l1_ratio(*ratio)) + .collect::>(); + +// get the mean r2 validation score across 5 folds for each model +let r2_values = + dataset.cross_validate(5, &models, |prediction, truth| prediction.r2(&truth))?; + +// show the mean r2 score for each parameter choice +for (ratio, r2) in ratios.iter().zip(r2_values.iter()) { + println!("L1 ratio: {}, r2 score: {}", ratio, r2); +} +``` \ No newline at end of file diff --git a/docs/website/content/snippets/k-folding.md b/docs/website/content/snippets/k-folding.md new file mode 100644 index 000000000..52f10707d --- /dev/null +++ b/docs/website/content/snippets/k-folding.md @@ -0,0 +1,23 @@ ++++ +title = "K folding" ++++ +```rust +// perform cross-validation with the F1 score +let f1_runs = dataset + .iter_fold(8, |v| params.fit(&v).unwrap()) + .map(|(model, valid)| { + let cm = model + .predict(&valid) + .mapv(|x| x > Pr::even()) + .confusion_matrix(&valid).unwrap(); + + cm.f1_score() + }) + .collect::>(); + +// calculate mean and standard deviation +println!("F1 score: {}±{}", + f1_runs.mean().unwrap(), + f1_runs.std_axis(Axis(0), 0.0), +); +``` diff --git a/src/dataset/impl_dataset.rs b/src/dataset/impl_dataset.rs index f17f18ae9..f4aab8a53 100644 --- a/src/dataset/impl_dataset.rs +++ b/src/dataset/impl_dataset.rs @@ -1,16 +1,17 @@ -use ndarray::{ - concatenate, s, Array1, Array2, ArrayBase, ArrayView2, ArrayViewMut2, Axis, Data, DataMut, - Dimension, Ix1, Ix2, -}; -use rand::{seq::SliceRandom, Rng}; -use std::collections::HashMap; - use super::{ super::traits::{Predict, PredictRef}, iter::{ChunksIter, DatasetIter, Iter}, AsTargets, AsTargetsMut, CountedTargets, Dataset, DatasetBase, DatasetView, Float, FromTargetArray, Label, Labels, Records, Result, }; +use crate::traits::Fit; +use ndarray::{ + concatenate, s, Array, Array1, Array2, ArrayBase, ArrayView1, ArrayView2, ArrayViewMut2, Axis, + Data, DataMut, Dimension, Ix1, Ix2, OwnedRepr, +}; +use rand::{seq::SliceRandom, Rng}; +use std::collections::HashMap; +use std::ops::AddAssign; /// Implementation without constraints on records and targets /// @@ -654,6 +655,18 @@ where } } +macro_rules! assist_swap_array2 { + ($slice: expr, $index: expr, $fold_size: expr, $features: expr) => { + if $index != 0 { + let adj_fold_size = $fold_size * $features; + let start = adj_fold_size * $index; + let (first_s, second_s) = $slice.split_at_mut(start); + let (mut fold, _) = second_s.split_at_mut(adj_fold_size); + first_s[..$fold_size * $features].swap_with_slice(&mut fold); + } + }; +} + impl<'a, F: Float, E: Copy + 'a, D, S> DatasetBase, ArrayBase> where D: DataMut, @@ -691,20 +704,24 @@ where /// ## Example /// ```rust /// use linfa::traits::Fit; - /// use linfa::dataset::{Dataset, DatasetView}; + /// use linfa::dataset::{Dataset, DatasetView, Records}; /// use ndarray::{array, ArrayView1, ArrayView2}; + /// use linfa::Error; /// /// struct MockFittable {} /// /// struct MockFittableResult { - /// mock_var: usize, + /// mock_var: usize, /// } /// - /// impl<'a> Fit<'a, ArrayView2<'a, f64>, ArrayView2<'a, f64>> for MockFittable { + /// + /// impl<'a> Fit, ArrayView2<'a, f64>, linfa::error::Error> for MockFittable { /// type Object = MockFittableResult; /// - /// fn fit(&self, training_data: &DatasetView) -> Self::Object { - /// MockFittableResult { mock_var: training_data.ntargets()} + /// fn fit(&self, training_data: &DatasetView) -> Result { + /// Ok(MockFittableResult { + /// mock_var: training_data.nsamples(), + /// }) /// } /// } /// @@ -713,17 +730,16 @@ where /// let mut dataset: Dataset = (records, targets).into(); /// let params = MockFittable {}; /// - ///for (model,validation_set) in dataset.iter_fold(5, |v| params.fit(&v)){ + ///for (model,validation_set) in dataset.iter_fold(5, |v| params.fit(&v).unwrap()){ /// // Here you can use `model` and `validation_set` to /// // assert the performance of the chosen algorithm /// } /// ``` - pub fn iter_fold) -> O>( + pub fn iter_fold) -> O>( &'a mut self, k: usize, fit_closure: C, ) -> impl Iterator, ArrayView2>)> { - //)-> impl Iterator + 'a { assert!(k > 0); assert!(k <= self.nsamples()); let samples_count = self.nsamples(); @@ -732,50 +748,243 @@ where let features = self.nfeatures(); let targets = self.ntargets(); - let mut records_sl = self.records.as_slice_mut().unwrap(); - let mut targets_sl2 = self.targets.as_multi_targets_mut(); - let mut targets_sl = targets_sl2.as_slice_mut().unwrap(); - let mut objs: Vec = Vec::new(); - for i in 0..k { - assist_swap_array2(&mut records_sl, i, fold_size, features); - assist_swap_array2(&mut targets_sl, i, fold_size, targets); - - let train = DatasetBase::new( - ArrayView2::from_shape( - (samples_count - fold_size, features), - records_sl.split_at(fold_size * features).1, - ) - .unwrap(), - ArrayView2::from_shape( - (samples_count - fold_size, targets), - targets_sl.split_at(fold_size * targets).1, - ) - .unwrap(), - ); - - let obj = fit_closure(train); - objs.push(obj); + { + let records_sl = self.records.as_slice_mut().unwrap(); + let mut targets_sl2 = self.targets.as_multi_targets_mut(); + let targets_sl = targets_sl2.as_slice_mut().unwrap(); + + for i in 0..k { + assist_swap_array2!(records_sl, i, fold_size, features); + assist_swap_array2!(targets_sl, i, fold_size, targets); + + { + let train = DatasetBase::new( + ArrayView2::from_shape( + (samples_count - fold_size, features), + records_sl.split_at(fold_size * features).1, + ) + .unwrap(), + ArrayView2::from_shape( + (samples_count - fold_size, targets), + targets_sl.split_at(fold_size * targets).1, + ) + .unwrap(), + ); + + let obj = fit_closure(&train); + objs.push(obj); + } - assist_swap_array2(&mut records_sl, i, fold_size, features); - assist_swap_array2(&mut targets_sl, i, fold_size, targets); + assist_swap_array2!(records_sl, i, fold_size, features); + assist_swap_array2!(targets_sl, i, fold_size, targets); + } } objs.into_iter().zip(self.sample_chunks(fold_size)) - // } -} -fn assist_swap_array2(slice: &mut [F], index: usize, fold_size: usize, features: usize) { - if index == 0 { - return; + /// Cross validation for multi-target algorithms + /// + /// Given a list of fittable models, cross validation + /// is used to compare their performance according to some + /// performance metric. To do so, k-folding is applied to the + /// dataset and, for each fold, each model is trained on the training set + /// and its performance is evaluated on the validation set. The performances + /// collected for each model are then averaged over the number of folds. + /// + /// ### Parameters: + /// + /// - `k`: the number of folds to apply + /// - `parameters`: a list of models to compare + /// - `eval`: closure used to evaluate the performance of each trained model + /// + /// ### Returns + /// + /// An array of model performances, in the same order as the models in input, if no errors occur. + /// The performance of each model is given as an array of performances, one for each target. + /// Otherwise, it might return an Error in one of the following cases: + /// + /// - An error occurred during the fitting of one model + /// - An error occurred inside the evaluation closure + /// + /// ### Example + /// + /// ```rust, ignore + /// + /// use linfa::prelude::*; + /// + /// // mutability needed for fast cross validation + /// let mut dataset = linfa_datasets::diabetes(); + /// + /// let models = vec![model1, model2, ... ]; + /// + /// let r2_scores = dataset.cross_validate_multi(5,&models, |prediction, truth| prediction.r2(truth))?; + /// + /// ``` + pub fn cross_validate_multi( + &'a mut self, + k: usize, + parameters: &[M], + eval: C, + ) -> std::result::Result, ER> + where + ER: std::error::Error + std::convert::From, + M: for<'c> Fit, ArrayView2<'c, E>, ER, Object = O>, + O: for<'d> PredictRef, Array2>, + FACC: Float, + C: Fn(&Array2, &ArrayView2) -> std::result::Result, crate::error::Error>, + { + let mut evaluations = Array2::from_elem((parameters.len(), self.ntargets()), FACC::zero()); + let folds_evaluations: std::result::Result, ER> = self + .iter_fold(k, |train| { + let fit_result: std::result::Result, ER> = + parameters.iter().map(|p| p.fit(&train)).collect(); + fit_result + }) + .map(|(models, valid)| { + let targets = valid.targets(); + let models = models?; + let mut eval_predictions = + Array2::from_elem((models.len(), targets.len()), FACC::zero()); + for (i, model) in models.iter().enumerate() { + let predicted = model.predict(valid.records()); + let eval_pred = match eval(&predicted, &targets) { + Err(e) => Err(ER::from(e)), + Ok(res) => Ok(res), + }?; + eval_predictions.row_mut(i).add_assign(&eval_pred); + } + Ok(eval_predictions) + }) + .collect(); + + for fold_evaluation in folds_evaluations? { + evaluations.add_assign(&fold_evaluation) + } + Ok(evaluations / FACC::from(k).unwrap()) + } + + /// Cross validation for single target algorithms + /// + /// Given a list of fittable models, cross validation + /// is used to compare their performance according to some + /// performance metric. To do so, k-folding is applied to the + /// dataset and, for each fold, each model is trained on the training set + /// and its performance is evaluated on the validation set. The performances + /// collected for each model are then averaged over the number of folds. + /// + /// ### Parameters: + /// + /// - `k`: the number of folds to apply + /// - `parameters`: a list of models to compare + /// - `eval`: closure used to evaluate the performance of each trained model. For single target + /// datasets, this closure is called once for each fold. + /// For multi-target datasets the closure is called, in each fold, once for every different target. + /// If there is the need to use different evaluations for each target, take a look at the + /// [`cross_validate_multi`](struct.DatasetBase.html#method.cross_validate_multi) method. + /// + /// ### Returns + /// + /// On succesful evalutation it returns an array of model performances, in the same order as the models in input. + /// + /// It returns an Error in one of the following cases: + /// + /// - An error occurred during the fitting of one model + /// - An error occurred inside the evaluation closure + /// + /// ### Example + /// + /// ```rust, ignore + /// + /// use linfa::prelude::*; + /// + /// // mutability needed for fast cross validation + /// let mut dataset = linfa_datasets::diabetes(); + /// + /// let models = vec![model1, model2, ... ]; + /// + /// let r2_scores = dataset.cross_validate(5,&models, |prediction, truth| prediction.r2(truth))?; + /// + /// ``` + pub fn cross_validate( + &'a mut self, + k: usize, + parameters: &[M], + eval: C, + ) -> std::result::Result, I>, ER> + where + ER: std::error::Error + std::convert::From, + M: for<'c> Fit, ArrayView2<'c, E>, ER, Object = O>, + O: for<'d> PredictRef, ArrayBase, I>>, + FACC: Float, + C: Fn(&ArrayView1, &ArrayView1) -> std::result::Result, + I: Dimension, + { + // construct shape as either vector or matrix + let mut shape = match I::NDIM { + Some(1) | Some(2) => Ok(I::zeros(I::NDIM.unwrap())), + _ => Err(crate::Error::NdShape(ndarray::ShapeError::from_kind( + ndarray::ErrorKind::IncompatibleShape, + ))), + }?; + + // assign shape form of output + let mut tmp = shape.as_array_view_mut(); + tmp[0] = parameters.len(); + if tmp.len() == 2 { + tmp[1] = self.ntargets(); + } + + let folds_evaluations = self + .iter_fold(k, |train| { + let fit_result: std::result::Result, ER> = + parameters.iter().map(|p| p.fit(&train)).collect(); + fit_result + }) + .map(|(models, valid)| { + let targets = valid.as_multi_targets(); + let models = models?; + + let eval_predictions = models + .iter() + .map(|m| { + let nsamples = valid.nsamples(); + let predicted = m.predict(valid.records()); + + // reshape to ensure that matrix has two dimensions + let ntargets = if predicted.ndim() == 1 { + 1 + } else { + predicted.len_of(Axis(1)) + }; + + let predicted: Array2<_> = + predicted.into_shape((nsamples, ntargets)).unwrap(); + + predicted + .gencolumns() + .into_iter() + .zip(targets.gencolumns().into_iter()) + .map(|(p, t)| eval(&p.view(), &t).map_err(ER::from)) + .collect() + }) + .collect::>, ER>>()? + .into_iter() + .flatten() + .collect(); + + Ok(Array::from_shape_vec(shape.clone(), eval_predictions).unwrap()) + }) + .collect::, ER>>(); + + let res = folds_evaluations? + .into_iter() + .fold(Array::::zeros(shape.clone()), std::ops::Add::add); + + Ok(res / FACC::cast(k)) } - let adj_fold_size = fold_size * features; - let start = adj_fold_size * index; - let (first_s, second_s) = slice.split_at_mut(start); - let (mut fold, _) = second_s.split_at_mut(adj_fold_size); - first_s[..fold_size * features].swap_with_slice(&mut fold); } impl Dataset { diff --git a/src/dataset/impl_targets.rs b/src/dataset/impl_targets.rs index 86d392955..45872f727 100644 --- a/src/dataset/impl_targets.rs +++ b/src/dataset/impl_targets.rs @@ -5,8 +5,8 @@ use super::{ Label, Labels, Pr, Records, }; use ndarray::{ - concatenate, Array1, Array2, ArrayBase, ArrayView2, ArrayViewMut2, Axis, CowArray, Data, - DataMut, Dimension, Ix1, Ix2, Ix3, OwnedRepr, ViewRepr, + Array1, Array2, ArrayBase, ArrayView2, ArrayViewMut2, Axis, CowArray, Data, DataMut, Dimension, + Ix1, Ix2, Ix3, OwnedRepr, ViewRepr, }; impl<'a, L, S: Data> AsTargets for ArrayBase { @@ -151,12 +151,12 @@ impl, I: Dimension> Labels for ArrayBase { } } -/// A NdArray with discrete labels can act as labels -impl> Labels for DatasetBase> { +/// Counted labels can act as labels +impl> Labels for CountedTargets { type Elem = L; fn label_count(&self) -> Vec> { - self.targets.labels.clone() + self.labels.clone() } } @@ -165,9 +165,14 @@ where D: Data, T: AsTargets, { + /// Transforms the input dataset by keeping only those samples whose label appears in `labels`. + /// + /// In the multi-target case a sample is kept if *any* of its targets appears in `labels`. + /// + /// Sample weights and feature names are preserved by this transformation. pub fn with_labels( &self, - labels: &[&[L]], + labels: &[L], ) -> DatasetBase, CountedTargets>> { let targets = self.targets.as_multi_targets(); let old_weights = self.weights(); @@ -185,7 +190,7 @@ where .zip(targets.genrows().into_iter()) .enumerate() { - let any_exists = t.iter().zip(labels.iter()).any(|(a, b)| b.contains(&a)); + let any_exists = t.iter().any(|a| labels.contains(&a)); if any_exists { for (map, val) in map.iter_mut().zip(t.iter()) { @@ -201,8 +206,15 @@ where } } - let records: Array2 = concatenate(Axis(0), &records_arr).unwrap(); - let targets = concatenate(Axis(0), &targets_arr).unwrap(); + let nsamples = records_arr.len(); + let nfeatures = self.nfeatures(); + let ntargets = self.ntargets(); + + let records_arr = records_arr.into_iter().flatten().copied().collect(); + let targets_arr = targets_arr.into_iter().flatten().copied().collect(); + + let records = Array2::from_shape_vec((nsamples, nfeatures), records_arr).unwrap(); + let targets = Array2::from_shape_vec((nsamples, ntargets), targets_arr).unwrap(); let targets = CountedTargets { targets, diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index 5100c12b1..102bb60a2 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -284,6 +284,7 @@ pub trait Labels { #[cfg(test)] mod tests { use super::*; + use approx::assert_abs_diff_eq; use ndarray::{array, Array1, Array2}; use rand::{rngs::SmallRng, SeedableRng}; @@ -523,34 +524,67 @@ mod tests { ); } - struct MockFittable {} + use crate::traits::{Fit, PredictRef}; + use ndarray::ArrayView2; + use thiserror::Error; + + struct MockFittable { + mock_var: usize, + } struct MockFittableResult { mock_var: usize, } - use crate::traits::Fit; - use ndarray::ArrayView2; + #[derive(Error, Debug)] + enum MockError { + #[error(transparent)] + LinfaError(#[from] crate::error::Error), + } + + type MockResult = std::result::Result; - impl<'a> Fit<'a, ArrayView2<'a, f64>, ArrayView2<'a, f64>> for MockFittable { + impl<'a> Fit, ArrayView2<'a, f64>, MockError> for MockFittable { type Object = MockFittableResult; - fn fit(&self, training_data: &DatasetView) -> Self::Object { - MockFittableResult { - mock_var: training_data.nsamples(), + fn fit( + &self, + training_data: &DatasetView, + ) -> std::result::Result { + if self.mock_var == 0 { + Err(MockError::LinfaError(Error::Parameters("0".to_string()))) + } else { + Ok(MockFittableResult { + mock_var: training_data.nsamples(), + }) } } } + impl<'b> PredictRef, Array1> for MockFittableResult { + fn predict_ref<'a>(&'a self, _x: &'a ArrayView2<'b, f64>) -> Array1 { + array![0.] + } + } + + impl<'b> PredictRef, Array2> for MockFittableResult { + fn predict_ref<'a>(&'a self, _x: &'a ArrayView2<'b, f64>) -> Array2 { + array![[0., 0.]] + } + } + #[test] fn test_iter_fold() { let records = Array2::from_shape_vec((5, 2), vec![1., 1., 2., 2., 3., 3., 4., 4., 5., 5.]).unwrap(); let targets = Array1::from_shape_vec(5, vec![1., 2., 3., 4., 5.]).unwrap(); let mut dataset: Dataset = (records, targets).into(); - let params = MockFittable {}; + let params = MockFittable { mock_var: 1 }; - for (i, (model, validation_set)) in dataset.iter_fold(5, |v| params.fit(&v)).enumerate() { + for (i, (model, validation_set)) in dataset + .iter_fold(5, |v| params.fit(&v).unwrap()) + .enumerate() + { assert_eq!(model.mock_var, 4); assert_eq!(validation_set.records().row(0)[0] as usize, i + 1); assert_eq!(validation_set.records().row(0)[1] as usize, i + 1); @@ -566,12 +600,15 @@ mod tests { Array2::from_shape_vec((5, 2), vec![1., 1., 2., 2., 3., 3., 4., 4., 5., 5.]).unwrap(); let targets = Array1::from_shape_vec(5, vec![1., 2., 3., 4., 5.]).unwrap(); let mut dataset: Dataset = (records, targets).into(); - let params = MockFittable {}; + let params = MockFittable { mock_var: 1 }; // If we request three folds from a dataset with 5 samples it will cut the // last two samples from the folds and always add them as a tail of the training // data - for (i, (model, validation_set)) in dataset.iter_fold(3, |v| params.fit(&v)).enumerate() { + for (i, (model, validation_set)) in dataset + .iter_fold(3, |v| params.fit(&v).unwrap()) + .enumerate() + { assert_eq!(model.mock_var, 4); assert_eq!(validation_set.records().row(0)[0] as usize, i + 1); assert_eq!(validation_set.records().row(0)[1] as usize, i + 1); @@ -582,7 +619,10 @@ mod tests { } // the same goes for the last sample if we choose 4 folds - for (i, (model, validation_set)) in dataset.iter_fold(4, |v| params.fit(&v)).enumerate() { + for (i, (model, validation_set)) in dataset + .iter_fold(4, |v| params.fit(&v).unwrap()) + .enumerate() + { assert_eq!(model.mock_var, 4); assert_eq!(validation_set.records().row(0)[0] as usize, i + 1); assert_eq!(validation_set.records().row(0)[1] as usize, i + 1); @@ -594,7 +634,10 @@ mod tests { // if we choose 2 folds then again the last sample will be only // used for trainig - for (i, (model, validation_set)) in dataset.iter_fold(2, |v| params.fit(&v)).enumerate() { + for (i, (model, validation_set)) in dataset + .iter_fold(2, |v| params.fit(&v).unwrap()) + .enumerate() + { assert_eq!(model.mock_var, 3); assert_eq!(validation_set.targets().dim(), (2, 1)); assert!(i < 2); @@ -608,7 +651,7 @@ mod tests { Array2::from_shape_vec((5, 2), vec![1., 1., 2., 2., 3., 3., 4., 4., 5., 5.]).unwrap(); let targets = Array1::from_shape_vec(5, vec![1., 2., 3., 4., 5.]).unwrap(); let mut dataset: Dataset = (records, targets).into(); - let params = MockFittable {}; + let params = MockFittable { mock_var: 1 }; let _ = dataset.iter_fold(0, |v| params.fit(&v)).enumerate(); } @@ -619,7 +662,229 @@ mod tests { Array2::from_shape_vec((5, 2), vec![1., 1., 2., 2., 3., 3., 4., 4., 5., 5.]).unwrap(); let targets = Array1::from_shape_vec(5, vec![1., 2., 3., 4., 5.]).unwrap(); let mut dataset: Dataset = (records, targets).into(); - let params = MockFittable {}; + let params = MockFittable { mock_var: 1 }; let _ = dataset.iter_fold(6, |v| params.fit(&v)).enumerate(); } + + #[test] + fn test_st_cv_all_correct() { + let records = + Array2::from_shape_vec((5, 2), vec![1., 1., 2., 2., 3., 3., 4., 4., 5., 5.]).unwrap(); + let targets = Array1::from_shape_vec(5, vec![1., 2., 3., 4., 5.]).unwrap(); + let mut dataset: Dataset = (records, targets).into(); + let params = vec![MockFittable { mock_var: 1 }, MockFittable { mock_var: 2 }]; + let acc = dataset + .cross_validate(5, ¶ms, |_pred, _truth| Ok(3.)) + .unwrap(); + assert_eq!(acc, array![3., 3.]); + + let mut dataset: Dataset = + (array![[1., 1.], [2., 2.]], array![[1., 2.], [3., 4.]]).into(); + + let params = vec![MockFittable { mock_var: 1 }, MockFittable { mock_var: 2 }]; + let acc = dataset + .cross_validate(2, ¶ms, |_pred, _truth| Ok(3.)) + .unwrap(); + assert_eq!(acc, array![[3., 3.], [3., 3.]]); + } + #[test] + #[should_panic( + expected = "called `Result::unwrap()` on an `Err` value: LinfaError(Parameters(\"0\"))" + )] + fn test_st_cv_one_incorrect() { + let records = + Array2::from_shape_vec((5, 2), vec![1., 1., 2., 2., 3., 3., 4., 4., 5., 5.]).unwrap(); + let targets = Array1::from_shape_vec(5, vec![1., 2., 3., 4., 5.]).unwrap(); + let mut dataset: Dataset = (records, targets).into(); + // second one should throw an error + let params = vec![MockFittable { mock_var: 1 }, MockFittable { mock_var: 0 }]; + let acc: MockResult> = dataset.cross_validate(5, ¶ms, |_pred, _truth| Ok(0.)); + + acc.unwrap(); + } + + #[test] + #[should_panic( + expected = "called `Result::unwrap()` on an `Err` value: LinfaError(Parameters(\"eval\"))" + )] + fn test_st_cv_incorrect_eval() { + let records = + Array2::from_shape_vec((5, 2), vec![1., 1., 2., 2., 3., 3., 4., 4., 5., 5.]).unwrap(); + let targets = Array1::from_shape_vec(5, vec![1., 2., 3., 4., 5.]).unwrap(); + let mut dataset: Dataset = (records, targets).into(); + // second one should throw an error + let params = vec![MockFittable { mock_var: 1 }, MockFittable { mock_var: 1 }]; + let err: MockResult> = dataset.cross_validate(5, ¶ms, |_pred, _truth| { + if false { + Ok(0f32) + } else { + Err(Error::Parameters("eval".to_string())) + } + }); + + err.unwrap(); + } + + #[test] + fn test_st_cv_mt_all_correct() { + let records = + Array2::from_shape_vec((5, 2), vec![1., 1., 2., 2., 3., 3., 4., 4., 5., 5.]).unwrap(); + let targets = array![[1., 1.], [2., 2.], [3., 3.], [4., 4.], [5., 5.]]; + let mut dataset: Dataset = (records, targets).into(); + let params = vec![MockFittable { mock_var: 1 }, MockFittable { mock_var: 2 }]; + let acc = dataset + .cross_validate_multi(5, ¶ms, |_pred, _truth| Ok(array![5., 6.])) + .unwrap(); + assert_eq!(acc.dim(), (params.len(), dataset.ntargets())); + assert_eq!(acc, array![[5., 6.], [5., 6.]]) + } + #[test] + fn test_st_cv_mt_one_incorrect() { + let records = + Array2::from_shape_vec((5, 2), vec![1., 1., 2., 2., 3., 3., 4., 4., 5., 5.]).unwrap(); + let targets = Array1::from_shape_vec(5, vec![1., 2., 3., 4., 5.]).unwrap(); + let mut dataset: Dataset = (records, targets).into(); + // second one should throw an error + let params = vec![MockFittable { mock_var: 1 }, MockFittable { mock_var: 0 }]; + let err = dataset + .cross_validate_multi(5, ¶ms, |_pred, _truth| Ok(array![5.])) + .unwrap_err(); + assert_eq!(err.to_string(), "invalid parameter 0".to_string()); + } + + #[test] + fn test_st_cv_mt_incorrect_eval() { + let records = + Array2::from_shape_vec((5, 2), vec![1., 1., 2., 2., 3., 3., 4., 4., 5., 5.]).unwrap(); + let targets = Array1::from_shape_vec(5, vec![1., 2., 3., 4., 5.]).unwrap(); + let mut dataset: Dataset = (records, targets).into(); + // second one should throw an error + let params = vec![MockFittable { mock_var: 1 }, MockFittable { mock_var: 1 }]; + let err = dataset + .cross_validate_multi(5, ¶ms, |_pred, _truth| { + if false { + Ok(array![0f32]) + } else { + Err(Error::Parameters("eval".to_string())) + } + }) + .unwrap_err(); + assert_eq!(err.to_string(), "invalid parameter eval".to_string()); + } + + #[test] + fn test_with_labels_st() { + let records = array![ + [0., 1.], + [1., 2.], + [2., 3.], + [0., 4.], + [1., 5.], + [2., 6.], + [0., 7.], + [1., 8.], + [2., 9.], + [0., 10.] + ]; + let targets = array![0, 1, 2, 0, 1, 2, 0, 1, 2, 0].insert_axis(Axis(1)); + let dataset = DatasetBase::from((records, targets)); + assert_eq!(dataset.nsamples(), 10); + assert_eq!(dataset.ntargets(), 1); + let dataset_no_0 = dataset.with_labels(&[1, 2]); + assert_eq!(dataset_no_0.nsamples(), 6); + assert_eq!(dataset_no_0.ntargets(), 1); + assert_abs_diff_eq!( + dataset_no_0.records, + array![[1., 2.], [2., 3.], [1., 5.], [2., 6.], [1., 8.], [2., 9.]] + ); + assert_abs_diff_eq!( + dataset_no_0.try_single_target().unwrap(), + array![1, 2, 1, 2, 1, 2] + ); + let dataset_no_1 = dataset.with_labels(&[0, 2]); + assert_eq!(dataset_no_1.nsamples(), 7); + assert_eq!(dataset_no_1.ntargets(), 1); + assert_abs_diff_eq!( + dataset_no_1.records, + array![ + [0., 1.], + [2., 3.], + [0., 4.], + [2., 6.], + [0., 7.], + [2., 9.], + [0., 10.] + ] + ); + assert_abs_diff_eq!( + dataset_no_1.try_single_target().unwrap(), + array![0, 2, 0, 2, 0, 2, 0] + ); + let dataset_no_2 = dataset.with_labels(&[0, 1]); + assert_eq!(dataset_no_2.nsamples(), 7); + assert_eq!(dataset_no_2.ntargets(), 1); + assert_abs_diff_eq!( + dataset_no_2.records, + array![ + [0., 1.], + [1., 2.], + [0., 4.], + [1., 5.], + [0., 7.], + [1., 8.], + [0., 10.] + ] + ); + assert_abs_diff_eq!( + dataset_no_2.try_single_target().unwrap(), + array![0, 1, 0, 1, 0, 1, 0] + ); + } + + #[test] + fn test_with_labels_mt() { + let records = array![ + [0., 1.], + [1., 2.], + [2., 3.], + [0., 4.], + [1., 5.], + [2., 6.], + [0., 7.], + [1., 8.], + [2., 9.], + [0., 10.] + ]; + let targets = array![ + [0, 7], + [1, 8], + [2, 9], + [0, 7], + [1, 8], + [2, 9], + [0, 7], + [1, 8], + [2, 9], + [0, 7] + ]; + let dataset = DatasetBase::from((records, targets)); + assert_eq!(dataset.nsamples(), 10); + assert_eq!(dataset.ntargets(), 2); + // remove 0 from target 1 and 7 from target 2 + let dataset_no_07 = dataset.with_labels(&[1, 2, 8, 9]); + assert_eq!(dataset_no_07.nsamples(), 6); + assert_eq!(dataset_no_07.ntargets(), 2); + assert_abs_diff_eq!( + dataset_no_07.records, + array![[1., 2.], [2., 3.], [1., 5.], [2., 6.], [1., 8.], [2., 9.]] + ); + assert_abs_diff_eq!( + dataset_no_07.as_multi_targets(), + array![[1, 8], [2, 9], [1, 8], [2, 9], [1, 8], [2, 9]] + ); + // remove label 1 from target 1 and label 7 from target 2: with labels is an "any" so all targets should be kept + let dataset_no_17 = dataset.with_labels(&[0, 2, 8, 9]); + assert_eq!(dataset_no_17.nsamples(), 10); + assert_eq!(dataset_no_17.ntargets(), 2); + } } diff --git a/src/error.rs b/src/error.rs index c86a81a20..f5ef0f14d 100644 --- a/src/error.rs +++ b/src/error.rs @@ -24,4 +24,6 @@ pub enum Error { MultipleTargets, #[error("platt scaling failed")] Platt(PlattNewtonResult), + #[error("The number of samples do not match: {0} - {1}")] + MismatchedShapes(usize, usize), } diff --git a/src/lib.rs b/src/lib.rs index 02cc08c57..b678bdb55 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -32,6 +32,9 @@ //! | [hierarchical](https://docs.rs/linfa-hierarchical/) | Agglomerative hierarchical clustering | Tested | Unsupervised learning | Cluster and build hierarchy of clusters | //! | [bayes](https://docs.rs/linfa-bayes/) | Naive Bayes | Tested | Supervised learning | Contains Gaussian Naive Bayes | //! | [ica](https://docs.rs/linfa-ica/) | Independent component analysis | Tested | Unsupervised learning | Contains FastICA implementation | +//! | [pls](algorithms/linfa-pls/) | Partial Least Squares | Tested | Supervised learning | Contains PLS estimators for dimensionality reduction and regression | +//! | [tsne](algorithms/linfa-tsne/) | Dimensionality reduction| Tested | Unsupervised learning | Contains exact solution and Barnes-Hut approximation t-SNE | +//! | [preprocessing](algorithms/linfa-preprocessing/) |Normalization & Vectorization| Tested | Pre-processing | Contains data normalization/whitening and count vectorization/tf-idf| //! //! We believe that only a significant community effort can nurture, build, and sustain a machine learning ecosystem in Rust - there is no other way forward. //! diff --git a/src/metrics_classification.rs b/src/metrics_classification.rs index 835bcc54d..37d6a345d 100644 --- a/src/metrics_classification.rs +++ b/src/metrics_classification.rs @@ -10,7 +10,7 @@ use ndarray::prelude::*; use ndarray::Data; use crate::dataset::{AsTargets, DatasetBase, Label, Labels, Pr, Records}; -use crate::error::Result; +use crate::error::{Error, Result}; /// Return tuple of class index for each element of prediction and ground_truth fn map_prediction_to_idx( @@ -267,10 +267,25 @@ where T: AsTargets + Labels, { fn confusion_matrix(&self, ground_truth: ArrayBase) -> Result> { + self.confusion_matrix(&ground_truth) + } +} + +impl ToConfusionMatrix> for T +where + S: Data, + T: AsTargets + Labels, +{ + fn confusion_matrix(&self, ground_truth: &ArrayBase) -> Result> { + let targets = self.try_single_target()?; + if targets.len() != ground_truth.len() { + return Err(Error::MismatchedShapes(targets.len(), ground_truth.len())); + } + let classes = self.labels(); let indices = map_prediction_to_idx( - &self.try_single_target()?.as_slice().unwrap(), + targets.as_slice().unwrap(), &ground_truth.as_slice().unwrap(), &classes, ); @@ -475,107 +490,109 @@ impl, T2: AsTargets, - D: Dimension, - >( - a: ArrayBase, - b: &[A], - ) { - assert_eq_iter(a.iter(), b); + use std::collections::HashMap; + + fn get_labels_map(cm: &ConfusionMatrix) -> HashMap { + cm.members + .iter() + .enumerate() + .map(|(index, label)| (label.clone(), index)) + .collect() } - fn assert_eq_iter<'a, A, B>(a: impl IntoIterator, b: impl IntoIterator) - where - A: 'a + std::fmt::Debug + PartialEq + AbsDiffEq, - B: Borrow, - { - let mut a_iter = a.into_iter(); - let mut b_iter = b.into_iter(); - loop { - match (a_iter.next(), b_iter.next()) { - (None, None) => break, - (Some(a_item), Some(b_item)) => { - abs_diff_eq!(a_item.borrow(), b_item); - } - _ => { - panic!("assert_eq_iters: iterators had different lengths"); - } - } + // confusion matrices use hash sets for the labels to pair so + // the order of the rows of the matrices is not constant. + // we can transform the index->member mapping in `cm.members` + // into a member->index mapping to check each element independently + fn assert_cm_eq(cm: &ConfusionMatrix, expected: &Array2, labels: &Array1) { + let map = get_labels_map(cm); + for ((row, column), value) in expected.indexed_iter().map(|((r, c), v)| { + ( + (*map.get(&labels[r]).unwrap(), *map.get(&labels[c]).unwrap()), + v, + ) + }) { + let cm_value = *cm.matrix.get((row, column)).unwrap(); + assert_abs_diff_eq!(cm_value, value); + } + } + + fn assert_split_eq) -> f32>( + cm: &ConfusionMatrix, + eval: C, + expected: &Array1, + labels: &Array1, + ) { + let map = get_labels_map(cm); + let evals = cm + .split_one_vs_all() + .into_iter() + .map(|x| eval(&x)) + .collect::>(); + for (index, value) in expected + .indexed_iter() + .map(|(i, v)| (*map.get(&labels[i]).unwrap(), v)) + { + let evals_value = *evals.get(index).unwrap(); + assert_abs_diff_eq!(evals_value, value); } } #[test] fn test_confusion_matrix() { - let predicted = ArrayView1::from(&[0, 1, 0, 1, 0, 1]); let ground_truth = ArrayView1::from(&[1, 1, 0, 1, 0, 1]); + let predicted = ArrayView1::from(&[0, 1, 0, 1, 0, 1]); - let cm = predicted.confusion_matrix(ground_truth); + let cm = predicted.confusion_matrix(ground_truth).unwrap(); - assert_eq_slice(cm.matrix, &[2., 1., 0., 3.]); + let labels = array![0, 1]; + let expected = array![[2., 1.], [0., 3.]]; + + assert_cm_eq(&cm, &expected, &labels); } #[test] fn test_cm_metrices() { - let predicted = Array1::from(vec![0, 1, 0, 1, 0, 1]); let ground_truth = Array1::from(vec![1, 1, 0, 1, 0, 1]); + let predicted = Array1::from(vec![0, 1, 0, 1, 0, 1]); - let x = predicted.confusion_matrix(ground_truth); + let x = predicted.confusion_matrix(ground_truth).unwrap(); - abs_diff_eq!(x.accuracy(), 5.0 / 6.0_f32); - abs_diff_eq!( + let labels = array![0, 1]; + + assert_abs_diff_eq!(x.accuracy(), 5.0 / 6.0_f32); + assert_abs_diff_eq!( x.mcc(), (2. * 3. - 1. * 0.) / (2.0f32 * 3. * 3. * 4.).sqrt() as f32 ); - assert_eq_iter( - x.split_one_vs_all().into_iter().map(|x| x.precision()), - &[1.0, 3. / 4.], + assert_split_eq( + &x, + |cm| ConfusionMatrix::precision(cm), + &array![1.0, 3. / 4.], + &labels, ); - assert_eq_iter( - x.split_one_vs_all().into_iter().map(|x| x.recall()), - &[2.0 / 3.0, 1.0], + assert_split_eq( + &x, + |cm| ConfusionMatrix::recall(cm), + &array![2.0 / 3.0, 1.0], + &labels, ); - assert_eq_iter( - x.split_one_vs_all().into_iter().map(|x| x.f1_score()), - &[4.0 / 5.0, 6.0 / 7.0], + assert_split_eq( + &x, + |cm| ConfusionMatrix::f1_score(cm), + &array![4.0 / 5.0, 6.0 / 7.0], + &labels, ); } - #[test] - fn test_modification() { - let predicted = array![0, 3, 2, 0, 1, 1, 1, 3, 2, 3]; - - let ground_truth = - DatasetBase::new((), array![0, 2, 3, 0, 1, 2, 1, 2, 3, 2]).with_labels(&[0, 1, 2]); - - // exclude class 3 from evaluation - let cm = predicted.confusion_matrix(&ground_truth); - - assert_eq_slice(cm.matrix, &[2., 0., 0., 0., 2., 1., 0., 0., 0.]); - - // weight errors in class 2 more severe and exclude class 1 - let ground_truth = ground_truth - .with_weights(vec![1., 2., 1., 1., 1., 2., 1., 2., 1., 2.]) - .with_labels(&[0, 2, 3]); - - let cm = predicted.confusion_matrix(&ground_truth); - - // the false-positive error for label=2 is twice severe here - assert_eq_slice(cm.matrix, &[2., 0., 0., 0., 0., 4., 0., 3., 0.]); - } - #[test] fn test_roc_curve() { let predicted = ArrayView1::from(&[0.1, 0.3, 0.5, 0.7, 0.8, 0.9]).mapv(Pr); @@ -592,7 +609,7 @@ mod tests { (1., 1.), ]; - let roc = predicted.roc(&groundtruth); + let roc = predicted.roc(&groundtruth).unwrap(); assert_eq!(roc.get_curve(), result); } @@ -609,32 +626,38 @@ mod tests { .collect::>(); // ROC Area-Under-Curve should be approximately 0.5 - let roc = predicted.roc(&ground_truth); + let roc = predicted.roc(&ground_truth).unwrap(); assert!((roc.area_under_curve() - 0.5) < 0.04); } #[test] fn split_one_vs_all() { - let predicted = array![0, 3, 2, 0, 1, 1, 1, 3, 2, 3]; let ground_truth = array![0, 2, 3, 0, 1, 2, 1, 2, 3, 2]; + let predicted = array![0, 3, 2, 0, 1, 1, 1, 3, 2, 3]; // create a confusion matrix - let cm = predicted.confusion_matrix(ground_truth); + let cm = predicted.confusion_matrix(ground_truth).unwrap(); + + let labels = array![0, 1, 2, 3]; + let bin_labels = array![true, false]; + let map = get_labels_map(&cm); // split four class confusion matrix into 4 binary confusion matrix let n_cm = cm.split_one_vs_all(); - let result: &[&[f32]] = &[ - &[2., 0., 0., 8.], // no misclassification for label=0 - &[2., 1., 0., 7.], // one false-positive for label=1 - &[0., 2., 4., 4.], // two false-positive and four false-negative for label=2 - &[0., 3., 2., 5.], // three false-positive and two false-negative for label=3 + let result = &[ + array![[2., 0.], [0., 8.]], // no misclassification for label=0 + array![[2., 1.], [0., 7.]], // one false-positive for label=1 + array![[0., 2.], [4., 4.]], // two false-positive and four false-negative for label=2 + array![[0., 3.], [2., 5.]], // three false-positive and two false-negative for label=3 ]; - // compare to result - n_cm.into_iter() - .zip(result.iter()) - .for_each(|(x, r)| assert_eq_slice(x.matrix, r)) + for (r, x) in result + .iter() + .zip(labels.iter()) + .map(|(r, l)| (r, n_cm.get(*map.get(l).unwrap()).unwrap())) + { + assert_cm_eq(x, r, &bin_labels); + } } } -*/ diff --git a/src/traits.rs b/src/traits.rs index b0c4e30d7..c7bbca2cb 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -2,6 +2,7 @@ //! use crate::dataset::{DatasetBase, Records}; +use std::convert::From; /// Transformation algorithms /// @@ -20,10 +21,10 @@ pub trait Transformer { /// A fittable algorithm takes a dataset and creates a concept of some kind about it. For example /// in *KMeans* this would be the mean values for each class, or in *SVM* the separating /// hyperplane. It returns a model, which can be used to predict targets for new data. -pub trait Fit<'a, R: Records, T> { - type Object: 'a; +pub trait Fit> { + type Object; - fn fit(&self, dataset: &DatasetBase) -> Self::Object; + fn fit(&self, dataset: &DatasetBase) -> Result; } /// Incremental algorithms