diff --git a/.gitignore b/.gitignore index 15eba83e1..226933088 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ __pycache__ build/ dist/ docs/src/examples +sg_execution_times.rst diff --git a/examples/reconstruction/PlotGFRE.py b/examples/reconstruction/PlotGFRE.py index 9111c83e5..d1f45106e 100644 --- a/examples/reconstruction/PlotGFRE.py +++ b/examples/reconstruction/PlotGFRE.py @@ -4,20 +4,19 @@ """ Global Feature Reconstruction Error (GFRE) and Distortion (GFRD) ================================================================ - -Example for the usage of the :class:`skmatter.metrics.global_reconstruction_error` -as global feature reconstruction error (GFRE) and +Example for the usage of the :class:`skmatter.metrics.global_reconstruction_error` as +global feature reconstruction error (GFRE) and :class:`skmatter.metrics.global_reconstruction_distortion` global feature reconstruction -distortion (GFRD). We apply the global reconstruction measures on the degenerate -CH4 manifold dataset. This dataset was specifically constructed to be -representable by a 4-body features (bispectrum) but not by a 3-body features -(power spectrum). In other words the dataset contains environments which are -different, but have the same 3-body features. For more details about the dataset -please refer to `Pozdnyakov 2020 `_. +distortion (GFRD). We apply the global reconstruction measures on the degenerate CH4 +manifold dataset. This dataset was specifically constructed to be representable by a +4-body features (bispectrum) but not by a 3-body features (power spectrum). In other +words the dataset contains environments which are different, but have the same 3-body +features. For more details about the dataset please refer to `Pozdnyakov 2020 +`_. The ``skmatter`` dataset already contains the 3 and 4-body features computed with -`librascal `_ so we can load it and -compare it with the GFRE/GFRD. +`librascal `_ so we can load it and compare it +with the GFRE/GFRD. """ # %% # diff --git a/examples/reconstruction/PlotLFRE.py b/examples/reconstruction/PlotLFRE.py index de4be77a5..ead5d131f 100644 --- a/examples/reconstruction/PlotLFRE.py +++ b/examples/reconstruction/PlotLFRE.py @@ -3,7 +3,6 @@ """ Pointwise Local Reconstruction Error ==================================== - Example for the usage of the :class:`skmatter.metrics.pointwise_local_reconstruction_error` as pointwise local reconstruction error (LFRE) on the degenerate CH4 manifold. We apply the local @@ -14,9 +13,9 @@ dataset please refer to `Pozdnyakov 2020 `_. -The skmatter dataset already contains the 3 and 4-body features computed with -`librascal `_ so we can load it and compare it -with the LFRE. +The skmatter dataset already contains the 3 and 4-body features computed with `librascal +`_ so we can load it and compare it with the +LFRE. """ # %% # diff --git a/examples/reconstruction/PlotPointwiseGFRE.py b/examples/reconstruction/PlotPointwiseGFRE.py index df7662bc4..256b6011c 100644 --- a/examples/reconstruction/PlotPointwiseGFRE.py +++ b/examples/reconstruction/PlotPointwiseGFRE.py @@ -3,8 +3,7 @@ """ Pointwise GFRE applied on RKHS features -================================================================ - +======================================= Example for the usage of the :class:`skmatter.metrics.pointwise_global_reconstruction_error` as the pointwise global feature reconstruction error (pointwise GFRE). We apply the pointwise global feature diff --git a/examples/regression/OrthogonalRegressionNonAnalytic.py b/examples/regression/OrthogonalRegressionNonAnalytic.py index c5089df01..586178d6d 100644 --- a/examples/regression/OrthogonalRegressionNonAnalytic.py +++ b/examples/regression/OrthogonalRegressionNonAnalytic.py @@ -3,7 +3,6 @@ r""" Regression with orthogonal projector/matrices ============================================= - In this example, we explain how when using :class:`skmatter.linear_model.OrthogonalRegression` the option ``use_orthogonal_projector`` can result in non-analytic behavior. In diff --git a/examples/regression/Ridge2FoldCVRegularization.py b/examples/regression/Ridge2FoldCVRegularization.py index 83ad6d9f0..b4c78cf63 100644 --- a/examples/regression/Ridge2FoldCVRegularization.py +++ b/examples/regression/Ridge2FoldCVRegularization.py @@ -1,16 +1,14 @@ # %% r""" - Ridge2FoldCV for data with low effective rank - ======================================================= - In this notebook we explain in more detail how - :class:`skmatter.linear_model.Ridge2FoldCV` speeds up the - cross-validation optimizing the regularitzation parameter :param alpha: and - compare it with existing solution for that in scikit-learn - :class:`slearn.linear_model.RidgeCV`. - :class:`skmatter.linear_model.Ridge2FoldCV` was designed to predict - efficiently feature matrices, but it can be also useful for the prediction - single targets. +Ridge2FoldCV for data with low effective rank +============================================= +In this notebook we explain in more detail how +:class:`skmatter.linear_model.Ridge2FoldCV` speeds up the cross-validation optimizing +the regularitzation parameter :param alpha: and compare it with existing solution for +that in scikit-learn :class:`slearn.linear_model.RidgeCV`. +:class:`skmatter.linear_model.Ridge2FoldCV` was designed to predict efficiently feature +matrices, but it can be also useful for the prediction single targets. """ # %% # @@ -128,6 +126,7 @@ def micro_bench(ridge): + """A small benchmark function.""" global N_REPEAT_MICRO_BENCH, X, y timings = [] train_mse = [] @@ -177,6 +176,7 @@ def micro_bench(ridge): def get_train_test_error(estimator): + """The train tets error based on the estimator.""" global X_train, y_train, X_test, y_test estimator = estimator.fit(X_train, y_train) return ( diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py index 95e43ed15..69aad73c3 100644 --- a/src/skmatter/_selection.py +++ b/src/skmatter/_selection.py @@ -1,13 +1,13 @@ """ -This module contains data sub-selection modules primarily corresponding to methods -derived from CUR matrix decomposition and Farthest Point Sampling. In their classical -form, CUR and FPS determine a data subset that maximizes the variance (CUR) or -distribution (FPS) of the features or samples. These methods can be modified to combine -supervised target information denoted by the methods `PCov-CUR` and `PCov-FPS`. For -further reading, refer to [Imbalzano2018]_ and [Cersonsky2021]_. These selectors can be -used for both feature and sample selection, with similar instantiations. All -sub-selection methods scores each feature or sample (without an estimator) and chooses -that with the maximum score. A simple example of usage: +Data sub-selection modules primarily corresponding to methods derived from CUR matrix +decomposition and Farthest Point Sampling. In their classical form, CUR and FPS +determine a data subset that maximizes the variance (CUR) or distribution (FPS) of the +features or samples. These methods can be modified to combine supervised target +information denoted by the methods `PCov-CUR` and `PCov-FPS`. For further reading, refer +to [Imbalzano2018]_ and [Cersonsky2021]_. These selectors can be used for both feature +and sample selection, with similar instantiations. All sub-selection methods scores +each feature or sample (without an estimator) and chooses that with the maximum score. A +simple example of usage: .. doctest:: @@ -98,60 +98,49 @@ class GreedySelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator): - """ - - Transformer that adds, via greedy forward selection, + """Transformer that adds, via greedy forward selection, features or samples to form a subset. At each stage, the model scores each feature or sample (without an estimator) and chooses that with the maximum score. Parameters ---------- - selection_type : str, {'feature', 'sample'} whether to choose a subset of columns ('feature') or rows ('sample'). Stored in :py:attr:`self._axis_name` (as text) and :py:attr:`self._axis` (as 0 or 1 for 'sample' or 'feature', respectively). - n_to_select : int or float, default=None The number of selections to make. If `None`, half of the features or samples are selected. If integer, the parameter is the absolute number of selections to make. If float between 0 and 1, it is the fraction of the total dataset to select. Stored in :py:attr:`self.n_to_select`. - score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the n_to_select is chosen. Otherwise will stop when the score falls below the threshold. Stored in :py:attr:`self.score_threshold`. - score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by the selector is compared to the threshold directly. When "relative", at each iteration, the score used by the selector is compared proportionally to the score of the first selection, i.e. the selector quits when ``current_score / first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. - progress_bar: bool, default=False option to use `tqdm `_ progress bar to monitor selections. Stored in :py:attr:`self.report_progress_`. - full : bool, default=False In the case that all non-redundant selections are exhausted, choose randomly from the remaining features. Stored in :py:attr:`self.full`. - - random_state: int or RandomState instance, default=0 + random_state: int or :class`numpy.random`RandomState` instance, default=0 Attributes ---------- n_selected_ : int - Counter tracking the number of selections that have been made - X_selected_ : ndarray, - Matrix containing the selected samples or features, for use in fitting - y_selected_ : ndarray, - In sample selection, the matrix containing the selected targets, for - use in fitting - - + Counter tracking the number of selections that have been made + X_selected_ : numpy.ndarray, + Matrix containing the selected samples or features, for use in fitting + y_selected_ : numpy.ndarray, + In sample selection, the matrix containing the selected targets, for use in + fitting """ def __init__( @@ -195,7 +184,6 @@ def fit(self, X, y=None, warm_start=False): ------- self : object """ - if self.selection_type == "feature": self._axis = 1 elif self.selection_type == "sample": @@ -311,7 +299,6 @@ def transform(self, X, y=None): X_r : ndarray The selected subset of the input. """ - check_is_fitted(self, ["_axis", "selected_idx_", "n_selected_"]) if self._axis == 0: @@ -396,7 +383,6 @@ def get_support(self, indices=False, ordered=False): def _init_greedy_search(self, X, y, n_to_select): """Initializes the search. Prepares an array to store the selected features.""" - self.n_selected_ = 0 self.first_score_ = None @@ -413,7 +399,6 @@ def _init_greedy_search(self, X, y, n_to_select): def _continue_greedy_search(self, X, y, n_to_select): """Continues the search. Prepares an array to store the selected features.""" - n_pad = [(0, 0), (0, 0)] n_pad[self._axis] = (0, n_to_select - self.n_selected_) @@ -455,10 +440,9 @@ def _get_best_new_selection(self, scorer, X, y): return max_score_idx def _update_post_selection(self, X, y, last_selected): + """Saves the most recently selected feature and increments the feature + counter. """ - Saves the most recently selected feature and increments the feature counter - """ - if self._axis == 1: self.X_selected_[:, self.n_selected_] = np.take( X, last_selected, axis=self._axis @@ -508,30 +492,26 @@ class _CUR(GreedySelector): which maximize the magnitude of the right or left singular vectors, consistent with classic CUR matrix decomposition. - **WARNING**: This base class should never be directly instantiated. - Instead, use :py:class:`skmatter.feature_selection.CUR` and - :py:class:`skmatter.sample_selection.CUR`, - which have the same constructor signature. + Warning:: + This base class should never be directly instantiated. Instead, use + :py:class:`skmatter.feature_selection.CUR` and + :py:class:`skmatter.sample_selection.CUR`, which have the same constructor + signature. Parameters ---------- recompute_every : int - number of steps after which to recompute the pi score - defaults to 1, if 0 no re-computation is done - + number of steps after which to recompute the pi score + defaults to 1, if 0 no re-computation is done k : int number of eigenvectors to compute the importance score with, defaults to 1 - tolerance: float - threshold below which scores will be considered 0, defaults to 1E-12 - + threshold below which scores will be considered 0, defaults to 1E-12 Attributes ---------- - X_current_ : ndarray (n_samples, n_features) - The original matrix orthogonalized by previous selections - + The original matrix orthogonalized by previous selections """ def __init__( @@ -562,47 +542,40 @@ def __init__( ) def score(self, X, y=None): - r""" - Returns the importance score of the given samples or features. + r"""Returns the importance score of the given samples or features. - NOTE: This function does not compute the importance score each time it - is called, in order to avoid unnecessary computations. This is done - by :py:func:`self._compute_pi`. + Note:: + This function does not compute the importance score each time it is called, + in order to avoid unnecessary computations. This is done by + :py:func:`self._compute_pi`. Parameters ---------- - X : ndarray of shape [n_samples, n_features] + X : numpy.ndarray of shape [n_samples, n_features] The input samples. - y : ignored Returns ------- - score : ndarray of (n_to_select_from_) + score : numpy.ndarray of (n_to_select_from_) :math:`\pi` importance for the given samples or features - """ - return self.pi_ def _init_greedy_search(self, X, y, n_to_select): - """ - Initializes the search. Prepares an array to store the selected + """Initializes the search. Prepares an array to store the selected features and computes their initial importance score. """ - self.X_current_ = as_float_array(X.copy()) self.pi_ = self._compute_pi(self.X_current_) super()._init_greedy_search(X, y, n_to_select) def _continue_greedy_search(self, X, y, n_to_select): + """Continues the search. Prepares an array to store the selected features, + orthogonalizes the features by those already selected, and computes their + initial importance. """ - Continues the search. Prepares an array to store the selected - features, orthogonalizes the features by those already selected, - and computes their initial importance. - """ - for c in self.selected_idx_: if self.recompute_every != 0 and ( np.linalg.norm(np.take(self.X_current_, [c], axis=self._axis)) @@ -615,22 +588,19 @@ def _continue_greedy_search(self, X, y, n_to_select): super()._continue_greedy_search(X, y, n_to_select) def _compute_pi(self, X, y=None): - """ - For feature selection, the importance score :math:`\\pi` is the sum over + r"""For feature selection, the importance score :math:`\pi` is the sum over the squares of the first :math:`k` components of the right singular vectors .. math:: - \\pi_j = \\sum_i^k \\left(\\mathbf{U}_\\mathbf{C}\\right)_{ij}^2. where :math:`\\mathbf{C} = \\mathbf{X}^T\\mathbf{X}`. - For sample selection, the importance score :math:`\\pi` is the sum over - the squares of the first :math:`k` components of the right singular vectors + For sample selection, the importance score :math:`\\pi` is the sum over the + squares of the first :math:`k` components of the right singular vectors .. math:: - \\pi_j = \\sum_i^k \\left(\\mathbf{U}_\\mathbf{K}\\right)_{ij}^2. @@ -638,17 +608,15 @@ def _compute_pi(self, X, y=None): Parameters ---------- - X : ndarray of shape [n_samples, n_features] + X : numpy.ndarray of shape [n_samples, n_features] The input samples. - y : ignored Returns ------- - pi : ndarray of (n_to_select_from_) + pi : numpy.ndarray of (n_to_select_from_) :math:`\\pi` importance for the given samples or features """ - svd_kwargs = dict(k=self.k, random_state=self.random_state) if self._axis == 0: svd_kwargs["return_singular_vectors"] = "u" @@ -690,7 +658,7 @@ def _orthogonalize(self, last_selected): class _PCovCUR(GreedySelector): - """Transformer that performs Greedy Selection by choosing features + r"""Transformer that performs Greedy Selection by choosing features which maximize the magnitude of the right or left augmented singular vectors. This is done by employing the augmented kernel and covariance matrices, @@ -702,29 +670,22 @@ class _PCovCUR(GreedySelector): Parameters ---------- recompute_every : int - number of steps after which to recompute the pi score - defaults to 1, if 0 no re-computation is done - + number of steps after which to recompute the pi score defaults to 1, if 0 no + re-computation is done k : int number of eigenvectors to compute the importance score with, defaults to 1 - tolerance: float - threshold below which scores will be considered 0, defaults to 1E-12 - + threshold below which scores will be considered 0, defaults to 1E-12 mixing: float, default=0.5 - The PCovR mixing parameter, as described in PCovR as - :math:`{\\alpha}`. Stored in :py:attr:`self.mixing`. + The PCovR mixing parameter, as described in PCovR as + :math:`{\alpha}`. Stored in :py:attr:`self.mixing`. Attributes ---------- - X_current_ : ndarray (n_samples, n_features) - The original matrix orthogonalized by previous selections - + The original matrix orthogonalized by previous selections y_current_ : ndarray (n_samples, n_properties) - The targets orthogonalized by a regression on - the previous selections. - + The targets orthogonalized by a regression on the previous selections. """ def __init__( @@ -758,34 +719,29 @@ def __init__( ) def score(self, X, y=None): - """ - Returns the importance score of the given samples or features. + r"""Returns the importance score of the given samples or features. - NOTE: This function does not compute the importance score each time it - is called, in order to avoid unnecessary computations. This is done - by :py:func:`self._compute_pi`. + Note:: + This function does not compute the importance score each time it is called, + in order to avoid unnecessary computations. This is done by + :py:func:`self._compute_pi`. Parameters ---------- X : ignored - y : ignored Returns ------- score : ndarray of (n_to_select_from_) - :math:`\\pi` importance for the given samples or features - + :math:`\pi` importance for the given samples or features """ - return self.pi_ def _init_greedy_search(self, X, y, n_to_select): - """ - Initializes the search. Prepares an array to store the selected + """Initializes the search. Prepares an array to store the selected features and computes their initial importance score. """ - self.X_ref_ = X self.y_ref_ = y self.X_current_ = X.copy() @@ -798,12 +754,10 @@ def _init_greedy_search(self, X, y, n_to_select): super()._init_greedy_search(X, y, n_to_select) def _continue_greedy_search(self, X, y, n_to_select): + """Continues the search. Prepares an array to store the selected + features, orthogonalizes the features by those already selected, and computes + their initial importance. """ - Continues the search. Prepares an array to store the selected - features, orthogonalizes the features by those already selected, - and computes their initial importance. - """ - for c in self.selected_idx_: if self.recompute_every != 0 and ( np.linalg.norm(np.take(self.X_current_, [c], axis=self._axis)) @@ -832,12 +786,10 @@ def _update_post_selection(self, X, y, last_selected): self.pi_[last_selected] = 0.0 def _compute_pi(self, X, y=None): - r""" - For feature selection, the importance score :math:`\pi` is the sum over - the squares of the first :math:`k` components of the right singular vectors + r"""For feature selection, the importance score :math:`\pi` is the sum over + the squares of the first :math:`k` components of the right singular vectors. .. math:: - \pi_j = \sum_i^k \left(\mathbf{U}_\mathbf{\tilde{C}}\right)_{ij}^2. @@ -852,7 +804,6 @@ def _compute_pi(self, X, y=None): the squares of the first :math:`k` components of the right singular vectors .. math:: - \pi_j = \sum_i^k \left(\mathbf{U}_\mathbf{\tilde{K}}\right)_{ij}^2. @@ -863,17 +814,15 @@ def _compute_pi(self, X, y=None): Parameters ---------- - X : ndarray of shape [n_samples, n_features] + X : numpy.ndarray of shape [n_samples, n_features] The input samples. - y : ignored Returns ------- - pi : ndarray of (n_to_select_from_) + pi : numpy.ndarray of (n_to_select_from_) :math:`\pi` importance for the given samples or features """ - if self._axis == 0: pcovr_distance = pcovr_kernel( self.mixing, @@ -923,22 +872,19 @@ def _orthogonalize(self, last_selected): class _FPS(GreedySelector): - """ - Transformer that performs Greedy Selection using Farthest Point Sampling. + """Transformer that performs Greedy Selection using Farthest Point Sampling. - **WARNING**: This base class should never be directly instantiated. - Instead, use :py:class:`skmatter.feature_selection.FPS` and - :py:class:`skmatter.sample_selection.FPS`, - which have the same constructor signature. + Warning:: + This base class should never be directly instantiated. Instead, use + :py:class:`skmatter.feature_selection.FPS` and + :py:class:`skmatter.sample_selection.FPS`, which have the same constructor + signature. Parameters ---------- - initialize: int, list of int, or 'random', default=0 - Index of the first selection(s). If 'random', picks a random - value when fit starts. Stored in :py:attr:`self.initialize`. - - + Index of the first selection(s). If 'random', picks a random value when fit + starts. Stored in :py:attr:`self.initialize`. """ def __init__( @@ -984,54 +930,45 @@ def score(self, X, y=None): return self.hausdorff_ def get_distance(self): - """ - - Traditional FPS employs a column-wise Euclidean + r"""Traditional FPS employs a column-wise Euclidean distance for feature selection, which can be expressed using the covariance - matrix :math:`\\mathbf{C} = \\mathbf{X} ^ T \\mathbf{X}` + matrix :math:`\mathbf{C} = \mathbf{X} ^ T \mathbf{X}` .. math:: - \\operatorname{d}_c(i, j) = C_{ii} - 2 C_{ij} + C_{jj}. + \operatorname{d}_c(i, j) = C_{ii} - 2 C_{ij} + C_{jj}. For sample selection, this is a row-wise Euclidean distance, which can be expressed in terms of the Gram matrix - :math:`\\mathbf{K} = \\mathbf{X} \\mathbf{X} ^ T` + :math:`\\mathbf{K} = \mathbf{X} \\mathbf{X} ^ T` .. math:: - \\operatorname{d}_r(i, j) = K_{ii} - 2 K_{ij} + K_{jj}. + \operatorname{d}_r(i, j) = K_{ii} - 2 K_{ij} + K_{jj}. Returns ------- - hausdorff : ndarray of shape (`n_to_select_from_`) - the minimum distance from each point to the set of selected - points. once a point is selected, the distance is not updated; - the final list will reflect the distances when selected. - + the minimum distance from each point to the set of selected points. once a + point is selected, the distance is not updated; the final list will reflect + the distances when selected. """ return self.hausdorff_ def get_select_distance(self): """ - Returns ------- - hausdorff_at_select : ndarray of shape (`n_to_select`) at the time of selection, the minimum distance from each selected point to the set of previously selected points. - """ mask = self.get_support(indices=True, ordered=True) return self.hausdorff_at_select_[mask] def _init_greedy_search(self, X, y, n_to_select): + """Initializes the search. Prepares an array to store the selections, + makes the initial selection (unless provided), and computes the starting + hausdorff distances. """ - Initializes the search. Prepares an array to store the selections, - makes the initial selection (unless provided), and - computes the starting hausdorff distances. - """ - super()._init_greedy_search(X, y, n_to_select) self.norms_ = (X**2).sum(axis=abs(self._axis - 1)) @@ -1082,25 +1019,20 @@ def _update_post_selection(self, X, y, last_selected): class _PCovFPS(GreedySelector): - """ - Transformer that performs Greedy Selection using PCovR-weighted - Farthest Point Sampling. - In PCov-FPS, a modified covariance or Gram matrix - is used to express the distances. + r"""Transformer that performs Greedy Selection using PCovR-weighted + Farthest Point Sampling. In PCov-FPS, a modified covariance or Gram matrix is used + to express the distances. For sample selection, this is a modified kernel matrix. Parameters ---------- - mixing: float, default=0.5 The PCovR mixing parameter, as described in PCovR as - :math:`{\\alpha}` - + :math:`{\alpha}` initialize: int or 'random', default=0 Index of the first selection. If 'random', picks a random value when fit starts. - """ def __init__( @@ -1135,8 +1067,7 @@ def __init__( ) def score(self, X, y=None): - """ - Returns the Hausdorff distances of all samples to previous selections + """Returns the Hausdorff distances of all samples to previous selections. NOTE: This function does not compute the importance score each time it is called, in order to avoid unnecessary computations. The hausdorff @@ -1155,39 +1086,31 @@ def score(self, X, y=None): def get_distance(self): """ - Returns ------- - hausdorff : ndarray of shape (`n_to_select_from_`) - the minimum distance from each point to the set of selected - points. once a point is selected, the distance is not updated; - the final list will reflect the distances when selected. - + the minimum distance from each point to the set of selected points. once a + point is selected, the distance is not updated; the final list will reflect + the distances when selected. """ return self.hausdorff_ def get_select_distance(self): """ - Returns ------- - hausdorff_at_select : ndarray of shape (`n_to_select`) - at the time of selection, the minimum distance from each - selected point to the set of previously selected points. - + at the time of selection, the minimum distance from each selected point to + the set of previously selected points. """ mask = self.get_support(indices=True, ordered=True) return self.hausdorff_at_select_[mask] def _init_greedy_search(self, X, y, n_to_select): + """Initializes the search. Prepares an array to store the selections, + makes the initial selection (unless provided), and computes the starting + hausdorff distances. """ - Initializes the search. Prepares an array to store the selections, - makes the initial selection (unless provided), and - computes the starting hausdorff distances. - """ - super()._init_greedy_search(X, y, n_to_select) if self._axis == 1: @@ -1224,17 +1147,14 @@ def _update_hausdorff(self, X, y, last_selected): np.minimum(self.hausdorff_, new_dist, self.hausdorff_) def _update_post_selection(self, X, y, last_selected): - """ - Saves the most recent selections, increments the counter, - and, recomputes hausdorff distances. + """Saves the most recent selections, increments the counter, and, recomputes + hausdorff distances. """ self._update_hausdorff(X, y, last_selected) super()._update_post_selection(X, y, last_selected) def _more_tags(self): - """ - Pass that this method requires a target vector - """ + """Pass that this method requires a target vector""" return { "requires_y": True, } diff --git a/src/skmatter/datasets/__init__.py b/src/skmatter/datasets/__init__.py index c10e90e24..c72113195 100644 --- a/src/skmatter/datasets/__init__.py +++ b/src/skmatter/datasets/__init__.py @@ -1,3 +1,5 @@ +"""Datasets used for example and testing.""" + from ._base import ( load_csd_1000r, load_degenerate_CH4_manifold, diff --git a/src/skmatter/datasets/_base.py b/src/skmatter/datasets/_base.py index e20e8887d..b3ff8b9f5 100644 --- a/src/skmatter/datasets/_base.py +++ b/src/skmatter/datasets/_base.py @@ -6,6 +6,7 @@ def load_nice_dataset(): """Load and returns NICE dataset. + Returns ------- nice_data : sklearn.utils.Bunch @@ -16,7 +17,6 @@ def load_nice_dataset(): DESCR: `str` -- The full description of the dataset. """ - module_path = dirname(__file__) target_filename = join(module_path, "data", "nice_dataset.npz") raw_data = np.load(target_filename) @@ -92,6 +92,7 @@ def load_csd_1000r(return_X_y=False): def load_who_dataset(): """Load and returns WHO dataset. + Returns ------- who_dataset : sklearn.utils.Bunch @@ -100,7 +101,6 @@ def load_who_dataset(): as a Pandas dataframe. DESCR: `str` -- The full description of the dataset. """ - module_path = dirname(__file__) target_filename = join(module_path, "data", "who_dataset.csv") pd = check_pandas_support("load_who_dataset") @@ -112,8 +112,8 @@ def load_who_dataset(): def load_roy_dataset(): """Load and returns the ROY dataset, which contains structures, - energies and SOAP-derived descriptors for 264 polymorphs of ROY, - from [Beran et Al, Chemical Science (2022)](https://doi.org/10.1039/D1SC06074K) + energies and SOAP-derived descriptors for 264 polymorphs of ROY, from [Beran et Al, + Chemical Science (2022)](https://doi.org/10.1039/D1SC06074K) Returns ------- @@ -123,7 +123,6 @@ def load_roy_dataset(): features: `np.array` -- SOAP-derived descriptors for the structures energies: `np.array` -- energies of the structures """ - module_path = dirname(__file__) target_structures = join(module_path, "data", "beran_roy_structures.xyz.bz2") diff --git a/src/skmatter/decomposition/_kernel_pcovr.py b/src/skmatter/decomposition/_kernel_pcovr.py index eba8c1ccc..a76c63e28 100644 --- a/src/skmatter/decomposition/_kernel_pcovr.py +++ b/src/skmatter/decomposition/_kernel_pcovr.py @@ -19,17 +19,15 @@ class KernelPCovR(_BasePCA, LinearModel): - r""" - Kernel Principal Covariates Regression, as described in [Helfrecht2020]_ - determines a latent-space projection :math:`\mathbf{T}` which - minimizes a combined loss in supervised and unsupervised tasks in the - reproducing kernel Hilbert space (RKHS). + r"""Kernel Principal Covariates Regression, as described in [Helfrecht2020]_ + determines a latent-space projection :math:`\mathbf{T}` which minimizes a combined + loss in supervised and unsupervised tasks in the reproducing kernel Hilbert space + (RKHS). - This projection is determined by the eigendecomposition of a modified gram - matrix :math:`\mathbf{\tilde{K}}` + This projection is determined by the eigendecomposition of a modified gram matrix + :math:`\mathbf{\tilde{K}}` .. math:: - \mathbf{\tilde{K}} = \alpha \mathbf{K} + (1 - \alpha) \mathbf{\hat{Y}}\mathbf{\hat{Y}}^T @@ -40,15 +38,13 @@ class KernelPCovR(_BasePCA, LinearModel): Parameters ---------- - mixing: float, default=0.5 + mixing : float, default=0.5 mixing parameter, as described in PCovR as :math:`{\\alpha}` - - n_components: int, float or str, default=None + n_components : int, float or str, default=None Number of components to keep. if n_components is not set all components are kept:: n_components == n_samples - svd_solver : {'auto', 'full', 'arpack', 'randomized'}, default='auto' If auto : The solver is selected by a default policy based on `X.shape` and @@ -66,7 +62,6 @@ class KernelPCovR(_BasePCA, LinearModel): 0 < n_components < min(X.shape) If randomized : run randomized SVD by the method of Halko et al. - regressor : {instance of `sklearn.kernel_ridge.KernelRidge`, `precomputed`, None}, default=None The regressor to use for computing the property predictions :math:`\\hat{\\mathbf{Y}}`. @@ -77,76 +72,58 @@ class KernelPCovR(_BasePCA, LinearModel): If `precomputed`, we assume that the `y` passed to the `fit` function is the regressed form of the targets :math:`{\mathbf{\hat{Y}}}`. - - - kernel: "linear" | "poly" | "rbf" | "sigmoid" | "cosine" | "precomputed" + kernel : "linear" | "poly" | "rbf" | "sigmoid" | "cosine" | "precomputed" Kernel. Default="linear". - - gamma: float, default=None + gamma : float, default=None Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other kernels. - - degree: int, default=3 + degree : int, default=3 Degree for poly kernels. Ignored by other kernels. - - coef0: float, default=1 + coef0 : float, default=1 Independent term in poly and sigmoid kernels. Ignored by other kernels. - - kernel_params: mapping of str to any, default=None + kernel_params : mapping of str to any, default=None Parameters (keyword arguments) and values for kernel passed as callable object. Ignored by other kernels. - - center: bool, default=False + center : bool, default=False Whether to center any computed kernels - - fit_inverse_transform: bool, default=False + fit_inverse_transform : bool, default=False Learn the inverse transform for non-precomputed kernels. (i.e. learn to find the pre-image of a point) - - tol: float, default=1e-12 + tol : float, default=1e-12 Tolerance for singular values computed by svd_solver == 'arpack' and for matrix inversions. Must be of range [0.0, infinity). - - n_jobs: int, default=None + n_jobs : int, default=None The number of parallel jobs to run. :obj:`None` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. - iterated_power : int or 'auto', default='auto' Number of iterations for the power method computed by svd_solver == 'randomized'. Must be of range [0, infinity). - - random_state : int, RandomState instance or None, default=None + random_state : int, :class:`numpy.random.RandomState` instance or None, default=None Used when the 'arpack' or 'randomized' solvers are used. Pass an int for reproducible results across multiple function calls. Attributes ---------- - - pt__: ndarray of size :math:`({n_{components}, n_{components}})` + pt__: numpy.darray of size :math:`({n_{components}, n_{components}})` pseudo-inverse of the latent-space projection, which can be used to contruct projectors from latent-space - - pkt_: ndarray of size :math:`({n_{samples}, n_{components}})` + pkt_: numpy.ndarray of size :math:`({n_{samples}, n_{components}})` the projector, or weights, from the input kernel :math:`\\mathbf{K}` to the latent-space projection :math:`\\mathbf{T}` - - pky_: ndarray of size :math:`({n_{samples}, n_{properties}})` + pky_: numpy.ndarray of size :math:`({n_{samples}, n_{properties}})` the projector, or weights, from the input kernel :math:`\\mathbf{K}` to the properties :math:`\\mathbf{Y}` - - pty_: ndarray of size :math:`({n_{components}, n_{properties}})` + pty_: numpy.ndarray of size :math:`({n_{components}, n_{properties}})` the projector, or weights, from the latent-space projection :math:`\\mathbf{T}` to the properties :math:`\\mathbf{Y}` - - ptx_: ndarray of size :math:`({n_{components}, n_{features}})` + ptx_: numpy.ndarray of size :math:`({n_{components}, n_{features}})` the projector, or weights, from the latent-space projection :math:`\\mathbf{T}` to the feature matrix :math:`\\mathbf{X}` - - X_fit_: ndarray of shape (n_samples, n_features) + X_fit_: numpy.ndarray of shape (n_samples, n_features) The data used to fit the model. This attribute is used to build kernels from new data. @@ -235,10 +212,7 @@ def _get_kernel(self, X, Y=None): ) def _fit(self, K, Yhat, W): - """ - Fit the model with the computed kernel and approximated properties. - """ - + """Fit the model with the computed kernel and approximated properties.""" K_tilde = pcovr_kernel(mixing=self.mixing, X=K, Y=Yhat, kernel="precomputed") if self._fit_svd_solver == "full": @@ -262,22 +236,19 @@ def _fit(self, K, Yhat, W): self.pt__ = np.linalg.lstsq(T, np.eye(T.shape[0]), rcond=self.tol)[0] def fit(self, X, Y, W=None): - """ - - Fit the model with X and Y. + r"""Fit the model with X and Y. Parameters ---------- - X: ndarray, shape (n_samples, n_features) + X : numpy.ndarray, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. - It is suggested that :math:`\\mathbf{X}` be centered by its column- + It is suggested that :math:`\mathbf{X}` be centered by its column- means and scaled. If features are related, the matrix should be scaled to have unit variance, otherwise :math:`\\mathbf{X}` should be scaled so that each feature has a variance of 1 / n_features. - - Y: ndarray, shape (n_samples, n_properties) + Y : numpy.ndarray, shape (n_samples, n_properties) Training data, where n_samples is the number of samples and n_properties is the number of properties @@ -285,8 +256,7 @@ def fit(self, X, Y, W=None): means and scaled. If features are related, the matrix should be scaled to have unit variance, otherwise :math:`\\mathbf{Y}` should be scaled so that each feature has a variance of 1 / n_features. - - W : ndarray, shape (n_samples, n_properties) + W : numpy.ndarray, shape (n_samples, n_properties) Regression weights, optional when regressor=`precomputed`. If not passed, it is assumed that `W = np.linalg.lstsq(K, Y, self.tol)[0]` @@ -294,9 +264,7 @@ def fit(self, X, Y, W=None): ------- self: object Returns the instance itself. - """ - if self.regressor not in ["precomputed", None] and not isinstance( self.regressor, KernelRidge ): @@ -417,7 +385,6 @@ def fit(self, X, Y, W=None): def predict(self, X=None): """Predicts the property values""" - check_is_fitted(self, ["pky_", "pty_"]) X = check_array(X) @@ -428,20 +395,17 @@ def predict(self, X=None): return K @ self.pky_ def transform(self, X): - """ - Apply dimensionality reduction to X. + """Apply dimensionality reduction to X. - X is projected on the first principal components as determined by the + ``X`` is projected on the first principal components as determined by the modified Kernel PCovR distances. Parameters ---------- - X: ndarray, shape (n_samples, n_features) + X : numpy.ndarray, shape (n_samples, n_features) New data, where n_samples is the number of samples and n_features is the number of features. - """ - check_is_fitted(self, ["pkt_", "X_fit_"]) X = check_array(X) @@ -453,13 +417,11 @@ def transform(self, X): return K @ self.pkt_ def inverse_transform(self, T): - """Transform input data back to its original space. + r"""Transform input data back to its original space. .. math:: - - \\mathbf{\\hat{X}} = \\mathbf{T} \\mathbf{P}_{TX} - = \\mathbf{K} \\mathbf{P}_{KT} \\mathbf{P}_{TX} - + \mathbf{\\hat{X}} = \mathbf{T} \mathbf{P}_{TX} + = \mathbf{K} \mathbf{P}_{KT} \mathbf{P}_{TX} Similar to KPCA, the original features are not always recoverable, as the projection is computed from the kernel features, not the original @@ -468,29 +430,25 @@ def inverse_transform(self, T): Parameters ---------- - T: ndarray, shape (n_samples, n_components) - Projected data, where n_samples is the number of samples - and n_components is the number of components. + T : numpy.ndarray, shape (n_samples, n_components) + Projected data, where n_samples is the number of samples and n_components is + the number of components. Returns ------- - X_original ndarray, shape (n_samples, n_features) + X_original : numpy.ndarray, shape (n_samples, n_features) """ - return T @ self.ptx_ def score(self, X, Y): - r""" - Computes the (negative) loss values for KernelPCovR on the given predictor and - response variables. The loss in :math:`\mathbf{K}`, as explained in + r"""Computes the (negative) loss values for KernelPCovR on the given predictor + and response variables. The loss in :math:`\mathbf{K}`, as explained in [Helfrecht2020]_ does not correspond to a traditional Gram loss - :math:`\mathbf{K} - \mathbf{TT}^T`. Indicating the kernel between set - A and B as :math:`\mathbf{K}_{AB}`, - the projection of set A as :math:`\mathbf{T}_A`, and with N and V as the - train and validation/test set, one obtains + :math:`\mathbf{K} - \mathbf{TT}^T`. Indicating the kernel between set A and B as + :math:`\mathbf{K}_{AB}`, the projection of set A as :math:`\mathbf{T}_A`, and + with N and V as the train and validation/test set, one obtains .. math:: - \ell=\frac{\operatorname{Tr}\left[\mathbf{K}_{VV} - 2 \mathbf{K}_{VN} \mathbf{T}_N (\mathbf{T}_N^T \mathbf{T}_N)^{-1} \mathbf{T}_V^T @@ -498,21 +456,21 @@ def score(self, X, Y): \mathbf{K}_{NN} \mathbf{T}_N (\mathbf{T}_N^T \mathbf{T}_N)^{-1} \mathbf{T}_V^T\right]}{\operatorname{Tr}(\mathbf{K}_{VV})} - The negative loss is returned for easier use in sklearn pipelines, e.g., a - grid search, where methods named 'score' are meant to be maximized. - - Arguments - --------- - X: independent (predictor) variable - Y: dependent (response) variable + The negative loss is returned for easier use in sklearn pipelines, e.g., a grid + search, where methods named 'score' are meant to be maximized. + Parameters + ---------- + X : numpy.ndarray + independent (predictor) variable + Y : numpy.ndarray + dependent (response) variable Returns ------- - L: Negative sum of the KPCA and KRR losses, with the KPCA loss - determined by the reconstruction of the kernel - + L : float + Negative sum of the KPCA and KRR losses, with the KPCA loss determined by + the reconstruction of the kernel """ - check_is_fitted(self, ["pkt_", "X_fit_"]) X = check_array(X) diff --git a/src/skmatter/decomposition/_pcovr.py b/src/skmatter/decomposition/_pcovr.py index 01a385e43..929f137e2 100644 --- a/src/skmatter/decomposition/_pcovr.py +++ b/src/skmatter/decomposition/_pcovr.py @@ -19,9 +19,7 @@ class PCovR(_BasePCA, LinearModel): - r""" - - Principal Covariates Regression, as described in [deJong1992]_ + r"""Principal Covariates Regression, as described in [deJong1992]_ determines a latent-space projection :math:`\mathbf{T}` which minimizes a combined loss in supervised and unsupervised tasks. @@ -29,7 +27,6 @@ class PCovR(_BasePCA, LinearModel): matrix :math:`\mathbf{\tilde{K}}` .. math:: - \mathbf{\tilde{K}} = \alpha \mathbf{X} \mathbf{X}^T + (1 - \alpha) \mathbf{\hat{Y}}\mathbf{\hat{Y}}^T @@ -42,7 +39,6 @@ class PCovR(_BasePCA, LinearModel): :math:`\mathbf{\tilde{C}}` .. math:: - \mathbf{\tilde{C}} = \alpha \mathbf{X}^T \mathbf{X} + (1 - \alpha) \left(\left(\mathbf{X}^T \mathbf{X}\right)^{-\frac{1}{2}} \mathbf{X}^T @@ -69,108 +65,85 @@ class PCovR(_BasePCA, LinearModel): Parameters ---------- mixing: float, default=0.5 - mixing parameter, as described in PCovR as :math:`{\alpha}`, here named - to avoid confusion with regularization parameter `alpha` - + mixing parameter, as described in PCovR as :math:`{\alpha}`, here named to avoid + confusion with regularization parameter `alpha` n_components : int, float or str, default=None Number of components to keep. if n_components is not set all components are kept:: n_components == min(n_samples, n_features) - svd_solver : {'auto', 'full', 'arpack', 'randomized'}, default='auto' If auto : The solver is selected by a default policy based on `X.shape` and - `n_components`: if the input data is larger than 500x500 and the - number of components to extract is lower than 80% of the smallest - dimension of the data, then the more efficient 'randomized' - method is enabled. Otherwise the exact full SVD is computed and - optionally truncated afterwards. + `n_components`: if the input data is larger than 500x500 and the number of + components to extract is lower than 80% of the smallest dimension of the + data, then the more efficient 'randomized' method is enabled. Otherwise the + exact full SVD is computed and optionally truncated afterwards. If full : - run exact full SVD calling the standard LAPACK solver via - `scipy.linalg.svd` and select the components by postprocessing + run exact full SVD calling the standard LAPACK solver via `scipy.linalg.svd` + and select the components by postprocessing If arpack : run SVD truncated to n_components calling ARPACK solver via - `scipy.sparse.linalg.svds`. It requires strictly - 0 < n_components < min(X.shape) + `scipy.sparse.linalg.svds`. It requires strictly 0 < n_components < + min(X.shape) If randomized : run randomized SVD by the method of Halko et al. - tol : float, default=1e-12 - Tolerance for singular values computed by svd_solver == 'arpack'. - Must be of range [0.0, infinity). - + Tolerance for singular values computed by svd_solver == 'arpack'. Must be of + range [0.0, infinity). space: {'feature', 'sample', 'auto'}, default='auto' - whether to compute the PCovR in `sample` or `feature` space - default=`sample` when :math:`{n_{samples} < n_{features}}` and - `feature` when :math:`{n_{features} < n_{samples}}` - + whether to compute the PCovR in `sample` or `feature` space default=`sample` + when :math:`{n_{samples} < n_{features}}` and `feature` when + :math:`{n_{features} < n_{samples}}` regressor: {`Ridge`, `RidgeCV`, `LinearRegression`, `precomputed`}, default=None - regressor for computing approximated :math:`{\mathbf{\hat{Y}}}`. - The regressor should be one `sklearn.linear_model.Ridge`, - `sklearn.linear_model.RidgeCV`, or `sklearn.linear_model.LinearRegression`. - If a pre-fitted regressor is provided, it is used to compute - :math:`{\mathbf{\hat{Y}}}`. - Note that any pre-fitting of the regressor will be lost if `PCovR` is - within a composite estimator that enforces cloning, e.g., - `sklearn.compose.TransformedTargetRegressor` or - `sklearn.pipeline.Pipeline` with model caching. - In such cases, the regressor will be re-fitted on the same - training data as the composite estimator. - If `precomputed`, we assume that the `y` passed to the `fit` function - is the regressed form of the targets :math:`{\mathbf{\hat{Y}}}`. - If None, ``sklearn.linear_model.Ridge('alpha':1e-6, 'fit_intercept':False, 'tol':1e-12)`` - is used as the regressor. - + regressor for computing approximated :math:`{\mathbf{\hat{Y}}}`. The regressor + should be one `sklearn.linear_model.Ridge`, `sklearn.linear_model.RidgeCV`, or + `sklearn.linear_model.LinearRegression`. If a pre-fitted regressor is provided, + it is used to compute :math:`{\mathbf{\hat{Y}}}`. Note that any pre-fitting of + the regressor will be lost if `PCovR` is within a composite estimator that + enforces cloning, e.g., `sklearn.compose.TransformedTargetRegressor` or + `sklearn.pipeline.Pipeline` with model caching. In such cases, the regressor + will be re-fitted on the same training data as the composite estimator. If + `precomputed`, we assume that the `y` passed to the `fit` function is the + regressed form of the targets :math:`{\mathbf{\hat{Y}}}`. If None, + ``sklearn.linear_model.Ridge('alpha':1e-6, 'fit_intercept':False, 'tol':1e-12)`` + is used as the regressor. iterated_power : int or 'auto', default='auto' - Number of iterations for the power method computed by - svd_solver == 'randomized'. - Must be of range [0, infinity). - - random_state : int, RandomState instance or None, default=None - Used when the 'arpack' or 'randomized' solvers are used. Pass an int - for reproducible results across multiple function calls. - - whiten : boolean, deprecated + Number of iterations for the power method computed by svd_solver == + 'randomized'. Must be of range [0, infinity). + random_state : int, :class:`numpy.random.RandomState` instance or None, default=None + Used when the 'arpack' or 'randomized' solvers are used. Pass an int for + reproducible results across multiple function calls. + whiten : bool, deprecated Attributes ---------- - mixing: float, default=0.5 mixing parameter, as described in PCovR as :math:`{\alpha}` - tol: float, default=1e-12 Tolerance for singular values computed by svd_solver == 'arpack'. Must be of range [0.0, infinity). - space: {'feature', 'sample', 'auto'}, default='auto' - whether to compute the PCovR in `sample` or `feature` space - default=`sample` when :math:`{n_{samples} < n_{features}}` and - `feature` when :math:`{n_{features} < n_{samples}}` - + whether to compute the PCovR in `sample` or `feature` space default=`sample` + when :math:`{n_{samples} < n_{features}}` and `feature` when + :math:`{n_{features} < n_{samples}}` n_components_ : int - The estimated number of components, which equals the parameter - n_components, or the lesser value of n_features and n_samples - if n_components is None. - - pxt_ : ndarray of size :math:`({n_{samples}, n_{components}})` - the projector, or weights, from the input space :math:`\mathbf{X}` - to the latent-space projection :math:`\mathbf{T}` - + The estimated number of components, which equals the parameter n_components, or + the lesser value of n_features and n_samples if n_components is None. + pxt_ : numpy.ndarray of size :math:`({n_{samples}, n_{components}})` + the projector, or weights, from the input space :math:`\mathbf{X}` to the + latent-space projection :math:`\mathbf{T}` pty_ : ndarray of size :math:`({n_{components}, n_{properties}})` - the projector, or weights, from the latent-space projection - :math:`\mathbf{T}` to the properties :math:`\mathbf{Y}` - + the projector, or weights, from the latent-space projection :math:`\mathbf{T}` + to the properties :math:`\mathbf{Y}` pxy_ : ndarray of size :math:`({n_{samples}, n_{properties}})` - the projector, or weights, from the input space :math:`\mathbf{X}` - to the properties :math:`\mathbf{Y}` - + the projector, or weights, from the input space :math:`\mathbf{X}` to the + properties :math:`\mathbf{Y}` explained_variance_ : ndarray of shape (n_components,) The amount of variance explained by each of the selected components. Equal to n_components largest eigenvalues of the PCovR-modified covariance matrix of :math:`\mathbf{X}`. - singular_values_ : ndarray of shape (n_components,) The singular values corresponding to each of the selected components. @@ -193,7 +166,7 @@ class PCovR(_BasePCA, LinearModel): [-1.02805338, 1.06736871], [ 0.98166504, -4.98307078], [-2.9963189 , 1.98238856]]) - """ # NoQa: E501 + """ def __init__( self, @@ -220,40 +193,34 @@ def __init__( self.regressor = regressor def fit(self, X, Y, W=None): - r""" - - Fit the model with X and Y. Depending on the dimensions of X, - calls either `_fit_feature_space` or `_fit_sample_space` + r"""Fit the model with X and Y. Depending on the dimensions of X, calls either + `_fit_feature_space` or `_fit_sample_space` Parameters ---------- - X : ndarray, shape (n_samples, n_features) - Training data, where n_samples is the number of samples and - n_features is the number of features. + X : numpy.ndarray, shape (n_samples, n_features) + Training data, where n_samples is the number of samples and n_features is + the number of features. It is suggested that :math:`\mathbf{X}` be centered by its column- means and scaled. If features are related, the matrix should be scaled to have unit variance, otherwise :math:`\mathbf{X}` should be scaled so that each feature has a variance of 1 / n_features. + Y : numpy.ndarray, shape (n_samples, n_properties) + Training data, where n_samples is the number of samples and n_properties is + the number of properties - Y : ndarray, shape (n_samples, n_properties) - Training data, where n_samples is the number of samples and - n_properties is the number of properties - - It is suggested that :math:`\mathbf{X}` be centered by its column- - means and scaled. If features are related, the matrix should be scaled - to have unit variance, otherwise :math:`\mathbf{Y}` should be - scaled so that each feature has a variance of 1 / n_features. + It is suggested that :math:`\mathbf{X}` be centered by its column- means and + scaled. If features are related, the matrix should be scaled to have unit + variance, otherwise :math:`\mathbf{Y}` should be scaled so that each feature + has a variance of 1 / n_features. If the passed regressor = `precomputed`, it is assumed that Y is the regressed form of the properties, :math:`{\mathbf{\hat{Y}}}`. - - W : ndarray, shape (n_features, n_properties) + W : numpy.ndarray, shape (n_features, n_properties) Regression weights, optional when regressor=`precomputed`. If not passed, it is assumed that `W = np.linalg.lstsq(X, Y, self.tol)[0]` - """ - X, Y = check_X_y(X, Y, y_numeric=True, multi_output=True) # saved for inverse transformations from the latent space, @@ -355,11 +322,9 @@ def fit(self, X, Y, W=None): return self def _fit_feature_space(self, X, Y, Yhat): - r""" - In feature-space PCovR, the projectors are determined by: + r"""In feature-space PCovR, the projectors are determined by: .. math:: - \mathbf{\tilde{C}} = \alpha \mathbf{X}^T \mathbf{X} + (1 - \alpha) \left(\left(\mathbf{X}^T \mathbf{X}\right)^{-\frac{1}{2}} \mathbf{X}^T @@ -369,26 +334,21 @@ def _fit_feature_space(self, X, Y, Yhat): where .. math:: - \mathbf{P}_{XT} = (\mathbf{X}^T \mathbf{X})^{-\frac{1}{2}} \mathbf{U}_\mathbf{\tilde{C}}^T \mathbf{\Lambda}_\mathbf{\tilde{C}}^{\frac{1}{2}} .. math:: - \mathbf{P}_{TX} = \mathbf{\Lambda}_\mathbf{\tilde{C}}^{-\frac{1}{2}} \mathbf{U}_\mathbf{\tilde{C}}^T (\mathbf{X}^T \mathbf{X})^{\frac{1}{2}} .. math:: - \mathbf{P}_{TY} = \mathbf{\Lambda}_\mathbf{\tilde{C}}^{-\frac{1}{2}} \mathbf{U}_\mathbf{\tilde{C}}^T (\mathbf{X}^T \mathbf{X})^{-\frac{1}{2}} \mathbf{X}^T \mathbf{Y} - """ - Ct, iCsqrt = pcovr_covariance( mixing=self.mixing, X=X, @@ -426,35 +386,28 @@ def _fit_feature_space(self, X, Y, Yhat): self.pty_ = np.linalg.multi_dot([S_sqrt_inv, Vt, iCsqrt, X.T, Y]) def _fit_sample_space(self, X, Y, Yhat, W): - r""" - In sample-space PCovR, the projectors are determined by: + r"""In sample-space PCovR, the projectors are determined by: .. math:: - \mathbf{\tilde{K}} = \alpha \mathbf{X} \mathbf{X}^T + (1 - \alpha) \mathbf{\hat{Y}}\mathbf{\hat{Y}}^T where .. math:: - \mathbf{P}_{XT} = \left(\alpha \mathbf{X}^T + (1 - \alpha) \mathbf{W} \mathbf{\hat{Y}}^T\right) \mathbf{U}_\mathbf{\tilde{K}} \mathbf{\Lambda}_\mathbf{\tilde{K}}^{-\frac{1}{2}} .. math:: - \mathbf{P}_{TX} = \mathbf{\Lambda}_\mathbf{\tilde{K}}^{-\frac{1}{2}} \mathbf{U}_\mathbf{\tilde{K}}^T \mathbf{X} .. math:: - \mathbf{P}_{TY} = \mathbf{\Lambda}_\mathbf{\tilde{K}}^{-\frac{1}{2}} \mathbf{U}_\mathbf{\tilde{K}}^T \mathbf{Y} - """ - Kt = pcovr_kernel(mixing=self.mixing, X=X, Y=Yhat) if self.fit_svd_solver_ == "full": @@ -598,11 +551,9 @@ def inverse_transform(self, T): r"""Transform data back to its original space. .. math:: - \mathbf{\hat{X}} = \mathbf{T} \mathbf{P}_{TX} = \mathbf{X} \mathbf{P}_{XT} \mathbf{P}_{TX} - Parameters ---------- T : ndarray, shape (n_samples, n_components) @@ -613,7 +564,6 @@ def inverse_transform(self, T): ------- X_original ndarray, shape (n_samples, n_features) """ - if np.max(np.abs(self.mean_)) > self.tol: warnings.warn( "This class does not automatically un-center data, and your data mean " @@ -625,8 +575,7 @@ def inverse_transform(self, T): return T @ self.ptx_ def predict(self, X=None, T=None): - """Predicts the property values using regression on X or T""" - + """Predicts the property values using regression on X or T.""" check_is_fitted(self, ["pxy_", "pty_"]) if X is None and T is None: @@ -640,20 +589,17 @@ def predict(self, X=None, T=None): return T @ self.pty_ def transform(self, X=None): - """ - Apply dimensionality reduction to X. + """Apply dimensionality reduction to X. - X is projected on the first principal components as determined by the + ``X`` is projected on the first principal components as determined by the modified PCovR distances. Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : numpy.ndarray, shape (n_samples, n_features) New data, where n_samples is the number of samples and n_features is the number of features. - """ - check_is_fitted(self, ["pxt_", "mean_"]) return super().transform(X) @@ -663,14 +609,12 @@ def score(self, X, Y, T=None): defined as: .. math:: - \ell_{X} = \frac{\lVert \mathbf{X} - \mathbf{T}\mathbf{P}_{TX} \rVert ^ 2} {\lVert \mathbf{X}\rVert ^ 2} and .. math:: - \ell_{Y} = \frac{\lVert \mathbf{Y} - \mathbf{T}\mathbf{P}_{TY} \rVert ^ 2} {\lVert \mathbf{Y}\rVert ^ 2} @@ -678,23 +622,19 @@ def score(self, X, Y, T=None): use in sklearn pipelines, e.g., a grid search, where methods named 'score' are meant to be maximized. - Parameters ---------- - X : ndarray of shape (n_samples, n_features) + X : numpy.ndarray of shape (n_samples, n_features) The data. - - Y : ndarray of shape (n_samples, n_properties) + Y : numpy.ndarray of shape (n_samples, n_properties) The target. Returns ------- loss : float - Negative sum of the loss in reconstructing X from the latent-space - projection T and the loss in predicting Y from the latent-space - projection T + Negative sum of the loss in reconstructing X from the latent-space + projection T and the loss in predicting Y from the latent-space projection T """ - if T is None: T = self.transform(X) diff --git a/src/skmatter/feature_selection/_base.py b/src/skmatter/feature_selection/_base.py index 4971f853d..0394faae7 100644 --- a/src/skmatter/feature_selection/_base.py +++ b/src/skmatter/feature_selection/_base.py @@ -1,61 +1,47 @@ -""" -Sequential feature selection -""" +"""Sequential feature selection.""" from .._selection import _CUR, _FPS, _PCovCUR, _PCovFPS class FPS(_FPS): - """ - Transformer that performs Greedy Feature Selection using Farthest Point Sampling. + """Transformer performing Greedy Feature Selection using Farthest Point Sampling. Parameters ---------- - initialize: int, list of int, or 'random', default=0 - Index of the first selection(s). If 'random', picks a random - value when fit starts. Stored in :py:attr:`self.initialize`. - + Index of the first selection(s). If 'random', picks a random value when fit + starts. Stored in :py:attr:`self.initialize`. n_to_select : int or float, default=None - The number of selections to make. If `None`, half of the features are - selected. If integer, the parameter is the absolute number of selections - to make. If float between 0 and 1, it is the fraction of the total dataset to - select. Stored in :py:attr:`self.n_to_select`. - + The number of selections to make. If `None`, half of the features are selected. + If integer, the parameter is the absolute number of selections to make. If float + between 0 and 1, it is the fraction of the total dataset to select. Stored in + :py:attr:`self.n_to_select`. score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the n_to_select is chosen. Otherwise will stop when the score falls below the threshold. Stored in :py:attr:`self.score_threshold`. - score_threshold_type : str, default="absolute" - How to interpret the ``score_threshold``. When "absolute", the score used by - the selector is compared to the threshold directly. When "relative", at each + How to interpret the ``score_threshold``. When "absolute", the score used by the + selector is compared to the threshold directly. When "relative", at each iteration, the score used by the selector is compared proportionally to the - score of the first selection, i.e. the selector quits when - ``current_score / first_score < threshold``. Stored in - :py:attr:`self.score_threshold_type`. - + score of the first selection, i.e. the selector quits when ``current_score / + first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False - option to use `tqdm `_ progress bar to monitor - selections. Stored in :py:attr:`self.report_progress`. - + option to use `tqdm `_ progress bar to monitor + selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose randomly from the remaining features. Stored in :py:attr:`self.full`. - - random_state: int or RandomState instance, default=0 + random_state: int or `class:`numpy.random.RandomState` instance, default=0 Attributes ---------- - n_selected_ : int - Counter tracking the number of selections that have been made - + Counter tracking the number of selections that have been made X_selected_ : ndarray, - Matrix containing the selected features, for use in fitting - + Matrix containing the selected features, for use in fitting selected_idx_ : ndarray - indices of selected samples + indices of selected samples Examples -------- @@ -65,7 +51,7 @@ class FPS(_FPS): ... n_to_select=2, ... # int or 'random', default=0 ... # Index of the first selection. - ... # If ‘random’, picks a random value when fit starts. + ... # If "random", picks a random value when fit starts. ... initialize=0, ... ) >>> X = np.array( @@ -105,56 +91,44 @@ def __init__( class PCovFPS(_PCovFPS): - """Transformer that performs Greedy Feature Selection using PCovR-weighted + r"""Transformer that performs Greedy Feature Selection using PCovR-weighted Farthest Point Sampling. Parameters ---------- - mixing: float, default=0.5 - The PCovR mixing parameter, as described in PCovR as - :math:`{\\alpha}` - + The PCovR mixing parameter, as described in PCovR as :math:`{\alpha}` initialize: int or 'random', default=0 - Index of the first selection. If 'random', picks a random - value when fit starts. - + Index of the first selection. If 'random', picks a random value when fit starts. n_to_select : int or float, default=None - The number of selections to make. If `None`, half of the features are - selected. If integer, the parameter is the absolute number of selections - to make. If float between 0 and 1, it is the fraction of the total dataset to - select. Stored in :py:attr:`self.n_to_select`. - + The number of selections to make. If `None`, half of the features are selected. + If integer, the parameter is the absolute number of selections to make. If float + between 0 and 1, it is the fraction of the total dataset to select. Stored in + :py:attr:`self.n_to_select`. score_threshold : float, default=None - Threshold for the score. If `None` selection will continue until the - n_to_select is chosen. Otherwise will stop when the score falls below the - threshold. Stored in :py:attr:`self.score_threshold`. - + Threshold for the score. If `None` selection will continue until the n_to_select + is chosen. Otherwise will stop when the score falls below the threshold. Stored + in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" - How to interpret the ``score_threshold``. When "absolute", the score used by - the selector is compared to the threshold directly. When "relative", at each + How to interpret the ``score_threshold``. When "absolute", the score used by the + selector is compared to the threshold directly. When "relative", at each iteration, the score used by the selector is compared proportionally to the - score of the first selection, i.e. the selector quits when - ``current_score / first_score < threshold``. Stored in - :py:attr:`self.score_threshold_type`. - + score of the first selection, i.e. the selector quits when ``current_score / + first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False - option to use `tqdm `_ progress bar to monitor - selections. Stored in :py:attr:`self.report_progress`. - + option to use `tqdm `_ progress bar to monitor + selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose randomly from the remaining features. Stored in :py:attr:`self.full`. - - random_state: int or RandomState instance, default=0 + random_state: int or :class:`numpy.random.RandomState` instance, default=0 Attributes ---------- - n_selected_ : int - Counter tracking the number of selections that have been made - X_selected_ : ndarray, - Matrix containing the selected features, for use in fitting + Counter tracking the number of selections that have been made + X_selected_ : numpy.ndarray, + Matrix containing the selected features, for use in fitting Examples -------- @@ -214,26 +188,21 @@ class CUR(_CUR): Parameters ---------- recompute_every : int - number of steps after which to recompute the pi score - defaults to 1, if 0 no re-computation is done - + number of steps after which to recompute the pi score + defaults to 1, if 0 no re-computation is done k : int - number of eigenvectors to compute the importance score with, defaults to 1 - + number of eigenvectors to compute the importance score with, defaults to ``1`` tolerance: float - threshold below which scores will be considered 0, defaults to 1E-12 - + threshold below which scores will be considered 0, defaults to ``1e-12`` n_to_select : int or float, default=None - The number of selections to make. If `None`, half of the features are - selected. If integer, the parameter is the absolute number of selections - to make. If float between 0 and 1, it is the fraction of the total dataset to - select. Stored in :py:attr:`self.n_to_select`. - + The number of selections to make. If `None`, half of the features are selected. + If integer, the parameter is the absolute number of selections to make. If float + between 0 and 1, it is the fraction of the total dataset to select. Stored in + :py:attr:`self.n_to_select`. score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the n_to_select is chosen. Otherwise will stop when the score falls below the threshold. Stored in :py:attr:`self.score_threshold`. - score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by the selector is compared to the threshold directly. When "relative", at each @@ -241,35 +210,26 @@ class CUR(_CUR): score of the first selection, i.e. the selector quits when ``current_score / first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. - progress_bar: bool, default=False option to use `tqdm `_ progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. - full : bool, default=False In the case that all non-redundant selections are exhausted, choose randomly from the remaining features. Stored in :py:attr:`self.full`. - random_state: int or RandomState instance, default=0 - Attributes ---------- - - X_current_ : ndarray (n_samples, n_features) - The original matrix orthogonalized by previous selections - + X_current_ : numpy.ndarray (n_samples, n_features) + The original matrix orthogonalized by previous selections n_selected_ : int - Counter tracking the number of selections that have been made - - X_selected_ : ndarray, - Matrix containing the selected features, for use in fitting - - pi_ : ndarray (n_features), - the importance score see :func:`_compute_pi` - - selected_idx_ : ndarray - indices of selected features + Counter tracking the number of selections that have been made + X_selected_ : numpy.ndarray + Matrix containing the selected features, for use in fitting + pi_ : numpy.ndarray (n_features), + the importance score see :func:`_compute_pi` + selected_idx_ : numpy.ndarray + indices of selected features Examples -------- @@ -321,71 +281,56 @@ def __init__( class PCovCUR(_PCovCUR): - """Transformer that performs Greedy Feature Selection by choosing features - which maximize the importance score :math:`\\pi`, which is the sum over + r"""Transformer that performs Greedy Feature Selection by choosing features + which maximize the importance score :math:`\pi`, which is the sum over the squares of the first :math:`k` components of the PCovR-modified right singular vectors. Parameters ---------- recompute_every : int - number of steps after which to recompute the pi score - defaults to 1, if 0 no re-computation is done - + number of steps after which to recompute the pi score defaults to 1, if 0 no + re-computation is done k : int number of eigenvectors to compute the importance score with, defaults to 1 - tolerance: float - threshold below which scores will be considered 0, defaults to 1E-12 - + threshold below which scores will be considered 0, defaults to ``1e-12`` mixing: float, default=0.5 - The PCovR mixing parameter, as described in PCovR as - :math:`{\\alpha}`. Stored in :py:attr:`self.mixing`. - + The PCovR mixing parameter, as described in PCovR as + :math:`{\alpha}`. Stored in :py:attr:`self.mixing`. n_to_select : int or float, default=None - The number of selections to make. If `None`, half of the features are - selected. If integer, the parameter is the absolute number of selections - to make. If float between 0 and 1, it is the fraction of the total dataset to - select. Stored in :py:attr:`self.n_to_select`. - + The number of selections to make. If `None`, half of the features are selected. + If integer, the parameter is the absolute number of selections to make. If float + between 0 and 1, it is the fraction of the total dataset to select. Stored in + :py:attr:`self.n_to_select`. score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the n_to_select is chosen. Otherwise will stop when the score falls below the threshold. Stored in :py:attr:`self.score_threshold`. - score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by the selector is compared to the threshold directly. When "relative", at each iteration, the score used by the selector is compared proportionally to the - score of the first selection, i.e. the selector quits when - ``current_score / first_score < threshold``. Stored in - :py:attr:`self.score_threshold_type`. - + score of the first selection, i.e. the selector quits when ``current_score / + first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False - option to use `tqdm `_ progress bar to monitor - selections. Stored in :py:attr:`self.report_progress`. - + option to use `tqdm `_ progress bar to monitor + selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False - In the case that all non-redundant selections are exhausted, choose - randomly from the remaining features. Stored in :py:attr:`self.full`. - - random_state: int or RandomState instance, default=0 + In the case that all non-redundant selections are exhausted, choose randomly + from the remaining features. Stored in :py:attr:`self.full`. + random_state: int or :class:`numpy.random.RandomState` instance, default=0 Attributes ---------- - - X_current_ : ndarray (n_samples, n_features) - The original matrix orthogonalized by previous selections - - y_current_ : ndarray (n_samples, n_properties) - The targets orthogonalized by a regression on - the previous selections. - + X_current_ : numpy.ndarray (n_samples, n_features) + The original matrix orthogonalized by previous selections + y_current_ : numpy.ndarray (n_samples, n_properties) + The targets orthogonalized by a regression on the previous selections. n_selected_ : int - Counter tracking the number of selections that have been made - - X_selected_ : ndarray, - Matrix containing the selected features, for use in fitting + Counter tracking the number of selections that have been made + X_selected_ : numpy.ndarray, + Matrix containing the selected features, for use in fitting Examples -------- diff --git a/src/skmatter/linear_model/__init__.py b/src/skmatter/linear_model/__init__.py index 9fb0613a1..dc0117a26 100644 --- a/src/skmatter/linear_model/__init__.py +++ b/src/skmatter/linear_model/__init__.py @@ -1,3 +1,5 @@ +"""Classes for building linear models.""" + from ._base import OrthogonalRegression from ._ridge import Ridge2FoldCV diff --git a/src/skmatter/linear_model/_base.py b/src/skmatter/linear_model/_base.py index 800cf67f4..6d57f795f 100644 --- a/src/skmatter/linear_model/_base.py +++ b/src/skmatter/linear_model/_base.py @@ -54,15 +54,13 @@ def fit(self, X, y): """ Parameters ---------- - X : ndarray of shape (n_samples, n_features) - Training data, where n_samples is the number of samples - and n_features is the number of features. - - y : ndarray of shape (n_samples, n_targets) - Training data, where n_samples is the number of samples - and n_targets is the number of target properties. + X : numpy.ndarray of shape (n_samples, n_features) + Training data, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. + y : numpy.ndarray of shape (n_samples, n_targets) + Training data, where ``n_samples`` is the number of samples and + ``n_targets`` is the number of target properties. """ - X, y = check_X_y( X, y, diff --git a/src/skmatter/linear_model/_ridge.py b/src/skmatter/linear_model/_ridge.py index b50356f73..eadc9a914 100644 --- a/src/skmatter/linear_model/_ridge.py +++ b/src/skmatter/linear_model/_ridge.py @@ -192,11 +192,10 @@ def predict(self, X): """ Parameters ---------- - X : ndarray of shape (n_samples, n_features) + X : numpy.ndarray of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. """ - X = check_array(X) check_is_fitted(self, ["coef_"]) diff --git a/src/skmatter/metrics/__init__.py b/src/skmatter/metrics/__init__.py index 70dff5c42..16cfe8f04 100644 --- a/src/skmatter/metrics/__init__.py +++ b/src/skmatter/metrics/__init__.py @@ -1,6 +1,5 @@ -""" -This module contains a set of metrics that can be used for an enhanced -understanding of your machine learning model. +"""Set of metrics that can be used for an enhanced understanding of your machine +learning model. First are the easily-interpretable error measures of the relative information capacity of feature space `F` with respect to feature space `F'`. The methods diff --git a/src/skmatter/metrics/_prediction_rigidities.py b/src/skmatter/metrics/_prediction_rigidities.py index efed6e233..19446ccc3 100644 --- a/src/skmatter/metrics/_prediction_rigidities.py +++ b/src/skmatter/metrics/_prediction_rigidities.py @@ -3,52 +3,47 @@ def local_prediction_rigidity(X_train, X_test, alpha): r"""Computes the local prediction rigidity (LPR) of a linear or kernel model - trained on a training dataset provided as input, on the local environments - in the test set provided as a separate input. LPR is defined as follows: + trained on a training dataset provided as input, on the local environments in the + test set provided as a separate input. LPR is defined as follows: .. math:: LPR_{i} = \frac{1}{X_i (X^{T} X + \lambda I)^{-1} X_i^{T}} - The function assumes that the model training is undertaken in a manner where - the global prediction targets are averaged over the number of atoms - appearing in each training structure, and the average feature vector of each - structure is hence used in the regression. This ensures that (1) - regularization strength across structures with different number of atoms is - kept constant per structure during model training, and (2) range of - resulting LPR values are loosely kept between 0 and 1 for the ease of - interpretation. This requires the user to provide the regularizer value that - results from such training procedure. To guarantee valid comparison in the - LPR across different models, feature vectors are scaled by a global factor - based on standard deviation across atomic envs. - - If the model is a kernel model, K_train and K_test can be provided in lieu - of X_train and X_test, alnog with the appropriate regularizer for the - trained model. + The function assumes that the model training is undertaken in a manner where the + global prediction targets are averaged over the number of atoms appearing in each + training structure, and the average feature vector of each structure is hence used + in the regression. This ensures that (1) regularization strength across structures + with different number of atoms is kept constant per structure during model training, + and (2) range of resulting LPR values are loosely kept between 0 and 1 for the ease + of interpretation. This requires the user to provide the regularizer value that + results from such training procedure. To guarantee valid comparison in the LPR + across different models, feature vectors are scaled by a global factor based on + standard deviation across atomic envs. + + If the model is a kernel model, K_train and K_test can be provided in lieu of + ``X_train`` and ``X_test``, alnog with the appropriate regularizer for the trained + model. Parameters ---------- - X_train : list of ndarray of shape (n_atoms, n_features) + X_train : list of numpy.ndarray of shape (n_atoms, n_features) Training dataset where each training set structure is stored as a separate ndarray. - - X_test : list of ndarray of shape (n_atoms, n_features) + X_test : list of numpy.ndarray of shape (n_atoms, n_features) Test dataset where each training set structure is stored as a separate ndarray. - alpha : float Regularizer value that the linear/kernel model has been optimized to. Returns ------- - LPR : list of array of shape (n_atoms) + LPR : list of numpy.array of shape (n_atoms) Local prediction rigidity (LPR) of the test set structures. LPR is separately stored for each test structure, and hence list length = n_test_strucs. rank_diff : int integer value of the difference between cov matrix dimension and rank - """ - # initialize a StandardFlexibleScaler and fit to train set atom envs X_atom = np.vstack(X_train) sfactor = np.sqrt(np.mean(X_atom**2, axis=0).sum()) @@ -91,9 +86,9 @@ def local_prediction_rigidity(X_train, X_test, alpha): def componentwise_prediction_rigidity(X_train, X_test, alpha, comp_dims): r"""Computes the component-wise prediction rigidity (CPR) and the local CPR - (LCPR) of a linear or kernel model trained on a training dataset provided as - input, on the local environments in the test set provided as a separate - input. CPR and LCPR are defined as follows: + (LCPR) of a linear or kernel model trained on a training dataset provided as input, + on the local environments in the test set provided as a separate input. CPR and LCPR + are defined as follows: .. math:: CPR_{A,c} = \frac{1}{X_{A,c} (X^{T} X + \lambda I)^{-1} X_{A,c}^{T}} @@ -102,53 +97,44 @@ def componentwise_prediction_rigidity(X_train, X_test, alpha, comp_dims): LCPR_{i,c} = \frac{1}{X_{i,c} (X^{T} X + \lambda I)^{-1} X_{i,c}^{T}} The function assumes that the feature vectors for the local environments and - structures are built by concatenating the descriptors of different - prediction components together. It also assumes, like the case of LPR, that - model training is undertaken in a manner where the global prediction targets - are averaged over the number of atoms appearing in each training structure, - and the average feature vector of each structure is hence used in the - regression. Likewise, to guarantee valid comparison in the (L)CPR across - different models, feature vectors are scaled by a global factor based on - standard deviation across atomic envs. - - If the model is a kernel model, K_train and K_test can be provided in lieu - of X_train and X_test, alnog with the appropriate regularizer for the - trained model. However, in computing the kernels, one must strictly keep the - different components separate, and compute separate kernel blocks for - different prediction components. + structures are built by concatenating the descriptors of different prediction + components together. It also assumes, like the case of LPR, that model training is + undertaken in a manner where the global prediction targets are averaged over the + number of atoms appearing in each training structure, and the average feature vector + of each structure is hence used in the regression. Likewise, to guarantee valid + comparison in the (L)CPR across different models, feature vectors are scaled by a + global factor based on standard deviation across atomic envs. + + If the model is a kernel model, K_train and K_test can be provided in lieu of + X_train and X_test, alnog with the appropriate regularizer for the trained model. + However, in computing the kernels, one must strictly keep the different components + separate, and compute separate kernel blocks for different prediction components. Parameters ---------- - X_train : list of ndarray of shape (n_atoms, n_features) + X_train : list of numpy.ndarray of shape (n_atoms, n_features) Training dataset where each training set structure is stored as a separate ndarray. - - X_test : list of ndarray of shape (n_atoms, n_features) + X_test : list of numpy.ndarray of shape (n_atoms, n_features) Test dataset where each training set structure is stored as a separate ndarray. - alpha : float Regularizer value that the linear/kernel model has been optimized to. - - comp_dims : array of int values + comp_dims : numpy.ndarray of int values Dimensions of the feature vectors pertaining to each prediction component. - Returns ------- - CPR : ndarray of shape (n_test_strucs, n_comps) - Component-wise prediction rigidity computed for each prediction - component, pertaining to the entire test structure. + CPR : numpy.ndarray of shape (n_test_strucs, n_comps) + Component-wise prediction rigidity computed for each prediction component, + pertaining to the entire test structure. LCPR : list of ndarrays of shape (n_atoms, n_comps) - Local component-wise prediction rigidity of the test set structures. - Values are separately stored for each test structure, and hence list - length = n_test_strucs + Local component-wise prediction rigidity of the test set structures. Values are + separately stored for each test structure, and hence list length = n_test_strucs rank_diff : int value of the difference between cov matrix dimension and rank - """ - # initialize a StandardFlexibleScaler and fit to train set atom envs X_atom = np.vstack(X_train) sfactor = np.sqrt(np.mean(X_atom**2, axis=0).sum()) diff --git a/src/skmatter/model_selection/__init__.py b/src/skmatter/model_selection/__init__.py index 1f152ba00..242dd8151 100644 --- a/src/skmatter/model_selection/__init__.py +++ b/src/skmatter/model_selection/__init__.py @@ -1,3 +1,5 @@ +"""Functions for model selection.""" + from ._split import train_test_split __all__ = ["train_test_split"] diff --git a/src/skmatter/model_selection/_split.py b/src/skmatter/model_selection/_split.py index 6e1f2cbcf..36fabe7f3 100644 --- a/src/skmatter/model_selection/_split.py +++ b/src/skmatter/model_selection/_split.py @@ -4,41 +4,41 @@ def train_test_split(*arrays, **options): - """This is an extended version of the sklearn train test split supporting - overlapping train and test sets. + """Extended version of the sklearn train test split supporting overlapping train and + test sets. + See `sklearn.model_selection.train_test_split (external link) `_ . Parameters ---------- *arrays : sequence of indexables with same length / shape[0] - Allowed inputs are lists, numpy arrays, scipy-sparse - matrices or pandas dataframes. + Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas + dataframes. test_size : float or int, default=None - If float, should be between 0.0 and 1.0 and represent the proportion - of the dataset to include in the test split. If int, represents the - absolute number of test samples. If None, the value is set to the - complement of the train size. If ``train_size`` is also None, it will - be set to 0.25. + If float, should be between 0.0 and 1.0 and represent the proportion of the + dataset to include in the test split. If int, represents the absolute number of + test samples. If :obj:`None`, the value is set to the complement of the train + size. If ``train_size`` is also None, it will be set to 0.25. train_size : float or int, default=None - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the train split. If - int, represents the absolute number of train samples. If None, - the value is automatically set to the complement of the test size. - random_state : int or RandomState instance, default=None - Controls the shuffling applied to the data before applying the split. - Pass an int for reproducible output across multiple function calls. - See - `random state glossary from sklearn (external link) `_ + If float, should be between 0.0 and 1.0 and represent the proportion of the + dataset to include in the train split. If int, represents the absolute number of + train samples. If :obj:`None`, the value is automatically set to the complement + of the test size. + random_state : int or numpy.random.RandomState instance, default=None + Controls the shuffling applied to the data before applying the split. Pass an + int for reproducible output across multiple function calls. See `random state + glossary from sklearn (external link) + `_ shuffle : bool, default=True - Whether or not to shuffle the data before splitting. If shuffle=False - then stratify must be None. + Whether or not to shuffle the data before splitting. If shuffle=False then + stratify must be :obj:`None`. stratify : array-like, default=None - If not None, data is split in a stratified fashion, using this as - the class labels. + If not :obj:`None`, data is split in a stratified fashion, using this as the + class labels. train_test_overlap : bool, default=False - If True, and train and test set are both not None, the train and test - set may overlap. + If :obj:`True`, and train and test set are both not :obj:`None`, the train and + test set may overlap. Returns ------- diff --git a/src/skmatter/preprocessing/__init__.py b/src/skmatter/preprocessing/__init__.py index b81735a39..46fda830b 100644 --- a/src/skmatter/preprocessing/__init__.py +++ b/src/skmatter/preprocessing/__init__.py @@ -1,4 +1,4 @@ -"""This module includes scaling, centering and normalization methods.""" +"""Scaling, centering and normalization methods.""" from ._data import ( KernelNormalizer, diff --git a/src/skmatter/preprocessing/_data.py b/src/skmatter/preprocessing/_data.py index 07160dea4..3ff563fac 100644 --- a/src/skmatter/preprocessing/_data.py +++ b/src/skmatter/preprocessing/_data.py @@ -117,13 +117,11 @@ def fit(self, X, y=None, sample_weight=None): Parameters ---------- - X : ndarray of shape (n_samples, n_features) + X : numpy.ndarray of shape (n_samples, n_features) The data used to compute the mean and standard deviation used for later scaling along the features axis. - y: None Ignored. - sample_weight: ndarray of shape (n_samples,) Weights for each sample. Sample weighting can be used to center (and scale) data using a weighted mean. Weights are internally @@ -134,7 +132,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Fitted scaler. """ - X = self._validate_data( X, copy=self.copy, @@ -177,10 +174,8 @@ def transform(self, X, y=None, copy=None): ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The data used to scale along the features axis. - y: None Ignored. - copy : bool, default=None Copy the input X or not. @@ -189,7 +184,6 @@ def transform(self, X, y=None, copy=None): X : {array-like, sparse matrix} of shape (n_samples, n_features) Transformed array. """ - copy = copy if copy is not None else self.copy X = self._validate_data( X, @@ -207,18 +201,17 @@ def transform(self, X, y=None, copy=None): return (X - self.mean_) / self.scale_ def inverse_transform(self, X_tr): - """Scale back the data to the original representation + """Scale back the data to the original representation. Parameters ---------- - X_tr : ndarray of shape (n_samples, n_features) + X_tr : numpy.ndarray of shape (n_samples, n_features) Transformed matrix Returns ------- X : original matrix """ - check_is_fitted( self, attributes=["n_samples_in_", "n_features_in_", "scale_", "mean_"] ) @@ -229,37 +222,33 @@ def inverse_transform(self, X_tr): class KernelNormalizer(KernelCenterer): - """Kernel centering method, similar to KernelCenterer, + r"""Kernel centering method, similar to KernelCenterer, but with additional scaling and ability to pass a set of sample weights. - Let :math:`K(x, z)` be a kernel defined by :math:`\\phi(x)^T \\phi(z)`, - where :math:`\\phi` is a function mapping x to a Hilbert space. + Let :math:`K(x, z)` be a kernel defined by :math:`\phi(x)^T \phi(z)`, + where :math:`\phi` is a function mapping x to a Hilbert space. KernelNormalizer centers (i.e., normalize to have zero mean) the data without - explicitly computing :math:`\\phi(x)`. + explicitly computing :math:`\phi(x)`. It is equivalent to centering and scaling :math:`\\phi(x)` with sklearn.preprocessing.StandardScaler(with_std=False). Parameters - --------- + ---------- with_center: bool, default=True If True, center the kernel matrix before scaling. If False, do not center the kernel - with_trace: bool, default=True If True, scale the kernel so that the trace is equal to the number of samples. If False, do not scale the kernel Attributes ---------- - K_fit_rows_ : ndarray of shape (n_samples,) + K_fit_rows_ : numpy.ndarray of shape (n_samples,) Average of each column of kernel matrix. - K_fit_all_ : float Average of kernel matrix. - sample_weight_ : float Sample weights (if provided during the fit) - scale_ : float Scaling parameter used when 'with_trace'=True Calculated as np.trace(K) / K.shape[0] @@ -299,23 +288,20 @@ def fit(self, K, y=None, sample_weight=None): Parameters ---------- - K : ndarray of shape (n_samples, n_samples) + K : numpy.ndarray of shape (n_samples, n_samples) Kernel matrix. - y : None Ignored. - - sample_weight: ndarray of shape (n_samples,), default=None - Weights for each sample. Sample weighting can be used to center (and - scale) data using a weighted mean. Weights are internally normalized - before preprocessing. + sample_weight: numpy.ndarray of shape (n_samples,), default=None + Weights for each sample. Sample weighting can be used to center (and scale) + data using a weighted mean. Weights are internally normalized before + preprocessing. Returns ------- self : object Fitted transformer. """ - K = self._validate_data(K, copy=True, dtype=FLOAT_DTYPES, reset=False) if sample_weight is not None: @@ -357,18 +343,16 @@ def transform(self, K, copy=True): Parameters ---------- - K : ndarray of shape (n_samples1, n_samples2) + K : numpy.ndarray of shape (n_samples1, n_samples2) Kernel matrix. - copy : bool, default=True Set to False to perform inplace computation. Returns ------- - K_new : ndarray of shape (n_samples1, n_samples2) + K_new : numpy.ndarray of shape (n_samples1, n_samples2) Transformed array """ - check_is_fitted(self) K = self._validate_data(K, copy=copy, dtype=FLOAT_DTYPES, reset=False) @@ -416,7 +400,7 @@ def fit_transform(self, K, y=None, sample_weight=None, copy=True, **fit_params): class SparseKernelCenterer(TransformerMixin): r"""Kernel centering method for sparse kernels, similar to - KernelFlexibleCenterer. + :class:`KernelFlexibleCenterer`. The main disadvantage of kernel methods, which is widely used in machine learning it is that they quickly grow in time and space complexity with the @@ -437,16 +421,14 @@ class SparseKernelCenterer(TransformerMixin): is possible to get a $N/M$ times improvement in the asymptotic by memory. Parameters - --------- + ---------- with_center: bool, default=True If True, center the kernel matrix before scaling. If False, do not center the kernel - - with_trace: bool, default=True + with_trace : bool, default=True If True, scale the kernel so that the trace is equal to the number of samples. If False, do not scale the kernel - - rcond: float, default 1E-12 + rcond : float, default 1E-12 conditioning parameter to use when computing the Nystrom-approximated kernel for scaling @@ -454,43 +436,34 @@ class SparseKernelCenterer(TransformerMixin): ---------- K_fit_rows_ : ndarray of shape (n_samples,) Average of each column of kernel matrix. - K_fit_all_ : float Average of kernel matrix. - sample_weight_ : float Sample weights (if provided during the fit) - scale_ : float Scaling parameter used when 'with_trace'=True Calculated as np.trace(K) / K.shape[0] - n_active_: int size of active set """ def __init__(self, with_center=True, with_trace=True, rcond=1e-12): - """Initialize SparseKernelCenterer.""" - self.with_center = with_center self.with_trace = with_trace self.rcond = rcond def fit(self, Knm, Kmm, y=None, sample_weight=None): - """Fit KernelFlexibleCenterer + """Fit ``KernelFlexibleCenterer`` Parameters - --------- - Knm: ndarray of shape (n_samples, n_active) + ---------- + Knm : numpy.ndarray of shape (n_samples, n_active) Kernel matrix between the reference data set and the active set - - Kmm: ndarray of shape (n_active, n_active) + Kmm : numpy.ndarray of shape (n_active, n_active) Kernel matrix between the active set and itself - y : None Ignored. - - sample_weight: ndarray of shape (n_samples,), default=None + sample_weight: numpy.ndarray of shape (n_samples,), default=None Weights for each sample. Sample weighting can be used to center (and scale) data using a weighted mean. Weights are internally normalized before preprocessing. @@ -500,7 +473,6 @@ def fit(self, Knm, Kmm, y=None, sample_weight=None): self : object Fitted transformer. """ - if Knm.shape[1] != Kmm.shape[0]: raise ValueError( "The reference kernel is not commensurate shape with the " @@ -536,16 +508,15 @@ def transform(self, Knm, y=None): """Centering our Kernel. Previously you should fit data. Parameters - --------- - Knm: ndarray of shape (n_samples, n_active) + ---------- + Knm: numpy.ndarray of shape (n_samples, n_active) Kernel matrix between the reference data set and the active set - y : None Ignored. Returns ------- - K_new : ndarray of shape (n_samples, n_active) + K_new : numpy.ndarray of shape (n_samples, n_active) Transformed array """ check_is_fitted(self, attributes=["scale_", "K_fit_rows_", "n_active_"]) @@ -563,7 +534,7 @@ def fit_transform(self, Knm, Kmm, y=None, sample_weight=None, **fit_params): r"""Fit to data, then transform it. Parameters - --------- + ---------- Knm: ndarray of shape (n_samples, n_active) Kernel matrix between the reference data set and the active set diff --git a/src/skmatter/sample_selection/_base.py b/src/skmatter/sample_selection/_base.py index 6026bce7b..f93dcfad0 100644 --- a/src/skmatter/sample_selection/_base.py +++ b/src/skmatter/sample_selection/_base.py @@ -1,6 +1,4 @@ -""" -Sequential sample selection -""" +"""Sequential sample selection""" import warnings @@ -24,13 +22,11 @@ def _linear_interpolator(points, values): values : ndarray of float or complex, shape (n,) Data values. - Reference: --------- The code is an adapted excerpt from https://github.com/scipy/scipy/blob/dde50595862a4f9cede24b5d1c86935c30f1f88a/scipy/interpolate/_ndgriddata.py#L119-L273 """ # NoQa: E501 - points = _ndim_coords_from_arrays(points) if points.ndim < 2: @@ -52,60 +48,47 @@ def _linear_interpolator(points, values): class FPS(_FPS): - """ - Transformer that performs Greedy Sample Selection using Farthest Point Sampling. + """Transformer performing Greedy Sample Selection using Farthest Point Sampling. Parameters ---------- - initialize: int, list of int, or 'random', default=0 - Index of the first selection(s). If 'random', picks a random - value when fit starts. Stored in :py:attr:`self.initialize`. - + Index of the first selection(s). If 'random', picks a random value when fit + starts. Stored in :py:attr:`self.initialize`. n_to_select : int or float, default=None - The number of selections to make. If `None`, half of the samples are - selected. If integer, the parameter is the absolute number of selections - to make. If float between 0 and 1, it is the fraction of the total dataset to - select. Stored in :py:attr:`self.n_to_select`. - + The number of selections to make. If `None`, half of the samples are selected. + If integer, the parameter is the absolute number of selections to make. If float + between 0 and 1, it is the fraction of the total dataset to select. Stored in + :py:attr:`self.n_to_select`. score_threshold : float, default=None - Threshold for the score. If `None` selection will continue until the - n_to_select is chosen. Otherwise will stop when the score falls below the - threshold. Stored in :py:attr:`self.score_threshold`. - + Threshold for the score. If `None` selection will continue until the n_to_select + is chosen. Otherwise will stop when the score falls below the threshold. Stored + in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by the selector is compared to the threshold directly. When "relative", at each iteration, the score used by the selector is compared proportionally to the - score of the first selection, i.e. the selector quits when - ``current_score / first_score < threshold``. Stored in - :py:attr:`self.score_threshold_type`. - + score of the first selection, i.e. the selector quits when ``current_score / + first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False - option to use `tqdm `_ progress bar to monitor - selections. Stored in :py:attr:`self.report_progress`. - + option to use `tqdm `_ progress bar to monitor + selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False - In the case that all non-redundant selections are exhausted, choose - randomly from the remaining samples. Stored in :py:attr:`self.full`. - - random_state: int or RandomState instance, default=0 + In the case that all non-redundant selections are exhausted, choose randomly + from the remaining samples. Stored in :py:attr:`self.full`. + random_state: int or numpy.random.RandomState instance, default=0 Attributes ---------- - n_selected_ : int - Counter tracking the number of selections that have been made - - X_selected_ : ndarray, - Matrix containing the selected samples, for use in fitting - - y_selected_ : ndarray, - In sample selection, the matrix containing the selected targets, for - use in fitting - - selected_idx_ : ndarray - indices of selected samples + Counter tracking the number of selections that have been made + X_selected_ :numpy.ndarray, + Matrix containing the selected samples, for use in fitting + y_selected_ : numpy.ndarray, + In sample selection, the matrix containing the selected targets, for + use in fitting. + selected_idx_ : numpy.ndarray + indices of selected samples Examples -------- @@ -115,7 +98,7 @@ class FPS(_FPS): ... n_to_select=2, ... # int or 'random', default=0 ... # Index of the first selection. - ... # If ‘random’, picks a random value when fit starts. + ... # If "random", picks a random value when fit starts. ... initialize=0, ... ) >>> X = np.array( @@ -154,64 +137,51 @@ def __init__( class PCovFPS(_PCovFPS): - """Transformer that performs Greedy Sample Selection using PCovR-weighted - Farthest Point Sampling. + r"""Transformer performing Greedy Sample Selection using PCovR-weighted Farthest + Point Sampling. Parameters ---------- - mixing: float, default=0.5 - The PCovR mixing parameter, as described in PCovR as - :math:`{\\alpha}` - + The PCovR mixing parameter, as described in PCovR as + :math:`{\\alpha}` initialize: int or 'random', default=0 - Index of the first selection. If 'random', picks a random - value when fit starts. - + Index of the first selection. If 'random', picks a random value when fit starts. n_to_select : int or float, default=None - The number of selections to make. If `None`, half of the samples are - selected. If integer, the parameter is the absolute number of selections - to make. If float between 0 and 1, it is the fraction of the total dataset to - select. Stored in :py:attr:`self.n_to_select`. - + The number of selections to make. If `None`, half of the samples are selected. + If integer, the parameter is the absolute number of selections to make. If float + between 0 and 1, it is the fraction of the total dataset to select. Stored in + :py:attr:`self.n_to_select`. score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the n_to_select is chosen. Otherwise will stop when the score falls below the threshold. Stored in :py:attr:`self.score_threshold`. - score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by the selector is compared to the threshold directly. When "relative", at each iteration, the score used by the selector is compared proportionally to the - score of the first selection, i.e. the selector quits when - ``current_score / first_score < threshold``. Stored in - :py:attr:`self.score_threshold_type`. - + score of the first selection, i.e. the selector quits when ``current_score / + first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False - option to use `tqdm `_ progress bar to monitor - selections. Stored in :py:attr:`self.report_progress`. - + option to use `tqdm `_ progress bar to monitor + selections. Stored in :py:attr:`self.report_progress`. full : bool, default=False In the case that all non-redundant selections are exhausted, choose randomly from the remaining samples. Stored in :py:attr:`self.full`. - - random_state: int or RandomState instance, default=0 + random_state: int or numpy.random.RandomState instance, default=0 Attributes ---------- - n_selected_ : int - Counter tracking the number of selections that have been made - - X_selected_ : ndarray, - Matrix containing the selected samples, for use in fitting + Counter tracking the number of selections that have been made + X_selected_ : numpy.ndarray, + Matrix containing the selected samples, for use in fitting - y_selected_ : ndarray, - In sample selection, the matrix containing the selected targets, for - use in fitting - - selected_idx_ : ndarray - indices of selected samples + y_selected_ : numpy.ndarray, + In sample selection, the matrix containing the selected targets, for use in + fitting + selected_idx_ : numpy.ndarray + indices of selected samples Examples -------- @@ -221,7 +191,7 @@ class PCovFPS(_PCovFPS): ... n_to_select=2, ... # int or 'random', default=0 ... # Index of the first selection. - ... # If ‘random’, picks a random value when fit starts. + ... # If "random", picks a random value when fit starts. ... initialize=0, ... ) >>> X = np.array( @@ -264,71 +234,56 @@ def __init__( class CUR(_CUR): """Transformer that performs Greedy Sample Selection by choosing samples - which maximize the magnitude of the left singular vectors, consistent with - classic CUR matrix decomposition. + which maximize the magnitude of the left singular vectors, consistent with classic + CUR matrix decomposition. Parameters ---------- recompute_every : int - number of steps after which to recompute the pi score - defaults to 1, if 0 no re-computation is done - + number of steps after which to recompute the pi score defaults to 1, if 0 no + re-computation is done k : int number of eigenvectors to compute the importance score with, defaults to 1 - tolerance: float threshold below which scores will be considered 0, defaults to 1E-12 - n_to_select : int or float, default=None - The number of selections to make. If `None`, half of the samples are - selected. If integer, the parameter is the absolute number of selections - to make. If float between 0 and 1, it is the fraction of the total dataset to - select. Stored in :py:attr:`self.n_to_select`. - + The number of selections to make. If `None`, half of the samples are selected. + If integer, the parameter is the absolute number of selections to make. If float + between 0 and 1, it is the fraction of the total dataset to select. Stored in + :py:attr:`self.n_to_select`. score_threshold : float, default=None - Threshold for the score. If `None` selection will continue until the - n_to_select is chosen. Otherwise will stop when the score falls below the - threshold. Stored in :py:attr:`self.score_threshold`. - + Threshold for the score. If `None` selection will continue until the n_to_select + is chosen. Otherwise will stop when the score falls below the threshold. Stored + in :py:attr:`self.score_threshold`. score_threshold_type : str, default="absolute" - How to interpret the ``score_threshold``. When "absolute", the score used by - the selector is compared to the threshold directly. When "relative", at each + How to interpret the ``score_threshold``. When "absolute", the score used by the + selector is compared to the threshold directly. When "relative", at each iteration, the score used by the selector is compared proportionally to the - score of the first selection, i.e. the selector quits when - ``current_score / first_score < threshold``. Stored in - :py:attr:`self.score_threshold_type`. - + score of the first selection, i.e. the selector quits when ``current_score / + first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. progress_bar: bool, default=False option to use `tqdm `_ progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. - full : bool, default=False In the case that all non-redundant selections are exhausted, choose randomly from the remaining samples. Stored in :py:attr:`self.full`. - - random_state: int or RandomState instance, default=0 + random_state: int or numpy.random.RandomState instance, default=0 Attributes ---------- - - X_current_ : ndarray (n_samples, n_features) - The original matrix orthogonalized by previous selections - + X_current_ : numpy.ndarray (n_samples, n_features) + The original matrix orthogonalized by previous selections n_selected_ : int - Counter tracking the number of selections that have been made - + Counter tracking the number of selections that have been made X_selected_ : ndarray, - Matrix containing the selected samples, for use in fitting - + Matrix containing the selected samples, for use in fitting y_selected_ : ndarray, - In sample selection, the matrix containing the selected targets, for - use in fitting - + In sample selection, the matrix containing the selected targets, for use in + fitting pi_ : ndarray (n_features), - the importance score see :func:`_compute_pi` - + the importance score see :func:`_compute_pi` selected_idx_ : ndarray - indices of selected features + indices of selected features Examples -------- @@ -391,33 +346,25 @@ class PCovCUR(_PCovCUR): Parameters ---------- - mixing: float, default=0.5 - The PCovR mixing parameter, as described in PCovR as - :math:`{\\alpha}`. Stored in :py:attr:`self.mixing`. - + The PCovR mixing parameter, as described in PCovR as :math:`{\\alpha}`. Stored + in :py:attr:`self.mixing`. recompute_every : int - number of steps after which to recompute the pi score - defaults to 1, if 0 no re-computation is done - - + number of steps after which to recompute the pi score defaults to 1, if 0 no + re-computation is done k : int number of eigenvectors to compute the importance score with, defaults to 1 - tolerance: float - threshold below which scores will be considered 0, defaults to 1E-12 - + threshold below which scores will be considered 0, defaults to 1E-12 n_to_select : int or float, default=None The number of selections to make. If `None`, half of the samples are selected. If integer, the parameter is the absolute number of selections to make. If float between 0 and 1, it is the fraction of the total dataset to select. Stored in :py:attr:`self.n_to_select`. - score_threshold : float, default=None Threshold for the score. If `None` selection will continue until the n_to_select is chosen. Otherwise will stop when the score falls below the threshold. Stored in :py:attr:`self.score_threshold`. - score_threshold_type : str, default="absolute" How to interpret the ``score_threshold``. When "absolute", the score used by the selector is compared to the threshold directly. When "relative", at each @@ -425,43 +372,32 @@ class PCovCUR(_PCovCUR): score of the first selection, i.e. the selector quits when ``current_score / first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`. - progress_bar: bool, default=False option to use `tqdm `_ progress bar to monitor selections. Stored in :py:attr:`self.report_progress`. - full : bool, default=False In the case that all non-redundant selections are exhausted, choose randomly from the remaining samples. Stored in :py:attr:`self.full`. - - random_state: int or RandomState instance, default=0 + random_state: int or numpy.random.RandomState instance, default=0 Attributes ---------- - - X_current_ : ndarray (n_samples, n_features) - The original matrix orthogonalized by previous selections - - y_current_ : ndarray (n_samples, n_properties) - The targets orthogonalized by a regression on - the previous selections. - + X_current_ : numpy.ndarray (n_samples, n_features) + The original matrix orthogonalized by previous selections + y_current_ : numpy.ndarray (n_samples, n_properties) + The targets orthogonalized by a regression on the previous selections. n_selected_ : int - Counter tracking the number of selections that have been made - - X_selected_ : ndarray, - Matrix containing the selected samples, for use in fitting - - y_selected_ : ndarray, - In sample selection, the matrix containing the selected targets, for - use in fitting - - pi_ : ndarray (n_features), - the importance score see :func:`_compute_pi` - - selected_idx_ : ndarray - indices of selected features + Counter tracking the number of selections that have been made + X_selected_ : numpy.ndarray + Matrix containing the selected samples, for use in fitting + y_selected_ : numpy.ndarray, + In sample selection, the matrix containing the selected targets, for use in + fitting + pi_ : numpy.ndarray (n_features), + the importance score see :func:`_compute_pi` + selected_idx_ : numpy.ndarray + indices of selected features Examples -------- @@ -619,24 +555,22 @@ def __init__(self, low_dim_idx=None, tolerance=1e-12): self.tolerance = tolerance def fit(self, X, y): - """ - Learn the samples that form the convex hull. + """Learn the samples that form the convex hull. Parameters ---------- - X : ndarray of shape (n_samples, n_features) - Feature matrix of samples to use for constructing the convex - hull. - y : ndarray of shape (n_samples,) - Target values (property on which the convex hull should be - constructed, e.g. Gibbs free energy) + X : numpy.ndarray of shape (n_samples, n_features) + Feature matrix of samples to use for constructing the convex + hull. + y : numpy.ndarray of shape (n_samples,) + Target values (property on which the convex hull should be + constructed, e.g. Gibbs free energy) Returns ------- self : object Fitted scorer. """ - X, y = self._check_X_y(X, y) self.n_features_in_ = X.shape[1] diff --git a/src/skmatter/sample_selection/_voronoi_fps.py b/src/skmatter/sample_selection/_voronoi_fps.py index 6490bda19..ae7d27f7e 100644 --- a/src/skmatter/sample_selection/_voronoi_fps.py +++ b/src/skmatter/sample_selection/_voronoi_fps.py @@ -40,11 +40,9 @@ class VoronoiFPS(GreedySelector): Parameters ---------- - - n_trial_calculation: integer, default=4 + n_trial_calculation: int, default=4 Number of calculations used for the switching point between Voronoi FPS and traditional FPS (for detail look at full_fraction). - full_fraction: float, default=None Proportion of calculated distances from the total number of features at which the switch from Voronoi FPS to FPS occurs. @@ -69,7 +67,7 @@ class VoronoiFPS(GreedySelector): ... full_fraction=0.45, ... # int or 'random', default=0 ... # Index of the first selection. - ... # If ‘random’, picks a random value when fit starts. + ... # If 'random', picks a random value when fit starts. ... initialize=0, ... ) >>> X = np.array( @@ -111,43 +109,37 @@ def score(self, X=None, y=None): return self.hausdorff_ def get_distance(self): - """ - + r""" Traditional FPS employs a column-wise Euclidean distance for feature selection, which can be expressed using the covariance matrix :math:`\\mathbf{C} = \\mathbf{X} ^ T \\mathbf{X}` .. math:: - \\operatorname{d}_c(i, j) = C_{ii} - 2 C_{ij} + C_{jj}. + \operatorname{d}_c(i, j) = C_{ii} - 2 C_{ij} + C_{jj}. For sample selection, this is a row-wise Euclidean distance, which can be expressed in terms of the Gram matrix :math:`\\mathbf{K} = \\mathbf{X} \\mathbf{X} ^ T` .. math:: - \\operatorname{d}_r(i, j) = K_{ii} - 2 K_{ij} + K_{jj}. + \operatorname{d}_r(i, j) = K_{ii} - 2 K_{ij} + K_{jj}. Returns ------- - hausdorff : ndarray of shape (`n_to_select_from_`) the minimum distance from each point to the set of selected points. once a point is selected, the distance is not updated; the final list will reflect the distances when selected. - """ return self.hausdorff_ def get_select_distance(self): """ - Returns ------- - - hausdorff_at_select : ndarray of shape (`n_to_select`) + hausdorff_at_select : numpy.ndarray of shape (`n_to_select`) at the time of selection, the minimum distance from each selected point to the set of previously selected points. - """ mask = self.get_support(indices=True, ordered=True) return self.hausdorff_at_select_[mask] @@ -163,7 +155,6 @@ def _init_greedy_search(self, X, y, n_to_select): large number of distances, it is more advantageous to run simple calculation along the whole matrix. """ - n_to_select_from = X.shape[0] self.vlocation_of_idx = np.full(n_to_select_from, 1) # index of the voronoi cell associated with each of the columns of X @@ -239,8 +230,8 @@ def _init_greedy_search(self, X, y, n_to_select): def _continue_greedy_search(self, X, y, n_to_select): """Continues the search. Prepares an array to store the selected - features.""" - + features. + """ super()._continue_greedy_search(X, y, n_to_select) n_pad = n_to_select - self.n_selected_ @@ -271,7 +262,6 @@ def _get_active(self, X, last_selected): |d(X,S) - d(S,L)|>= d(X,S) to know that we don't need to check X. but |d(X,S) - d(S,L)|^2>= d(X,S)^2 if and only if d(S,L)/2 > d(S,X) """ - if not hasattr(self, "n_selected_") or self.n_selected_ == 0: return np.arange(X.shape[0], dtype=int) @@ -290,19 +280,17 @@ def _get_active(self, X, last_selected): return active_points def _update_post_selection(self, X, y, last_selected): - """ - Saves the most recently selected feature, increments the feature counter + """Saves the most recently selected feature, increments the feature counter and update the hausdorff distances + Let: - L is the last point selected; - S are the selected points from before this iteration; - X is the one active point; - This function calculates d(L, X) and checks the condition - d(L, X)< min d(X, S_i). If so, we move X to a new polyhedron. - If the number of active points is too high, it is faster to calculate - the distances between L and all the points in the dataset. - """ + L is the last point selected; S are the selected points from before this + iteration; X is the one active point; This function calculates d(L, X) and + checks the condition d(L, X)< min d(X, S_i). If so, we move X to a new + polyhedron. If the number of active points is too high, it is faster to + calculate the distances between L and all the points in the dataset. + """ self.hausdorff_at_select_[last_selected] = self.hausdorff_[last_selected] active_points = self._get_active(X, last_selected) diff --git a/src/skmatter/utils/_orthogonalizers.py b/src/skmatter/utils/_orthogonalizers.py index 14dbf0a2c..023747ed7 100644 --- a/src/skmatter/utils/_orthogonalizers.py +++ b/src/skmatter/utils/_orthogonalizers.py @@ -1,12 +1,8 @@ # -*- coding: utf-8 -*- -""" - -This module contains the necessary orthogonalizers for the CUR decomposition -subselection method. +"""Necessary orthogonalizers for the CUR decomposition subselection method. Authors: Rose K. Cersonsky Michele Ceriotti - """ import warnings @@ -15,25 +11,22 @@ def X_orthogonalizer(x1, c=None, x2=None, tol=1e-12, copy=False): - """ - Orthogonalizes a feature matrix by the given columns. Can be used to - orthogonalize by samples by calling `X = X_orthogonalizer(X.T, row_index).T`. - After orthogonalization, each column of X will contain only what is + """Orthogonalizes a feature matrix by the given columns. + + Can be used to orthogonalize by samples by calling `X = X_orthogonalizer(X.T, + row_index).T`. After orthogonalization, each column of X will contain only what is orthogonal to X[:, c] or x2. Parameters ---------- - x1: matrix of shape (n x m) + x1: numpy.ndarray of shape (n x m) feature matrix to orthogonalize - c: int, less than m, default=None index of the column to orthogonalize by - - x2: matrix of shape (n x a), default=x1[:, c] + x2: numpy.ndarray of shape (n x a), default=x1[:, c] a separate set of columns to orthogonalize with respect to Note: the orthogonalizer will work column-by-column in column-index order """ - if x2 is None and c is not None: cols = x1[:, [c]] elif x2.shape[0] == x1.shape[0]: @@ -64,8 +57,8 @@ def X_orthogonalizer(x1, c=None, x2=None, tol=1e-12, copy=False): def Y_feature_orthogonalizer(y, X, tol=1e-12, copy=True): - r""" - Orthogonalizes a property matrix given the selected features in :math:`\mathbf{X}` + r"""Orthogonalizes a property matrix given the selected features in + :math:`\mathbf{X}`. .. math:: \mathbf{Y} \leftarrow \mathbf{Y} - @@ -73,20 +66,15 @@ def Y_feature_orthogonalizer(y, X, tol=1e-12, copy=True): Parameters ---------- - - y: ndarray of shape (n_samples x n_properties) + y : numpy.ndarray of shape (n_samples x n_properties) property matrix - - X: ndarray of shape (n_samples x n_features) + X : numpy.ndarray of shape (n_samples x n_features) feature matrix - tol: float - cutoff for small eigenvalues to send to np.linalg.pinv - + cutoff for small eigenvalues to send to np.linalg.pinv copy: bool - whether to return a copy of y or edit in-place, default=True + whether to return a copy of y or edit in-place, default=True """ - v = np.linalg.pinv(np.matmul(X.T, X), rcond=tol) v = np.matmul(X, v) v = np.matmul(v, X.T) @@ -99,42 +87,31 @@ def Y_feature_orthogonalizer(y, X, tol=1e-12, copy=True): def Y_sample_orthogonalizer(y, X, y_ref, X_ref, tol=1e-12, copy=True): - """ - Orthogonalizes a matrix of targets :math:`{\\mathbf{Y}}` given a reference feature - matrix :math:`{\\mathbf{X}_r}` and reference target matrix :math:`{\\mathbf{Y}_r}`: + r"""Orthogonalizes a matrix of targets :math:`{\\mathbf{Y}}` given a reference + feature matrix :math:`{\\mathbf{X}_r}` and reference target matrix + :math:`{\\mathbf{Y}_r}`: .. math:: - \\mathbf{Y} \\leftarrow \\mathbf{Y} - \\mathbf{X} \\left(\\mathbf{X}_{\\mathbf{r}}^T \\mathbf{X}_{\\mathbf{r}}\\right)^{-1}\\mathbf{X}_{\\mathbf{r}}^T \\mathbf{Y}_{\\mathbf{r}} - - Parameters ---------- - - y: ndarray of shape (n_samples x n_properties) + y : numpy.ndarray of shape (n_samples x n_properties) property matrix - - X: ndarray of shape (n_samples x n_features) + X : numpy.ndarray of shape (n_samples x n_features) feature matrix - - y_ref: ndarray of shape (n_ref x n_properties) - reference property matrix - - X_ref: ndarray of shape (n_ref x n_features) - reference feature matrix - + y_ref : numpy.ndarray of shape (n_ref x n_properties) + reference property matrix + X_ref : numpy.ndarray of shape (n_ref x n_features) + reference feature matrix tol: float - cutoff for small eigenvalues to send to np.linalg.pinv - + cutoff for small eigenvalues to send to np.linalg.pinv copy: bool - whether to return a copy of y or edit in-place, default=True - + whether to return a copy of y or edit in-place, default=True """ - y_frag = (X @ (np.linalg.lstsq(X_ref, y_ref, rcond=tol)[0])).reshape(y.shape) if copy: diff --git a/src/skmatter/utils/_pcovr_utils.py b/src/skmatter/utils/_pcovr_utils.py index 69ae2e394..837ea394c 100644 --- a/src/skmatter/utils/_pcovr_utils.py +++ b/src/skmatter/utils/_pcovr_utils.py @@ -114,8 +114,7 @@ def pcovr_covariance( random_state=0, iterated_power="auto", ): - r""" - Creates the PCovR modified covariance + r"""Creates the PCovR modified covariance .. math:: @@ -153,7 +152,6 @@ def pcovr_covariance( random seed to use for randomized svd """ - C = np.zeros((X.shape[1], X.shape[1]), dtype=np.float64) if mixing < 1 or return_isqrt: @@ -198,39 +196,30 @@ def pcovr_covariance( def pcovr_kernel(mixing, X, Y, **kernel_params): - r""" - Creates the PCovR modified kernel distances + r"""Creates the PCovR modified kernel distances .. math:: - \mathbf{\tilde{K}} = \alpha \mathbf{K} + (1 - \alpha) \mathbf{Y}\mathbf{Y}^T the default kernel is the linear kernel, such that: .. math:: - \mathbf{\tilde{K}} = \alpha \mathbf{X} \mathbf{X}^T + (1 - \alpha) \mathbf{Y}\mathbf{Y}^T Parameters ---------- - mixing : float mixing parameter, as described in PCovR as :math:`{\alpha}` - - X : ndarray of shape (n x m) + X : numpy.ndarray of shape (n x m) Data matrix :math:`\mathbf{X}` - - Y : ndarray of shape (n x p) + Y : numpy.ndarray of shape (n x p) Array to include in biased selection when mixing < 1 - kernel_params : dict, optional dictionary of arguments to pass to pairwise_kernels if none are specified, assumes that the kernel is linear - """ - K = np.zeros((X.shape[0], X.shape[0])) if mixing < 1: K += (1 - mixing) * Y @ Y.T diff --git a/src/skmatter/utils/_progress_bar.py b/src/skmatter/utils/_progress_bar.py index 19b8ac291..01820698a 100644 --- a/src/skmatter/utils/_progress_bar.py +++ b/src/skmatter/utils/_progress_bar.py @@ -1,7 +1,7 @@ def get_progress_bar(): - """ - This function returns the appropriate version of tqdm, as determined by - tqdm.auto. If tqdm is not installed, an ImportError is raised. + """Returns the appropriate version of ``tqdm``, as determined by ``tqdm.auto``. + + If ``tqdm`` is not installed, an :py:class`ImportError` is raised. """ try: from tqdm.auto import tqdm @@ -9,14 +9,11 @@ def get_progress_bar(): return tqdm except ImportError: raise ImportError( - "tqdm must be installed to use a progress bar." - "Either install tqdm or re-run with" - "progress_bar = False" + "tqdm must be installed to use a progress bar. Either install tqdm or " + "re-run with progress_bar = False" ) def no_progress_bar(x): - """ - This is the identity function, same as lambda x:x. It returns x. - """ + """Identity function, same as ``lambda x:x``. It returns ``x``.""" return x diff --git a/tests/test_check_estimators.py b/tests/test_check_estimators.py index b21835dca..fc89ecdb4 100644 --- a/tests/test_check_estimators.py +++ b/tests/test_check_estimators.py @@ -23,4 +23,5 @@ ] ) def test_sklearn_compatible_estimator(estimator, check): + """Test of the estimators are compatible with sklearn.""" check(estimator) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index e976c5d7c..5dd7f144a 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -77,9 +77,7 @@ def setUpClass(cls): cls.has_pandas = False def test_load_dataset_without_pandas(self): - """ - Check if the correct exception occurs when pandas isn't present. - """ + """Check if the correct exception occurs when pandas isn't present.""" with unittest.mock.patch.dict("sys.modules", {"pandas": None}): with self.assertRaises(ImportError) as cm: _ = load_who_dataset() @@ -95,9 +93,7 @@ def test_dataset_size_and_shape(self): self.assertEqual(self.who["data"].shape, self.shape) def test_datapoint_value(self): - """ - Check if the value of a datapoint at a certain location is correct. - """ + """Check if the value of a datapoint at a certain location is correct.""" if self.has_pandas is True: self.assertTrue( np.allclose( @@ -120,9 +116,7 @@ def setUpClass(cls): cls.has_ase = False def test_load_dataset_without_ase(self): - """ - Check if the correct exception occurs when ase isn't present. - """ + """Check if the correct exception occurs when ase isn't present.""" with unittest.mock.patch.dict("sys.modules", {"ase.io": None}): with self.assertRaises(ImportError) as cm: _ = load_roy_dataset() @@ -131,8 +125,8 @@ def test_load_dataset_without_ase(self): ) def test_dataset_content(self): - """ - Check if the correct number of datapoints are present in the dataset. + """Check if the correct number of datapoints are present in the dataset. + Also check if the size of the dataset is correct. """ if self.has_ase is True: @@ -141,8 +135,8 @@ def test_dataset_content(self): self.assertEqual(len(self.roy["energies"]), self.size) def test_dataset_consistency(self): - """ - Check if the energies in the structures are the same as in the explicit array. + """Check if the energies in the structures are the same as in the explicit + array. """ if self.has_ase is True: self.assertTrue( diff --git a/tests/test_dch.py b/tests/test_dch.py index 98dd11ef8..afad6444e 100644 --- a/tests/test_dch.py +++ b/tests/test_dch.py @@ -21,10 +21,8 @@ def setUp(self): ) def test_selected_idx_and_scores(self): - """ - This test is a regression test that checks that DCH selects correct vertices and - gets correct distances from the `score_feature_matrix` and `score_samples` - functions. + """Regression test that checks that DCH selects correct vertices and gets + correct distances from the `score_feature_matrix` and `score_samples` functions. """ selector = DirectionalConvexHull() selector.fit(self.T, self.y) @@ -169,11 +167,9 @@ def test_positive_score(self): self.assertTrue(np.all(distances >= -selector.tolerance)) def test_score_function_warnings(self): - """ - Ensure that calling `score_samples` with points outside the range causes an + """Ensure that calling `score_samples` with points outside the range causes an error. """ - selector = DirectionalConvexHull(low_dim_idx=[0]) # high-dimensional dummy data, not important for the test X_high_dimensional = [1.0, 2.0, 3.0] diff --git a/tests/test_feature_pcov_cur.py b/tests/test_feature_pcov_cur.py index d1f22b358..3f025d670 100644 --- a/tests/test_feature_pcov_cur.py +++ b/tests/test_feature_pcov_cur.py @@ -12,20 +12,14 @@ def setUp(self): self.idx = [2, 8, 3, 4, 1, 7, 5, 9, 6] def test_known(self): - """ - This test checks that the model returns a known set of indices - """ - + """Check that the model returns a known set of indices.""" selector = PCovCUR(n_to_select=9) selector.fit(self.X, self.y) self.assertTrue(np.allclose(selector.selected_idx_, self.idx)) def test_restart(self): - """ - This test checks that the model can be restarted with a new instance - """ - + """Check that the model can be restarted with a new instance.""" selector = PCovCUR(n_to_select=1) selector.fit(self.X, self.y) @@ -35,9 +29,7 @@ def test_restart(self): self.assertEqual(selector.selected_idx_[i], self.idx[i]) def test_non_it(self): - """ - This test checks that the model can be run non-iteratively - """ + """Check that the model can be run non-iteratively.""" self.idx = [2, 8, 3, 6, 7, 9, 1, 0, 5] selector = PCovCUR(n_to_select=9, recompute_every=0) selector.fit(self.X, self.y) diff --git a/tests/test_feature_pcov_fps.py b/tests/test_feature_pcov_fps.py index ae53f1796..321cc78ee 100644 --- a/tests/test_feature_pcov_fps.py +++ b/tests/test_feature_pcov_fps.py @@ -11,11 +11,9 @@ def setUp(self): self.idx = [0, 2, 6, 7, 1, 3, 4] def test_restart(self): + """Check that the model can be restarted with a new number of features and + `warm_start`. """ - This test checks that the model can be restarted with a new number of - features and `warm_start` - """ - selector = PCovFPS(n_to_select=1, initialize=self.idx[0]) selector.fit(self.X, y=self.y) @@ -25,10 +23,7 @@ def test_restart(self): self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1]) def test_no_mixing_1(self): - """ - This test checks that the model throws an error when mixing = 1.0 - """ - + """Check that the model throws an error when mixing = 1.0.""" with self.assertRaises(ValueError) as cm: _ = PCovFPS(n_to_select=1, mixing=1.0) self.assertEqual( diff --git a/tests/test_feature_simple_cur.py b/tests/test_feature_simple_cur.py index 72554471d..147a16fed 100644 --- a/tests/test_feature_simple_cur.py +++ b/tests/test_feature_simple_cur.py @@ -18,10 +18,7 @@ def test_bad_transform(self): _ = selector.transform(self.X) def test_restart(self): - """ - This test checks that the model can be restarted with a new instance - """ - + """Check that the model can be restarted with a new instance.""" ref_selector = CUR(n_to_select=self.X.shape[-1] - 3).fit(X=self.X) ref_idx = ref_selector.selected_idx_ @@ -34,9 +31,7 @@ def test_restart(self): self.assertEqual(selector.selected_idx_[i], ref_idx[i]) def test_non_it(self): - """ - This test checks that the model can be run non-iteratively - """ + """Check that the model can be run non-iteratively.""" C = self.X.T @ self.X _, UC = np.linalg.eigh(C) ref_idx = np.argsort(-(UC[:, -1] ** 2.0))[:-1] diff --git a/tests/test_feature_simple_fps.py b/tests/test_feature_simple_fps.py index b29a2bc7b..68e3ffdbc 100644 --- a/tests/test_feature_simple_fps.py +++ b/tests/test_feature_simple_fps.py @@ -13,7 +13,7 @@ def setUp(self): def test_restart(self): """ - This test checks that the model can be restarted with a new number of + Check that the model can be restarted with a new number of features and `warm_start` """ selector = FPS(n_to_select=1, initialize=self.idx[0]) @@ -25,11 +25,9 @@ def test_restart(self): self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1]) def test_initialize(self): + """Check that the model can be initialized in all applicable manners and throws + an error otherwise. """ - This test checks that the model can be initialized in all applicable manners - and throws an error otherwise - """ - for initialize in [self.idx[0], "random"]: with self.subTest(initialize=initialize): selector = FPS(n_to_select=1, initialize=initialize) @@ -48,9 +46,7 @@ def test_initialize(self): self.assertEqual(str(cm.exception), "Invalid value of the initialize parameter") def test_get_distances(self): - """ - This test checks that the hausdorff distances are returnable after fitting - """ + """Check that the hausdorff distances are returnable after fitting.""" selector = FPS(n_to_select=7) selector.fit(self.X) d = selector.get_select_distance() diff --git a/tests/test_kernel_normalizer.py b/tests/test_kernel_normalizer.py index d17ddf9f3..a2297902b 100644 --- a/tests/test_kernel_normalizer.py +++ b/tests/test_kernel_normalizer.py @@ -13,7 +13,8 @@ def __init__(self, *args, **kwargs): def test_sample_weights(self): """Checks that sample weights of one are equal to the unweighted case and - that nonuniform weights are different from the unweighted case""" + that nonuniform weights are different from the unweighted case. + """ K = self.random_state.uniform(0, 100, size=(3, 3)) equal_wts = np.ones(len(K)) nonequal_wts = self.random_state.uniform(0, 100, size=(len(K),)) @@ -31,7 +32,8 @@ def test_sample_weights(self): def test_invalid_sample_weights(self): """Checks that weights must be 1D array with the same length as the number of - samples""" + samples. + """ K = self.random_state.uniform(0, 100, size=(3, 3)) wts_len = np.ones(len(K) + 1) wts_dim = np.ones((len(K), 2)) @@ -49,8 +51,9 @@ def test_ValueError(self): model.fit(K) def test_reference_ValueError(self): - """Checks that it is impossible to normalize - a matrix with a non-coincident size with the reference.""" + """Checks that it is impossible to normalize a matrix with a non-coincident + size with the reference. + """ K = self.random_state.uniform(0, 100, size=(3, 3)) K_2 = self.random_state.uniform(0, 100, size=(2, 2)) model = KernelNormalizer() @@ -59,9 +62,9 @@ def test_reference_ValueError(self): model.transform(K_2) def test_NotFittedError_transform(self): - """Checks that an error is returned when - trying to use the transform function - before the fit function""" + """Checks that an error is returned when trying to use the transform function + before the fit function. + """ K = self.random_state.uniform(0, 100, size=(3, 3)) model = KernelNormalizer() with self.assertRaises(sklearn.exceptions.NotFittedError): @@ -69,8 +72,8 @@ def test_NotFittedError_transform(self): def test_fit_transform(self): """Checks that the kernel is correctly normalized. - Compare with the value calculated - directly from the equation. + + Compare with the value calculated directly from the equation. """ K = self.random_state.uniform(0, 100, size=(3, 3)) model = KernelNormalizer() diff --git a/tests/test_kernel_pcovr.py b/tests/test_kernel_pcovr.py index 8cb7b0297..e4bbda52e 100644 --- a/tests/test_kernel_pcovr.py +++ b/tests/test_kernel_pcovr.py @@ -51,7 +51,7 @@ def setUp(self): class KernelPCovRErrorTest(KernelPCovRBaseTest): def test_lr_with_x_errors(self): """ - This test checks that KernelPCovR returns a non-null property prediction + Check that KernelPCovR returns a non-null property prediction and that the prediction error increases with `mixing` """ prev_error = -1.0 @@ -73,11 +73,9 @@ def test_lr_with_x_errors(self): prev_error = error def test_reconstruction_errors(self): + """Check that KernelPCovR returns a non-null reconstructed X and that the + reconstruction error decreases with `mixing`. """ - This test checks that KernelPCovR returns a non-null reconstructed X - and that the reconstruction error decreases with `mixing` - """ - prev_error = 10.0 prev_x_error = 10.0 @@ -139,7 +137,7 @@ def test_kpcovr_error(self): class KernelPCovRInfrastructureTest(KernelPCovRBaseTest): def test_nonfitted_failure(self): """ - This test checks that KernelPCovR will raise a `NonFittedError` if + Check that KernelPCovR will raise a `NonFittedError` if `transform` is called before the model is fitted """ kpcovr = KernelPCovR(mixing=0.5, n_components=2, tol=1e-12) @@ -148,7 +146,7 @@ def test_nonfitted_failure(self): def test_no_arg_predict(self): """ - This test checks that KernelPCovR will raise a `ValueError` if + Check that KernelPCovR will raise a `ValueError` if `predict` is called without arguments """ kpcovr = KernelPCovR(mixing=0.5, n_components=2, tol=1e-12) @@ -158,7 +156,7 @@ def test_no_arg_predict(self): def test_T_shape(self): """ - This test checks that KernelPCovR returns a latent space projection + Check that KernelPCovR returns a latent space projection consistent with the shape of the input matrix """ n_components = 5 @@ -169,9 +167,7 @@ def test_T_shape(self): self.assertTrue(T.shape[-1] == n_components) def test_no_centerer(self): - """ - tests that when center=False, no centerer exists - """ + """Tests that when center=False, no centerer exists.""" kpcovr = self.model(center=False) kpcovr.fit(self.X, self.Y) @@ -179,10 +175,7 @@ def test_no_centerer(self): kpcovr.centerer_ def test_centerer(self): - """ - tests that all functionalities that rely on the centerer work properly - """ - + """Tests that all functionalities that rely on the centerer work properly.""" kpcovr = self.model(center=True) kpcovr.fit(self.X, self.Y) @@ -305,9 +298,8 @@ def test_precomputed_regression(self): class KernelTests(KernelPCovRBaseTest): def test_kernel_types(self): - """ - This test checks that KernelPCovR can handle all kernels passable to - sklearn kernel classes, including callable kernels + """Check that KernelPCovR can handle all kernels passable to sklearn + kernel classes, including callable kernels """ def _linear_kernel(X, Y): @@ -332,11 +324,9 @@ def _linear_kernel(X, Y): kpcovr.fit(self.X, self.Y) def test_linear_matches_pcovr(self): + """Check that KernelPCovR returns the same results as PCovR when using a linear + kernel. """ - This test checks that KernelPCovR returns the same results as PCovR when - using a linear kernel - """ - ridge = RidgeCV(fit_intercept=False, alphas=np.logspace(-8, 2)) ridge.fit(self.X, self.Y) @@ -394,7 +384,7 @@ def test_linear_matches_pcovr(self): class KernelPCovRTestSVDSolvers(KernelPCovRBaseTest): def test_svd_solvers(self): """ - This test checks that PCovR works with all svd_solver modes and assigns + Check that PCovR works with all svd_solver modes and assigns the right n_components """ for solver in ["arpack", "full", "randomized", "auto"]: @@ -433,7 +423,7 @@ def test_svd_solvers(self): def test_bad_solver(self): """ - This test checks that PCovR will not work with a solver that isn't in + Check that PCovR will not work with a solver that isn't in ['arpack', 'full', 'randomized', 'auto'] """ with self.assertRaises(ValueError) as cm: @@ -443,11 +433,7 @@ def test_bad_solver(self): self.assertTrue(str(cm.exception), "Unrecognized svd_solver='bad'" "") def test_good_n_components(self): - """ - This test checks that PCovR will work with any allowed values of - n_components. - """ - + """Check that PCovR will work with any allowed values of n_components.""" # this one should pass kpcovr = self.model(n_components=0.5, svd_solver="full") kpcovr.fit(self.X, self.Y) @@ -462,11 +448,7 @@ def test_good_n_components(self): kpcovr.fit(self.X, self.Y) def test_bad_n_components(self): - """ - This test checks that PCovR will not work with any prohibited values of - n_components. - """ - + """Check that PCovR will not work with any prohibited values of n_components.""" with self.subTest(type="negative_ncomponents"): with self.assertRaises(ValueError) as cm: kpcovr = self.model(n_components=-1, svd_solver="auto") diff --git a/tests/test_orthogonalizers.py b/tests/test_orthogonalizers.py index 0578141c8..016fd6988 100644 --- a/tests/test_orthogonalizers.py +++ b/tests/test_orthogonalizers.py @@ -136,7 +136,7 @@ def test_multicolumn(self): str(cm.exception), "You can only orthogonalize a matrix using a vector with the same number " f"of rows. Matrix X has {self.n_samples} rows, whereas the " - f"orthogonalizing matrix has {self.n_samples+4} rows.", + f"orthogonalizing matrix has {self.n_samples + 4} rows.", ) def test_warning(self): diff --git a/tests/test_pcovr.py b/tests/test_pcovr.py index 84a261c86..e589978d2 100644 --- a/tests/test_pcovr.py +++ b/tests/test_pcovr.py @@ -32,10 +32,7 @@ def setUp(self): class PCovRErrorTest(PCovRBaseTest): def test_against_pca(self): - """ - Tests that mixing = 1.0 corresponds to PCA - """ - + """Tests that mixing = 1.0 corresponds to PCA.""" pcovr = PCovR( mixing=1.0, n_components=3, space="sample", svd_solver="full" ).fit(self.X, self.Y) @@ -54,11 +51,9 @@ def test_against_pca(self): ) def test_simple_reconstruction(self): + """Check that PCovR with a full eigendecomposition at mixing=1 can fully + reconstruct the input matrix. """ - This test checks that PCovR with a full eigendecomposition at mixing=1 - can fully reconstruct the input matrix. - """ - for space in ["feature", "sample", "auto"]: with self.subTest(space=space): pcovr = self.model( @@ -73,7 +68,7 @@ def test_simple_reconstruction(self): def test_simple_prediction(self): """ - This test checks that PCovR with a full eigendecomposition at mixing=0 + Check that PCovR with a full eigendecomposition at mixing=0 can fully reconstruct the input properties. """ for space in ["feature", "sample", "auto"]: @@ -92,7 +87,7 @@ def test_simple_prediction(self): def test_lr_with_x_errors(self): """ - This test checks that PCovR returns a non-null property prediction + Check that PCovR returns a non-null property prediction and that the prediction error increases with `mixing` """ prev_error = -1.0 @@ -112,12 +107,9 @@ def test_lr_with_x_errors(self): prev_error = error def test_lr_with_t_errors(self): + """Check that PCovR returns a non-null property prediction from the latent space + projection and that the prediction error increases with `mixing`. """ - This test checks that PCovR returns a non-null property prediction - from the latent space projection and that the prediction error - increases with `mixing` - """ - prev_error = -1.0 for mixing in np.linspace(0, 1, 11): @@ -136,11 +128,9 @@ def test_lr_with_t_errors(self): prev_error = error def test_reconstruction_errors(self): + """Check that PCovR returns a non-null reconstructed X and that the + reconstruction error decreases with `mixing`. """ - This test checks that PCovR returns a non-null reconstructed X - and that the reconstruction error decreases with `mixing` - """ - prev_error = 1.0 for mixing in np.linspace(0, 1, 11): @@ -161,7 +151,7 @@ def test_reconstruction_errors(self): class PCovRSpaceTest(PCovRBaseTest): def test_select_feature_space(self): """ - This test checks that PCovR implements the feature space version + Check that PCovR implements the feature space version when :math:`n_{features} < n_{samples}``. """ pcovr = self.model(n_components=2, tol=1e-12) @@ -171,7 +161,7 @@ def test_select_feature_space(self): def test_select_sample_space(self): """ - This test checks that PCovR implements the sample space version + Check that PCovR implements the sample space version when :math:`n_{features} > n_{samples}``. """ pcovr = self.model(n_components=2, tol=1e-12) @@ -183,7 +173,7 @@ def test_select_sample_space(self): def test_bad_space(self): """ - This test checks that PCovR raises a ValueError when a non-valid + Check that PCovR raises a ValueError when a non-valid space is designated. """ with self.assertRaises(ValueError): @@ -192,7 +182,7 @@ def test_bad_space(self): def test_override_spaceselection(self): """ - This test checks that PCovR implements the space provided in the + Check that PCovR implements the space provided in the constructor, overriding that chosen by the input dimensions. """ pcovr = self.model(n_components=2, tol=1e-12, space="sample") @@ -202,7 +192,7 @@ def test_override_spaceselection(self): def test_spaces_equivalent(self): """ - This test checks that the results from PCovR, regardless of the space, + Check that the results from PCovR, regardless of the space, are equivalent. """ for alpha in np.linspace(0.01, 0.99, 11): @@ -248,7 +238,7 @@ def test_spaces_equivalent(self): class PCovRTestSVDSolvers(PCovRBaseTest): def test_svd_solvers(self): """ - This test checks that PCovR works with all svd_solver modes and assigns + Check that PCovR works with all svd_solver modes and assigns the right n_components """ for solver in ["arpack", "full", "randomized", "auto"]: @@ -263,7 +253,7 @@ def test_svd_solvers(self): def test_bad_solver(self): """ - This test checks that PCovR will not work with a solver that isn't in + Check that PCovR will not work with a solver that isn't in ['arpack', 'full', 'randomized', 'auto'] """ for space in ["feature", "sample"]: @@ -274,11 +264,7 @@ def test_bad_solver(self): self.assertEqual(str(cm.exception), "Unrecognized svd_solver='bad'" "") def test_good_n_components(self): - """ - This test checks that PCovR will work with any allowed values of - n_components. - """ - + """Check that PCovR will work with any allowed values of n_components.""" # this one should pass pcovr = self.model(n_components=0.5, svd_solver="full") pcovr.fit(self.X, self.Y) @@ -293,11 +279,7 @@ def test_good_n_components(self): pcovr.fit(self.X, self.Y) def test_bad_n_components(self): - """ - This test checks that PCovR will not work with any prohibited values of - n_components. - """ - + """Check that PCovR will not work with any prohibited values of n_components.""" with self.assertRaises(ValueError) as cm: pcovr = self.model(n_components="mle", svd_solver="full") pcovr.fit(self.X[:2], self.Y[:2]) @@ -370,7 +352,7 @@ def test_bad_n_components(self): class PCovRInfrastructureTest(PCovRBaseTest): def test_nonfitted_failure(self): """ - This test checks that PCovR will raise a `NonFittedError` if + Check that PCovR will raise a `NonFittedError` if `transform` is called before the pcovr is fitted """ pcovr = self.model(n_components=2, tol=1e-12) @@ -379,7 +361,7 @@ def test_nonfitted_failure(self): def test_no_arg_predict(self): """ - This test checks that PCovR will raise a `ValueError` if + Check that PCovR will raise a `ValueError` if `predict` is called without arguments """ pcovr = self.model(n_components=2, tol=1e-12) @@ -389,23 +371,22 @@ def test_no_arg_predict(self): def test_centering(self): """ - This test checks that PCovR raises a warning if + Check that PCovR raises a warning if given uncentered data. """ pcovr = self.model(n_components=2, tol=1e-12) X = self.X.copy() + np.random.uniform(-1, 1, self.X.shape[1]) with warnings.catch_warnings(record=True) as w: pcovr.fit(X, self.Y) - self.assertEquals( + self.assertEqual( str(w[0].message), - "This class does not automatically center data, and your data mean is" - " greater than the supplied tolerance.", + "This class does not automatically center data, and your data mean is " + "greater than the supplied tolerance.", ) def test_T_shape(self): - """ - This test checks that PCovR returns a latent space projection - consistent with the shape of the input matrix + """Check that PCovR returns a latent space projection consistent with the shape + of the input matrix. """ n_components = 5 pcovr = self.model(n_components=n_components, tol=1e-12) @@ -490,8 +471,7 @@ def test_incompatible_regressor(self): pcovr.fit(self.X, self.Y) self.assertEqual( str(cm.exception), - "Regressor must be an instance of " - "`LinearRegression`, `Ridge`, `RidgeCV`, " + "Regressor must be an instance of `LinearRegression`, `Ridge`, `RidgeCV`, " "or `precomputed`", ) @@ -525,10 +505,8 @@ def test_incompatible_coef_shape(self): pcovr.fit(self.X, np.column_stack((self.Y, self.Y))) self.assertEqual( str(cm.exception), - "The regressor coefficients have a shape incompatible " - "with the supplied target space. " - "The coefficients have shape %r and the targets " - "have shape %r" + "The regressor coefficients have a shape incompatible with the supplied " + "target space. The coefficients have shape %r and the targets have shape %r" % (regressor.coef_.shape, np.column_stack((self.Y, self.Y)).shape), ) diff --git a/tests/test_progress_bar.py b/tests/test_progress_bar.py index e304cd3d0..e88352ec9 100644 --- a/tests/test_progress_bar.py +++ b/tests/test_progress_bar.py @@ -5,10 +5,7 @@ class PBarTest(unittest.TestCase): def test_no_tqdm(self): - """ - This test checks that the model cannot use a progress bar when tqdm - is not installed - """ + """Check that the model cannot use a progress bar when tqdm is not installed.""" import sys sys.modules["tqdm"] = None @@ -17,9 +14,8 @@ def test_no_tqdm(self): _ = get_progress_bar() self.assertEqual( str(cm.exception), - "tqdm must be installed to use a progress bar." - "Either install tqdm or re-run with" - "progress_bar = False", + "tqdm must be installed to use a progress bar. Either install tqdm or " + "re-run with progress_bar = False", ) diff --git a/tests/test_sample_pcov_cur.py b/tests/test_sample_pcov_cur.py index c14c50c48..cb05326aa 100644 --- a/tests/test_sample_pcov_cur.py +++ b/tests/test_sample_pcov_cur.py @@ -16,20 +16,14 @@ def setUp(self): self.idx = [256, 304, 41, 408, 311, 364, 152, 78, 359, 102] def test_known(self): - """ - This test checks that the model returns a known set of indices - """ - + """Check that the model returns a known set of indices.""" selector = PCovCUR(n_to_select=10, mixing=0.5) selector.fit(self.X, self.y) self.assertTrue(np.allclose(selector.selected_idx_, self.idx)) def test_restart(self): - """ - This test checks that the model can be restarted with a new instance - """ - + """Check that the model can be restarted with a new instance.""" selector = PCovCUR(n_to_select=1, mixing=0.5) selector.fit(self.X, self.y) @@ -49,9 +43,7 @@ def test_restart(self): ) def test_non_it(self): - """ - This test checks that the model can be run non-iteratively - """ + """Check that the model can be run non-iteratively.""" self.idx = [256, 32, 138, 290, 362, 141, 359, 428, 254, 9] selector = PCovCUR(n_to_select=10, recompute_every=0) selector.fit(self.X, self.y) @@ -59,10 +51,7 @@ def test_non_it(self): self.assertTrue(np.allclose(selector.selected_idx_, self.idx)) def test_multiple_k(self): - """ - This test checks that the model can be run with multiple k's - """ - + """Check that the model can be run with multiple k's.""" for k in list(set(np.logspace(0, np.log10(min(self.X.shape)), 4, dtype=int))): selector = PCovCUR(n_to_select=10, k=k) selector.fit(self.X, self.y) diff --git a/tests/test_sample_pcov_fps.py b/tests/test_sample_pcov_fps.py index 8a083a776..7679abb0a 100644 --- a/tests/test_sample_pcov_fps.py +++ b/tests/test_sample_pcov_fps.py @@ -11,11 +11,9 @@ def setUp(self): self.idx = [0, 256, 156, 324, 349, 77, 113, 441, 426, 51] def test_restart(self): + """Check that the model can be restarted with a new number of samples and + `warm_start`. """ - This test checks that the model can be restarted with a new number of - samples and `warm_start` - """ - selector = PCovFPS(n_to_select=1, initialize=self.idx[0]) selector.fit(self.X, y=self.y) @@ -25,13 +23,10 @@ def test_restart(self): self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1]) def test_no_mixing_1(self): - """ - This test checks that the model throws an error when mixing = 1.0 - """ - + """Check that the model throws an error when mixing = 1.0.""" with self.assertRaises(ValueError) as cm: _ = PCovFPS(n_to_select=1, mixing=1.0) - self.assertEquals( + self.assertEqual( str(cm.exception), "Mixing = 1.0 corresponds to traditional FPS." "Please use the FPS class.", ) diff --git a/tests/test_sample_simple_cur.py b/tests/test_sample_simple_cur.py index b3a9437e1..0969074a3 100644 --- a/tests/test_sample_simple_cur.py +++ b/tests/test_sample_simple_cur.py @@ -14,7 +14,7 @@ def setUp(self): def test_sample_transform(self): """ - This test checks that an error is raised when the transform function is used, + Check that an error is raised when the transform function is used, because sklearn does not support well transformers that change the number of samples with other classes like Pipeline """ @@ -29,10 +29,7 @@ def test_sample_transform(self): ) def test_restart(self): - """ - This test checks that the model can be restarted with a new instance - """ - + """Check that the model can be restarted with a new instance""" ref_selector = CUR(n_to_select=self.n_select) ref_idx = ref_selector.fit(self.X).selected_idx_ @@ -45,10 +42,7 @@ def test_restart(self): self.assertEqual(selector.selected_idx_[i], ref_idx[i]) def test_non_it(self): - """ - This test checks that the model can be run non-iteratively - """ - + """Check that the model can be run non-iteratively.""" K = self.X @ self.X.T _, UK = np.linalg.eigh(K) ref_idx = np.argsort(-(UK[:, -1] ** 2.0))[: self.n_select] diff --git a/tests/test_sample_simple_fps.py b/tests/test_sample_simple_fps.py index ca7ee4bee..0e2960974 100644 --- a/tests/test_sample_simple_fps.py +++ b/tests/test_sample_simple_fps.py @@ -12,11 +12,9 @@ def setUp(self): self.idx = [0, 123, 441, 187, 117, 276, 261, 281, 251, 193] def test_restart(self): + """Checks that the model can be restarted with a new number of samples and + `warm_start`. """ - This test checks that the model can be restarted with a new number of - samples and `warm_start` - """ - selector = FPS(n_to_select=1, initialize=self.idx[0]) selector.fit(self.X) @@ -26,11 +24,9 @@ def test_restart(self): self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1]) def test_initialize(self): + """Checks that the model can be initialized in all applicable manners and throws + an error otherwise. """ - This test checks that the model can be initialized in all applicable manners - and throws an error otherwise - """ - for initialize in [self.idx[0], "random"]: with self.subTest(initialize=initialize): selector = FPS(n_to_select=1, initialize=initialize) @@ -46,14 +42,10 @@ def test_initialize(self): with self.assertRaises(ValueError) as cm: selector = FPS(n_to_select=1, initialize="bad") selector.fit(self.X) - self.assertEquals( - str(cm.exception), "Invalid value of the initialize parameter" - ) + self.assertEqual(str(cm.exception), "Invalid value of the initialize parameter") def test_get_distances(self): - """ - This test checks that the hausdorff distances are returnable after fitting - """ + """Checks that the hausdorff distances are returnable after fitting.""" selector = FPS(n_to_select=1) selector.fit(self.X) _ = selector.get_select_distance() diff --git a/tests/test_sparse_kernel_centerer.py b/tests/test_sparse_kernel_centerer.py index 619e8e387..df9d14213 100644 --- a/tests/test_sparse_kernel_centerer.py +++ b/tests/test_sparse_kernel_centerer.py @@ -13,7 +13,8 @@ def __init__(self, *args, **kwargs): def test_sample_weights(self): """Checks that sample weights of one are equal to the unweighted case and that - the nonuniform weights are different from the unweighted case""" + the nonuniform weights are different from the unweighted case. + """ X = self.random_state.uniform(-1, 1, size=(4, 5)) X_sparse = self.random_state.uniform(-1, 1, size=(3, 5)) @@ -40,7 +41,8 @@ def test_sample_weights(self): def test_invalid_sample_weights(self): """Checks that weights must be 1D array with the same length as the number of - samples""" + samples. + """ X = self.random_state.uniform(-1, 1, size=(4, 5)) X_sparse = self.random_state.uniform(-1, 1, size=(3, 5)) @@ -56,8 +58,7 @@ def test_invalid_sample_weights(self): model.fit_transform(Knm, Kmm, sample_weight=wts_dim) def test_Square_Kmm(self): - """Checks that the passed active kernel is square""" - + """Checks that the passed active kernel is square.""" X = self.random_state.uniform(-1, 1, size=(4, 5)) X_sparse = self.random_state.uniform(-1, 1, size=(3, 5)) @@ -71,8 +72,8 @@ def test_Square_Kmm(self): def test_LatterDim(self): """Checks that a matrix must have the same latter dimension as its active - counterpart cannot be normalized.""" - + counterpart cannot be normalized. + """ X = self.random_state.uniform(-1, 1, size=(4, 5)) X_sparse = self.random_state.uniform(-1, 1, size=(3, 5)) @@ -84,12 +85,13 @@ def test_LatterDim(self): model.fit(Knm, Kmm) self.assertEqual( str(cm.exception), - "The reference kernel is not " "commensurate shape with the active kernel.", + "The reference kernel is not commensurate shape with the active kernel.", ) def test_new_kernel(self): - """Checks that it is impossible to normalize - a matrix with a non-coincident size with the reference.""" + """Checks that it is impossible to normalize a matrix with a non-coincident size + with the reference. + """ X = self.random_state.uniform(-1, 1, size=(4, 5)) X_sparse = self.random_state.uniform(-1, 1, size=(3, 5)) @@ -101,15 +103,15 @@ def test_new_kernel(self): model = model.fit(Knm, Kmm) with self.assertRaises(ValueError) as cm: model.transform(Knm2) - self.assertEquals( + self.assertEqual( str(cm.exception), "The reference kernel and received kernel have different shape", ) def test_NotFittedError_transform(self): - """Checks that an error is returned when - trying to use the transform function - before the fit function""" + """Checks that an error is returned when trying to use the transform function + before the fit function + """ K = self.random_state.uniform(0, 100, size=(3, 3)) model = SparseKernelCenterer() with self.assertRaises(sklearn.exceptions.NotFittedError): @@ -117,10 +119,9 @@ def test_NotFittedError_transform(self): def test_fit_transform(self): """Checks that the kernel is correctly normalized. - Compare with the value calculated - directly from the equation. - """ + Compare with the value calculated directly from the equation. + """ X = self.random_state.uniform(-1, 1, size=(4, 5)) X_sparse = self.random_state.uniform(-1, 1, size=(3, 5)) diff --git a/tests/test_standard_flexible_scaler.py b/tests/test_standard_flexible_scaler.py index e1d6cc1f6..7d5de796c 100644 --- a/tests/test_standard_flexible_scaler.py +++ b/tests/test_standard_flexible_scaler.py @@ -15,7 +15,8 @@ def __init__(self, *args, **kwargs): def test_sample_weights(self): """Checks that sample weights of one are equal to the unweighted case. - Also, that the nonuniform weights are different from the unweighted case""" + Also, that the nonuniform weights are different from the unweighted case + """ X = self.random_state.uniform(0, 100, size=(3, 3)) equal_wts = np.ones(len(X)) nonequal_wts = self.random_state.uniform(0, 100, size=(len(X),)) @@ -33,7 +34,8 @@ def test_sample_weights(self): def test_invalid_sample_weights(self): """Checks that weights must be 1D array with the same length as the number of - samples""" + samples + """ X = self.random_state.uniform(0, 100, size=(3, 3)) wts_len = np.ones(len(X) + 1) wts_dim = np.ones((len(X), 2)) @@ -106,17 +108,18 @@ def test_inverse_transform(self): self.assertTrue((np.isclose(Y, Y_inv, atol=1e-12)).all()) def test_NotFittedError_transform(self): - """Checks that an error is returned when - trying to use the transform function - before the fit function""" + """Checks that an error is returned when trying to use the transform function + before the fit function. + """ X = self.random_state.uniform(0, 100, size=(3, 3)) model = StandardFlexibleScaler(column_wise=True) with self.assertRaises(sklearn.exceptions.NotFittedError): model.transform(X) def test_shape_inconsistent_transform(self): - """Checks that an error is returned when attempting - to use the transform function with mismatched matrix sizes.""" + """Checks that an error is returned when attempting to use the transform + function with mismatched matrix sizes. + """ X = self.random_state.uniform(0, 100, size=(3, 3)) X_test = self.random_state.uniform(0, 100, size=(4, 4)) model = StandardFlexibleScaler(column_wise=True) @@ -125,8 +128,9 @@ def test_shape_inconsistent_transform(self): model.transform(X_test) def test_shape_inconsistent_inverse(self): - """Checks that an error is returned when attempting - to use the inverse transform function with mismatched matrix sizes.""" + """Checks that an error is returned when attempting to use the inverse transform + function with mismatched matrix sizes. + """ X = self.random_state.uniform(0, 100, size=(3, 3)) X_test = self.random_state.uniform(0, 100, size=(4, 4)) model = StandardFlexibleScaler(column_wise=True) @@ -135,17 +139,18 @@ def test_shape_inconsistent_inverse(self): model.inverse_transform(X_test) def test_NotFittedError_inverse(self): - """Checks that an error is returned when - trying to use the inverse transform function - before the fit function""" + """Checks that an error is returned when trying to use the inverse transform + function before the fit function. + """ X = self.random_state.uniform(0, 100, size=(3, 3)) model = StandardFlexibleScaler() with self.assertRaises(sklearn.exceptions.NotFittedError): model.inverse_transform(X) def test_ValueError_column_wise(self): - """Checks that the matrix cannot be normalized - across columns if there is a zero variation column.""" + """Checks that the matrix cannot be normalized across columns if there is a zero + variation column. + """ X = self.random_state.uniform(0, 100, size=(3, 3)) X[0][0] = X[1][0] = X[2][0] = 2 model = StandardFlexibleScaler(column_wise=True) @@ -154,7 +159,8 @@ def test_ValueError_column_wise(self): def test_atol(self): """Checks that we can define absolute tolerance and it control the - minimal variance of columns ot the whole matrix""" + minimal variance of columns ot the whole matrix. + """ X = self.random_state.uniform(0, 100, size=(3, 3)) atol = ((X[:, 0] - X[:, 0].mean(axis=0)) ** 2).mean(axis=0) + 1e-8 model = StandardFlexibleScaler(column_wise=True, atol=atol, rtol=0) @@ -167,7 +173,8 @@ def test_atol(self): def test_rtol(self): """Checks that we can define relative tolerance and it control the - minimal variance of columns or the whole matrix""" + minimal variance of columns or the whole matrix. + """ X = self.random_state.uniform(0, 100, size=(3, 3)) mean = X[:, 0].mean(axis=0) rtol = ((X[:, 0] - mean) ** 2).mean(axis=0) / mean + 1e-8 @@ -181,16 +188,16 @@ def test_rtol(self): model.fit(X) def test_ValueError_full(self): - """Checks that the matrix cannot be normalized - if there is a zero variation matrix.""" + """Checks that the matrix cannot be normalized if there is a zero variation + matrix. + """ X = np.array([2, 2, 2]).reshape(-1, 1) model = StandardFlexibleScaler(column_wise=False) with self.assertRaises(ValueError): model.fit(X) def test_not_w_mean(self): - """Checks that the matrix normalized `with_mean=False` - does not have a mean.""" + """Checks that the matrix normalized `with_mean=False` does not have a mean.""" X = np.array([2, 2, 3]).reshape(-1, 1) model = StandardFlexibleScaler(with_mean=False) model.fit(X) diff --git a/tests/test_voronoi_fps.py b/tests/test_voronoi_fps.py index 1a2c6b314..d3f0e9e55 100644 --- a/tests/test_voronoi_fps.py +++ b/tests/test_voronoi_fps.py @@ -12,11 +12,9 @@ def setUp(self): super().setUp() def test_restart(self): - """ - This test checks that the model can be restarted with a new number of + """Checks that the model can be restarted with a new number of features and `warm_start` """ - selector = VoronoiFPS(n_to_select=1, initialize=self.idx[0]) selector.fit(self.X) @@ -26,11 +24,9 @@ def test_restart(self): self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1]) def test_initialize(self): - """ - This test checks that the model can be initialized in all applicable manners + """Checks that the model can be initialized in all applicable manners and throws an error otherwise """ - for initialize in [self.idx[0], "random"]: with self.subTest(initialize=initialize): selector = VoronoiFPS(n_to_select=1, initialize=initialize) @@ -39,13 +35,10 @@ def test_initialize(self): with self.assertRaises(ValueError) as cm: selector = VoronoiFPS(n_to_select=1, initialize="bad") selector.fit(self.X) - self.assertEquals( - str(cm.exception), "Invalid value of the initialize parameter" - ) + self.assertEqual(str(cm.exception), "Invalid value of the initialize parameter") def test_switching_point(self): - """ - This test check work of the switching point calculator into the + """Check work of the switching point calculator into the _init_greedy_search function """ selector = VoronoiFPS(n_to_select=1) @@ -94,9 +87,7 @@ def test_switching_point(self): ) def test_get_distances(self): - """ - This test checks that the hausdorff distances are returnable after fitting - """ + """Checks that the hausdorff distances are returnable after fitting""" selector = VoronoiFPS(n_to_select=1) selector.fit(self.X) _ = selector.get_select_distance() @@ -106,8 +97,7 @@ def test_get_distances(self): _ = selector.get_select_distance() def test_comparison(self): - """ - This test checks that the voronoi FPS strictly computes less distances + """Checks that the voronoi FPS strictly computes less distances than its normal FPS counterpart. """ vselector = VoronoiFPS(n_to_select=self.X.shape[0] - 1) @@ -119,9 +109,8 @@ def test_comparison(self): self.assertTrue(np.allclose(vselector.selected_idx_, selector.selected_idx_)) def test_nothing_updated_points(self): - """ - This test checks that in the case where we have no points to update, - the code still works fine + """Checks that in the case where we have no points to update, the code + still works fine """ X = np.array([[1, 1], [4, 4], [10, 10], [100, 100]]) selector = VoronoiFPS(n_to_select=3, initialize=0) @@ -165,7 +154,7 @@ def test_calculate_dSL(self): ) def test_score(self): - """This test check that function score return hausdorff distance""" + """Check that function score return hausdorff distance""" selector = VoronoiFPS(n_to_select=3, initialize=0) selector.fit(self.X) diff --git a/tox.ini b/tox.ini index 5920c6a90..dee237d9a 100644 --- a/tox.ini +++ b/tox.ini @@ -57,6 +57,7 @@ deps = blackdoc flake8 flake8-bugbear + flake8-docstrings flake8-sphinx-links isort sphinx-lint @@ -96,7 +97,16 @@ commands = max_line_length = 88 exclude = docs/src/examples/ +docstring-convention = numpy per-file-ignores = # D205 and D400 are incompatible with the requirements of sphinx-gallery examples/**:D205, D400 -extend-ignore = E203 +ignore = + E203 + D100 + D101 + D102 + D205 + D400 + D401 + W503