From 979924396fe6c078f4b897b84039a7ebe9404da1 Mon Sep 17 00:00:00 2001
From: Philip Loche <ploche@physik.fu-berlin.de>
Date: Thu, 2 May 2024 17:34:41 +0200
Subject: [PATCH] Cleanup docstrings and add basic docs-linter

---
 .gitignore                                    |   1 +
 docs/src/sg_execution_times.rst               |  73 +++++
 examples/reconstruction/PlotGFRE.py           |  21 +-
 examples/reconstruction/PlotLFRE.py           |   7 +-
 examples/reconstruction/PlotPointwiseGFRE.py  |   3 +-
 .../OrthogonalRegressionNonAnalytic.py        |   1 -
 .../regression/Ridge2FoldCVRegularization.py  |  20 +-
 src/skmatter/_selection.py                    | 282 +++++++-----------
 src/skmatter/datasets/__init__.py             |   2 +
 src/skmatter/datasets/_base.py                |   9 +-
 src/skmatter/decomposition/_kernel_pcovr.py   | 154 ++++------
 src/skmatter/decomposition/_pcovr.py          | 200 +++++--------
 src/skmatter/feature_selection/_base.py       | 217 +++++---------
 src/skmatter/linear_model/__init__.py         |   2 +
 src/skmatter/linear_model/_base.py            |  14 +-
 src/skmatter/linear_model/_ridge.py           |   3 +-
 src/skmatter/metrics/__init__.py              |   5 +-
 .../metrics/_prediction_rigidities.py         |  98 +++---
 src/skmatter/model_selection/__init__.py      |   2 +
 src/skmatter/model_selection/_split.py        |  48 +--
 src/skmatter/preprocessing/__init__.py        |   2 +-
 src/skmatter/preprocessing/_data.py           |  87 ++----
 src/skmatter/sample_selection/_base.py        | 268 +++++++----------
 src/skmatter/sample_selection/_voronoi_fps.py |  44 +--
 src/skmatter/utils/_orthogonalizers.py        |  71 ++---
 src/skmatter/utils/_pcovr_utils.py            |  19 +-
 src/skmatter/utils/_progress_bar.py           |  15 +-
 tests/test_check_estimators.py                |   1 +
 tests/test_datasets.py                        |  20 +-
 tests/test_dch.py                             |  10 +-
 tests/test_feature_pcov_cur.py                |  14 +-
 tests/test_feature_pcov_fps.py                |  11 +-
 tests/test_feature_simple_cur.py              |   9 +-
 tests/test_feature_simple_fps.py              |  12 +-
 tests/test_kernel_normalizer.py               |  21 +-
 tests/test_kernel_pcovr.py                    |  50 +---
 tests/test_orthogonalizers.py                 |   2 +-
 tests/test_pcovr.py                           |  62 ++--
 tests/test_progress_bar.py                    |   2 +-
 tests/test_sample_pcov_cur.py                 |  19 +-
 tests/test_sample_pcov_fps.py                 |  11 +-
 tests/test_sample_simple_cur.py               |  12 +-
 tests/test_sample_simple_fps.py               |  16 +-
 tests/test_sparse_kernel_centerer.py          |  31 +-
 tests/test_standard_flexible_scaler.py        |  47 +--
 tests/test_voronoi_fps.py                     |  25 +-
 tox.ini                                       |  12 +-
 47 files changed, 824 insertions(+), 1231 deletions(-)
 create mode 100644 docs/src/sg_execution_times.rst
diff --git a/.gitignore b/.gitignore
index 15eba83e1f..2269330889 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ __pycache__
 build/
 dist/
 docs/src/examples
+sg_execution_times.rst
diff --git a/docs/src/sg_execution_times.rst b/docs/src/sg_execution_times.rst
new file mode 100644
index 0000000000..14c38e3597
--- /dev/null
+++ b/docs/src/sg_execution_times.rst
@@ -0,0 +1,73 @@
+
+:orphan:
+
+.. _sphx_glr_sg_execution_times:
+
+
+Computation times
+=================
+**01:30.435** total execution time for 13 files **from all galleries**:
+
+.. container::
+
+  .. raw:: html
+
+    <style scoped>
+    <link href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/5.3.0/css/bootstrap.min.css" rel="stylesheet" />
+    <link href="https://cdn.datatables.net/1.13.6/css/dataTables.bootstrap5.min.css" rel="stylesheet" />
+    </style>
+    <script src="https://code.jquery.com/jquery-3.7.0.js"></script>
+    <script src="https://cdn.datatables.net/1.13.6/js/jquery.dataTables.min.js"></script>
+    <script src="https://cdn.datatables.net/1.13.6/js/dataTables.bootstrap5.min.js"></script>
+    <script type="text/javascript" class="init">
+    $(document).ready( function () {
+        $('table.sg-datatable').DataTable({order: [[1, 'desc']]});
+    } );
+    </script>
+
+  .. list-table::
+   :header-rows: 1
+   :class: table table-striped sg-datatable
+
+   * - Example
+     - Time
+     - Mem (MB)
+   * - :ref:`sphx_glr_examples_regression_Ridge2FoldCVRegularization.py` (``../../examples/regression/Ridge2FoldCVRegularization.py``)
+     - 01:25.321
+     - 0.0
+   * - :ref:`sphx_glr_examples_reconstruction_PlotLFRE.py` (``../../examples/reconstruction/PlotLFRE.py``)
+     - 00:02.805
+     - 0.0
+   * - :ref:`sphx_glr_examples_regression_OrthogonalRegressionNonAnalytic.py` (``../../examples/regression/OrthogonalRegressionNonAnalytic.py``)
+     - 00:01.885
+     - 0.0
+   * - :ref:`sphx_glr_examples_reconstruction_PlotPointwiseGFRE.py` (``../../examples/reconstruction/PlotPointwiseGFRE.py``)
+     - 00:00.268
+     - 0.0
+   * - :ref:`sphx_glr_examples_reconstruction_PlotGFRE.py` (``../../examples/reconstruction/PlotGFRE.py``)
+     - 00:00.156
+     - 0.0
+   * - :ref:`sphx_glr_examples_pcovr_PCovR-WHODataset.py` (``../../examples/pcovr/PCovR-WHODataset.py``)
+     - 00:00.000
+     - 0.0
+   * - :ref:`sphx_glr_examples_pcovr_PCovR.py` (``../../examples/pcovr/PCovR.py``)
+     - 00:00.000
+     - 0.0
+   * - :ref:`sphx_glr_examples_pcovr_PCovR_Regressors.py` (``../../examples/pcovr/PCovR_Regressors.py``)
+     - 00:00.000
+     - 0.0
+   * - :ref:`sphx_glr_examples_pcovr_PCovR_Scaling.py` (``../../examples/pcovr/PCovR_Scaling.py``)
+     - 00:00.000
+     - 0.0
+   * - :ref:`sphx_glr_examples_selection_FeatureSelection-WHODataset.py` (``../../examples/selection/FeatureSelection-WHODataset.py``)
+     - 00:00.000
+     - 0.0
+   * - :ref:`sphx_glr_examples_selection_FeatureSelection.py` (``../../examples/selection/FeatureSelection.py``)
+     - 00:00.000
+     - 0.0
+   * - :ref:`sphx_glr_examples_selection_GCH-ROY.py` (``../../examples/selection/GCH-ROY.py``)
+     - 00:00.000
+     - 0.0
+   * - :ref:`sphx_glr_examples_selection_Selectors-Pipelines.py` (``../../examples/selection/Selectors-Pipelines.py``)
+     - 00:00.000
+     - 0.0
diff --git a/examples/reconstruction/PlotGFRE.py b/examples/reconstruction/PlotGFRE.py
index 9111c83e50..d1f45106eb 100644
--- a/examples/reconstruction/PlotGFRE.py
+++ b/examples/reconstruction/PlotGFRE.py
@@ -4,20 +4,19 @@
 """
 Global Feature Reconstruction Error (GFRE) and Distortion (GFRD)
 ================================================================
-
-Example for the usage of the :class:`skmatter.metrics.global_reconstruction_error`
-as global feature reconstruction error (GFRE) and
+Example for the usage of the :class:`skmatter.metrics.global_reconstruction_error` as
+global feature reconstruction error (GFRE) and
 :class:`skmatter.metrics.global_reconstruction_distortion` global feature reconstruction
-distortion (GFRD). We apply the global reconstruction measures on the degenerate
-CH4 manifold dataset. This dataset was specifically constructed to be
-representable by a 4-body features (bispectrum) but not by a 3-body features
-(power spectrum). In other words the dataset contains environments which are
-different, but have the same 3-body features. For more details about the dataset
-please refer to `Pozdnyakov 2020 <https://doi.org/10.1103/PhysRevLett.125.166001>`_.
+distortion (GFRD). We apply the global reconstruction measures on the degenerate CH4
+manifold dataset. This dataset was specifically constructed to be representable by a
+4-body features (bispectrum) but not by a 3-body features (power spectrum). In other
+words the dataset contains environments which are different, but have the same 3-body
+features. For more details about the dataset please refer to `Pozdnyakov 2020
+<https://doi.org/10.1103/PhysRevLett.125.166001>`_.
 
 The ``skmatter`` dataset already contains the 3 and 4-body features computed with
-`librascal <https://github.com/lab-cosmo/librascal>`_ so we can load it and
-compare it with the GFRE/GFRD.
+`librascal <https://github.com/lab-cosmo/librascal>`_ so we can load it and compare it
+with the GFRE/GFRD.
 """
 # %%
 #
diff --git a/examples/reconstruction/PlotLFRE.py b/examples/reconstruction/PlotLFRE.py
index de4be77a56..ead5d131f0 100644
--- a/examples/reconstruction/PlotLFRE.py
+++ b/examples/reconstruction/PlotLFRE.py
@@ -3,7 +3,6 @@
 """
 Pointwise Local Reconstruction Error
 ====================================
-
 Example for the usage of the
 :class:`skmatter.metrics.pointwise_local_reconstruction_error` as pointwise local
 reconstruction error (LFRE) on the degenerate CH4 manifold. We apply the local
@@ -14,9 +13,9 @@
 dataset please refer to `Pozdnyakov 2020
 <https://doi.org/10.1103/PhysRevLett.125.166001>`_.
 
-The skmatter dataset already contains the 3 and 4-body features computed with
-`librascal <https://github.com/lab-cosmo/librascal>`_ so we can load it and compare it
-with the LFRE.
+The skmatter dataset already contains the 3 and 4-body features computed with `librascal
+<https://github.com/lab-cosmo/librascal>`_ so we can load it and compare it with the
+LFRE.
 """
 # %%
 #
diff --git a/examples/reconstruction/PlotPointwiseGFRE.py b/examples/reconstruction/PlotPointwiseGFRE.py
index df7662bc48..256b6011c9 100644
--- a/examples/reconstruction/PlotPointwiseGFRE.py
+++ b/examples/reconstruction/PlotPointwiseGFRE.py
@@ -3,8 +3,7 @@
 
 """
 Pointwise GFRE applied on RKHS features
-================================================================
-
+=======================================
 Example for the usage of the
 :class:`skmatter.metrics.pointwise_global_reconstruction_error` as the pointwise global
 feature reconstruction error (pointwise GFRE). We apply the pointwise global feature
diff --git a/examples/regression/OrthogonalRegressionNonAnalytic.py b/examples/regression/OrthogonalRegressionNonAnalytic.py
index c5089df017..586178d6d8 100644
--- a/examples/regression/OrthogonalRegressionNonAnalytic.py
+++ b/examples/regression/OrthogonalRegressionNonAnalytic.py
@@ -3,7 +3,6 @@
 r"""
 Regression with orthogonal projector/matrices
 =============================================
-
 In this example, we explain how when using
 :class:`skmatter.linear_model.OrthogonalRegression` the option
 ``use_orthogonal_projector`` can result in non-analytic behavior. In
diff --git a/examples/regression/Ridge2FoldCVRegularization.py b/examples/regression/Ridge2FoldCVRegularization.py
index 83ad6d9f0f..b4c78cf63f 100644
--- a/examples/regression/Ridge2FoldCVRegularization.py
+++ b/examples/regression/Ridge2FoldCVRegularization.py
@@ -1,16 +1,14 @@
 # %%
 
 r"""
- Ridge2FoldCV for data with low effective rank
- =======================================================
- In this notebook we explain in more detail how
- :class:`skmatter.linear_model.Ridge2FoldCV` speeds up the
- cross-validation optimizing the regularitzation parameter :param alpha: and
- compare it with existing solution for that in scikit-learn
- :class:`slearn.linear_model.RidgeCV`.
- :class:`skmatter.linear_model.Ridge2FoldCV` was designed to predict
- efficiently feature matrices, but it can be also useful for the prediction
- single targets.
+Ridge2FoldCV for data with low effective rank
+=============================================
+In this notebook we explain in more detail how
+:class:`skmatter.linear_model.Ridge2FoldCV` speeds up the cross-validation optimizing
+the regularitzation parameter :param alpha: and compare it with existing solution for
+that in scikit-learn :class:`slearn.linear_model.RidgeCV`.
+:class:`skmatter.linear_model.Ridge2FoldCV` was designed to predict efficiently feature
+matrices, but it can be also useful for the prediction single targets.
 """
 # %%
 #
@@ -128,6 +126,7 @@
 
 
 def micro_bench(ridge):
+    """A small benchmark function."""
     global N_REPEAT_MICRO_BENCH, X, y
     timings = []
     train_mse = []
@@ -177,6 +176,7 @@ def micro_bench(ridge):
 
 
 def get_train_test_error(estimator):
+    """The train tets error based on the estimator."""
     global X_train, y_train, X_test, y_test
     estimator = estimator.fit(X_train, y_train)
     return (
diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py
index 95e43ed151..69aad73c3b 100644
--- a/src/skmatter/_selection.py
+++ b/src/skmatter/_selection.py
@@ -1,13 +1,13 @@
 """
-This module contains data sub-selection modules primarily corresponding to methods
-derived from CUR matrix decomposition and Farthest Point Sampling. In their classical
-form, CUR and FPS determine a data subset that maximizes the variance (CUR) or
-distribution (FPS) of the features or samples. These methods can be modified to combine
-supervised target information denoted by the methods `PCov-CUR` and `PCov-FPS`. For
-further reading, refer to [Imbalzano2018]_ and [Cersonsky2021]_. These selectors can be
-used for both feature and sample selection, with similar instantiations. All
-sub-selection methods  scores each feature or sample (without an estimator) and chooses
-that with the maximum score. A simple example of usage:
+Data sub-selection modules primarily corresponding to methods derived from CUR matrix
+decomposition and Farthest Point Sampling. In their classical form, CUR and FPS
+determine a data subset that maximizes the variance (CUR) or distribution (FPS) of the
+features or samples. These methods can be modified to combine supervised target
+information denoted by the methods `PCov-CUR` and `PCov-FPS`. For further reading, refer
+to [Imbalzano2018]_ and [Cersonsky2021]_. These selectors can be used for both feature
+and sample selection, with similar instantiations. All sub-selection methods  scores
+each feature or sample (without an estimator) and chooses that with the maximum score. A
+simple example of usage:
 
 .. doctest::
 
@@ -98,60 +98,49 @@
 
 
 class GreedySelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
-    """
-
-    Transformer that adds, via greedy forward selection,
+    """Transformer that adds, via greedy forward selection,
     features or samples to form a subset. At each stage, the model scores each
     feature or sample (without an estimator) and chooses that with the maximum score.
 
     Parameters
     ----------
-
     selection_type : str, {'feature', 'sample'}
         whether to choose a subset of columns ('feature') or rows ('sample').
         Stored in :py:attr:`self._axis_name` (as text) and :py:attr:`self._axis`
         (as 0 or 1 for 'sample' or 'feature', respectively).
-
     n_to_select : int or float, default=None
         The number of selections to make. If `None`, half of the features or samples are
         selected. If integer, the parameter is the absolute number of selections
         to make. If float between 0 and 1, it is the fraction of the total dataset to
         select. Stored in :py:attr:`self.n_to_select`.
-
     score_threshold : float, default=None
         Threshold for the score. If `None` selection will continue until the
         n_to_select is chosen. Otherwise will stop when the score falls below the
         threshold. Stored in :py:attr:`self.score_threshold`.
-
     score_threshold_type : str, default="absolute"
         How to interpret the ``score_threshold``. When "absolute", the score used by
         the selector is compared to the threshold directly. When "relative", at each
         iteration, the score used by the selector is compared proportionally to the
         score of the first selection, i.e. the selector quits when ``current_score /
         first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`.
-
     progress_bar: bool, default=False
               option to use `tqdm <https://tqdm.github.io/>`_
               progress bar to monitor selections. Stored in
               :py:attr:`self.report_progress_`.
-
     full : bool, default=False
         In the case that all non-redundant selections are exhausted, choose
         randomly from the remaining features. Stored in :py:attr:`self.full`.
-
-    random_state: int or RandomState instance, default=0
+    random_state: int or :class`numpy.random`RandomState` instance, default=0
 
     Attributes
     ----------
     n_selected_ : int
-                  Counter tracking the number of selections that have been made
-    X_selected_ : ndarray,
-                  Matrix containing the selected samples or features, for use in fitting
-    y_selected_ : ndarray,
-                  In sample selection, the matrix containing the selected targets, for
-                  use in fitting
-
-
+        Counter tracking the number of selections that have been made
+    X_selected_ : numpy.ndarray,
+        Matrix containing the selected samples or features, for use in fitting
+    y_selected_ : numpy.ndarray,
+        In sample selection, the matrix containing the selected targets, for use in
+        fitting
     """
 
     def __init__(
@@ -195,7 +184,6 @@ def fit(self, X, y=None, warm_start=False):
         -------
         self : object
         """
-
         if self.selection_type == "feature":
             self._axis = 1
         elif self.selection_type == "sample":
@@ -311,7 +299,6 @@ def transform(self, X, y=None):
         X_r : ndarray
             The selected subset of the input.
         """
-
         check_is_fitted(self, ["_axis", "selected_idx_", "n_selected_"])
 
         if self._axis == 0:
@@ -396,7 +383,6 @@ def get_support(self, indices=False, ordered=False):
 
     def _init_greedy_search(self, X, y, n_to_select):
         """Initializes the search. Prepares an array to store the selected features."""
-
         self.n_selected_ = 0
         self.first_score_ = None
 
@@ -413,7 +399,6 @@ def _init_greedy_search(self, X, y, n_to_select):
 
     def _continue_greedy_search(self, X, y, n_to_select):
         """Continues the search. Prepares an array to store the selected features."""
-
         n_pad = [(0, 0), (0, 0)]
         n_pad[self._axis] = (0, n_to_select - self.n_selected_)
 
@@ -455,10 +440,9 @@ def _get_best_new_selection(self, scorer, X, y):
         return max_score_idx
 
     def _update_post_selection(self, X, y, last_selected):
+        """Saves the most recently selected feature and increments the feature
+        counter.
         """
-        Saves the most recently selected feature and increments the feature counter
-        """
-
         if self._axis == 1:
             self.X_selected_[:, self.n_selected_] = np.take(
                 X, last_selected, axis=self._axis
@@ -508,30 +492,26 @@ class _CUR(GreedySelector):
     which maximize the magnitude of the right or left singular vectors, consistent with
     classic CUR matrix decomposition.
 
-    **WARNING**: This base class should never be directly instantiated.
-    Instead, use :py:class:`skmatter.feature_selection.CUR` and
-    :py:class:`skmatter.sample_selection.CUR`,
-    which have the same constructor signature.
+    Warning::
+        This base class should never be directly instantiated. Instead, use
+        :py:class:`skmatter.feature_selection.CUR` and
+        :py:class:`skmatter.sample_selection.CUR`, which have the same constructor
+        signature.
 
     Parameters
     ----------
     recompute_every : int
-                      number of steps after which to recompute the pi score
-                      defaults to 1, if 0 no re-computation is done
-
+        number of steps after which to recompute the pi score
+        defaults to 1, if 0 no re-computation is done
     k : int
         number of eigenvectors to compute the importance score with, defaults to 1
-
     tolerance: float
-         threshold below which scores will be considered 0, defaults to 1E-12
-
+        threshold below which scores will be considered 0, defaults to 1E-12
 
     Attributes
     ----------
-
     X_current_ : ndarray (n_samples, n_features)
-                  The original matrix orthogonalized by previous selections
-
+        The original matrix orthogonalized by previous selections
     """
 
     def __init__(
@@ -562,47 +542,40 @@ def __init__(
         )
 
     def score(self, X, y=None):
-        r"""
-        Returns the importance score of the given samples or features.
+        r"""Returns the importance score of the given samples or features.
 
-        NOTE: This function does not compute the importance score each time it
-        is called, in order to avoid unnecessary computations. This is done
-        by :py:func:`self._compute_pi`.
+        Note::
+            This function does not compute the importance score each time it is called,
+            in order to avoid unnecessary computations. This is done by
+            :py:func:`self._compute_pi`.
 
         Parameters
         ----------
-        X : ndarray of shape [n_samples, n_features]
+        X : numpy.ndarray of shape [n_samples, n_features]
             The input samples.
-
         y : ignored
 
         Returns
         -------
-        score : ndarray of (n_to_select_from_)
+        score : numpy.ndarray of (n_to_select_from_)
             :math:`\pi` importance for the given samples or features
-
         """
-
         return self.pi_
 
     def _init_greedy_search(self, X, y, n_to_select):
-        """
-        Initializes the search. Prepares an array to store the selected
+        """Initializes the search. Prepares an array to store the selected
         features and computes their initial importance score.
         """
-
         self.X_current_ = as_float_array(X.copy())
         self.pi_ = self._compute_pi(self.X_current_)
 
         super()._init_greedy_search(X, y, n_to_select)
 
     def _continue_greedy_search(self, X, y, n_to_select):
+        """Continues the search. Prepares an array to store the selected features,
+        orthogonalizes the features by those already selected, and computes their
+        initial importance.
         """
-        Continues the search. Prepares an array to store the selected
-        features, orthogonalizes the features by those already selected,
-        and computes their initial importance.
-        """
-
         for c in self.selected_idx_:
             if self.recompute_every != 0 and (
                 np.linalg.norm(np.take(self.X_current_, [c], axis=self._axis))
@@ -615,22 +588,19 @@ def _continue_greedy_search(self, X, y, n_to_select):
         super()._continue_greedy_search(X, y, n_to_select)
 
     def _compute_pi(self, X, y=None):
-        """
-        For feature selection, the importance score :math:`\\pi` is the sum over
+        r"""For feature selection, the importance score :math:`\pi` is the sum over
         the squares of the first :math:`k` components of the right singular vectors
 
         .. math::
-
             \\pi_j =
             \\sum_i^k \\left(\\mathbf{U}_\\mathbf{C}\\right)_{ij}^2.
 
         where :math:`\\mathbf{C} = \\mathbf{X}^T\\mathbf{X}`.
 
-        For sample selection, the importance score :math:`\\pi` is the sum over
-        the squares of the first :math:`k` components of the right singular vectors
+        For sample selection, the importance score :math:`\\pi` is the sum over the
+        squares of the first :math:`k` components of the right singular vectors
 
         .. math::
-
             \\pi_j =
             \\sum_i^k \\left(\\mathbf{U}_\\mathbf{K}\\right)_{ij}^2.
 
@@ -638,17 +608,15 @@ def _compute_pi(self, X, y=None):
 
         Parameters
         ----------
-        X : ndarray of shape [n_samples, n_features]
+        X : numpy.ndarray of shape [n_samples, n_features]
             The input samples.
-
         y : ignored
 
         Returns
         -------
-        pi : ndarray of (n_to_select_from_)
+        pi : numpy.ndarray of (n_to_select_from_)
             :math:`\\pi` importance for the given samples or features
         """
-
         svd_kwargs = dict(k=self.k, random_state=self.random_state)
         if self._axis == 0:
             svd_kwargs["return_singular_vectors"] = "u"
@@ -690,7 +658,7 @@ def _orthogonalize(self, last_selected):
 
 
 class _PCovCUR(GreedySelector):
-    """Transformer that performs Greedy Selection by choosing features
+    r"""Transformer that performs Greedy Selection by choosing features
     which maximize the magnitude of the right or left augmented singular vectors.
     This is done by employing the augmented kernel and covariance matrices,
 
@@ -702,29 +670,22 @@ class _PCovCUR(GreedySelector):
     Parameters
     ----------
     recompute_every : int
-                      number of steps after which to recompute the pi score
-                      defaults to 1, if 0 no re-computation is done
-
+        number of steps after which to recompute the pi score defaults to 1, if 0 no
+        re-computation is done
     k : int
         number of eigenvectors to compute the importance score with, defaults to 1
-
     tolerance: float
-         threshold below which scores will be considered 0, defaults to 1E-12
-
+        threshold below which scores will be considered 0, defaults to 1E-12
     mixing: float, default=0.5
-            The PCovR mixing parameter, as described in PCovR as
-            :math:`{\\alpha}`. Stored in :py:attr:`self.mixing`.
+        The PCovR mixing parameter, as described in PCovR as
+        :math:`{\alpha}`. Stored in :py:attr:`self.mixing`.
 
     Attributes
     ----------
-
     X_current_ : ndarray (n_samples, n_features)
-                  The original matrix orthogonalized by previous selections
-
+        The original matrix orthogonalized by previous selections
     y_current_ : ndarray (n_samples, n_properties)
-                The targets orthogonalized by a regression on
-                the previous selections.
-
+        The targets orthogonalized by a regression on the previous selections.
     """
 
     def __init__(
@@ -758,34 +719,29 @@ def __init__(
         )
 
     def score(self, X, y=None):
-        """
-        Returns the importance score of the given samples or features.
+        r"""Returns the importance score of the given samples or features.
 
-        NOTE: This function does not compute the importance score each time it
-        is called, in order to avoid unnecessary computations. This is done
-        by :py:func:`self._compute_pi`.
+        Note::
+            This function does not compute the importance score each time it is called,
+            in order to avoid unnecessary computations. This is done by
+            :py:func:`self._compute_pi`.
 
         Parameters
         ----------
         X : ignored
-
         y : ignored
 
         Returns
         -------
         score : ndarray of (n_to_select_from_)
-            :math:`\\pi` importance for the given samples or features
-
+            :math:`\pi` importance for the given samples or features
         """
-
         return self.pi_
 
     def _init_greedy_search(self, X, y, n_to_select):
-        """
-        Initializes the search. Prepares an array to store the selected
+        """Initializes the search. Prepares an array to store the selected
         features and computes their initial importance score.
         """
-
         self.X_ref_ = X
         self.y_ref_ = y
         self.X_current_ = X.copy()
@@ -798,12 +754,10 @@ def _init_greedy_search(self, X, y, n_to_select):
         super()._init_greedy_search(X, y, n_to_select)
 
     def _continue_greedy_search(self, X, y, n_to_select):
+        """Continues the search. Prepares an array to store the selected
+        features, orthogonalizes the features by those already selected, and computes
+        their initial importance.
         """
-        Continues the search. Prepares an array to store the selected
-        features, orthogonalizes the features by those already selected,
-        and computes their initial importance.
-        """
-
         for c in self.selected_idx_:
             if self.recompute_every != 0 and (
                 np.linalg.norm(np.take(self.X_current_, [c], axis=self._axis))
@@ -832,12 +786,10 @@ def _update_post_selection(self, X, y, last_selected):
         self.pi_[last_selected] = 0.0
 
     def _compute_pi(self, X, y=None):
-        r"""
-        For feature selection, the importance score :math:`\pi` is the sum over
-        the squares of the first :math:`k` components of the right singular vectors
+        r"""For feature selection, the importance score :math:`\pi` is the sum over
+        the squares of the first :math:`k` components of the right singular vectors.
 
         .. math::
-
             \pi_j =
             \sum_i^k \left(\mathbf{U}_\mathbf{\tilde{C}}\right)_{ij}^2.
 
@@ -852,7 +804,6 @@ def _compute_pi(self, X, y=None):
         the squares of the first :math:`k` components of the right singular vectors
 
         .. math::
-
             \pi_j =
             \sum_i^k \left(\mathbf{U}_\mathbf{\tilde{K}}\right)_{ij}^2.
 
@@ -863,17 +814,15 @@ def _compute_pi(self, X, y=None):
 
         Parameters
         ----------
-        X : ndarray of shape [n_samples, n_features]
+        X : numpy.ndarray of shape [n_samples, n_features]
             The input samples.
-
         y : ignored
 
         Returns
         -------
-        pi : ndarray of (n_to_select_from_)
+        pi : numpy.ndarray of (n_to_select_from_)
             :math:`\pi` importance for the given samples or features
         """
-
         if self._axis == 0:
             pcovr_distance = pcovr_kernel(
                 self.mixing,
@@ -923,22 +872,19 @@ def _orthogonalize(self, last_selected):
 
 
 class _FPS(GreedySelector):
-    """
-    Transformer that performs Greedy Selection using Farthest Point Sampling.
+    """Transformer that performs Greedy Selection using Farthest Point Sampling.
 
-    **WARNING**: This base class should never be directly instantiated.
-    Instead, use :py:class:`skmatter.feature_selection.FPS` and
-    :py:class:`skmatter.sample_selection.FPS`,
-    which have the same constructor signature.
+    Warning::
+        This base class should never be directly instantiated. Instead, use
+        :py:class:`skmatter.feature_selection.FPS` and
+        :py:class:`skmatter.sample_selection.FPS`, which have the same constructor
+        signature.
 
     Parameters
     ----------
-
     initialize: int, list of int, or 'random', default=0
-        Index of the first selection(s). If 'random', picks a random
-        value when fit starts. Stored in :py:attr:`self.initialize`.
-
-
+        Index of the first selection(s). If 'random', picks a random value when fit
+        starts. Stored in :py:attr:`self.initialize`.
     """
 
     def __init__(
@@ -984,54 +930,45 @@ def score(self, X, y=None):
         return self.hausdorff_
 
     def get_distance(self):
-        """
-
-        Traditional FPS employs a column-wise Euclidean
+        r"""Traditional FPS employs a column-wise Euclidean
         distance for feature selection, which can be expressed using the covariance
-        matrix :math:`\\mathbf{C} = \\mathbf{X} ^ T \\mathbf{X}`
+        matrix :math:`\mathbf{C} = \mathbf{X} ^ T \mathbf{X}`
 
         .. math::
-            \\operatorname{d}_c(i, j) = C_{ii} - 2 C_{ij} + C_{jj}.
+            \operatorname{d}_c(i, j) = C_{ii} - 2 C_{ij} + C_{jj}.
 
         For sample selection, this is a row-wise Euclidean distance, which can be
         expressed in terms of the Gram matrix
-        :math:`\\mathbf{K} = \\mathbf{X} \\mathbf{X} ^ T`
+        :math:`\\mathbf{K} = \mathbf{X} \\mathbf{X} ^ T`
 
         .. math::
-            \\operatorname{d}_r(i, j) = K_{ii} - 2 K_{ij} + K_{jj}.
+            \operatorname{d}_r(i, j) = K_{ii} - 2 K_{ij} + K_{jj}.
 
         Returns
         -------
-
         hausdorff : ndarray of shape (`n_to_select_from_`)
-                     the minimum distance from each point to the set of selected
-                     points. once a point is selected, the distance is not updated;
-                     the final list will reflect the distances when selected.
-
+            the minimum distance from each point to the set of selected points. once a
+            point is selected, the distance is not updated; the final list will reflect
+            the distances when selected.
         """
         return self.hausdorff_
 
     def get_select_distance(self):
         """
-
         Returns
         -------
-
         hausdorff_at_select : ndarray of shape (`n_to_select`)
                      at the time of selection, the minimum distance from each
                      selected point to the set of previously selected points.
-
         """
         mask = self.get_support(indices=True, ordered=True)
         return self.hausdorff_at_select_[mask]
 
     def _init_greedy_search(self, X, y, n_to_select):
+        """Initializes the search. Prepares an array to store the selections,
+        makes the initial selection (unless provided), and computes the starting
+        hausdorff distances.
         """
-        Initializes the search. Prepares an array to store the selections,
-        makes the initial selection (unless provided), and
-        computes the starting hausdorff distances.
-        """
-
         super()._init_greedy_search(X, y, n_to_select)
 
         self.norms_ = (X**2).sum(axis=abs(self._axis - 1))
@@ -1082,25 +1019,20 @@ def _update_post_selection(self, X, y, last_selected):
 
 
 class _PCovFPS(GreedySelector):
-    """
-    Transformer that performs Greedy Selection using PCovR-weighted
-    Farthest Point Sampling.
-    In PCov-FPS, a modified covariance or Gram matrix
-    is used to express the distances.
+    r"""Transformer that performs Greedy Selection using PCovR-weighted
+    Farthest Point Sampling. In PCov-FPS, a modified covariance or Gram matrix is used
+    to express the distances.
 
     For sample selection, this is a modified kernel matrix.
 
     Parameters
     ----------
-
     mixing: float, default=0.5
             The PCovR mixing parameter, as described in PCovR as
-            :math:`{\\alpha}`
-
+            :math:`{\alpha}`
     initialize: int or 'random', default=0
         Index of the first selection. If 'random', picks a random
         value when fit starts.
-
     """
 
     def __init__(
@@ -1135,8 +1067,7 @@ def __init__(
         )
 
     def score(self, X, y=None):
-        """
-        Returns the Hausdorff distances of all samples to previous selections
+        """Returns the Hausdorff distances of all samples to previous selections.
 
         NOTE: This function does not compute the importance score each time it
         is called, in order to avoid unnecessary computations. The hausdorff
@@ -1155,39 +1086,31 @@ def score(self, X, y=None):
 
     def get_distance(self):
         """
-
         Returns
         -------
-
         hausdorff : ndarray of shape (`n_to_select_from_`)
-                     the minimum distance from each point to the set of selected
-                     points. once a point is selected, the distance is not updated;
-                     the final list will reflect the distances when selected.
-
+            the minimum distance from each point to the set of selected points. once a
+            point is selected, the distance is not updated; the final list will reflect
+            the distances when selected.
         """
         return self.hausdorff_
 
     def get_select_distance(self):
         """
-
         Returns
         -------
-
         hausdorff_at_select : ndarray of shape (`n_to_select`)
-                     at the time of selection, the minimum distance from each
-                     selected point to the set of previously selected points.
-
+            at the time of selection, the minimum distance from each selected point to
+            the set of previously selected points.
         """
         mask = self.get_support(indices=True, ordered=True)
         return self.hausdorff_at_select_[mask]
 
     def _init_greedy_search(self, X, y, n_to_select):
+        """Initializes the search. Prepares an array to store the selections,
+        makes the initial selection (unless provided), and computes the starting
+        hausdorff distances.
         """
-        Initializes the search. Prepares an array to store the selections,
-        makes the initial selection (unless provided), and
-        computes the starting hausdorff distances.
-        """
-
         super()._init_greedy_search(X, y, n_to_select)
 
         if self._axis == 1:
@@ -1224,17 +1147,14 @@ def _update_hausdorff(self, X, y, last_selected):
         np.minimum(self.hausdorff_, new_dist, self.hausdorff_)
 
     def _update_post_selection(self, X, y, last_selected):
-        """
-        Saves the most recent selections, increments the counter,
-        and, recomputes hausdorff distances.
+        """Saves the most recent selections, increments the counter, and, recomputes
+        hausdorff distances.
         """
         self._update_hausdorff(X, y, last_selected)
         super()._update_post_selection(X, y, last_selected)
 
     def _more_tags(self):
-        """
-        Pass that this method requires a target vector
-        """
+        """Pass that this method requires a target vector"""
         return {
             "requires_y": True,
         }
diff --git a/src/skmatter/datasets/__init__.py b/src/skmatter/datasets/__init__.py
index c10e90e245..c721131959 100644
--- a/src/skmatter/datasets/__init__.py
+++ b/src/skmatter/datasets/__init__.py
@@ -1,3 +1,5 @@
+"""Datasets used for example and testing."""
+
 from ._base import (
     load_csd_1000r,
     load_degenerate_CH4_manifold,
diff --git a/src/skmatter/datasets/_base.py b/src/skmatter/datasets/_base.py
index e20e8887d0..b3ff8b9f5f 100644
--- a/src/skmatter/datasets/_base.py
+++ b/src/skmatter/datasets/_base.py
@@ -6,6 +6,7 @@
 
 def load_nice_dataset():
     """Load and returns NICE dataset.
+
     Returns
     -------
     nice_data : sklearn.utils.Bunch
@@ -16,7 +17,6 @@ def load_nice_dataset():
       DESCR: `str` --
         The full description of the dataset.
     """
-
     module_path = dirname(__file__)
     target_filename = join(module_path, "data", "nice_dataset.npz")
     raw_data = np.load(target_filename)
@@ -92,6 +92,7 @@ def load_csd_1000r(return_X_y=False):
 
 def load_who_dataset():
     """Load and returns WHO dataset.
+
     Returns
     -------
     who_dataset : sklearn.utils.Bunch
@@ -100,7 +101,6 @@ def load_who_dataset():
                   as a Pandas dataframe.
           DESCR: `str` -- The full description of the dataset.
     """
-
     module_path = dirname(__file__)
     target_filename = join(module_path, "data", "who_dataset.csv")
     pd = check_pandas_support("load_who_dataset")
@@ -112,8 +112,8 @@ def load_who_dataset():
 
 def load_roy_dataset():
     """Load and returns the ROY dataset, which contains structures,
-    energies and SOAP-derived descriptors for 264 polymorphs of ROY,
-    from [Beran et Al, Chemical Science (2022)](https://doi.org/10.1039/D1SC06074K)
+    energies and SOAP-derived descriptors for 264 polymorphs of ROY, from [Beran et Al,
+    Chemical Science (2022)](https://doi.org/10.1039/D1SC06074K)
 
     Returns
     -------
@@ -123,7 +123,6 @@ def load_roy_dataset():
           features: `np.array` -- SOAP-derived descriptors for the structures
           energies: `np.array` -- energies of the structures
     """
-
     module_path = dirname(__file__)
     target_structures = join(module_path, "data", "beran_roy_structures.xyz.bz2")
 
diff --git a/src/skmatter/decomposition/_kernel_pcovr.py b/src/skmatter/decomposition/_kernel_pcovr.py
index eba8c1ccc9..3b85000fd7 100644
--- a/src/skmatter/decomposition/_kernel_pcovr.py
+++ b/src/skmatter/decomposition/_kernel_pcovr.py
@@ -19,17 +19,15 @@
 
 
 class KernelPCovR(_BasePCA, LinearModel):
-    r"""
-    Kernel Principal Covariates Regression, as described in [Helfrecht2020]_
-    determines a latent-space projection :math:`\mathbf{T}` which
-    minimizes a combined loss in supervised and unsupervised tasks in the
-    reproducing kernel Hilbert space (RKHS).
+    r"""Kernel Principal Covariates Regression, as described in [Helfrecht2020]_
+    determines a latent-space projection :math:`\mathbf{T}` which minimizes a combined
+    loss in supervised and unsupervised tasks in the reproducing kernel Hilbert space
+    (RKHS).
 
-    This projection is determined by the eigendecomposition of a modified gram
-    matrix :math:`\mathbf{\tilde{K}}`
+    This projection is determined by the eigendecomposition of a modified gram matrix
+    :math:`\mathbf{\tilde{K}}`
 
     .. math::
-
       \mathbf{\tilde{K}} = \alpha \mathbf{K} +
             (1 - \alpha) \mathbf{\hat{Y}}\mathbf{\hat{Y}}^T
 
@@ -40,15 +38,13 @@ class KernelPCovR(_BasePCA, LinearModel):
 
     Parameters
     ----------
-    mixing: float, default=0.5
+    mixing : float, default=0.5
         mixing parameter, as described in PCovR as :math:`{\\alpha}`
-
-    n_components: int, float or str, default=None
+    n_components : int, float or str, default=None
         Number of components to keep.
         if n_components is not set all components are kept::
 
             n_components == n_samples
-
     svd_solver : {'auto', 'full', 'arpack', 'randomized'}, default='auto'
         If auto :
             The solver is selected by a default policy based on `X.shape` and
@@ -66,7 +62,6 @@ class KernelPCovR(_BasePCA, LinearModel):
             0 < n_components < min(X.shape)
         If randomized :
             run randomized SVD by the method of Halko et al.
-
     regressor : {instance of `sklearn.kernel_ridge.KernelRidge`, `precomputed`, None}, default=None
         The regressor to use for computing
         the property predictions :math:`\\hat{\\mathbf{Y}}`.
@@ -77,76 +72,58 @@ class KernelPCovR(_BasePCA, LinearModel):
 
         If `precomputed`, we assume that the `y` passed to the `fit` function
         is the regressed form of the targets :math:`{\mathbf{\hat{Y}}}`.
-
-
-    kernel: "linear" | "poly" | "rbf" | "sigmoid" | "cosine" | "precomputed"
+    kernel : "linear" | "poly" | "rbf" | "sigmoid" | "cosine" | "precomputed"
         Kernel. Default="linear".
-
-    gamma: float, default=None
+    gamma : float, default=None
         Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other
         kernels.
-
-    degree: int, default=3
+    degree : int, default=3
         Degree for poly kernels. Ignored by other kernels.
-
-    coef0: float, default=1
+    coef0 : float, default=1
         Independent term in poly and sigmoid kernels.
         Ignored by other kernels.
-
-    kernel_params: mapping of str to any, default=None
+    kernel_params : mapping of str to any, default=None
         Parameters (keyword arguments) and values for kernel passed as
         callable object. Ignored by other kernels.
-
-    center: bool, default=False
+    center : bool, default=False
             Whether to center any computed kernels
-
-    fit_inverse_transform: bool, default=False
+    fit_inverse_transform : bool, default=False
         Learn the inverse transform for non-precomputed kernels.
         (i.e. learn to find the pre-image of a point)
-
-    tol: float, default=1e-12
+    tol : float, default=1e-12
         Tolerance for singular values computed by svd_solver == 'arpack'
         and for matrix inversions.
         Must be of range [0.0, infinity).
-
-    n_jobs: int, default=None
+    n_jobs : int, default=None
         The number of parallel jobs to run.
         :obj:`None` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors.
-
     iterated_power : int or 'auto', default='auto'
         Number of iterations for the power method computed by
         svd_solver == 'randomized'.
         Must be of range [0, infinity).
-
-    random_state : int, RandomState instance or None, default=None
+    random_state : int, :class:`numpy.random.RandomState` instance or None, default=None
         Used when the 'arpack' or 'randomized' solvers are used. Pass an int
         for reproducible results across multiple function calls.
 
     Attributes
     ----------
-
-    pt__: ndarray of size :math:`({n_{components}, n_{components}})`
+    pt__: numpy.darray of size :math:`({n_{components}, n_{components}})`
            pseudo-inverse of the latent-space projection, which
            can be used to contruct projectors from latent-space
-
-    pkt_: ndarray of size :math:`({n_{samples}, n_{components}})`
+    pkt_: numpy.ndarray of size :math:`({n_{samples}, n_{components}})`
            the projector, or weights, from the input kernel :math:`\\mathbf{K}`
            to the latent-space projection :math:`\\mathbf{T}`
-
-    pky_: ndarray of size :math:`({n_{samples}, n_{properties}})`
+    pky_: numpy.ndarray of size :math:`({n_{samples}, n_{properties}})`
            the projector, or weights, from the input kernel :math:`\\mathbf{K}`
            to the properties :math:`\\mathbf{Y}`
-
-    pty_: ndarray of size :math:`({n_{components}, n_{properties}})`
+    pty_: numpy.ndarray of size :math:`({n_{components}, n_{properties}})`
           the projector, or weights, from the latent-space projection
           :math:`\\mathbf{T}` to the properties :math:`\\mathbf{Y}`
-
-    ptx_: ndarray of size :math:`({n_{components}, n_{features}})`
+    ptx_: numpy.ndarray of size :math:`({n_{components}, n_{features}})`
          the projector, or weights, from the latent-space projection
          :math:`\\mathbf{T}` to the feature matrix :math:`\\mathbf{X}`
-
-    X_fit_: ndarray of shape (n_samples, n_features)
+    X_fit_: numpy.ndarray of shape (n_samples, n_features)
         The data used to fit the model. This attribute is used to build kernels
         from new data.
 
@@ -235,10 +212,7 @@ def _get_kernel(self, X, Y=None):
         )
 
     def _fit(self, K, Yhat, W):
-        """
-        Fit the model with the computed kernel and approximated properties.
-        """
-
+        """Fit the model with the computed kernel and approximated properties."""
         K_tilde = pcovr_kernel(mixing=self.mixing, X=K, Y=Yhat, kernel="precomputed")
 
         if self._fit_svd_solver == "full":
@@ -262,22 +236,19 @@ def _fit(self, K, Yhat, W):
         self.pt__ = np.linalg.lstsq(T, np.eye(T.shape[0]), rcond=self.tol)[0]
 
     def fit(self, X, Y, W=None):
-        """
-
-        Fit the model with X and Y.
+        r"""Fit the model with X and Y.
 
         Parameters
         ----------
-        X:  ndarray, shape (n_samples, n_features)
+        X : numpy.ndarray, shape (n_samples, n_features)
             Training data, where n_samples is the number of samples and
             n_features is the number of features.
 
-            It is suggested that :math:`\\mathbf{X}` be centered by its column-
+            It is suggested that :math:`\mathbf{X}` be centered by its column-
             means and scaled. If features are related, the matrix should be scaled
             to have unit variance, otherwise :math:`\\mathbf{X}` should be
             scaled so that each feature has a variance of 1 / n_features.
-
-        Y:  ndarray, shape (n_samples, n_properties)
+        Y : ndarray, shape (n_samples, n_properties)
             Training data, where n_samples is the number of samples and
             n_properties is the number of properties
 
@@ -285,8 +256,7 @@ def fit(self, X, Y, W=None):
             means and scaled. If features are related, the matrix should be scaled
             to have unit variance, otherwise :math:`\\mathbf{Y}` should be
             scaled so that each feature has a variance of 1 / n_features.
-
-        W : ndarray, shape (n_samples, n_properties)
+        W : numpy.ndarray, shape (n_samples, n_properties)
             Regression weights, optional when regressor=`precomputed`. If not
             passed, it is assumed that `W = np.linalg.lstsq(K, Y, self.tol)[0]`
 
@@ -294,9 +264,7 @@ def fit(self, X, Y, W=None):
         -------
         self: object
             Returns the instance itself.
-
         """
-
         if self.regressor not in ["precomputed", None] and not isinstance(
             self.regressor, KernelRidge
         ):
@@ -417,7 +385,6 @@ def fit(self, X, Y, W=None):
 
     def predict(self, X=None):
         """Predicts the property values"""
-
         check_is_fitted(self, ["pky_", "pty_"])
 
         X = check_array(X)
@@ -428,20 +395,17 @@ def predict(self, X=None):
         return K @ self.pky_
 
     def transform(self, X):
-        """
-        Apply dimensionality reduction to X.
+        """Apply dimensionality reduction to X.
 
-        X is projected on the first principal components as determined by the
+        ``X`` is projected on the first principal components as determined by the
         modified Kernel PCovR distances.
 
         Parameters
         ----------
-        X: ndarray, shape (n_samples, n_features)
+        X : numpy.ndarray, shape (n_samples, n_features)
             New data, where n_samples is the number of samples
             and n_features is the number of features.
-
         """
-
         check_is_fitted(self, ["pkt_", "X_fit_"])
 
         X = check_array(X)
@@ -453,13 +417,11 @@ def transform(self, X):
         return K @ self.pkt_
 
     def inverse_transform(self, T):
-        """Transform input data back to its original space.
+        r"""Transform input data back to its original space.
 
         .. math::
-
-            \\mathbf{\\hat{X}} = \\mathbf{T} \\mathbf{P}_{TX}
-                              = \\mathbf{K} \\mathbf{P}_{KT} \\mathbf{P}_{TX}
-
+            \mathbf{\\hat{X}} = \mathbf{T} \mathbf{P}_{TX}
+                              = \mathbf{K} \mathbf{P}_{KT} \mathbf{P}_{TX}
 
         Similar to KPCA, the original features are not always recoverable,
         as the projection is computed from the kernel features, not the original
@@ -468,29 +430,25 @@ def inverse_transform(self, T):
 
         Parameters
         ----------
-        T: ndarray, shape (n_samples, n_components)
-            Projected data, where n_samples is the number of samples
-            and n_components is the number of components.
+        T : numpy.ndarray, shape (n_samples, n_components)
+            Projected data, where n_samples is the number of samples and n_components is
+            the number of components.
 
         Returns
         -------
-        X_original ndarray, shape (n_samples, n_features)
+        X_original : numpy.ndarray, shape (n_samples, n_features)
         """
-
         return T @ self.ptx_
 
     def score(self, X, Y):
-        r"""
-        Computes the (negative) loss values for KernelPCovR on the given predictor and
-        response variables. The loss in :math:`\mathbf{K}`, as explained in
+        r"""Computes the (negative) loss values for KernelPCovR on the given predictor
+        and response variables. The loss in :math:`\mathbf{K}`, as explained in
         [Helfrecht2020]_ does not correspond to a traditional Gram loss
-        :math:`\mathbf{K} - \mathbf{TT}^T`. Indicating the kernel between set
-        A and B as :math:`\mathbf{K}_{AB}`,
-        the projection of set A as :math:`\mathbf{T}_A`, and with N and V as the
-        train and validation/test set, one obtains
+        :math:`\mathbf{K} - \mathbf{TT}^T`. Indicating the kernel between set A and B as
+        :math:`\mathbf{K}_{AB}`, the projection of set A as :math:`\mathbf{T}_A`, and
+        with N and V as the train and validation/test set, one obtains
 
         .. math::
-
             \ell=\frac{\operatorname{Tr}\left[\mathbf{K}_{VV} - 2
             \mathbf{K}_{VN} \mathbf{T}_N
                 (\mathbf{T}_N^T \mathbf{T}_N)^{-1} \mathbf{T}_V^T
@@ -498,21 +456,21 @@ def score(self, X, Y):
             \mathbf{K}_{NN} \mathbf{T}_N (\mathbf{T}_N^T \mathbf{T}_N)^{-1}
             \mathbf{T}_V^T\right]}{\operatorname{Tr}(\mathbf{K}_{VV})}
 
-        The negative loss is returned for easier use in sklearn pipelines, e.g., a
-        grid search, where methods named 'score' are meant to be maximized.
-
-        Arguments
-        ---------
-        X:              independent (predictor) variable
-        Y:              dependent (response) variable
+        The negative loss is returned for easier use in sklearn pipelines, e.g., a grid
+        search, where methods named 'score' are meant to be maximized.
 
+        Parameters
+        ----------
+        X : numpy.ndarray
+            independent (predictor) variable
+        Y : numpy.ndarray
+            dependent (response) variable
         Returns
         -------
-        L:             Negative sum of the KPCA and KRR losses, with the KPCA loss
-                       determined by the reconstruction of the kernel
-
+        L : float
+            Negative sum of the KPCA and KRR losses, with the KPCA loss determined by
+            the reconstruction of the kernel
         """
-
         check_is_fitted(self, ["pkt_", "X_fit_"])
 
         X = check_array(X)
diff --git a/src/skmatter/decomposition/_pcovr.py b/src/skmatter/decomposition/_pcovr.py
index 01a385e437..929f137e23 100644
--- a/src/skmatter/decomposition/_pcovr.py
+++ b/src/skmatter/decomposition/_pcovr.py
@@ -19,9 +19,7 @@
 
 
 class PCovR(_BasePCA, LinearModel):
-    r"""
-
-    Principal Covariates Regression, as described in [deJong1992]_
+    r"""Principal Covariates Regression, as described in [deJong1992]_
     determines a latent-space projection :math:`\mathbf{T}` which
     minimizes a combined loss in supervised and unsupervised tasks.
 
@@ -29,7 +27,6 @@ class PCovR(_BasePCA, LinearModel):
     matrix :math:`\mathbf{\tilde{K}}`
 
     .. math::
-
       \mathbf{\tilde{K}} = \alpha \mathbf{X} \mathbf{X}^T +
             (1 - \alpha) \mathbf{\hat{Y}}\mathbf{\hat{Y}}^T
 
@@ -42,7 +39,6 @@ class PCovR(_BasePCA, LinearModel):
     :math:`\mathbf{\tilde{C}}`
 
     .. math::
-
       \mathbf{\tilde{C}} = \alpha \mathbf{X}^T \mathbf{X} +
             (1 - \alpha) \left(\left(\mathbf{X}^T
             \mathbf{X}\right)^{-\frac{1}{2}} \mathbf{X}^T
@@ -69,108 +65,85 @@ class PCovR(_BasePCA, LinearModel):
     Parameters
     ----------
     mixing: float, default=0.5
-        mixing parameter, as described in PCovR as :math:`{\alpha}`, here named
-        to avoid confusion with regularization parameter `alpha`
-
+        mixing parameter, as described in PCovR as :math:`{\alpha}`, here named to avoid
+        confusion with regularization parameter `alpha`
     n_components : int, float or str, default=None
         Number of components to keep.
         if n_components is not set all components are kept::
 
             n_components == min(n_samples, n_features)
-
     svd_solver : {'auto', 'full', 'arpack', 'randomized'}, default='auto'
         If auto :
             The solver is selected by a default policy based on `X.shape` and
-            `n_components`: if the input data is larger than 500x500 and the
-            number of components to extract is lower than 80% of the smallest
-            dimension of the data, then the more efficient 'randomized'
-            method is enabled. Otherwise the exact full SVD is computed and
-            optionally truncated afterwards.
+            `n_components`: if the input data is larger than 500x500 and the number of
+            components to extract is lower than 80% of the smallest dimension of the
+            data, then the more efficient 'randomized' method is enabled. Otherwise the
+            exact full SVD is computed and optionally truncated afterwards.
         If full :
-            run exact full SVD calling the standard LAPACK solver via
-            `scipy.linalg.svd` and select the components by postprocessing
+            run exact full SVD calling the standard LAPACK solver via `scipy.linalg.svd`
+            and select the components by postprocessing
         If arpack :
             run SVD truncated to n_components calling ARPACK solver via
-            `scipy.sparse.linalg.svds`. It requires strictly
-            0 < n_components < min(X.shape)
+            `scipy.sparse.linalg.svds`. It requires strictly 0 < n_components <
+            min(X.shape)
         If randomized :
             run randomized SVD by the method of Halko et al.
-
     tol : float, default=1e-12
-        Tolerance for singular values computed by svd_solver == 'arpack'.
-        Must be of range [0.0, infinity).
-
+        Tolerance for singular values computed by svd_solver == 'arpack'. Must be of
+        range [0.0, infinity).
     space: {'feature', 'sample', 'auto'}, default='auto'
-            whether to compute the PCovR in `sample` or `feature` space
-            default=`sample` when :math:`{n_{samples} < n_{features}}` and
-            `feature` when :math:`{n_{features} < n_{samples}}`
-
+        whether to compute the PCovR in `sample` or `feature` space default=`sample`
+        when :math:`{n_{samples} < n_{features}}` and `feature` when
+        :math:`{n_{features} < n_{samples}}`
     regressor: {`Ridge`, `RidgeCV`, `LinearRegression`, `precomputed`}, default=None
-             regressor for computing approximated :math:`{\mathbf{\hat{Y}}}`.
-             The regressor should be one `sklearn.linear_model.Ridge`,
-             `sklearn.linear_model.RidgeCV`, or `sklearn.linear_model.LinearRegression`.
-             If a pre-fitted regressor is provided, it is used to compute
-             :math:`{\mathbf{\hat{Y}}}`.
-             Note that any pre-fitting of the regressor will be lost if `PCovR` is
-             within a composite estimator that enforces cloning, e.g.,
-             `sklearn.compose.TransformedTargetRegressor` or
-             `sklearn.pipeline.Pipeline` with model caching.
-             In such cases, the regressor will be re-fitted on the same
-             training data as the composite estimator.
-             If `precomputed`, we assume that the `y` passed to the `fit` function
-             is the regressed form of the targets :math:`{\mathbf{\hat{Y}}}`.
-             If None, ``sklearn.linear_model.Ridge('alpha':1e-6, 'fit_intercept':False, 'tol':1e-12)``
-             is used as the regressor.
-
+        regressor for computing approximated :math:`{\mathbf{\hat{Y}}}`. The regressor
+        should be one `sklearn.linear_model.Ridge`, `sklearn.linear_model.RidgeCV`, or
+        `sklearn.linear_model.LinearRegression`. If a pre-fitted regressor is provided,
+        it is used to compute :math:`{\mathbf{\hat{Y}}}`. Note that any pre-fitting of
+        the regressor will be lost if `PCovR` is within a composite estimator that
+        enforces cloning, e.g., `sklearn.compose.TransformedTargetRegressor` or
+        `sklearn.pipeline.Pipeline` with model caching. In such cases, the regressor
+        will be re-fitted on the same training data as the composite estimator. If
+        `precomputed`, we assume that the `y` passed to the `fit` function is the
+        regressed form of the targets :math:`{\mathbf{\hat{Y}}}`. If None,
+        ``sklearn.linear_model.Ridge('alpha':1e-6, 'fit_intercept':False, 'tol':1e-12)``
+        is used as the regressor.
     iterated_power : int or 'auto', default='auto'
-         Number of iterations for the power method computed by
-         svd_solver == 'randomized'.
-         Must be of range [0, infinity).
-
-    random_state : int, RandomState instance or None, default=None
-         Used when the 'arpack' or 'randomized' solvers are used. Pass an int
-         for reproducible results across multiple function calls.
-
-    whiten : boolean, deprecated
+         Number of iterations for the power method computed by svd_solver ==
+         'randomized'. Must be of range [0, infinity).
+    random_state : int, :class:`numpy.random.RandomState` instance or None, default=None
+         Used when the 'arpack' or 'randomized' solvers are used. Pass an int for
+         reproducible results across multiple function calls.
+    whiten : bool, deprecated
 
     Attributes
     ----------
-
     mixing: float, default=0.5
         mixing parameter, as described in PCovR as :math:`{\alpha}`
-
     tol: float, default=1e-12
         Tolerance for singular values computed by svd_solver == 'arpack'.
         Must be of range [0.0, infinity).
-
     space: {'feature', 'sample', 'auto'}, default='auto'
-            whether to compute the PCovR in `sample` or `feature` space
-            default=`sample` when :math:`{n_{samples} < n_{features}}` and
-            `feature` when :math:`{n_{features} < n_{samples}}`
-
+        whether to compute the PCovR in `sample` or `feature` space default=`sample`
+        when :math:`{n_{samples} < n_{features}}` and `feature` when
+        :math:`{n_{features} < n_{samples}}`
     n_components_ : int
-        The estimated number of components, which equals the parameter
-        n_components, or the lesser value of n_features and n_samples
-        if n_components is None.
-
-    pxt_ : ndarray of size :math:`({n_{samples}, n_{components}})`
-           the projector, or weights, from the input space :math:`\mathbf{X}`
-           to the latent-space projection :math:`\mathbf{T}`
-
+        The estimated number of components, which equals the parameter n_components, or
+        the lesser value of n_features and n_samples if n_components is None.
+    pxt_ : numpy.ndarray of size :math:`({n_{samples}, n_{components}})`
+        the projector, or weights, from the input space :math:`\mathbf{X}` to the
+        latent-space projection :math:`\mathbf{T}`
     pty_ : ndarray of size :math:`({n_{components}, n_{properties}})`
-          the projector, or weights, from the latent-space projection
-          :math:`\mathbf{T}` to the properties :math:`\mathbf{Y}`
-
+        the projector, or weights, from the latent-space projection :math:`\mathbf{T}`
+        to the properties :math:`\mathbf{Y}`
     pxy_ : ndarray of size :math:`({n_{samples}, n_{properties}})`
-           the projector, or weights, from the input space :math:`\mathbf{X}`
-           to the properties :math:`\mathbf{Y}`
-
+        the projector, or weights, from the input space :math:`\mathbf{X}` to the
+        properties :math:`\mathbf{Y}`
     explained_variance_ : ndarray of shape (n_components,)
         The amount of variance explained by each of the selected components.
 
         Equal to n_components largest eigenvalues
         of the PCovR-modified covariance matrix of :math:`\mathbf{X}`.
-
     singular_values_ : ndarray of shape (n_components,)
         The singular values corresponding to each of the selected components.
 
@@ -193,7 +166,7 @@ class PCovR(_BasePCA, LinearModel):
            [-1.02805338,  1.06736871],
            [ 0.98166504, -4.98307078],
            [-2.9963189 ,  1.98238856]])
-    """  # NoQa: E501
+    """
 
     def __init__(
         self,
@@ -220,40 +193,34 @@ def __init__(
         self.regressor = regressor
 
     def fit(self, X, Y, W=None):
-        r"""
-
-        Fit the model with X and Y. Depending on the dimensions of X,
-        calls either `_fit_feature_space` or `_fit_sample_space`
+        r"""Fit the model with X and Y. Depending on the dimensions of X, calls either
+        `_fit_feature_space` or `_fit_sample_space`
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
-            Training data, where n_samples is the number of samples and
-            n_features is the number of features.
+        X : numpy.ndarray, shape (n_samples, n_features)
+            Training data, where n_samples is the number of samples and n_features is
+            the number of features.
 
             It is suggested that :math:`\mathbf{X}` be centered by its column-
             means and scaled. If features are related, the matrix should be scaled
             to have unit variance, otherwise :math:`\mathbf{X}` should be
             scaled so that each feature has a variance of 1 / n_features.
+        Y : numpy.ndarray, shape (n_samples, n_properties)
+            Training data, where n_samples is the number of samples and n_properties is
+            the number of properties
 
-        Y : ndarray, shape (n_samples, n_properties)
-            Training data, where n_samples is the number of samples and
-            n_properties is the number of properties
-
-            It is suggested that :math:`\mathbf{X}` be centered by its column-
-            means and scaled. If features are related, the matrix should be scaled
-            to have unit variance, otherwise :math:`\mathbf{Y}` should be
-            scaled so that each feature has a variance of 1 / n_features.
+            It is suggested that :math:`\mathbf{X}` be centered by its column- means and
+            scaled. If features are related, the matrix should be scaled to have unit
+            variance, otherwise :math:`\mathbf{Y}` should be scaled so that each feature
+            has a variance of 1 / n_features.
 
             If the passed regressor = `precomputed`, it is assumed that Y is the
             regressed form of the properties, :math:`{\mathbf{\hat{Y}}}`.
-
-        W : ndarray, shape (n_features, n_properties)
+        W : numpy.ndarray, shape (n_features, n_properties)
             Regression weights, optional when regressor=`precomputed`. If not
             passed, it is assumed that `W = np.linalg.lstsq(X, Y, self.tol)[0]`
-
         """
-
         X, Y = check_X_y(X, Y, y_numeric=True, multi_output=True)
 
         # saved for inverse transformations from the latent space,
@@ -355,11 +322,9 @@ def fit(self, X, Y, W=None):
         return self
 
     def _fit_feature_space(self, X, Y, Yhat):
-        r"""
-        In feature-space PCovR, the projectors are determined by:
+        r"""In feature-space PCovR, the projectors are determined by:
 
         .. math::
-
             \mathbf{\tilde{C}} = \alpha \mathbf{X}^T \mathbf{X} +
             (1 - \alpha) \left(\left(\mathbf{X}^T
             \mathbf{X}\right)^{-\frac{1}{2}} \mathbf{X}^T
@@ -369,26 +334,21 @@ def _fit_feature_space(self, X, Y, Yhat):
         where
 
         .. math::
-
             \mathbf{P}_{XT} = (\mathbf{X}^T \mathbf{X})^{-\frac{1}{2}}
                                 \mathbf{U}_\mathbf{\tilde{C}}^T
                                 \mathbf{\Lambda}_\mathbf{\tilde{C}}^{\frac{1}{2}}
 
         .. math::
-
             \mathbf{P}_{TX} = \mathbf{\Lambda}_\mathbf{\tilde{C}}^{-\frac{1}{2}}
                                 \mathbf{U}_\mathbf{\tilde{C}}^T
                                 (\mathbf{X}^T \mathbf{X})^{\frac{1}{2}}
 
         .. math::
-
             \mathbf{P}_{TY} = \mathbf{\Lambda}_\mathbf{\tilde{C}}^{-\frac{1}{2}}
                                \mathbf{U}_\mathbf{\tilde{C}}^T (\mathbf{X}^T
                                \mathbf{X})^{-\frac{1}{2}} \mathbf{X}^T
                                \mathbf{Y}
-
         """
-
         Ct, iCsqrt = pcovr_covariance(
             mixing=self.mixing,
             X=X,
@@ -426,35 +386,28 @@ def _fit_feature_space(self, X, Y, Yhat):
         self.pty_ = np.linalg.multi_dot([S_sqrt_inv, Vt, iCsqrt, X.T, Y])
 
     def _fit_sample_space(self, X, Y, Yhat, W):
-        r"""
-        In sample-space PCovR, the projectors are determined by:
+        r"""In sample-space PCovR, the projectors are determined by:
 
         .. math::
-
             \mathbf{\tilde{K}} = \alpha \mathbf{X} \mathbf{X}^T +
             (1 - \alpha) \mathbf{\hat{Y}}\mathbf{\hat{Y}}^T
 
         where
 
         .. math::
-
             \mathbf{P}_{XT} = \left(\alpha \mathbf{X}^T + (1 - \alpha)
                                \mathbf{W} \mathbf{\hat{Y}}^T\right)
                                \mathbf{U}_\mathbf{\tilde{K}}
                                \mathbf{\Lambda}_\mathbf{\tilde{K}}^{-\frac{1}{2}}
 
         .. math::
-
             \mathbf{P}_{TX} = \mathbf{\Lambda}_\mathbf{\tilde{K}}^{-\frac{1}{2}}
                                 \mathbf{U}_\mathbf{\tilde{K}}^T \mathbf{X}
 
         .. math::
-
             \mathbf{P}_{TY} = \mathbf{\Lambda}_\mathbf{\tilde{K}}^{-\frac{1}{2}}
                                \mathbf{U}_\mathbf{\tilde{K}}^T \mathbf{Y}
-
         """
-
         Kt = pcovr_kernel(mixing=self.mixing, X=X, Y=Yhat)
 
         if self.fit_svd_solver_ == "full":
@@ -598,11 +551,9 @@ def inverse_transform(self, T):
         r"""Transform data back to its original space.
 
         .. math::
-
             \mathbf{\hat{X}} = \mathbf{T} \mathbf{P}_{TX}
                               = \mathbf{X} \mathbf{P}_{XT} \mathbf{P}_{TX}
 
-
         Parameters
         ----------
         T : ndarray, shape (n_samples, n_components)
@@ -613,7 +564,6 @@ def inverse_transform(self, T):
         -------
         X_original ndarray, shape (n_samples, n_features)
         """
-
         if np.max(np.abs(self.mean_)) > self.tol:
             warnings.warn(
                 "This class does not automatically un-center data, and your data mean "
@@ -625,8 +575,7 @@ def inverse_transform(self, T):
         return T @ self.ptx_
 
     def predict(self, X=None, T=None):
-        """Predicts the property values using regression on X or T"""
-
+        """Predicts the property values using regression on X or T."""
         check_is_fitted(self, ["pxy_", "pty_"])
 
         if X is None and T is None:
@@ -640,20 +589,17 @@ def predict(self, X=None, T=None):
             return T @ self.pty_
 
     def transform(self, X=None):
-        """
-        Apply dimensionality reduction to X.
+        """Apply dimensionality reduction to X.
 
-        X is projected on the first principal components as determined by the
+        ``X`` is projected on the first principal components as determined by the
         modified PCovR distances.
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : numpy.ndarray, shape (n_samples, n_features)
             New data, where n_samples is the number of samples
             and n_features is the number of features.
-
         """
-
         check_is_fitted(self, ["pxt_", "mean_"])
 
         return super().transform(X)
@@ -663,14 +609,12 @@ def score(self, X, Y, T=None):
         defined as:
 
         .. math::
-
             \ell_{X} = \frac{\lVert \mathbf{X} - \mathbf{T}\mathbf{P}_{TX} \rVert ^ 2}
                             {\lVert \mathbf{X}\rVert ^ 2}
 
         and
 
         .. math::
-
             \ell_{Y} = \frac{\lVert \mathbf{Y} - \mathbf{T}\mathbf{P}_{TY} \rVert ^ 2}
                             {\lVert \mathbf{Y}\rVert ^ 2}
 
@@ -678,23 +622,19 @@ def score(self, X, Y, T=None):
         use in sklearn pipelines, e.g., a grid search, where methods named 'score' are
         meant to be maximized.
 
-
         Parameters
         ----------
-        X : ndarray of shape (n_samples, n_features)
+        X : numpy.ndarray of shape (n_samples, n_features)
             The data.
-
-        Y : ndarray of shape (n_samples, n_properties)
+        Y : numpy.ndarray of shape (n_samples, n_properties)
             The target.
 
         Returns
         -------
         loss : float
-             Negative sum of the loss in reconstructing X from the latent-space
-             projection T and the loss in predicting Y from the latent-space
-             projection T
+            Negative sum of the loss in reconstructing X from the latent-space
+            projection T and the loss in predicting Y from the latent-space projection T
         """
-
         if T is None:
             T = self.transform(X)
 
diff --git a/src/skmatter/feature_selection/_base.py b/src/skmatter/feature_selection/_base.py
index 4971f853d2..0394faae7f 100644
--- a/src/skmatter/feature_selection/_base.py
+++ b/src/skmatter/feature_selection/_base.py
@@ -1,61 +1,47 @@
-"""
-Sequential feature selection
-"""
+"""Sequential feature selection."""
 
 from .._selection import _CUR, _FPS, _PCovCUR, _PCovFPS
 
 
 class FPS(_FPS):
-    """
-    Transformer that performs Greedy Feature Selection using Farthest Point Sampling.
+    """Transformer performing Greedy Feature Selection using Farthest Point Sampling.
 
     Parameters
     ----------
-
     initialize: int, list of int, or 'random', default=0
-        Index of the first selection(s). If 'random', picks a random
-        value when fit starts. Stored in :py:attr:`self.initialize`.
-
+        Index of the first selection(s). If 'random', picks a random value when fit
+        starts. Stored in :py:attr:`self.initialize`.
     n_to_select : int or float, default=None
-        The number of selections to make. If `None`, half of the features are
-        selected. If integer, the parameter is the absolute number of selections
-        to make. If float between 0 and 1, it is the fraction of the total dataset to
-        select. Stored in :py:attr:`self.n_to_select`.
-
+        The number of selections to make. If `None`, half of the features are selected.
+        If integer, the parameter is the absolute number of selections to make. If float
+        between 0 and 1, it is the fraction of the total dataset to select. Stored in
+        :py:attr:`self.n_to_select`.
     score_threshold : float, default=None
         Threshold for the score. If `None` selection will continue until the n_to_select
         is chosen. Otherwise will stop when the score falls below the threshold. Stored
         in :py:attr:`self.score_threshold`.
-
     score_threshold_type : str, default="absolute"
-        How to interpret the ``score_threshold``. When "absolute", the score used by
-        the selector is compared to the threshold directly. When "relative", at each
+        How to interpret the ``score_threshold``. When "absolute", the score used by the
+        selector is compared to the threshold directly. When "relative", at each
         iteration, the score used by the selector is compared proportionally to the
-        score of the first selection, i.e. the selector quits when
-        ``current_score / first_score < threshold``. Stored in
-        :py:attr:`self.score_threshold_type`.
-
+        score of the first selection, i.e. the selector quits when ``current_score /
+        first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`.
     progress_bar: bool, default=False
-              option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
-              selections. Stored in :py:attr:`self.report_progress`.
-
+        option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
+        selections. Stored in :py:attr:`self.report_progress`.
     full : bool, default=False
         In the case that all non-redundant selections are exhausted, choose
         randomly from the remaining features. Stored in :py:attr:`self.full`.
-
-    random_state: int or RandomState instance, default=0
+    random_state: int or `class:`numpy.random.RandomState` instance, default=0
 
     Attributes
     ----------
-
     n_selected_ : int
-                  Counter tracking the number of selections that have been made
-
+        Counter tracking the number of selections that have been made
     X_selected_ : ndarray,
-                  Matrix containing the selected features, for use in fitting
-
+        Matrix containing the selected features, for use in fitting
     selected_idx_ : ndarray
-                  indices of selected samples
+        indices of selected samples
 
     Examples
     --------
@@ -65,7 +51,7 @@ class FPS(_FPS):
     ...     n_to_select=2,
     ...     # int or 'random', default=0
     ...     # Index of the first selection.
-    ...     # If ‘random’, picks a random value when fit starts.
+    ...     # If "random", picks a random value when fit starts.
     ...     initialize=0,
     ... )
     >>> X = np.array(
@@ -105,56 +91,44 @@ def __init__(
 
 
 class PCovFPS(_PCovFPS):
-    """Transformer that performs Greedy Feature Selection using PCovR-weighted
+    r"""Transformer that performs Greedy Feature Selection using PCovR-weighted
     Farthest Point Sampling.
 
     Parameters
     ----------
-
     mixing: float, default=0.5
-            The PCovR mixing parameter, as described in PCovR as
-            :math:`{\\alpha}`
-
+        The PCovR mixing parameter, as described in PCovR as :math:`{\alpha}`
     initialize: int or 'random', default=0
-        Index of the first selection. If 'random', picks a random
-        value when fit starts.
-
+        Index of the first selection. If 'random', picks a random value when fit starts.
     n_to_select : int or float, default=None
-        The number of selections to make. If `None`, half of the features are
-        selected. If integer, the parameter is the absolute number of selections
-        to make. If float between 0 and 1, it is the fraction of the total dataset to
-        select. Stored in :py:attr:`self.n_to_select`.
-
+        The number of selections to make. If `None`, half of the features are selected.
+        If integer, the parameter is the absolute number of selections to make. If float
+        between 0 and 1, it is the fraction of the total dataset to select. Stored in
+        :py:attr:`self.n_to_select`.
     score_threshold : float, default=None
-        Threshold for the score. If `None` selection will continue until the
-        n_to_select is chosen. Otherwise will stop when the score falls below the
-        threshold. Stored in :py:attr:`self.score_threshold`.
-
+        Threshold for the score. If `None` selection will continue until the n_to_select
+        is chosen. Otherwise will stop when the score falls below the threshold. Stored
+        in :py:attr:`self.score_threshold`.
     score_threshold_type : str, default="absolute"
-        How to interpret the ``score_threshold``. When "absolute", the score used by
-        the selector is compared to the threshold directly. When "relative", at each
+        How to interpret the ``score_threshold``. When "absolute", the score used by the
+        selector is compared to the threshold directly. When "relative", at each
         iteration, the score used by the selector is compared proportionally to the
-        score of the first selection, i.e. the selector quits when
-        ``current_score / first_score < threshold``. Stored in
-        :py:attr:`self.score_threshold_type`.
-
+        score of the first selection, i.e. the selector quits when ``current_score /
+        first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`.
     progress_bar: bool, default=False
-              option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
-              selections. Stored in :py:attr:`self.report_progress`.
-
+        option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
+        selections. Stored in :py:attr:`self.report_progress`.
     full : bool, default=False
         In the case that all non-redundant selections are exhausted, choose
         randomly from the remaining features. Stored in :py:attr:`self.full`.
-
-    random_state: int or RandomState instance, default=0
+    random_state: int or :class:`numpy.random.RandomState` instance, default=0
 
     Attributes
     ----------
-
     n_selected_ : int
-                  Counter tracking the number of selections that have been made
-    X_selected_ : ndarray,
-                  Matrix containing the selected features, for use in fitting
+        Counter tracking the number of selections that have been made
+    X_selected_ : numpy.ndarray,
+        Matrix containing the selected features, for use in fitting
 
     Examples
     --------
@@ -214,26 +188,21 @@ class CUR(_CUR):
     Parameters
     ----------
     recompute_every : int
-                      number of steps after which to recompute the pi score
-                      defaults to 1, if 0 no re-computation is done
-
+        number of steps after which to recompute the pi score
+        defaults to 1, if 0 no re-computation is done
     k : int
-        number of eigenvectors to compute the importance score with, defaults to 1
-
+        number of eigenvectors to compute the importance score with, defaults to ``1``
     tolerance: float
-         threshold below which scores will be considered 0, defaults to 1E-12
-
+        threshold below which scores will be considered 0, defaults to ``1e-12``
     n_to_select : int or float, default=None
-        The number of selections to make. If `None`, half of the features are
-        selected. If integer, the parameter is the absolute number of selections
-        to make. If float between 0 and 1, it is the fraction of the total dataset to
-        select. Stored in :py:attr:`self.n_to_select`.
-
+        The number of selections to make. If `None`, half of the features are selected.
+        If integer, the parameter is the absolute number of selections to make. If float
+        between 0 and 1, it is the fraction of the total dataset to select. Stored in
+        :py:attr:`self.n_to_select`.
     score_threshold : float, default=None
         Threshold for the score. If `None` selection will continue until the
         n_to_select is chosen. Otherwise will stop when the score falls below the
         threshold. Stored in :py:attr:`self.score_threshold`.
-
     score_threshold_type : str, default="absolute"
         How to interpret the ``score_threshold``. When "absolute", the score used by
         the selector is compared to the threshold directly. When "relative", at each
@@ -241,35 +210,26 @@ class CUR(_CUR):
         score of the first selection, i.e. the selector quits when
         ``current_score / first_score < threshold``. Stored in
         :py:attr:`self.score_threshold_type`.
-
     progress_bar: bool, default=False
               option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
               selections. Stored in :py:attr:`self.report_progress`.
-
     full : bool, default=False
         In the case that all non-redundant selections are exhausted, choose
         randomly from the remaining features. Stored in :py:attr:`self.full`.
-
     random_state: int or RandomState instance, default=0
 
-
     Attributes
     ----------
-
-    X_current_ : ndarray (n_samples, n_features)
-                  The original matrix orthogonalized by previous selections
-
+    X_current_ : numpy.ndarray (n_samples, n_features)
+        The original matrix orthogonalized by previous selections
     n_selected_ : int
-                  Counter tracking the number of selections that have been made
-
-    X_selected_ : ndarray,
-                  Matrix containing the selected features, for use in fitting
-
-    pi_ : ndarray (n_features),
-                  the importance score see :func:`_compute_pi`
-
-    selected_idx_ : ndarray
-                  indices of selected features
+        Counter tracking the number of selections that have been made
+    X_selected_ : numpy.ndarray
+        Matrix containing the selected features, for use in fitting
+    pi_ : numpy.ndarray (n_features),
+        the importance score see :func:`_compute_pi`
+    selected_idx_ : numpy.ndarray
+        indices of selected features
 
     Examples
     --------
@@ -321,71 +281,56 @@ def __init__(
 
 
 class PCovCUR(_PCovCUR):
-    """Transformer that performs Greedy Feature Selection by choosing features
-    which maximize the importance score :math:`\\pi`, which is the sum over
+    r"""Transformer that performs Greedy Feature Selection by choosing features
+    which maximize the importance score :math:`\pi`, which is the sum over
     the squares of the first :math:`k` components of the PCovR-modified
     right singular vectors.
 
     Parameters
     ----------
     recompute_every : int
-                      number of steps after which to recompute the pi score
-                      defaults to 1, if 0 no re-computation is done
-
+        number of steps after which to recompute the pi score defaults to 1, if 0 no
+        re-computation is done
     k : int
         number of eigenvectors to compute the importance score with, defaults to 1
-
     tolerance: float
-         threshold below which scores will be considered 0, defaults to 1E-12
-
+        threshold below which scores will be considered 0, defaults to ``1e-12``
     mixing: float, default=0.5
-            The PCovR mixing parameter, as described in PCovR as
-            :math:`{\\alpha}`. Stored in :py:attr:`self.mixing`.
-
+        The PCovR mixing parameter, as described in PCovR as
+        :math:`{\alpha}`. Stored in :py:attr:`self.mixing`.
     n_to_select : int or float, default=None
-        The number of selections to make. If `None`, half of the features are
-        selected. If integer, the parameter is the absolute number of selections
-        to make. If float between 0 and 1, it is the fraction of the total dataset to
-        select. Stored in :py:attr:`self.n_to_select`.
-
+        The number of selections to make. If `None`, half of the features are selected.
+        If integer, the parameter is the absolute number of selections to make. If float
+        between 0 and 1, it is the fraction of the total dataset to select. Stored in
+        :py:attr:`self.n_to_select`.
     score_threshold : float, default=None
         Threshold for the score. If `None` selection will continue until the n_to_select
         is chosen. Otherwise will stop when the score falls below the threshold. Stored
         in :py:attr:`self.score_threshold`.
-
     score_threshold_type : str, default="absolute"
         How to interpret the ``score_threshold``. When "absolute", the score used by the
         selector is compared to the threshold directly. When "relative", at each
         iteration, the score used by the selector is compared proportionally to the
-        score of the first selection, i.e. the selector quits when
-        ``current_score / first_score < threshold``. Stored in
-        :py:attr:`self.score_threshold_type`.
-
+        score of the first selection, i.e. the selector quits when ``current_score /
+        first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`.
     progress_bar: bool, default=False
-              option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
-              selections. Stored in :py:attr:`self.report_progress`.
-
+        option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
+        selections. Stored in :py:attr:`self.report_progress`.
     full : bool, default=False
-        In the case that all non-redundant selections are exhausted, choose
-        randomly from the remaining features. Stored in :py:attr:`self.full`.
-
-    random_state: int or RandomState instance, default=0
+        In the case that all non-redundant selections are exhausted, choose randomly
+        from the remaining features. Stored in :py:attr:`self.full`.
+    random_state: int or :class:`numpy.random.RandomState` instance, default=0
 
     Attributes
     ----------
-
-    X_current_ : ndarray (n_samples, n_features)
-                The original matrix orthogonalized by previous selections
-
-    y_current_ : ndarray (n_samples, n_properties)
-                The targets orthogonalized by a regression on
-                the previous selections.
-
+    X_current_ : numpy.ndarray (n_samples, n_features)
+        The original matrix orthogonalized by previous selections
+    y_current_ : numpy.ndarray (n_samples, n_properties)
+        The targets orthogonalized by a regression on the previous selections.
     n_selected_ : int
-                Counter tracking the number of selections that have been made
-
-    X_selected_ : ndarray,
-                Matrix containing the selected features, for use in fitting
+        Counter tracking the number of selections that have been made
+    X_selected_ : numpy.ndarray,
+        Matrix containing the selected features, for use in fitting
 
     Examples
     --------
diff --git a/src/skmatter/linear_model/__init__.py b/src/skmatter/linear_model/__init__.py
index 9fb0613a1e..dc0117a260 100644
--- a/src/skmatter/linear_model/__init__.py
+++ b/src/skmatter/linear_model/__init__.py
@@ -1,3 +1,5 @@
+"""Classes for building linear models."""
+
 from ._base import OrthogonalRegression
 from ._ridge import Ridge2FoldCV
 
diff --git a/src/skmatter/linear_model/_base.py b/src/skmatter/linear_model/_base.py
index 800cf67f4a..6d57f795fd 100644
--- a/src/skmatter/linear_model/_base.py
+++ b/src/skmatter/linear_model/_base.py
@@ -54,15 +54,13 @@ def fit(self, X, y):
         """
         Parameters
         ----------
-        X : ndarray of shape (n_samples, n_features)
-            Training data, where n_samples is the number of samples
-            and n_features is the number of features.
-
-        y : ndarray of shape (n_samples, n_targets)
-            Training data, where n_samples is the number of samples
-            and n_targets is the number of target properties.
+        X : numpy.ndarray of shape (n_samples, n_features)
+            Training data, where ``n_samples`` is the number of samples and
+            ``n_features`` is the number of features.
+        y : numpy.ndarray of shape (n_samples, n_targets)
+            Training data, where ``n_samples`` is the number of samples and
+            ``n_targets`` is the number of target properties.
         """
-
         X, y = check_X_y(
             X,
             y,
diff --git a/src/skmatter/linear_model/_ridge.py b/src/skmatter/linear_model/_ridge.py
index b50356f734..eadc9a9145 100644
--- a/src/skmatter/linear_model/_ridge.py
+++ b/src/skmatter/linear_model/_ridge.py
@@ -192,11 +192,10 @@ def predict(self, X):
         """
         Parameters
         ----------
-        X : ndarray of shape (n_samples, n_features)
+        X : numpy.ndarray of shape (n_samples, n_features)
             Training data, where n_samples is the number of samples
             and n_features is the number of features.
         """
-
         X = check_array(X)
 
         check_is_fitted(self, ["coef_"])
diff --git a/src/skmatter/metrics/__init__.py b/src/skmatter/metrics/__init__.py
index 70dff5c424..16cfe8f04c 100644
--- a/src/skmatter/metrics/__init__.py
+++ b/src/skmatter/metrics/__init__.py
@@ -1,6 +1,5 @@
-"""
-This module contains a set of metrics that can be used for an enhanced
-understanding of your machine learning model.
+"""Set of metrics that can be used for an enhanced understanding of your machine
+learning model.
 
 First are the easily-interpretable error measures of the relative information
 capacity of feature space `F` with respect to feature space `F'`. The methods
diff --git a/src/skmatter/metrics/_prediction_rigidities.py b/src/skmatter/metrics/_prediction_rigidities.py
index efed6e233f..19446ccc38 100644
--- a/src/skmatter/metrics/_prediction_rigidities.py
+++ b/src/skmatter/metrics/_prediction_rigidities.py
@@ -3,52 +3,47 @@
 
 def local_prediction_rigidity(X_train, X_test, alpha):
     r"""Computes the local prediction rigidity (LPR) of a linear or kernel model
-    trained on a training dataset provided as input, on the local environments
-    in the test set provided as a separate input. LPR is defined as follows:
+    trained on a training dataset provided as input, on the local environments in the
+    test set provided as a separate input. LPR is defined as follows:
 
     .. math::
         LPR_{i} = \frac{1}{X_i (X^{T} X + \lambda I)^{-1} X_i^{T}}
 
-    The function assumes that the model training is undertaken in a manner where
-    the global prediction targets are averaged over the number of atoms
-    appearing in each training structure, and the average feature vector of each
-    structure is hence used in the regression. This ensures that (1)
-    regularization strength across structures with different number of atoms is
-    kept constant per structure during model training, and (2) range of
-    resulting LPR values are loosely kept between 0 and 1 for the ease of
-    interpretation. This requires the user to provide the regularizer value that
-    results from such training procedure. To guarantee valid comparison in the
-    LPR across different models, feature vectors are scaled by a global factor
-    based on standard deviation across atomic envs.
-
-    If the model is a kernel model, K_train and K_test can be provided in lieu
-    of X_train and X_test, alnog with the appropriate regularizer for the
-    trained model.
+    The function assumes that the model training is undertaken in a manner where the
+    global prediction targets are averaged over the number of atoms appearing in each
+    training structure, and the average feature vector of each structure is hence used
+    in the regression. This ensures that (1) regularization strength across structures
+    with different number of atoms is kept constant per structure during model training,
+    and (2) range of resulting LPR values are loosely kept between 0 and 1 for the ease
+    of interpretation. This requires the user to provide the regularizer value that
+    results from such training procedure. To guarantee valid comparison in the LPR
+    across different models, feature vectors are scaled by a global factor based on
+    standard deviation across atomic envs.
+
+    If the model is a kernel model, K_train and K_test can be provided in lieu of
+    ``X_train`` and ``X_test``, alnog with the appropriate regularizer for the trained
+    model.
 
     Parameters
     ----------
-    X_train : list of ndarray of shape (n_atoms, n_features)
+    X_train : list of numpy.ndarray of shape (n_atoms, n_features)
         Training dataset where each training set structure is stored as a
         separate ndarray.
-
-    X_test : list of ndarray of shape (n_atoms, n_features)
+    X_test : list of numpy.ndarray of shape (n_atoms, n_features)
         Test dataset where each training set structure is stored as a separate
         ndarray.
-
     alpha : float
         Regularizer value that the linear/kernel model has been optimized to.
 
     Returns
     -------
-    LPR : list of array of shape (n_atoms)
+    LPR : list of numpy.array of shape (n_atoms)
         Local prediction rigidity (LPR) of the test set structures. LPR is
         separately stored for each test structure, and hence list length =
         n_test_strucs.
     rank_diff : int
         integer value of the difference between cov matrix dimension and rank
-
     """
-
     # initialize a StandardFlexibleScaler and fit to train set atom envs
     X_atom = np.vstack(X_train)
     sfactor = np.sqrt(np.mean(X_atom**2, axis=0).sum())
@@ -91,9 +86,9 @@ def local_prediction_rigidity(X_train, X_test, alpha):
 
 def componentwise_prediction_rigidity(X_train, X_test, alpha, comp_dims):
     r"""Computes the component-wise prediction rigidity (CPR) and the local CPR
-    (LCPR) of a linear or kernel model trained on a training dataset provided as
-    input, on the local environments in the test set provided as a separate
-    input. CPR and LCPR are defined as follows:
+    (LCPR) of a linear or kernel model trained on a training dataset provided as input,
+    on the local environments in the test set provided as a separate input. CPR and LCPR
+    are defined as follows:
 
     .. math::
         CPR_{A,c} = \frac{1}{X_{A,c} (X^{T} X + \lambda I)^{-1} X_{A,c}^{T}}
@@ -102,53 +97,44 @@ def componentwise_prediction_rigidity(X_train, X_test, alpha, comp_dims):
         LCPR_{i,c} = \frac{1}{X_{i,c} (X^{T} X + \lambda I)^{-1} X_{i,c}^{T}}
 
     The function assumes that the feature vectors for the local environments and
-    structures are built by concatenating the descriptors of different
-    prediction components together. It also assumes, like the case of LPR, that
-    model training is undertaken in a manner where the global prediction targets
-    are averaged over the number of atoms appearing in each training structure,
-    and the average feature vector of each structure is hence used in the
-    regression. Likewise, to guarantee valid comparison in the (L)CPR across
-    different models, feature vectors are scaled by a global factor based on
-    standard deviation across atomic envs.
-
-    If the model is a kernel model, K_train and K_test can be provided in lieu
-    of X_train and X_test, alnog with the appropriate regularizer for the
-    trained model. However, in computing the kernels, one must strictly keep the
-    different components separate, and compute separate kernel blocks for
-    different prediction components.
+    structures are built by concatenating the descriptors of different prediction
+    components together. It also assumes, like the case of LPR, that model training is
+    undertaken in a manner where the global prediction targets are averaged over the
+    number of atoms appearing in each training structure, and the average feature vector
+    of each structure is hence used in the regression. Likewise, to guarantee valid
+    comparison in the (L)CPR across different models, feature vectors are scaled by a
+    global factor based on standard deviation across atomic envs.
+
+    If the model is a kernel model, K_train and K_test can be provided in lieu of
+    X_train and X_test, alnog with the appropriate regularizer for the trained model.
+    However, in computing the kernels, one must strictly keep the different components
+    separate, and compute separate kernel blocks for different prediction components.
 
     Parameters
     ----------
-    X_train : list of ndarray of shape (n_atoms, n_features)
+    X_train : list of numpy.ndarray of shape (n_atoms, n_features)
         Training dataset where each training set structure is stored as a
         separate ndarray.
-
-    X_test : list of ndarray of shape (n_atoms, n_features)
+    X_test : list of numpy.ndarray of shape (n_atoms, n_features)
         Test dataset where each training set structure is stored as a separate
         ndarray.
-
     alpha : float
         Regularizer value that the linear/kernel model has been optimized to.
-
-    comp_dims : array of int values
+    comp_dims : numpy.ndarray of int values
         Dimensions of the feature vectors pertaining to each prediction
         component.
 
-
     Returns
     -------
-    CPR : ndarray of shape (n_test_strucs, n_comps)
-        Component-wise prediction rigidity computed for each prediction
-        component, pertaining to the entire test structure.
+    CPR : numpy.ndarray of shape (n_test_strucs, n_comps)
+        Component-wise prediction rigidity computed for each prediction component,
+        pertaining to the entire test structure.
     LCPR : list of ndarrays of shape (n_atoms, n_comps)
-        Local component-wise prediction rigidity of the test set structures.
-        Values are separately stored for each test structure, and hence list
-        length = n_test_strucs
+        Local component-wise prediction rigidity of the test set structures. Values are
+        separately stored for each test structure, and hence list length = n_test_strucs
     rank_diff : int
         value of the difference between cov matrix dimension and rank
-
     """
-
     # initialize a StandardFlexibleScaler and fit to train set atom envs
     X_atom = np.vstack(X_train)
     sfactor = np.sqrt(np.mean(X_atom**2, axis=0).sum())
diff --git a/src/skmatter/model_selection/__init__.py b/src/skmatter/model_selection/__init__.py
index 1f152ba00a..242dd8151c 100644
--- a/src/skmatter/model_selection/__init__.py
+++ b/src/skmatter/model_selection/__init__.py
@@ -1,3 +1,5 @@
+"""Functions for model selection."""
+
 from ._split import train_test_split
 
 __all__ = ["train_test_split"]
diff --git a/src/skmatter/model_selection/_split.py b/src/skmatter/model_selection/_split.py
index 6e1f2cbcfb..36fabe7f30 100644
--- a/src/skmatter/model_selection/_split.py
+++ b/src/skmatter/model_selection/_split.py
@@ -4,41 +4,41 @@
 
 
 def train_test_split(*arrays, **options):
-    """This is an extended version of the sklearn train test split supporting
-    overlapping train and test sets.
+    """Extended version of the sklearn train test split supporting overlapping train and
+    test sets.
+
     See `sklearn.model_selection.train_test_split (external link)
     <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html>`_ .
 
     Parameters
     ----------
     *arrays : sequence of indexables with same length / shape[0]
-        Allowed inputs are lists, numpy arrays, scipy-sparse
-        matrices or pandas dataframes.
+        Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas
+        dataframes.
     test_size : float or int, default=None
-        If float, should be between 0.0 and 1.0 and represent the proportion
-        of the dataset to include in the test split. If int, represents the
-        absolute number of test samples. If None, the value is set to the
-        complement of the train size. If ``train_size`` is also None, it will
-        be set to 0.25.
+        If float, should be between 0.0 and 1.0 and represent the proportion of the
+        dataset to include in the test split. If int, represents the absolute number of
+        test samples. If :obj:`None`, the value is set to the complement of the train
+        size. If ``train_size`` is also None, it will be set to 0.25.
     train_size : float or int, default=None
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the train split. If
-        int, represents the absolute number of train samples. If None,
-        the value is automatically set to the complement of the test size.
-    random_state : int or RandomState instance, default=None
-        Controls the shuffling applied to the data before applying the split.
-        Pass an int for reproducible output across multiple function calls.
-        See
-        `random state glossary from sklearn (external link) <https://scikit-learn.org/stable/glossary.html#term-random-state>`_
+        If float, should be between 0.0 and 1.0 and represent the proportion of the
+        dataset to include in the train split. If int, represents the absolute number of
+        train samples. If :obj:`None`, the value is automatically set to the complement
+        of the test size.
+    random_state : int or numpy.random.RandomState instance, default=None
+        Controls the shuffling applied to the data before applying the split. Pass an
+        int for reproducible output across multiple function calls. See `random state
+        glossary from sklearn (external link)
+        <https://scikit-learn.org/stable/glossary.html#term-random-state>`_
     shuffle : bool, default=True
-        Whether or not to shuffle the data before splitting. If shuffle=False
-        then stratify must be None.
+        Whether or not to shuffle the data before splitting. If shuffle=False then
+        stratify must be :obj:`None`.
     stratify : array-like, default=None
-        If not None, data is split in a stratified fashion, using this as
-        the class labels.
+        If not :obj:`None`, data is split in a stratified fashion, using this as the
+        class labels.
     train_test_overlap : bool, default=False
-        If True, and train and test set are both not None, the train and test
-        set may overlap.
+        If :obj:`True`, and train and test set are both not :obj:`None`, the train and
+        test set may overlap.
 
     Returns
     -------
diff --git a/src/skmatter/preprocessing/__init__.py b/src/skmatter/preprocessing/__init__.py
index b81735a391..46fda830ba 100644
--- a/src/skmatter/preprocessing/__init__.py
+++ b/src/skmatter/preprocessing/__init__.py
@@ -1,4 +1,4 @@
-"""This module includes scaling, centering and normalization methods."""
+"""Scaling, centering and normalization methods."""
 
 from ._data import (
     KernelNormalizer,
diff --git a/src/skmatter/preprocessing/_data.py b/src/skmatter/preprocessing/_data.py
index 07160dea46..3ff563face 100644
--- a/src/skmatter/preprocessing/_data.py
+++ b/src/skmatter/preprocessing/_data.py
@@ -117,13 +117,11 @@ def fit(self, X, y=None, sample_weight=None):
 
         Parameters
         ----------
-        X : ndarray of shape (n_samples, n_features)
+        X : numpy.ndarray of shape (n_samples, n_features)
             The data used to compute the mean and standard deviation
             used for later scaling along the features axis.
-
         y: None
             Ignored.
-
         sample_weight: ndarray of shape (n_samples,)
             Weights for each sample. Sample weighting can be used to center
             (and scale) data using a weighted mean. Weights are internally
@@ -134,7 +132,6 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Fitted scaler.
         """
-
         X = self._validate_data(
             X,
             copy=self.copy,
@@ -177,10 +174,8 @@ def transform(self, X, y=None, copy=None):
         ----------
         X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The data used to scale along the features axis.
-
         y: None
             Ignored.
-
         copy : bool, default=None
             Copy the input X or not.
 
@@ -189,7 +184,6 @@ def transform(self, X, y=None, copy=None):
         X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Transformed array.
         """
-
         copy = copy if copy is not None else self.copy
         X = self._validate_data(
             X,
@@ -207,18 +201,17 @@ def transform(self, X, y=None, copy=None):
         return (X - self.mean_) / self.scale_
 
     def inverse_transform(self, X_tr):
-        """Scale back the data to the original representation
+        """Scale back the data to the original representation.
 
         Parameters
         ----------
-        X_tr : ndarray of shape (n_samples, n_features)
+        X_tr : numpy.ndarray of shape (n_samples, n_features)
             Transformed matrix
 
         Returns
         -------
         X : original matrix
         """
-
         check_is_fitted(
             self, attributes=["n_samples_in_", "n_features_in_", "scale_", "mean_"]
         )
@@ -229,37 +222,33 @@ def inverse_transform(self, X_tr):
 
 
 class KernelNormalizer(KernelCenterer):
-    """Kernel centering method, similar to KernelCenterer,
+    r"""Kernel centering method, similar to KernelCenterer,
     but with additional scaling and ability to pass a set of sample weights.
 
-    Let :math:`K(x, z)` be a kernel defined by :math:`\\phi(x)^T \\phi(z)`,
-    where :math:`\\phi` is a function mapping x to a Hilbert space.
+    Let :math:`K(x, z)` be a kernel defined by :math:`\phi(x)^T \phi(z)`,
+    where :math:`\phi` is a function mapping x to a Hilbert space.
     KernelNormalizer centers (i.e., normalize to have zero mean) the data without
-    explicitly computing :math:`\\phi(x)`.
+    explicitly computing :math:`\phi(x)`.
     It is equivalent to centering and scaling :math:`\\phi(x)` with
     sklearn.preprocessing.StandardScaler(with_std=False).
 
     Parameters
-    ---------
+    ----------
     with_center: bool, default=True
         If True, center the kernel matrix before scaling. If False, do not
         center the kernel
-
     with_trace: bool, default=True
         If True, scale the kernel so that the trace is equal to the number of
         samples. If False, do not scale the kernel
 
     Attributes
     ----------
-    K_fit_rows_ : ndarray of shape (n_samples,)
+    K_fit_rows_ : numpy.ndarray of shape (n_samples,)
         Average of each column of kernel matrix.
-
     K_fit_all_ : float
         Average of kernel matrix.
-
     sample_weight_ : float
         Sample weights (if provided during the fit)
-
     scale_ : float
         Scaling parameter used when 'with_trace'=True
         Calculated as np.trace(K) / K.shape[0]
@@ -299,23 +288,20 @@ def fit(self, K, y=None, sample_weight=None):
 
         Parameters
         ----------
-        K : ndarray of shape (n_samples, n_samples)
+        K : numpy.ndarray of shape (n_samples, n_samples)
             Kernel matrix.
-
         y : None
             Ignored.
-
-        sample_weight: ndarray of shape (n_samples,), default=None
-            Weights for each sample. Sample weighting can be used to center (and
-            scale) data using a weighted mean. Weights are internally normalized
-            before preprocessing.
+        sample_weight: numpy.ndarray of shape (n_samples,), default=None
+            Weights for each sample. Sample weighting can be used to center (and scale)
+            data using a weighted mean. Weights are internally normalized before
+            preprocessing.
 
         Returns
         -------
         self : object
             Fitted transformer.
         """
-
         K = self._validate_data(K, copy=True, dtype=FLOAT_DTYPES, reset=False)
 
         if sample_weight is not None:
@@ -357,18 +343,16 @@ def transform(self, K, copy=True):
 
         Parameters
         ----------
-        K : ndarray of shape (n_samples1, n_samples2)
+        K : numpy.ndarray of shape (n_samples1, n_samples2)
             Kernel matrix.
-
         copy : bool, default=True
             Set to False to perform inplace computation.
 
         Returns
         -------
-        K_new : ndarray of shape (n_samples1, n_samples2)
+        K_new : numpy.ndarray of shape (n_samples1, n_samples2)
             Transformed array
         """
-
         check_is_fitted(self)
         K = self._validate_data(K, copy=copy, dtype=FLOAT_DTYPES, reset=False)
 
@@ -416,7 +400,7 @@ def fit_transform(self, K, y=None, sample_weight=None, copy=True, **fit_params):
 
 class SparseKernelCenterer(TransformerMixin):
     r"""Kernel centering method for sparse kernels, similar to
-    KernelFlexibleCenterer.
+    :class:`KernelFlexibleCenterer`.
 
     The main disadvantage of kernel methods, which is widely used in machine
     learning it is that they quickly grow in time and space complexity with the
@@ -437,16 +421,14 @@ class SparseKernelCenterer(TransformerMixin):
     is possible to get a $N/M$ times improvement in the asymptotic by memory.
 
     Parameters
-    ---------
+    ----------
     with_center: bool, default=True
         If True, center the kernel matrix before scaling. If False, do not
         center the kernel
-
-    with_trace: bool, default=True
+    with_trace : bool, default=True
         If True, scale the kernel so that the trace is equal to the number of
         samples. If False, do not scale the kernel
-
-    rcond: float, default 1E-12
+    rcond : float, default 1E-12
         conditioning parameter to use when computing the Nystrom-approximated
         kernel for scaling
 
@@ -454,43 +436,34 @@ class SparseKernelCenterer(TransformerMixin):
     ----------
     K_fit_rows_ : ndarray of shape (n_samples,)
         Average of each column of kernel matrix.
-
     K_fit_all_ : float
         Average of kernel matrix.
-
     sample_weight_ : float
         Sample weights (if provided during the fit)
-
     scale_ : float
         Scaling parameter used when 'with_trace'=True
         Calculated as np.trace(K) / K.shape[0]
-
     n_active_: int
         size of active set
     """
 
     def __init__(self, with_center=True, with_trace=True, rcond=1e-12):
-        """Initialize SparseKernelCenterer."""
-
         self.with_center = with_center
         self.with_trace = with_trace
         self.rcond = rcond
 
     def fit(self, Knm, Kmm, y=None, sample_weight=None):
-        """Fit KernelFlexibleCenterer
+        """Fit ``KernelFlexibleCenterer``
 
         Parameters
-        ---------
-        Knm: ndarray of shape (n_samples, n_active)
+        ----------
+        Knm : numpy.ndarray of shape (n_samples, n_active)
             Kernel matrix between the reference data set and the active set
-
-        Kmm: ndarray of shape (n_active, n_active)
+        Kmm : numpy.ndarray of shape (n_active, n_active)
             Kernel matrix between the active set and itself
-
         y : None
             Ignored.
-
-        sample_weight: ndarray of shape (n_samples,), default=None
+        sample_weight: numpy.ndarray of shape (n_samples,), default=None
             Weights for each sample. Sample weighting can be used to center (and
             scale) data using a weighted mean. Weights are internally normalized
             before preprocessing.
@@ -500,7 +473,6 @@ def fit(self, Knm, Kmm, y=None, sample_weight=None):
         self : object
             Fitted transformer.
         """
-
         if Knm.shape[1] != Kmm.shape[0]:
             raise ValueError(
                 "The reference kernel is not commensurate shape with the "
@@ -536,16 +508,15 @@ def transform(self, Knm, y=None):
         """Centering our Kernel. Previously you should fit data.
 
         Parameters
-        ---------
-        Knm: ndarray of shape (n_samples, n_active)
+        ----------
+        Knm: numpy.ndarray of shape (n_samples, n_active)
             Kernel matrix between the reference data set and the active set
-
         y : None
             Ignored.
 
         Returns
         -------
-        K_new : ndarray of shape (n_samples, n_active)
+        K_new : numpy.ndarray of shape (n_samples, n_active)
             Transformed array
         """
         check_is_fitted(self, attributes=["scale_", "K_fit_rows_", "n_active_"])
@@ -563,7 +534,7 @@ def fit_transform(self, Knm, Kmm, y=None, sample_weight=None, **fit_params):
         r"""Fit to data, then transform it.
 
         Parameters
-        ---------
+        ----------
         Knm: ndarray of shape (n_samples, n_active)
             Kernel matrix between the reference data set and the active set
 
diff --git a/src/skmatter/sample_selection/_base.py b/src/skmatter/sample_selection/_base.py
index 6026bce7b1..f93dcfad0b 100644
--- a/src/skmatter/sample_selection/_base.py
+++ b/src/skmatter/sample_selection/_base.py
@@ -1,6 +1,4 @@
-"""
-Sequential sample selection
-"""
+"""Sequential sample selection"""
 
 import warnings
 
@@ -24,13 +22,11 @@ def _linear_interpolator(points, values):
     values : ndarray of float or complex, shape (n,)
         Data values.
 
-
     Reference:
     ---------
     The code is an adapted excerpt from
     https://github.com/scipy/scipy/blob/dde50595862a4f9cede24b5d1c86935c30f1f88a/scipy/interpolate/_ndgriddata.py#L119-L273
     """  # NoQa: E501
-
     points = _ndim_coords_from_arrays(points)
 
     if points.ndim < 2:
@@ -52,60 +48,47 @@ def _linear_interpolator(points, values):
 
 
 class FPS(_FPS):
-    """
-    Transformer that performs Greedy Sample Selection using Farthest Point Sampling.
+    """Transformer performing Greedy Sample Selection using Farthest Point Sampling.
 
     Parameters
     ----------
-
     initialize: int, list of int, or 'random', default=0
-        Index of the first selection(s). If 'random', picks a random
-        value when fit starts. Stored in :py:attr:`self.initialize`.
-
+        Index of the first selection(s). If 'random', picks a random value when fit
+        starts. Stored in :py:attr:`self.initialize`.
     n_to_select : int or float, default=None
-        The number of selections to make. If `None`, half of the samples are
-        selected. If integer, the parameter is the absolute number of selections
-        to make. If float between 0 and 1, it is the fraction of the total dataset to
-        select. Stored in :py:attr:`self.n_to_select`.
-
+        The number of selections to make. If `None`, half of the samples are selected.
+        If integer, the parameter is the absolute number of selections to make. If float
+        between 0 and 1, it is the fraction of the total dataset to select. Stored in
+        :py:attr:`self.n_to_select`.
     score_threshold : float, default=None
-        Threshold for the score. If `None` selection will continue until the
-        n_to_select is chosen. Otherwise will stop when the score falls below the
-        threshold. Stored in :py:attr:`self.score_threshold`.
-
+        Threshold for the score. If `None` selection will continue until the n_to_select
+        is chosen. Otherwise will stop when the score falls below the threshold. Stored
+        in :py:attr:`self.score_threshold`.
     score_threshold_type : str, default="absolute"
         How to interpret the ``score_threshold``. When "absolute", the score used by the
         selector is compared to the threshold directly. When "relative", at each
         iteration, the score used by the selector is compared proportionally to the
-        score of the first selection, i.e. the selector quits when
-        ``current_score / first_score < threshold``. Stored in
-        :py:attr:`self.score_threshold_type`.
-
+        score of the first selection, i.e. the selector quits when ``current_score /
+        first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`.
     progress_bar: bool, default=False
-              option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
-              selections. Stored in :py:attr:`self.report_progress`.
-
+        option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
+        selections. Stored in :py:attr:`self.report_progress`.
     full : bool, default=False
-        In the case that all non-redundant selections are exhausted, choose
-        randomly from the remaining samples. Stored in :py:attr:`self.full`.
-
-    random_state: int or RandomState instance, default=0
+        In the case that all non-redundant selections are exhausted, choose randomly
+        from the remaining samples. Stored in :py:attr:`self.full`.
+    random_state: int or numpy.random.RandomState instance, default=0
 
     Attributes
     ----------
-
     n_selected_ : int
-                  Counter tracking the number of selections that have been made
-
-    X_selected_ : ndarray,
-                  Matrix containing the selected samples, for use in fitting
-
-    y_selected_ : ndarray,
-                  In sample selection, the matrix containing the selected targets, for
-                  use in fitting
-
-    selected_idx_ : ndarray
-                  indices of selected samples
+        Counter tracking the number of selections that have been made
+    X_selected_ :numpy.ndarray,
+        Matrix containing the selected samples, for use in fitting
+    y_selected_ : numpy.ndarray,
+        In sample selection, the matrix containing the selected targets, for
+        use in fitting.
+    selected_idx_ : numpy.ndarray
+        indices of selected samples
 
     Examples
     --------
@@ -115,7 +98,7 @@ class FPS(_FPS):
     ...     n_to_select=2,
     ...     # int or 'random', default=0
     ...     # Index of the first selection.
-    ...     # If ‘random’, picks a random value when fit starts.
+    ...     # If "random", picks a random value when fit starts.
     ...     initialize=0,
     ... )
     >>> X = np.array(
@@ -154,64 +137,51 @@ def __init__(
 
 
 class PCovFPS(_PCovFPS):
-    """Transformer that performs Greedy Sample Selection using PCovR-weighted
-    Farthest Point Sampling.
+    r"""Transformer performing Greedy Sample Selection using PCovR-weighted Farthest
+    Point Sampling.
 
     Parameters
     ----------
-
     mixing: float, default=0.5
-            The PCovR mixing parameter, as described in PCovR as
-            :math:`{\\alpha}`
-
+        The PCovR mixing parameter, as described in PCovR as
+        :math:`{\\alpha}`
     initialize: int or 'random', default=0
-        Index of the first selection. If 'random', picks a random
-        value when fit starts.
-
+        Index of the first selection. If 'random', picks a random value when fit starts.
     n_to_select : int or float, default=None
-        The number of selections to make. If `None`, half of the samples are
-        selected. If integer, the parameter is the absolute number of selections
-        to make. If float between 0 and 1, it is the fraction of the total dataset to
-        select. Stored in :py:attr:`self.n_to_select`.
-
+        The number of selections to make. If `None`, half of the samples are selected.
+        If integer, the parameter is the absolute number of selections to make. If float
+        between 0 and 1, it is the fraction of the total dataset to select. Stored in
+        :py:attr:`self.n_to_select`.
     score_threshold : float, default=None
         Threshold for the score. If `None` selection will continue until the n_to_select
         is chosen. Otherwise will stop when the score falls below the threshold. Stored
         in :py:attr:`self.score_threshold`.
-
     score_threshold_type : str, default="absolute"
         How to interpret the ``score_threshold``. When "absolute", the score used by the
         selector is compared to the threshold directly. When "relative", at each
         iteration, the score used by the selector is compared proportionally to the
-        score of the first selection, i.e. the selector quits when
-        ``current_score / first_score < threshold``. Stored in
-        :py:attr:`self.score_threshold_type`.
-
+        score of the first selection, i.e. the selector quits when ``current_score /
+        first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`.
     progress_bar: bool, default=False
-              option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
-              selections. Stored in :py:attr:`self.report_progress`.
-
+        option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
+        selections. Stored in :py:attr:`self.report_progress`.
     full : bool, default=False
         In the case that all non-redundant selections are exhausted, choose
         randomly from the remaining samples. Stored in :py:attr:`self.full`.
-
-    random_state: int or RandomState instance, default=0
+    random_state: int or numpy.random.RandomState instance, default=0
 
     Attributes
     ----------
-
     n_selected_ : int
-                  Counter tracking the number of selections that have been made
-
-    X_selected_ : ndarray,
-                  Matrix containing the selected samples, for use in fitting
+        Counter tracking the number of selections that have been made
+    X_selected_ : numpy.ndarray,
+        Matrix containing the selected samples, for use in fitting
 
-    y_selected_ : ndarray,
-                  In sample selection, the matrix containing the selected targets, for
-                  use in fitting
-
-    selected_idx_ : ndarray
-                  indices of selected samples
+    y_selected_ : numpy.ndarray,
+        In sample selection, the matrix containing the selected targets, for use in
+        fitting
+    selected_idx_ : numpy.ndarray
+        indices of selected samples
 
     Examples
     --------
@@ -221,7 +191,7 @@ class PCovFPS(_PCovFPS):
     ...     n_to_select=2,
     ...     # int or 'random', default=0
     ...     # Index of the first selection.
-    ...     # If ‘random’, picks a random value when fit starts.
+    ...     # If "random", picks a random value when fit starts.
     ...     initialize=0,
     ... )
     >>> X = np.array(
@@ -264,71 +234,56 @@ def __init__(
 
 class CUR(_CUR):
     """Transformer that performs Greedy Sample Selection by choosing samples
-    which maximize the magnitude of the left singular vectors, consistent with
-    classic CUR matrix decomposition.
+    which maximize the magnitude of the left singular vectors, consistent with classic
+    CUR matrix decomposition.
 
     Parameters
     ----------
     recompute_every : int
-                      number of steps after which to recompute the pi score
-                      defaults to 1, if 0 no re-computation is done
-
+        number of steps after which to recompute the pi score defaults to 1, if 0 no
+        re-computation is done
     k : int
         number of eigenvectors to compute the importance score with, defaults to 1
-
     tolerance: float
          threshold below which scores will be considered 0, defaults to 1E-12
-
     n_to_select : int or float, default=None
-        The number of selections to make. If `None`, half of the samples are
-        selected. If integer, the parameter is the absolute number of selections
-        to make. If float between 0 and 1, it is the fraction of the total dataset to
-        select. Stored in :py:attr:`self.n_to_select`.
-
+        The number of selections to make. If `None`, half of the samples are selected.
+        If integer, the parameter is the absolute number of selections to make. If float
+        between 0 and 1, it is the fraction of the total dataset to select. Stored in
+        :py:attr:`self.n_to_select`.
     score_threshold : float, default=None
-        Threshold for the score. If `None` selection will continue until the
-        n_to_select is chosen. Otherwise will stop when the score falls below the
-        threshold. Stored in :py:attr:`self.score_threshold`.
-
+        Threshold for the score. If `None` selection will continue until the n_to_select
+        is chosen. Otherwise will stop when the score falls below the threshold. Stored
+        in :py:attr:`self.score_threshold`.
     score_threshold_type : str, default="absolute"
-        How to interpret the ``score_threshold``. When "absolute", the score used by
-        the selector is compared to the threshold directly. When "relative", at each
+        How to interpret the ``score_threshold``. When "absolute", the score used by the
+        selector is compared to the threshold directly. When "relative", at each
         iteration, the score used by the selector is compared proportionally to the
-        score of the first selection, i.e. the selector quits when
-        ``current_score / first_score < threshold``. Stored in
-        :py:attr:`self.score_threshold_type`.
-
+        score of the first selection, i.e. the selector quits when ``current_score /
+        first_score < threshold``. Stored in :py:attr:`self.score_threshold_type`.
     progress_bar: bool, default=False
               option to use `tqdm <https://tqdm.github.io/>`_ progress bar to monitor
               selections. Stored in :py:attr:`self.report_progress`.
-
     full : bool, default=False
         In the case that all non-redundant selections are exhausted, choose
         randomly from the remaining samples. Stored in :py:attr:`self.full`.
-
-    random_state: int or RandomState instance, default=0
+    random_state: int or numpy.random.RandomState instance, default=0
 
     Attributes
     ----------
-
-    X_current_ : ndarray (n_samples, n_features)
-                  The original matrix orthogonalized by previous selections
-
+    X_current_ : numpy.ndarray (n_samples, n_features)
+        The original matrix orthogonalized by previous selections
     n_selected_ : int
-                  Counter tracking the number of selections that have been made
-
+        Counter tracking the number of selections that have been made
     X_selected_ : ndarray,
-                  Matrix containing the selected samples, for use in fitting
-
+        Matrix containing the selected samples, for use in fitting
     y_selected_ : ndarray,
-                  In sample selection, the matrix containing the selected targets, for
-                  use in fitting
-
+        In sample selection, the matrix containing the selected targets, for use in
+        fitting
     pi_ : ndarray (n_features),
-                  the importance score see :func:`_compute_pi`
-
+        the importance score see :func:`_compute_pi`
     selected_idx_ : ndarray
-                  indices of selected features
+        indices of selected features
 
     Examples
     --------
@@ -391,33 +346,25 @@ class PCovCUR(_PCovCUR):
 
     Parameters
     ----------
-
     mixing: float, default=0.5
-            The PCovR mixing parameter, as described in PCovR as
-            :math:`{\\alpha}`. Stored in :py:attr:`self.mixing`.
-
+        The PCovR mixing parameter, as described in PCovR as :math:`{\\alpha}`. Stored
+        in :py:attr:`self.mixing`.
     recompute_every : int
-                      number of steps after which to recompute the pi score
-                      defaults to 1, if 0 no re-computation is done
-
-
+        number of steps after which to recompute the pi score defaults to 1, if 0 no
+        re-computation is done
     k : int
         number of eigenvectors to compute the importance score with, defaults to 1
-
     tolerance: float
-         threshold below which scores will be considered 0, defaults to 1E-12
-
+        threshold below which scores will be considered 0, defaults to 1E-12
     n_to_select : int or float, default=None
         The number of selections to make. If `None`, half of the samples are
         selected. If integer, the parameter is the absolute number of selections
         to make. If float between 0 and 1, it is the fraction of the total dataset to
         select. Stored in :py:attr:`self.n_to_select`.
-
     score_threshold : float, default=None
         Threshold for the score. If `None` selection will continue until the
         n_to_select is chosen. Otherwise will stop when the score falls below the
         threshold. Stored in :py:attr:`self.score_threshold`.
-
     score_threshold_type : str, default="absolute"
         How to interpret the ``score_threshold``. When "absolute", the score used by
         the selector is compared to the threshold directly. When "relative", at each
@@ -425,43 +372,32 @@ class PCovCUR(_PCovCUR):
         score of the first selection, i.e. the selector quits when
         ``current_score / first_score < threshold``. Stored in
         :py:attr:`self.score_threshold_type`.
-
     progress_bar: bool, default=False
               option to use `tqdm <https://tqdm.github.io/>`_
               progress bar to monitor selections.
               Stored in :py:attr:`self.report_progress`.
-
     full : bool, default=False
         In the case that all non-redundant selections are exhausted, choose
         randomly from the remaining samples. Stored in :py:attr:`self.full`.
-
-    random_state: int or RandomState instance, default=0
+    random_state: int or numpy.random.RandomState instance, default=0
 
     Attributes
     ----------
-
-    X_current_ : ndarray (n_samples, n_features)
-                  The original matrix orthogonalized by previous selections
-
-    y_current_ : ndarray (n_samples, n_properties)
-                The targets orthogonalized by a regression on
-                the previous selections.
-
+    X_current_ : numpy.ndarray (n_samples, n_features)
+        The original matrix orthogonalized by previous selections
+    y_current_ : numpy.ndarray (n_samples, n_properties)
+        The targets orthogonalized by a regression on the previous selections.
     n_selected_ : int
-                  Counter tracking the number of selections that have been made
-
-    X_selected_ : ndarray,
-                  Matrix containing the selected samples, for use in fitting
-
-    y_selected_ : ndarray,
-                  In sample selection, the matrix containing the selected targets, for
-                  use in fitting
-
-    pi_ : ndarray (n_features),
-                  the importance score see :func:`_compute_pi`
-
-    selected_idx_ : ndarray
-                  indices of selected features
+        Counter tracking the number of selections that have been made
+    X_selected_ : numpy.ndarray
+        Matrix containing the selected samples, for use in fitting
+    y_selected_ : numpy.ndarray,
+        In sample selection, the matrix containing the selected targets, for use in
+        fitting
+    pi_ : numpy.ndarray (n_features),
+        the importance score see :func:`_compute_pi`
+    selected_idx_ : numpy.ndarray
+        indices of selected features
 
     Examples
     --------
@@ -619,24 +555,22 @@ def __init__(self, low_dim_idx=None, tolerance=1e-12):
         self.tolerance = tolerance
 
     def fit(self, X, y):
-        """
-        Learn the samples that form the convex hull.
+        """Learn the samples that form the convex hull.
 
         Parameters
         ----------
-        X        : ndarray of shape (n_samples, n_features)
-                   Feature matrix of samples to use for constructing the convex
-                   hull.
-        y        : ndarray of shape (n_samples,)
-                   Target values (property on which the convex hull should be
-                   constructed, e.g. Gibbs free energy)
+        X : numpy.ndarray of shape (n_samples, n_features)
+            Feature matrix of samples to use for constructing the convex
+            hull.
+        y : numpy.ndarray of shape (n_samples,)
+            Target values (property on which the convex hull should be
+            constructed, e.g. Gibbs free energy)
 
         Returns
         -------
         self : object
             Fitted scorer.
         """
-
         X, y = self._check_X_y(X, y)
         self.n_features_in_ = X.shape[1]
 
diff --git a/src/skmatter/sample_selection/_voronoi_fps.py b/src/skmatter/sample_selection/_voronoi_fps.py
index 6490bda19a..ae7d27f7ee 100644
--- a/src/skmatter/sample_selection/_voronoi_fps.py
+++ b/src/skmatter/sample_selection/_voronoi_fps.py
@@ -40,11 +40,9 @@ class VoronoiFPS(GreedySelector):
 
     Parameters
     ----------
-
-    n_trial_calculation: integer, default=4
+    n_trial_calculation: int, default=4
         Number of calculations used for the switching point between Voronoi FPS
         and traditional FPS (for detail look at full_fraction).
-
     full_fraction: float, default=None
         Proportion of calculated distances from the total number of features at
         which the switch from Voronoi FPS to FPS occurs.
@@ -69,7 +67,7 @@ class VoronoiFPS(GreedySelector):
     ...     full_fraction=0.45,
     ...     # int or 'random', default=0
     ...     # Index of the first selection.
-    ...     # If ‘random’, picks a random value when fit starts.
+    ...     # If 'random', picks a random value when fit starts.
     ...     initialize=0,
     ... )
     >>> X = np.array(
@@ -111,43 +109,37 @@ def score(self, X=None, y=None):
         return self.hausdorff_
 
     def get_distance(self):
-        """
-
+        r"""
         Traditional FPS employs a column-wise Euclidean
         distance for feature selection, which can be expressed using the covariance
         matrix :math:`\\mathbf{C} = \\mathbf{X} ^ T \\mathbf{X}`
 
         .. math::
-            \\operatorname{d}_c(i, j) = C_{ii} - 2 C_{ij} + C_{jj}.
+            \operatorname{d}_c(i, j) = C_{ii} - 2 C_{ij} + C_{jj}.
 
         For sample selection, this is a row-wise Euclidean distance, which can be
         expressed in terms of the Gram matrix
         :math:`\\mathbf{K} = \\mathbf{X} \\mathbf{X} ^ T`
 
         .. math::
-            \\operatorname{d}_r(i, j) = K_{ii} - 2 K_{ij} + K_{jj}.
+            \operatorname{d}_r(i, j) = K_{ii} - 2 K_{ij} + K_{jj}.
 
         Returns
         -------
-
         hausdorff : ndarray of shape (`n_to_select_from_`)
                      the minimum distance from each point to the set of selected
                      points. once a point is selected, the distance is not updated;
                      the final list will reflect the distances when selected.
-
         """
         return self.hausdorff_
 
     def get_select_distance(self):
         """
-
         Returns
         -------
-
-        hausdorff_at_select : ndarray of shape (`n_to_select`)
+        hausdorff_at_select : numpy.ndarray of shape (`n_to_select`)
                      at the time of selection, the minimum distance from each
                      selected point to the set of previously selected points.
-
         """
         mask = self.get_support(indices=True, ordered=True)
         return self.hausdorff_at_select_[mask]
@@ -163,7 +155,6 @@ def _init_greedy_search(self, X, y, n_to_select):
         large number of distances, it is more advantageous to run simple
         calculation along the whole matrix.
         """
-
         n_to_select_from = X.shape[0]
         self.vlocation_of_idx = np.full(n_to_select_from, 1)
         # index of the voronoi cell associated with each of the columns of X
@@ -239,8 +230,8 @@ def _init_greedy_search(self, X, y, n_to_select):
 
     def _continue_greedy_search(self, X, y, n_to_select):
         """Continues the search. Prepares an array to store the selected
-        features."""
-
+        features.
+        """
         super()._continue_greedy_search(X, y, n_to_select)
 
         n_pad = n_to_select - self.n_selected_
@@ -271,7 +262,6 @@ def _get_active(self, X, last_selected):
         |d(X,S) - d(S,L)|>= d(X,S) to know that we don't need to check X.
         but |d(X,S) - d(S,L)|^2>= d(X,S)^2 if and only if d(S,L)/2 > d(S,X)
         """
-
         if not hasattr(self, "n_selected_") or self.n_selected_ == 0:
             return np.arange(X.shape[0], dtype=int)
 
@@ -290,19 +280,17 @@ def _get_active(self, X, last_selected):
             return active_points
 
     def _update_post_selection(self, X, y, last_selected):
-        """
-        Saves the most recently selected feature, increments the feature counter
+        """Saves the most recently selected feature, increments the feature counter
         and update the hausdorff distances
+
         Let:
-        L is the last point selected;
-        S are the selected points from before this iteration;
-        X is the one active point;
-        This function calculates d(L, X) and checks the condition
-        d(L, X)< min d(X, S_i). If so, we move X to a new polyhedron.
-        If the number of active points is too high, it is faster to calculate
-        the distances between L and all the points in the dataset.
-        """
 
+        L is the last point selected; S are the selected points from before this
+        iteration; X is the one active point; This function calculates d(L, X) and
+        checks the condition d(L, X)< min d(X, S_i). If so, we move X to a new
+        polyhedron. If the number of active points is too high, it is faster to
+        calculate the distances between L and all the points in the dataset.
+        """
         self.hausdorff_at_select_[last_selected] = self.hausdorff_[last_selected]
         active_points = self._get_active(X, last_selected)
 
diff --git a/src/skmatter/utils/_orthogonalizers.py b/src/skmatter/utils/_orthogonalizers.py
index 14dbf0a2c8..023747ed78 100644
--- a/src/skmatter/utils/_orthogonalizers.py
+++ b/src/skmatter/utils/_orthogonalizers.py
@@ -1,12 +1,8 @@
 # -*- coding: utf-8 -*-
-"""
-
-This module contains the necessary orthogonalizers for the CUR decomposition
-subselection method.
+"""Necessary orthogonalizers for the CUR decomposition subselection method.
 
 Authors: Rose K. Cersonsky
          Michele Ceriotti
-
 """
 
 import warnings
@@ -15,25 +11,22 @@
 
 
 def X_orthogonalizer(x1, c=None, x2=None, tol=1e-12, copy=False):
-    """
-    Orthogonalizes a feature matrix by the given columns. Can be used to
-    orthogonalize by samples by calling `X = X_orthogonalizer(X.T, row_index).T`.
-    After orthogonalization, each column of X will contain only what is
+    """Orthogonalizes a feature matrix by the given columns.
+
+    Can be used to orthogonalize by samples by calling `X = X_orthogonalizer(X.T,
+    row_index).T`. After orthogonalization, each column of X will contain only what is
     orthogonal to X[:, c] or x2.
 
     Parameters
     ----------
-    x1: matrix of shape (n x m)
+    x1: numpy.ndarray of shape (n x m)
         feature matrix to orthogonalize
-
     c: int, less than m, default=None
        index of the column to orthogonalize by
-
-    x2: matrix of shape (n x a), default=x1[:, c]
+    x2: numpy.ndarray of shape (n x a), default=x1[:, c]
         a separate set of columns to orthogonalize with respect to
         Note: the orthogonalizer will work column-by-column in column-index order
     """
-
     if x2 is None and c is not None:
         cols = x1[:, [c]]
     elif x2.shape[0] == x1.shape[0]:
@@ -64,8 +57,8 @@ def X_orthogonalizer(x1, c=None, x2=None, tol=1e-12, copy=False):
 
 
 def Y_feature_orthogonalizer(y, X, tol=1e-12, copy=True):
-    r"""
-    Orthogonalizes a property matrix given the selected features in :math:`\mathbf{X}`
+    r"""Orthogonalizes a property matrix given the selected features in
+    :math:`\mathbf{X}`.
 
     .. math::
         \mathbf{Y} \leftarrow \mathbf{Y} -
@@ -73,20 +66,15 @@ def Y_feature_orthogonalizer(y, X, tol=1e-12, copy=True):
 
     Parameters
     ----------
-
-    y: ndarray of shape (n_samples x n_properties)
+    y : numpy.ndarray of shape (n_samples x n_properties)
        property matrix
-
-    X: ndarray of shape (n_samples x n_features)
+    X : numpy.ndarray of shape (n_samples x n_features)
        feature matrix
-
     tol: float
-         cutoff for small eigenvalues to send to np.linalg.pinv
-
+        cutoff for small eigenvalues to send to np.linalg.pinv
     copy: bool
-          whether to return a copy of y or edit in-place, default=True
+        whether to return a copy of y or edit in-place, default=True
     """
-
     v = np.linalg.pinv(np.matmul(X.T, X), rcond=tol)
     v = np.matmul(X, v)
     v = np.matmul(v, X.T)
@@ -99,42 +87,31 @@ def Y_feature_orthogonalizer(y, X, tol=1e-12, copy=True):
 
 
 def Y_sample_orthogonalizer(y, X, y_ref, X_ref, tol=1e-12, copy=True):
-    """
-    Orthogonalizes a matrix of targets :math:`{\\mathbf{Y}}` given a reference feature
-    matrix :math:`{\\mathbf{X}_r}` and reference target matrix :math:`{\\mathbf{Y}_r}`:
+    r"""Orthogonalizes a matrix of targets :math:`{\\mathbf{Y}}` given a reference
+    feature matrix :math:`{\\mathbf{X}_r}` and reference target matrix
+    :math:`{\\mathbf{Y}_r}`:
 
     .. math::
-
         \\mathbf{Y} \\leftarrow \\mathbf{Y} -
         \\mathbf{X} \\left(\\mathbf{X}_{\\mathbf{r}}^T
         \\mathbf{X}_{\\mathbf{r}}\\right)^{-1}\\mathbf{X}_{\\mathbf{r}}^T
         \\mathbf{Y}_{\\mathbf{r}}
 
-
-
     Parameters
     ----------
-
-    y: ndarray of shape (n_samples x n_properties)
+    y : numpy.ndarray of shape (n_samples x n_properties)
        property matrix
-
-    X: ndarray of shape (n_samples x n_features)
+    X : numpy.ndarray of shape (n_samples x n_features)
        feature matrix
-
-    y_ref: ndarray of shape (n_ref x n_properties)
-           reference property matrix
-
-    X_ref: ndarray of shape (n_ref x n_features)
-           reference feature matrix
-
+    y_ref : numpy.ndarray of shape (n_ref x n_properties)
+        reference property matrix
+    X_ref : numpy.ndarray of shape (n_ref x n_features)
+        reference feature matrix
     tol: float
-         cutoff for small eigenvalues to send to np.linalg.pinv
-
+        cutoff for small eigenvalues to send to np.linalg.pinv
     copy: bool
-          whether to return a copy of y or edit in-place, default=True
-
+        whether to return a copy of y or edit in-place, default=True
     """
-
     y_frag = (X @ (np.linalg.lstsq(X_ref, y_ref, rcond=tol)[0])).reshape(y.shape)
 
     if copy:
diff --git a/src/skmatter/utils/_pcovr_utils.py b/src/skmatter/utils/_pcovr_utils.py
index 69ae2e3941..837ea394c4 100644
--- a/src/skmatter/utils/_pcovr_utils.py
+++ b/src/skmatter/utils/_pcovr_utils.py
@@ -114,8 +114,7 @@ def pcovr_covariance(
     random_state=0,
     iterated_power="auto",
 ):
-    r"""
-    Creates the PCovR modified covariance
+    r"""Creates the PCovR modified covariance
 
     .. math::
 
@@ -153,7 +152,6 @@ def pcovr_covariance(
                    random seed to use for randomized svd
 
     """
-
     C = np.zeros((X.shape[1], X.shape[1]), dtype=np.float64)
 
     if mixing < 1 or return_isqrt:
@@ -198,39 +196,30 @@ def pcovr_covariance(
 
 
 def pcovr_kernel(mixing, X, Y, **kernel_params):
-    r"""
-    Creates the PCovR modified kernel distances
+    r"""Creates the PCovR modified kernel distances
 
     .. math::
-
         \mathbf{\tilde{K}} = \alpha \mathbf{K} +
         (1 - \alpha) \mathbf{Y}\mathbf{Y}^T
 
     the default kernel is the linear kernel, such that:
 
     .. math::
-
         \mathbf{\tilde{K}} = \alpha \mathbf{X} \mathbf{X}^T +
         (1 - \alpha) \mathbf{Y}\mathbf{Y}^T
 
     Parameters
     ----------
-
     mixing : float
              mixing parameter, as described in PCovR as :math:`{\alpha}`
-
-    X : ndarray of shape (n x m)
+    X : numpy.ndarray of shape (n x m)
         Data matrix :math:`\mathbf{X}`
-
-    Y : ndarray of shape (n x p)
+    Y : numpy.ndarray of shape (n x p)
         Array to include in biased selection when mixing < 1
-
     kernel_params : dict, optional
                     dictionary of arguments to pass to pairwise_kernels
                     if none are specified, assumes that the kernel is linear
-
     """
-
     K = np.zeros((X.shape[0], X.shape[0]))
     if mixing < 1:
         K += (1 - mixing) * Y @ Y.T
diff --git a/src/skmatter/utils/_progress_bar.py b/src/skmatter/utils/_progress_bar.py
index 19b8ac2912..01820698a8 100644
--- a/src/skmatter/utils/_progress_bar.py
+++ b/src/skmatter/utils/_progress_bar.py
@@ -1,7 +1,7 @@
 def get_progress_bar():
-    """
-    This function returns the appropriate version of tqdm, as determined by
-    tqdm.auto. If tqdm is not installed, an ImportError is raised.
+    """Returns the appropriate version of ``tqdm``, as determined by ``tqdm.auto``.
+
+    If ``tqdm`` is not installed, an :py:class`ImportError` is raised.
     """
     try:
         from tqdm.auto import tqdm
@@ -9,14 +9,11 @@ def get_progress_bar():
         return tqdm
     except ImportError:
         raise ImportError(
-            "tqdm must be installed to use a progress bar."
-            "Either install tqdm or re-run with"
-            "progress_bar = False"
+            "tqdm must be installed to use a progress bar. Either install tqdm or "
+            "re-run with progress_bar = False"
         )
 
 
 def no_progress_bar(x):
-    """
-    This is the identity function, same as lambda x:x. It returns x.
-    """
+    """Identity function, same as ``lambda x:x``. It returns ``x``."""
     return x
diff --git a/tests/test_check_estimators.py b/tests/test_check_estimators.py
index b21835dcac..fc89ecdb4e 100644
--- a/tests/test_check_estimators.py
+++ b/tests/test_check_estimators.py
@@ -23,4 +23,5 @@
     ]
 )
 def test_sklearn_compatible_estimator(estimator, check):
+    """Test of the estimators are compatible with sklearn."""
     check(estimator)
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index e976c5d7ca..5dd7f144a3 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -77,9 +77,7 @@ def setUpClass(cls):
             cls.has_pandas = False
 
     def test_load_dataset_without_pandas(self):
-        """
-        Check if the correct exception occurs when pandas isn't present.
-        """
+        """Check if the correct exception occurs when pandas isn't present."""
         with unittest.mock.patch.dict("sys.modules", {"pandas": None}):
             with self.assertRaises(ImportError) as cm:
                 _ = load_who_dataset()
@@ -95,9 +93,7 @@ def test_dataset_size_and_shape(self):
             self.assertEqual(self.who["data"].shape, self.shape)
 
     def test_datapoint_value(self):
-        """
-        Check if the value of a datapoint at a certain location is correct.
-        """
+        """Check if the value of a datapoint at a certain location is correct."""
         if self.has_pandas is True:
             self.assertTrue(
                 np.allclose(
@@ -120,9 +116,7 @@ def setUpClass(cls):
             cls.has_ase = False
 
     def test_load_dataset_without_ase(self):
-        """
-        Check if the correct exception occurs when ase isn't present.
-        """
+        """Check if the correct exception occurs when ase isn't present."""
         with unittest.mock.patch.dict("sys.modules", {"ase.io": None}):
             with self.assertRaises(ImportError) as cm:
                 _ = load_roy_dataset()
@@ -131,8 +125,8 @@ def test_load_dataset_without_ase(self):
             )
 
     def test_dataset_content(self):
-        """
-        Check if the correct number of datapoints are present in the dataset.
+        """Check if the correct number of datapoints are present in the dataset.
+
         Also check if the size of the dataset is correct.
         """
         if self.has_ase is True:
@@ -141,8 +135,8 @@ def test_dataset_content(self):
             self.assertEqual(len(self.roy["energies"]), self.size)
 
     def test_dataset_consistency(self):
-        """
-        Check if the energies in the structures are the same as in the explicit array.
+        """Check if the energies in the structures are the same as in the explicit
+        array.
         """
         if self.has_ase is True:
             self.assertTrue(
diff --git a/tests/test_dch.py b/tests/test_dch.py
index 98dd11ef80..afad6444e3 100644
--- a/tests/test_dch.py
+++ b/tests/test_dch.py
@@ -21,10 +21,8 @@ def setUp(self):
         )
 
     def test_selected_idx_and_scores(self):
-        """
-        This test is a regression test that checks that DCH selects correct vertices and
-        gets correct distances from the `score_feature_matrix` and `score_samples`
-        functions.
+        """Regression test that checks that DCH selects correct vertices and gets
+        correct distances from the `score_feature_matrix` and `score_samples` functions.
         """
         selector = DirectionalConvexHull()
         selector.fit(self.T, self.y)
@@ -169,11 +167,9 @@ def test_positive_score(self):
         self.assertTrue(np.all(distances >= -selector.tolerance))
 
     def test_score_function_warnings(self):
-        """
-        Ensure that calling `score_samples` with points outside the range causes an
+        """Ensure that calling `score_samples` with points outside the range causes an
         error.
         """
-
         selector = DirectionalConvexHull(low_dim_idx=[0])
         # high-dimensional dummy data, not important for the test
         X_high_dimensional = [1.0, 2.0, 3.0]
diff --git a/tests/test_feature_pcov_cur.py b/tests/test_feature_pcov_cur.py
index d1f22b3581..3f025d6704 100644
--- a/tests/test_feature_pcov_cur.py
+++ b/tests/test_feature_pcov_cur.py
@@ -12,20 +12,14 @@ def setUp(self):
         self.idx = [2, 8, 3, 4, 1, 7, 5, 9, 6]
 
     def test_known(self):
-        """
-        This test checks that the model returns a known set of indices
-        """
-
+        """Check that the model returns a known set of indices."""
         selector = PCovCUR(n_to_select=9)
         selector.fit(self.X, self.y)
 
         self.assertTrue(np.allclose(selector.selected_idx_, self.idx))
 
     def test_restart(self):
-        """
-        This test checks that the model can be restarted with a new instance
-        """
-
+        """Check that the model can be restarted with a new instance."""
         selector = PCovCUR(n_to_select=1)
         selector.fit(self.X, self.y)
 
@@ -35,9 +29,7 @@ def test_restart(self):
             self.assertEqual(selector.selected_idx_[i], self.idx[i])
 
     def test_non_it(self):
-        """
-        This test checks that the model can be run non-iteratively
-        """
+        """Check that the model can be run non-iteratively."""
         self.idx = [2, 8, 3, 6, 7, 9, 1, 0, 5]
         selector = PCovCUR(n_to_select=9, recompute_every=0)
         selector.fit(self.X, self.y)
diff --git a/tests/test_feature_pcov_fps.py b/tests/test_feature_pcov_fps.py
index ae53f17960..321cc78ee3 100644
--- a/tests/test_feature_pcov_fps.py
+++ b/tests/test_feature_pcov_fps.py
@@ -11,11 +11,9 @@ def setUp(self):
         self.idx = [0, 2, 6, 7, 1, 3, 4]
 
     def test_restart(self):
+        """Check that the model can be restarted with a new number of features and
+        `warm_start`.
         """
-        This test checks that the model can be restarted with a new number of
-        features and `warm_start`
-        """
-
         selector = PCovFPS(n_to_select=1, initialize=self.idx[0])
         selector.fit(self.X, y=self.y)
 
@@ -25,10 +23,7 @@ def test_restart(self):
             self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1])
 
     def test_no_mixing_1(self):
-        """
-        This test checks that the model throws an error when mixing = 1.0
-        """
-
+        """Check that the model throws an error when mixing = 1.0."""
         with self.assertRaises(ValueError) as cm:
             _ = PCovFPS(n_to_select=1, mixing=1.0)
         self.assertEqual(
diff --git a/tests/test_feature_simple_cur.py b/tests/test_feature_simple_cur.py
index 72554471d4..147a16fedc 100644
--- a/tests/test_feature_simple_cur.py
+++ b/tests/test_feature_simple_cur.py
@@ -18,10 +18,7 @@ def test_bad_transform(self):
             _ = selector.transform(self.X)
 
     def test_restart(self):
-        """
-        This test checks that the model can be restarted with a new instance
-        """
-
+        """Check that the model can be restarted with a new instance."""
         ref_selector = CUR(n_to_select=self.X.shape[-1] - 3).fit(X=self.X)
         ref_idx = ref_selector.selected_idx_
 
@@ -34,9 +31,7 @@ def test_restart(self):
             self.assertEqual(selector.selected_idx_[i], ref_idx[i])
 
     def test_non_it(self):
-        """
-        This test checks that the model can be run non-iteratively
-        """
+        """Check that the model can be run non-iteratively."""
         C = self.X.T @ self.X
         _, UC = np.linalg.eigh(C)
         ref_idx = np.argsort(-(UC[:, -1] ** 2.0))[:-1]
diff --git a/tests/test_feature_simple_fps.py b/tests/test_feature_simple_fps.py
index b29a2bc7bc..68e3ffdbcb 100644
--- a/tests/test_feature_simple_fps.py
+++ b/tests/test_feature_simple_fps.py
@@ -13,7 +13,7 @@ def setUp(self):
 
     def test_restart(self):
         """
-        This test checks that the model can be restarted with a new number of
+        Check that the model can be restarted with a new number of
         features and `warm_start`
         """
         selector = FPS(n_to_select=1, initialize=self.idx[0])
@@ -25,11 +25,9 @@ def test_restart(self):
             self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1])
 
     def test_initialize(self):
+        """Check that the model can be initialized in all applicable manners and throws
+        an error otherwise.
         """
-        This test checks that the model can be initialized in all applicable manners
-        and throws an error otherwise
-        """
-
         for initialize in [self.idx[0], "random"]:
             with self.subTest(initialize=initialize):
                 selector = FPS(n_to_select=1, initialize=initialize)
@@ -48,9 +46,7 @@ def test_initialize(self):
         self.assertEqual(str(cm.exception), "Invalid value of the initialize parameter")
 
     def test_get_distances(self):
-        """
-        This test checks that the hausdorff distances are returnable after fitting
-        """
+        """Check that the hausdorff distances are returnable after fitting."""
         selector = FPS(n_to_select=7)
         selector.fit(self.X)
         d = selector.get_select_distance()
diff --git a/tests/test_kernel_normalizer.py b/tests/test_kernel_normalizer.py
index d17ddf9f33..a2297902b8 100644
--- a/tests/test_kernel_normalizer.py
+++ b/tests/test_kernel_normalizer.py
@@ -13,7 +13,8 @@ def __init__(self, *args, **kwargs):
 
     def test_sample_weights(self):
         """Checks that sample weights of one are equal to the unweighted case and
-        that nonuniform weights are different from the unweighted case"""
+        that nonuniform weights are different from the unweighted case.
+        """
         K = self.random_state.uniform(0, 100, size=(3, 3))
         equal_wts = np.ones(len(K))
         nonequal_wts = self.random_state.uniform(0, 100, size=(len(K),))
@@ -31,7 +32,8 @@ def test_sample_weights(self):
 
     def test_invalid_sample_weights(self):
         """Checks that weights must be 1D array with the same length as the number of
-        samples"""
+        samples.
+        """
         K = self.random_state.uniform(0, 100, size=(3, 3))
         wts_len = np.ones(len(K) + 1)
         wts_dim = np.ones((len(K), 2))
@@ -49,8 +51,9 @@ def test_ValueError(self):
             model.fit(K)
 
     def test_reference_ValueError(self):
-        """Checks that it is impossible to normalize
-        a matrix with a non-coincident size with the reference."""
+        """Checks that it is impossible to normalize a matrix with a non-coincident
+        size with the reference.
+        """
         K = self.random_state.uniform(0, 100, size=(3, 3))
         K_2 = self.random_state.uniform(0, 100, size=(2, 2))
         model = KernelNormalizer()
@@ -59,9 +62,9 @@ def test_reference_ValueError(self):
             model.transform(K_2)
 
     def test_NotFittedError_transform(self):
-        """Checks that an error is returned when
-        trying to use the transform function
-        before the fit function"""
+        """Checks that an error is returned when trying to use the transform function
+        before the fit function.
+        """
         K = self.random_state.uniform(0, 100, size=(3, 3))
         model = KernelNormalizer()
         with self.assertRaises(sklearn.exceptions.NotFittedError):
@@ -69,8 +72,8 @@ def test_NotFittedError_transform(self):
 
     def test_fit_transform(self):
         """Checks that the kernel is correctly normalized.
-        Compare with the value calculated
-        directly from the equation.
+
+        Compare with the value calculated directly from the equation.
         """
         K = self.random_state.uniform(0, 100, size=(3, 3))
         model = KernelNormalizer()
diff --git a/tests/test_kernel_pcovr.py b/tests/test_kernel_pcovr.py
index 8cb7b0297c..e4bbda52e4 100644
--- a/tests/test_kernel_pcovr.py
+++ b/tests/test_kernel_pcovr.py
@@ -51,7 +51,7 @@ def setUp(self):
 class KernelPCovRErrorTest(KernelPCovRBaseTest):
     def test_lr_with_x_errors(self):
         """
-        This test checks that KernelPCovR returns a non-null property prediction
+        Check that KernelPCovR returns a non-null property prediction
         and that the prediction error increases with `mixing`
         """
         prev_error = -1.0
@@ -73,11 +73,9 @@ def test_lr_with_x_errors(self):
             prev_error = error
 
     def test_reconstruction_errors(self):
+        """Check that KernelPCovR returns a non-null reconstructed X and that the
+        reconstruction error decreases with `mixing`.
         """
-        This test checks that KernelPCovR returns a non-null reconstructed X
-        and that the reconstruction error decreases with `mixing`
-        """
-
         prev_error = 10.0
         prev_x_error = 10.0
 
@@ -139,7 +137,7 @@ def test_kpcovr_error(self):
 class KernelPCovRInfrastructureTest(KernelPCovRBaseTest):
     def test_nonfitted_failure(self):
         """
-        This test checks that KernelPCovR will raise a `NonFittedError` if
+        Check that KernelPCovR will raise a `NonFittedError` if
         `transform` is called before the model is fitted
         """
         kpcovr = KernelPCovR(mixing=0.5, n_components=2, tol=1e-12)
@@ -148,7 +146,7 @@ def test_nonfitted_failure(self):
 
     def test_no_arg_predict(self):
         """
-        This test checks that KernelPCovR will raise a `ValueError` if
+        Check that KernelPCovR will raise a `ValueError` if
         `predict` is called without arguments
         """
         kpcovr = KernelPCovR(mixing=0.5, n_components=2, tol=1e-12)
@@ -158,7 +156,7 @@ def test_no_arg_predict(self):
 
     def test_T_shape(self):
         """
-        This test checks that KernelPCovR returns a latent space projection
+        Check that KernelPCovR returns a latent space projection
         consistent with the shape of the input matrix
         """
         n_components = 5
@@ -169,9 +167,7 @@ def test_T_shape(self):
         self.assertTrue(T.shape[-1] == n_components)
 
     def test_no_centerer(self):
-        """
-        tests that when center=False, no centerer exists
-        """
+        """Tests that when center=False, no centerer exists."""
         kpcovr = self.model(center=False)
         kpcovr.fit(self.X, self.Y)
 
@@ -179,10 +175,7 @@ def test_no_centerer(self):
             kpcovr.centerer_
 
     def test_centerer(self):
-        """
-        tests that all functionalities that rely on the centerer work properly
-        """
-
+        """Tests that all functionalities that rely on the centerer work properly."""
         kpcovr = self.model(center=True)
         kpcovr.fit(self.X, self.Y)
 
@@ -305,9 +298,8 @@ def test_precomputed_regression(self):
 
 class KernelTests(KernelPCovRBaseTest):
     def test_kernel_types(self):
-        """
-        This test checks that KernelPCovR can handle all kernels passable to
-        sklearn kernel classes, including callable kernels
+        """Check that KernelPCovR can handle all kernels passable to sklearn
+        kernel classes, including callable kernels
         """
 
         def _linear_kernel(X, Y):
@@ -332,11 +324,9 @@ def _linear_kernel(X, Y):
                 kpcovr.fit(self.X, self.Y)
 
     def test_linear_matches_pcovr(self):
+        """Check that KernelPCovR returns the same results as PCovR when using a linear
+        kernel.
         """
-        This test checks that KernelPCovR returns the same results as PCovR when
-        using a linear kernel
-        """
-
         ridge = RidgeCV(fit_intercept=False, alphas=np.logspace(-8, 2))
         ridge.fit(self.X, self.Y)
 
@@ -394,7 +384,7 @@ def test_linear_matches_pcovr(self):
 class KernelPCovRTestSVDSolvers(KernelPCovRBaseTest):
     def test_svd_solvers(self):
         """
-        This test checks that PCovR works with all svd_solver modes and assigns
+        Check that PCovR works with all svd_solver modes and assigns
         the right n_components
         """
         for solver in ["arpack", "full", "randomized", "auto"]:
@@ -433,7 +423,7 @@ def test_svd_solvers(self):
 
     def test_bad_solver(self):
         """
-        This test checks that PCovR will not work with a solver that isn't in
+        Check that PCovR will not work with a solver that isn't in
         ['arpack', 'full', 'randomized', 'auto']
         """
         with self.assertRaises(ValueError) as cm:
@@ -443,11 +433,7 @@ def test_bad_solver(self):
         self.assertTrue(str(cm.exception), "Unrecognized svd_solver='bad'" "")
 
     def test_good_n_components(self):
-        """
-        This test checks that PCovR will work with any allowed values of
-        n_components.
-        """
-
+        """Check that PCovR will work with any allowed values of n_components."""
         # this one should pass
         kpcovr = self.model(n_components=0.5, svd_solver="full")
         kpcovr.fit(self.X, self.Y)
@@ -462,11 +448,7 @@ def test_good_n_components(self):
             kpcovr.fit(self.X, self.Y)
 
     def test_bad_n_components(self):
-        """
-        This test checks that PCovR will not work with any prohibited values of
-        n_components.
-        """
-
+        """Check that PCovR will not work with any prohibited values of n_components."""
         with self.subTest(type="negative_ncomponents"):
             with self.assertRaises(ValueError) as cm:
                 kpcovr = self.model(n_components=-1, svd_solver="auto")
diff --git a/tests/test_orthogonalizers.py b/tests/test_orthogonalizers.py
index 0578141c85..016fd69882 100644
--- a/tests/test_orthogonalizers.py
+++ b/tests/test_orthogonalizers.py
@@ -136,7 +136,7 @@ def test_multicolumn(self):
             str(cm.exception),
             "You can only orthogonalize a matrix using a vector with the same number "
             f"of rows. Matrix X has {self.n_samples} rows, whereas the "
-            f"orthogonalizing matrix has {self.n_samples+4} rows.",
+            f"orthogonalizing matrix has {self.n_samples + 4} rows.",
         )
 
     def test_warning(self):
diff --git a/tests/test_pcovr.py b/tests/test_pcovr.py
index 84a261c860..233f369e02 100644
--- a/tests/test_pcovr.py
+++ b/tests/test_pcovr.py
@@ -32,10 +32,7 @@ def setUp(self):
 
 class PCovRErrorTest(PCovRBaseTest):
     def test_against_pca(self):
-        """
-        Tests that mixing = 1.0 corresponds to PCA
-        """
-
+        """Tests that mixing = 1.0 corresponds to PCA."""
         pcovr = PCovR(
             mixing=1.0, n_components=3, space="sample", svd_solver="full"
         ).fit(self.X, self.Y)
@@ -54,11 +51,9 @@ def test_against_pca(self):
         )
 
     def test_simple_reconstruction(self):
+        """Check that PCovR with a full eigendecomposition at mixing=1 can fully
+        reconstruct the input matrix.
         """
-        This test checks that PCovR with a full eigendecomposition at mixing=1
-        can fully reconstruct the input matrix.
-        """
-
         for space in ["feature", "sample", "auto"]:
             with self.subTest(space=space):
                 pcovr = self.model(
@@ -73,7 +68,7 @@ def test_simple_reconstruction(self):
 
     def test_simple_prediction(self):
         """
-        This test checks that PCovR with a full eigendecomposition at mixing=0
+        Check that PCovR with a full eigendecomposition at mixing=0
         can fully reconstruct the input properties.
         """
         for space in ["feature", "sample", "auto"]:
@@ -92,7 +87,7 @@ def test_simple_prediction(self):
 
     def test_lr_with_x_errors(self):
         """
-        This test checks that PCovR returns a non-null property prediction
+        Check that PCovR returns a non-null property prediction
         and that the prediction error increases with `mixing`
         """
         prev_error = -1.0
@@ -112,12 +107,9 @@ def test_lr_with_x_errors(self):
             prev_error = error
 
     def test_lr_with_t_errors(self):
+        """Check that PCovR returns a non-null property prediction from the latent space
+        projection and that the prediction error increases with `mixing`.
         """
-        This test checks that PCovR returns a non-null property prediction
-        from the latent space projection and that the prediction error
-        increases with `mixing`
-        """
-
         prev_error = -1.0
 
         for mixing in np.linspace(0, 1, 11):
@@ -136,11 +128,9 @@ def test_lr_with_t_errors(self):
             prev_error = error
 
     def test_reconstruction_errors(self):
+        """Check that PCovR returns a non-null reconstructed X and that the
+        reconstruction error decreases with `mixing`.
         """
-        This test checks that PCovR returns a non-null reconstructed X
-        and that the reconstruction error decreases with `mixing`
-        """
-
         prev_error = 1.0
 
         for mixing in np.linspace(0, 1, 11):
@@ -161,7 +151,7 @@ def test_reconstruction_errors(self):
 class PCovRSpaceTest(PCovRBaseTest):
     def test_select_feature_space(self):
         """
-        This test checks that PCovR implements the feature space version
+        Check that PCovR implements the feature space version
         when :math:`n_{features} < n_{samples}``.
         """
         pcovr = self.model(n_components=2, tol=1e-12)
@@ -171,7 +161,7 @@ def test_select_feature_space(self):
 
     def test_select_sample_space(self):
         """
-        This test checks that PCovR implements the sample space version
+        Check that PCovR implements the sample space version
         when :math:`n_{features} > n_{samples}``.
         """
         pcovr = self.model(n_components=2, tol=1e-12)
@@ -183,7 +173,7 @@ def test_select_sample_space(self):
 
     def test_bad_space(self):
         """
-        This test checks that PCovR raises a ValueError when a non-valid
+        Check that PCovR raises a ValueError when a non-valid
         space is designated.
         """
         with self.assertRaises(ValueError):
@@ -192,7 +182,7 @@ def test_bad_space(self):
 
     def test_override_spaceselection(self):
         """
-        This test checks that PCovR implements the space provided in the
+        Check that PCovR implements the space provided in the
         constructor, overriding that chosen by the input dimensions.
         """
         pcovr = self.model(n_components=2, tol=1e-12, space="sample")
@@ -202,7 +192,7 @@ def test_override_spaceselection(self):
 
     def test_spaces_equivalent(self):
         """
-        This test checks that the results from PCovR, regardless of the space,
+        Check that the results from PCovR, regardless of the space,
         are equivalent.
         """
         for alpha in np.linspace(0.01, 0.99, 11):
@@ -248,7 +238,7 @@ def test_spaces_equivalent(self):
 class PCovRTestSVDSolvers(PCovRBaseTest):
     def test_svd_solvers(self):
         """
-        This test checks that PCovR works with all svd_solver modes and assigns
+        Check that PCovR works with all svd_solver modes and assigns
         the right n_components
         """
         for solver in ["arpack", "full", "randomized", "auto"]:
@@ -263,7 +253,7 @@ def test_svd_solvers(self):
 
     def test_bad_solver(self):
         """
-        This test checks that PCovR will not work with a solver that isn't in
+        Check that PCovR will not work with a solver that isn't in
         ['arpack', 'full', 'randomized', 'auto']
         """
         for space in ["feature", "sample"]:
@@ -274,11 +264,7 @@ def test_bad_solver(self):
             self.assertEqual(str(cm.exception), "Unrecognized svd_solver='bad'" "")
 
     def test_good_n_components(self):
-        """
-        This test checks that PCovR will work with any allowed values of
-        n_components.
-        """
-
+        """Check that PCovR will work with any allowed values of n_components."""
         # this one should pass
         pcovr = self.model(n_components=0.5, svd_solver="full")
         pcovr.fit(self.X, self.Y)
@@ -293,11 +279,7 @@ def test_good_n_components(self):
             pcovr.fit(self.X, self.Y)
 
     def test_bad_n_components(self):
-        """
-        This test checks that PCovR will not work with any prohibited values of
-        n_components.
-        """
-
+        """Check that PCovR will not work with any prohibited values of n_components."""
         with self.assertRaises(ValueError) as cm:
             pcovr = self.model(n_components="mle", svd_solver="full")
             pcovr.fit(self.X[:2], self.Y[:2])
@@ -370,7 +352,7 @@ def test_bad_n_components(self):
 class PCovRInfrastructureTest(PCovRBaseTest):
     def test_nonfitted_failure(self):
         """
-        This test checks that PCovR will raise a `NonFittedError` if
+        Check that PCovR will raise a `NonFittedError` if
         `transform` is called before the pcovr is fitted
         """
         pcovr = self.model(n_components=2, tol=1e-12)
@@ -379,7 +361,7 @@ def test_nonfitted_failure(self):
 
     def test_no_arg_predict(self):
         """
-        This test checks that PCovR will raise a `ValueError` if
+        Check that PCovR will raise a `ValueError` if
         `predict` is called without arguments
         """
         pcovr = self.model(n_components=2, tol=1e-12)
@@ -389,7 +371,7 @@ def test_no_arg_predict(self):
 
     def test_centering(self):
         """
-        This test checks that PCovR raises a warning if
+        Check that PCovR raises a warning if
         given uncentered data.
         """
         pcovr = self.model(n_components=2, tol=1e-12)
@@ -404,7 +386,7 @@ def test_centering(self):
 
     def test_T_shape(self):
         """
-        This test checks that PCovR returns a latent space projection
+        Check that PCovR returns a latent space projection
         consistent with the shape of the input matrix
         """
         n_components = 5
diff --git a/tests/test_progress_bar.py b/tests/test_progress_bar.py
index e304cd3d03..d06440e20b 100644
--- a/tests/test_progress_bar.py
+++ b/tests/test_progress_bar.py
@@ -6,7 +6,7 @@
 class PBarTest(unittest.TestCase):
     def test_no_tqdm(self):
         """
-        This test checks that the model cannot use a progress bar when tqdm
+        Check that the model cannot use a progress bar when tqdm
         is not installed
         """
         import sys
diff --git a/tests/test_sample_pcov_cur.py b/tests/test_sample_pcov_cur.py
index c14c50c486..cb05326aac 100644
--- a/tests/test_sample_pcov_cur.py
+++ b/tests/test_sample_pcov_cur.py
@@ -16,20 +16,14 @@ def setUp(self):
         self.idx = [256, 304, 41, 408, 311, 364, 152, 78, 359, 102]
 
     def test_known(self):
-        """
-        This test checks that the model returns a known set of indices
-        """
-
+        """Check that the model returns a known set of indices."""
         selector = PCovCUR(n_to_select=10, mixing=0.5)
         selector.fit(self.X, self.y)
 
         self.assertTrue(np.allclose(selector.selected_idx_, self.idx))
 
     def test_restart(self):
-        """
-        This test checks that the model can be restarted with a new instance
-        """
-
+        """Check that the model can be restarted with a new instance."""
         selector = PCovCUR(n_to_select=1, mixing=0.5)
         selector.fit(self.X, self.y)
 
@@ -49,9 +43,7 @@ def test_restart(self):
                 )
 
     def test_non_it(self):
-        """
-        This test checks that the model can be run non-iteratively
-        """
+        """Check that the model can be run non-iteratively."""
         self.idx = [256, 32, 138, 290, 362, 141, 359, 428, 254, 9]
         selector = PCovCUR(n_to_select=10, recompute_every=0)
         selector.fit(self.X, self.y)
@@ -59,10 +51,7 @@ def test_non_it(self):
         self.assertTrue(np.allclose(selector.selected_idx_, self.idx))
 
     def test_multiple_k(self):
-        """
-        This test checks that the model can be run with multiple k's
-        """
-
+        """Check that the model can be run with multiple k's."""
         for k in list(set(np.logspace(0, np.log10(min(self.X.shape)), 4, dtype=int))):
             selector = PCovCUR(n_to_select=10, k=k)
             selector.fit(self.X, self.y)
diff --git a/tests/test_sample_pcov_fps.py b/tests/test_sample_pcov_fps.py
index 8a083a776e..9aab7ed5d2 100644
--- a/tests/test_sample_pcov_fps.py
+++ b/tests/test_sample_pcov_fps.py
@@ -11,11 +11,9 @@ def setUp(self):
         self.idx = [0, 256, 156, 324, 349, 77, 113, 441, 426, 51]
 
     def test_restart(self):
+        """Check that the model can be restarted with a new number of samples and
+        `warm_start`.
         """
-        This test checks that the model can be restarted with a new number of
-        samples and `warm_start`
-        """
-
         selector = PCovFPS(n_to_select=1, initialize=self.idx[0])
         selector.fit(self.X, y=self.y)
 
@@ -25,10 +23,7 @@ def test_restart(self):
             self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1])
 
     def test_no_mixing_1(self):
-        """
-        This test checks that the model throws an error when mixing = 1.0
-        """
-
+        """Check that the model throws an error when mixing = 1.0."""
         with self.assertRaises(ValueError) as cm:
             _ = PCovFPS(n_to_select=1, mixing=1.0)
         self.assertEquals(
diff --git a/tests/test_sample_simple_cur.py b/tests/test_sample_simple_cur.py
index b3a9437e16..0969074a39 100644
--- a/tests/test_sample_simple_cur.py
+++ b/tests/test_sample_simple_cur.py
@@ -14,7 +14,7 @@ def setUp(self):
 
     def test_sample_transform(self):
         """
-        This test checks that an error is raised when the transform function is used,
+        Check that an error is raised when the transform function is used,
         because sklearn does not support well transformers that change the number
         of samples with other classes like Pipeline
         """
@@ -29,10 +29,7 @@ def test_sample_transform(self):
         )
 
     def test_restart(self):
-        """
-        This test checks that the model can be restarted with a new instance
-        """
-
+        """Check that the model can be restarted with a new instance"""
         ref_selector = CUR(n_to_select=self.n_select)
         ref_idx = ref_selector.fit(self.X).selected_idx_
 
@@ -45,10 +42,7 @@ def test_restart(self):
             self.assertEqual(selector.selected_idx_[i], ref_idx[i])
 
     def test_non_it(self):
-        """
-        This test checks that the model can be run non-iteratively
-        """
-
+        """Check that the model can be run non-iteratively."""
         K = self.X @ self.X.T
         _, UK = np.linalg.eigh(K)
         ref_idx = np.argsort(-(UK[:, -1] ** 2.0))[: self.n_select]
diff --git a/tests/test_sample_simple_fps.py b/tests/test_sample_simple_fps.py
index ca7ee4beed..cbf94fcf89 100644
--- a/tests/test_sample_simple_fps.py
+++ b/tests/test_sample_simple_fps.py
@@ -12,11 +12,9 @@ def setUp(self):
         self.idx = [0, 123, 441, 187, 117, 276, 261, 281, 251, 193]
 
     def test_restart(self):
+        """Checks that the model can be restarted with a new number of samples and
+        `warm_start`.
         """
-        This test checks that the model can be restarted with a new number of
-        samples and `warm_start`
-        """
-
         selector = FPS(n_to_select=1, initialize=self.idx[0])
         selector.fit(self.X)
 
@@ -26,11 +24,9 @@ def test_restart(self):
             self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1])
 
     def test_initialize(self):
+        """Checks that the model can be initialized in all applicable manners and throws
+        an error otherwise.
         """
-        This test checks that the model can be initialized in all applicable manners
-        and throws an error otherwise
-        """
-
         for initialize in [self.idx[0], "random"]:
             with self.subTest(initialize=initialize):
                 selector = FPS(n_to_select=1, initialize=initialize)
@@ -51,9 +47,7 @@ def test_initialize(self):
         )
 
     def test_get_distances(self):
-        """
-        This test checks that the hausdorff distances are returnable after fitting
-        """
+        """Checks that the hausdorff distances are returnable after fitting."""
         selector = FPS(n_to_select=1)
         selector.fit(self.X)
         _ = selector.get_select_distance()
diff --git a/tests/test_sparse_kernel_centerer.py b/tests/test_sparse_kernel_centerer.py
index 619e8e3870..08ab77f52c 100644
--- a/tests/test_sparse_kernel_centerer.py
+++ b/tests/test_sparse_kernel_centerer.py
@@ -13,7 +13,8 @@ def __init__(self, *args, **kwargs):
 
     def test_sample_weights(self):
         """Checks that sample weights of one are equal to the unweighted case and that
-        the nonuniform weights are different from the unweighted case"""
+        the nonuniform weights are different from the unweighted case.
+        """
         X = self.random_state.uniform(-1, 1, size=(4, 5))
         X_sparse = self.random_state.uniform(-1, 1, size=(3, 5))
 
@@ -40,7 +41,8 @@ def test_sample_weights(self):
 
     def test_invalid_sample_weights(self):
         """Checks that weights must be 1D array with the same length as the number of
-        samples"""
+        samples.
+        """
         X = self.random_state.uniform(-1, 1, size=(4, 5))
         X_sparse = self.random_state.uniform(-1, 1, size=(3, 5))
 
@@ -56,8 +58,7 @@ def test_invalid_sample_weights(self):
             model.fit_transform(Knm, Kmm, sample_weight=wts_dim)
 
     def test_Square_Kmm(self):
-        """Checks that the passed active kernel is square"""
-
+        """Checks that the passed active kernel is square."""
         X = self.random_state.uniform(-1, 1, size=(4, 5))
         X_sparse = self.random_state.uniform(-1, 1, size=(3, 5))
 
@@ -71,8 +72,8 @@ def test_Square_Kmm(self):
 
     def test_LatterDim(self):
         """Checks that a matrix must have the same latter dimension as its active
-        counterpart cannot be normalized."""
-
+        counterpart cannot be normalized.
+        """
         X = self.random_state.uniform(-1, 1, size=(4, 5))
         X_sparse = self.random_state.uniform(-1, 1, size=(3, 5))
 
@@ -84,12 +85,13 @@ def test_LatterDim(self):
             model.fit(Knm, Kmm)
         self.assertEqual(
             str(cm.exception),
-            "The reference kernel is not " "commensurate shape with the active kernel.",
+            "The reference kernel is not commensurate shape with the active kernel.",
         )
 
     def test_new_kernel(self):
-        """Checks that it is impossible to normalize
-        a matrix with a non-coincident size with the reference."""
+        """Checks that it is impossible to normalize a matrix with a non-coincident size
+        with the reference.
+        """
         X = self.random_state.uniform(-1, 1, size=(4, 5))
         X_sparse = self.random_state.uniform(-1, 1, size=(3, 5))
 
@@ -107,9 +109,9 @@ def test_new_kernel(self):
         )
 
     def test_NotFittedError_transform(self):
-        """Checks that an error is returned when
-        trying to use the transform function
-        before the fit function"""
+        """Checks that an error is returned when trying to use the transform function
+        before the fit function
+        """
         K = self.random_state.uniform(0, 100, size=(3, 3))
         model = SparseKernelCenterer()
         with self.assertRaises(sklearn.exceptions.NotFittedError):
@@ -117,10 +119,9 @@ def test_NotFittedError_transform(self):
 
     def test_fit_transform(self):
         """Checks that the kernel is correctly normalized.
-        Compare with the value calculated
-        directly from the equation.
-        """
 
+        Compare with the value calculated directly from the equation.
+        """
         X = self.random_state.uniform(-1, 1, size=(4, 5))
         X_sparse = self.random_state.uniform(-1, 1, size=(3, 5))
 
diff --git a/tests/test_standard_flexible_scaler.py b/tests/test_standard_flexible_scaler.py
index e1d6cc1f60..7d5de796c1 100644
--- a/tests/test_standard_flexible_scaler.py
+++ b/tests/test_standard_flexible_scaler.py
@@ -15,7 +15,8 @@ def __init__(self, *args, **kwargs):
     def test_sample_weights(self):
         """Checks that sample weights of one are equal to the unweighted case.
 
-        Also, that the nonuniform weights are different from the unweighted case"""
+        Also, that the nonuniform weights are different from the unweighted case
+        """
         X = self.random_state.uniform(0, 100, size=(3, 3))
         equal_wts = np.ones(len(X))
         nonequal_wts = self.random_state.uniform(0, 100, size=(len(X),))
@@ -33,7 +34,8 @@ def test_sample_weights(self):
 
     def test_invalid_sample_weights(self):
         """Checks that weights must be 1D array with the same length as the number of
-        samples"""
+        samples
+        """
         X = self.random_state.uniform(0, 100, size=(3, 3))
         wts_len = np.ones(len(X) + 1)
         wts_dim = np.ones((len(X), 2))
@@ -106,17 +108,18 @@ def test_inverse_transform(self):
         self.assertTrue((np.isclose(Y, Y_inv, atol=1e-12)).all())
 
     def test_NotFittedError_transform(self):
-        """Checks that an error is returned when
-        trying to use the transform function
-        before the fit function"""
+        """Checks that an error is returned when trying to use the transform function
+        before the fit function.
+        """
         X = self.random_state.uniform(0, 100, size=(3, 3))
         model = StandardFlexibleScaler(column_wise=True)
         with self.assertRaises(sklearn.exceptions.NotFittedError):
             model.transform(X)
 
     def test_shape_inconsistent_transform(self):
-        """Checks that an error is returned when attempting
-        to use the transform function with mismatched matrix sizes."""
+        """Checks that an error is returned when attempting to use the transform
+        function with mismatched matrix sizes.
+        """
         X = self.random_state.uniform(0, 100, size=(3, 3))
         X_test = self.random_state.uniform(0, 100, size=(4, 4))
         model = StandardFlexibleScaler(column_wise=True)
@@ -125,8 +128,9 @@ def test_shape_inconsistent_transform(self):
             model.transform(X_test)
 
     def test_shape_inconsistent_inverse(self):
-        """Checks that an error is returned when attempting
-        to use the inverse transform function with mismatched matrix sizes."""
+        """Checks that an error is returned when attempting to use the inverse transform
+        function with mismatched matrix sizes.
+        """
         X = self.random_state.uniform(0, 100, size=(3, 3))
         X_test = self.random_state.uniform(0, 100, size=(4, 4))
         model = StandardFlexibleScaler(column_wise=True)
@@ -135,17 +139,18 @@ def test_shape_inconsistent_inverse(self):
             model.inverse_transform(X_test)
 
     def test_NotFittedError_inverse(self):
-        """Checks that an error is returned when
-        trying to use the inverse transform function
-        before the fit function"""
+        """Checks that an error is returned when trying to use the inverse transform
+        function before the fit function.
+        """
         X = self.random_state.uniform(0, 100, size=(3, 3))
         model = StandardFlexibleScaler()
         with self.assertRaises(sklearn.exceptions.NotFittedError):
             model.inverse_transform(X)
 
     def test_ValueError_column_wise(self):
-        """Checks that the matrix cannot be normalized
-        across columns if there is a zero variation column."""
+        """Checks that the matrix cannot be normalized across columns if there is a zero
+        variation column.
+        """
         X = self.random_state.uniform(0, 100, size=(3, 3))
         X[0][0] = X[1][0] = X[2][0] = 2
         model = StandardFlexibleScaler(column_wise=True)
@@ -154,7 +159,8 @@ def test_ValueError_column_wise(self):
 
     def test_atol(self):
         """Checks that we can define absolute tolerance and it control the
-        minimal variance of columns ot the whole matrix"""
+        minimal variance of columns ot the whole matrix.
+        """
         X = self.random_state.uniform(0, 100, size=(3, 3))
         atol = ((X[:, 0] - X[:, 0].mean(axis=0)) ** 2).mean(axis=0) + 1e-8
         model = StandardFlexibleScaler(column_wise=True, atol=atol, rtol=0)
@@ -167,7 +173,8 @@ def test_atol(self):
 
     def test_rtol(self):
         """Checks that we can define relative tolerance and it control the
-        minimal variance of columns or the whole matrix"""
+        minimal variance of columns or the whole matrix.
+        """
         X = self.random_state.uniform(0, 100, size=(3, 3))
         mean = X[:, 0].mean(axis=0)
         rtol = ((X[:, 0] - mean) ** 2).mean(axis=0) / mean + 1e-8
@@ -181,16 +188,16 @@ def test_rtol(self):
             model.fit(X)
 
     def test_ValueError_full(self):
-        """Checks that the matrix cannot be normalized
-        if there is a zero variation matrix."""
+        """Checks that the matrix cannot be normalized if there is a zero variation
+        matrix.
+        """
         X = np.array([2, 2, 2]).reshape(-1, 1)
         model = StandardFlexibleScaler(column_wise=False)
         with self.assertRaises(ValueError):
             model.fit(X)
 
     def test_not_w_mean(self):
-        """Checks that the matrix normalized `with_mean=False`
-        does not have a mean."""
+        """Checks that the matrix normalized `with_mean=False` does not have a mean."""
         X = np.array([2, 2, 3]).reshape(-1, 1)
         model = StandardFlexibleScaler(with_mean=False)
         model.fit(X)
diff --git a/tests/test_voronoi_fps.py b/tests/test_voronoi_fps.py
index 1a2c6b3140..55cb194fbb 100644
--- a/tests/test_voronoi_fps.py
+++ b/tests/test_voronoi_fps.py
@@ -12,11 +12,9 @@ def setUp(self):
         super().setUp()
 
     def test_restart(self):
-        """
-        This test checks that the model can be restarted with a new number of
+        """Checks that the model can be restarted with a new number of
         features and `warm_start`
         """
-
         selector = VoronoiFPS(n_to_select=1, initialize=self.idx[0])
         selector.fit(self.X)
 
@@ -26,11 +24,9 @@ def test_restart(self):
             self.assertEqual(selector.selected_idx_[i - 1], self.idx[i - 1])
 
     def test_initialize(self):
-        """
-        This test checks that the model can be initialized in all applicable manners
+        """Checks that the model can be initialized in all applicable manners
         and throws an error otherwise
         """
-
         for initialize in [self.idx[0], "random"]:
             with self.subTest(initialize=initialize):
                 selector = VoronoiFPS(n_to_select=1, initialize=initialize)
@@ -44,8 +40,7 @@ def test_initialize(self):
         )
 
     def test_switching_point(self):
-        """
-        This test check work of the switching point calculator into the
+        """Check work of the switching point calculator into the
         _init_greedy_search function
         """
         selector = VoronoiFPS(n_to_select=1)
@@ -94,9 +89,7 @@ def test_switching_point(self):
             )
 
     def test_get_distances(self):
-        """
-        This test checks that the hausdorff distances are returnable after fitting
-        """
+        """Checks that the hausdorff distances are returnable after fitting"""
         selector = VoronoiFPS(n_to_select=1)
         selector.fit(self.X)
         _ = selector.get_select_distance()
@@ -106,8 +99,7 @@ def test_get_distances(self):
             _ = selector.get_select_distance()
 
     def test_comparison(self):
-        """
-        This test checks that the voronoi FPS strictly computes less distances
+        """Checks that the voronoi FPS strictly computes less distances
         than its normal FPS counterpart.
         """
         vselector = VoronoiFPS(n_to_select=self.X.shape[0] - 1)
@@ -119,9 +111,8 @@ def test_comparison(self):
         self.assertTrue(np.allclose(vselector.selected_idx_, selector.selected_idx_))
 
     def test_nothing_updated_points(self):
-        """
-        This test checks that in the case where we have no points to update,
-        the code still works fine
+        """Checks that in the case where we have no points to update, the code
+        still works fine
         """
         X = np.array([[1, 1], [4, 4], [10, 10], [100, 100]])
         selector = VoronoiFPS(n_to_select=3, initialize=0)
@@ -165,7 +156,7 @@ def test_calculate_dSL(self):
         )
 
     def test_score(self):
-        """This test check that function score return hausdorff distance"""
+        """Check that function score return hausdorff distance"""
         selector = VoronoiFPS(n_to_select=3, initialize=0)
         selector.fit(self.X)
 
diff --git a/tox.ini b/tox.ini
index 5920c6a90c..dee237d9ab 100644
--- a/tox.ini
+++ b/tox.ini
@@ -57,6 +57,7 @@ deps =
     blackdoc
     flake8
     flake8-bugbear
+    flake8-docstrings
     flake8-sphinx-links
     isort
     sphinx-lint
@@ -96,7 +97,16 @@ commands =
 max_line_length = 88
 exclude =
     docs/src/examples/
+docstring-convention = numpy
 per-file-ignores =
     # D205 and D400 are incompatible with the requirements of sphinx-gallery
     examples/**:D205, D400
-extend-ignore = E203
+ignore = 
+    E203
+    D100
+    D101
+    D102
+    D205
+    D400
+    D401
+    W503