From 98457b0ce690a9d819b09cbba1abc18bcacd04fa Mon Sep 17 00:00:00 2001 From: Doug Date: Mon, 5 Aug 2024 17:38:31 +0100 Subject: [PATCH 01/15] Add checks for threshold to be in range of variables. Write fit method. --- feature_engine/discretisation/binarizer.py | 183 ++++++++++++++++++ .../tests_binarizer/test_binarizer.py | 11 ++ 2 files changed, 194 insertions(+) create mode 100644 feature_engine/discretisation/binarizer.py create mode 100644 feature_engine/discretisation/tests_binarizer/test_binarizer.py diff --git a/feature_engine/discretisation/binarizer.py b/feature_engine/discretisation/binarizer.py new file mode 100644 index 000000000..d9d70dd0a --- /dev/null +++ b/feature_engine/discretisation/binarizer.py @@ -0,0 +1,183 @@ +from typing import List, Optional, Union + +import pandas as pd + +from feature_engine._check_init_parameters.check_variables import ( + _check_variables_input_value, +) +from feature_engine._docstrings.fit_attributes import ( + _binner_dict_docstring, + _feature_names_in_docstring, + _n_features_in_docstring, + _variables_attribute_docstring, +) +from feature_engine._docstrings.init_parameters.all_trasnformers import ( + _variables_numerical_docstring, +) +from feature_engine._docstrings.init_parameters.discretisers import ( + _precision_docstring, + _return_boundaries_docstring, + _return_object_docstring, +) +from feature_engine._docstrings.methods import ( + _fit_discretiser_docstring, + _fit_transform_docstring, + _transform_discretiser_docstring, +) +from feature_engine._docstrings.substitute import Substitution +from feature_engine.discretisation.base_discretiser import BaseDiscretiser + + +@Substitution( + return_object=_return_object_docstring, + return_boundaries=_return_boundaries_docstring, + precision=_precision_docstring, + binner_dict_=_binner_dict_docstring, + fit=_fit_discretiser_docstring, + transform=_transform_discretiser_docstring, + variables=_variables_numerical_docstring, + variables_=_variables_attribute_docstring, + feature_names_in_=_feature_names_in_docstring, + n_features_in_=_n_features_in_docstring, + fit_transform=_fit_transform_docstring, +) +class Binarizer(BaseDiscretiser): + """ + TODO: FIX THE DOCSTRING. SEE BELOW FOR EXAMPLE + The EqualWidthDiscretiser() divides continuous numerical variables into + intervals of the same width, that is, equidistant intervals. Note that the + proportion of observations per interval may vary. + + The size of the interval is calculated as: + + .. math:: + + ( max(X) - min(X) ) / bins + + where bins, which is the number of intervals, is determined by the user. + + The EqualWidthDiscretiser() works only with numerical variables. + A list of variables can be passed as argument. Alternatively, the discretiser + will automatically select all numerical variables. + + The EqualWidthDiscretiser() first finds the boundaries for the intervals for + each variable. Then, it transforms the variables, that is, sorts the values into + the intervals. + + More details in the :ref:`User Guide `. + + Parameters + ---------- + {variables} + + bins: int, default=10 + Desired number of equal width intervals / bins. + + {return_object} + + {return_boundaries} + + {precision} + + Attributes + ---------- + {binner_dict_} + + {variables_} + + {feature_names_in_} + + {n_features_in_} + + Methods + ------- + {fit} + + {fit_transform} + + {transform} + + See Also + -------- + pandas.cut + sklearn.preprocessing.KBinsDiscretizer + + References + ---------- + .. [1] Kotsiantis and Pintelas, "Data preprocessing for supervised leaning," + International Journal of Computer Science, vol. 1, pp. 111 117, 2006. + + .. [2] Dong. "Beating Kaggle the easy way". Master Thesis. + https://www.ke.tu-darmstadt.de/lehre/arbeiten/studien/2015/Dong_Ying.pdf + + Examples + -------- + + >>> import pandas as pd + >>> import numpy as np + >>> from feature_engine.discretisation import EqualWidthDiscretiser + >>> np.random.seed(42) + >>> X = pd.DataFrame(dict(x = np.random.randint(1,100, 100))) + >>> ewd = EqualWidthDiscretiser() + >>> ewd.fit(X) + >>> ewd.transform(X)["x"].value_counts() + 9 15 + 6 15 + 0 13 + 5 11 + 8 9 + 7 8 + 2 8 + 1 7 + 3 7 + 4 7 + Name: x, dtype: int64 + """ + + def __init__( + self, + variables: Union[None, int, str, List[Union[str, int]]] = None, + threshold = None, + return_object: bool = False, + return_boundaries: bool = False, + precision: int = 3, + ) -> None: + + if not threshold: + raise ValueError(f"threshold not supplied. Please provide a threshold of type float or int.") + + if not isinstance(threshold, (int, float)): + raise TypeError(f"threshold must be an integer or a float. Got type {type(threshold)} instead.") + + super().__init__(return_object, return_boundaries, precision) + + self.variables = _check_variables_input_value(variables) + self.threshold = threshold + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + """ + Learn the boundaries of the equal width intervals / bins for each + variable. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The training dataset. Can be the entire dataframe, not just the variables + to be transformed. + y: None + y is not needed in this encoder. You can pass y or None. + """ + + # check input dataframe + X = super().fit(X) + + # Check threshold is in between max and min of all features in self.variables. + thresh_checks = all([self.threshold > min(X[col]) and self.threshold < max(X[col]) for col in self.variables]) + + if not thresh_checks: + print(f"threshold outside of range for one or more variables {self.variables}. Features {self.variables} will not be transformed.") + + return self + + def transform(): + pass \ No newline at end of file diff --git a/feature_engine/discretisation/tests_binarizer/test_binarizer.py b/feature_engine/discretisation/tests_binarizer/test_binarizer.py new file mode 100644 index 000000000..e1f09bb5a --- /dev/null +++ b/feature_engine/discretisation/tests_binarizer/test_binarizer.py @@ -0,0 +1,11 @@ +import numpy as np +import pandas as pd + +from feature_engine.discretisation.binarizer import Binarizer + +np.random.seed(42) +X = pd.DataFrame(dict(x = np.random.randint(1, 100, 100))) + +b = Binarizer(threshold=200, variables=['x']) + +b.fit(X) From 6cdf958d110cd7be6c0065594a63fe1215659c1a Mon Sep 17 00:00:00 2001 From: Doug Date: Thu, 15 Aug 2024 16:10:10 +0100 Subject: [PATCH 02/15] Removing files with -z spelling --- feature_engine/discretisation/binarizer.py | 183 ------------------ .../tests_binarizer/test_binarizer.py | 11 -- 2 files changed, 194 deletions(-) delete mode 100644 feature_engine/discretisation/binarizer.py delete mode 100644 feature_engine/discretisation/tests_binarizer/test_binarizer.py diff --git a/feature_engine/discretisation/binarizer.py b/feature_engine/discretisation/binarizer.py deleted file mode 100644 index d9d70dd0a..000000000 --- a/feature_engine/discretisation/binarizer.py +++ /dev/null @@ -1,183 +0,0 @@ -from typing import List, Optional, Union - -import pandas as pd - -from feature_engine._check_init_parameters.check_variables import ( - _check_variables_input_value, -) -from feature_engine._docstrings.fit_attributes import ( - _binner_dict_docstring, - _feature_names_in_docstring, - _n_features_in_docstring, - _variables_attribute_docstring, -) -from feature_engine._docstrings.init_parameters.all_trasnformers import ( - _variables_numerical_docstring, -) -from feature_engine._docstrings.init_parameters.discretisers import ( - _precision_docstring, - _return_boundaries_docstring, - _return_object_docstring, -) -from feature_engine._docstrings.methods import ( - _fit_discretiser_docstring, - _fit_transform_docstring, - _transform_discretiser_docstring, -) -from feature_engine._docstrings.substitute import Substitution -from feature_engine.discretisation.base_discretiser import BaseDiscretiser - - -@Substitution( - return_object=_return_object_docstring, - return_boundaries=_return_boundaries_docstring, - precision=_precision_docstring, - binner_dict_=_binner_dict_docstring, - fit=_fit_discretiser_docstring, - transform=_transform_discretiser_docstring, - variables=_variables_numerical_docstring, - variables_=_variables_attribute_docstring, - feature_names_in_=_feature_names_in_docstring, - n_features_in_=_n_features_in_docstring, - fit_transform=_fit_transform_docstring, -) -class Binarizer(BaseDiscretiser): - """ - TODO: FIX THE DOCSTRING. SEE BELOW FOR EXAMPLE - The EqualWidthDiscretiser() divides continuous numerical variables into - intervals of the same width, that is, equidistant intervals. Note that the - proportion of observations per interval may vary. - - The size of the interval is calculated as: - - .. math:: - - ( max(X) - min(X) ) / bins - - where bins, which is the number of intervals, is determined by the user. - - The EqualWidthDiscretiser() works only with numerical variables. - A list of variables can be passed as argument. Alternatively, the discretiser - will automatically select all numerical variables. - - The EqualWidthDiscretiser() first finds the boundaries for the intervals for - each variable. Then, it transforms the variables, that is, sorts the values into - the intervals. - - More details in the :ref:`User Guide `. - - Parameters - ---------- - {variables} - - bins: int, default=10 - Desired number of equal width intervals / bins. - - {return_object} - - {return_boundaries} - - {precision} - - Attributes - ---------- - {binner_dict_} - - {variables_} - - {feature_names_in_} - - {n_features_in_} - - Methods - ------- - {fit} - - {fit_transform} - - {transform} - - See Also - -------- - pandas.cut - sklearn.preprocessing.KBinsDiscretizer - - References - ---------- - .. [1] Kotsiantis and Pintelas, "Data preprocessing for supervised leaning," - International Journal of Computer Science, vol. 1, pp. 111 117, 2006. - - .. [2] Dong. "Beating Kaggle the easy way". Master Thesis. - https://www.ke.tu-darmstadt.de/lehre/arbeiten/studien/2015/Dong_Ying.pdf - - Examples - -------- - - >>> import pandas as pd - >>> import numpy as np - >>> from feature_engine.discretisation import EqualWidthDiscretiser - >>> np.random.seed(42) - >>> X = pd.DataFrame(dict(x = np.random.randint(1,100, 100))) - >>> ewd = EqualWidthDiscretiser() - >>> ewd.fit(X) - >>> ewd.transform(X)["x"].value_counts() - 9 15 - 6 15 - 0 13 - 5 11 - 8 9 - 7 8 - 2 8 - 1 7 - 3 7 - 4 7 - Name: x, dtype: int64 - """ - - def __init__( - self, - variables: Union[None, int, str, List[Union[str, int]]] = None, - threshold = None, - return_object: bool = False, - return_boundaries: bool = False, - precision: int = 3, - ) -> None: - - if not threshold: - raise ValueError(f"threshold not supplied. Please provide a threshold of type float or int.") - - if not isinstance(threshold, (int, float)): - raise TypeError(f"threshold must be an integer or a float. Got type {type(threshold)} instead.") - - super().__init__(return_object, return_boundaries, precision) - - self.variables = _check_variables_input_value(variables) - self.threshold = threshold - - def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): - """ - Learn the boundaries of the equal width intervals / bins for each - variable. - - Parameters - ---------- - X: pandas dataframe of shape = [n_samples, n_features] - The training dataset. Can be the entire dataframe, not just the variables - to be transformed. - y: None - y is not needed in this encoder. You can pass y or None. - """ - - # check input dataframe - X = super().fit(X) - - # Check threshold is in between max and min of all features in self.variables. - thresh_checks = all([self.threshold > min(X[col]) and self.threshold < max(X[col]) for col in self.variables]) - - if not thresh_checks: - print(f"threshold outside of range for one or more variables {self.variables}. Features {self.variables} will not be transformed.") - - return self - - def transform(): - pass \ No newline at end of file diff --git a/feature_engine/discretisation/tests_binarizer/test_binarizer.py b/feature_engine/discretisation/tests_binarizer/test_binarizer.py deleted file mode 100644 index e1f09bb5a..000000000 --- a/feature_engine/discretisation/tests_binarizer/test_binarizer.py +++ /dev/null @@ -1,11 +0,0 @@ -import numpy as np -import pandas as pd - -from feature_engine.discretisation.binarizer import Binarizer - -np.random.seed(42) -X = pd.DataFrame(dict(x = np.random.randint(1, 100, 100))) - -b = Binarizer(threshold=200, variables=['x']) - -b.fit(X) From 5b1927d3b829ff5a38b90a74451cdc3d5cd09cc4 Mon Sep 17 00:00:00 2001 From: Doug Date: Thu, 15 Aug 2024 16:10:50 +0100 Subject: [PATCH 03/15] Initial commit of Binariser class --- feature_engine/discretisation/binariser.py | 233 +++++++++++++++++++++ 1 file changed, 233 insertions(+) create mode 100644 feature_engine/discretisation/binariser.py diff --git a/feature_engine/discretisation/binariser.py b/feature_engine/discretisation/binariser.py new file mode 100644 index 000000000..cf8f98373 --- /dev/null +++ b/feature_engine/discretisation/binariser.py @@ -0,0 +1,233 @@ +from typing import List, Optional, Union + +import numpy as np +import pandas as pd + +from feature_engine._check_init_parameters.check_variables import ( + _check_variables_input_value, +) +from feature_engine._docstrings.fit_attributes import ( + _binner_dict_docstring, + _feature_names_in_docstring, + _n_features_in_docstring, + _variables_attribute_docstring, +) +from feature_engine._docstrings.init_parameters.all_trasnformers import ( + _variables_numerical_docstring, +) +from feature_engine._docstrings.init_parameters.discretisers import ( + _precision_docstring, + _return_boundaries_docstring, + _return_object_docstring, +) +from feature_engine._docstrings.methods import ( + _fit_discretiser_docstring, + _fit_transform_docstring, + _transform_discretiser_docstring, +) +from feature_engine._docstrings.substitute import Substitution +from feature_engine.discretisation.base_discretiser import BaseDiscretiser + + +@Substitution( + return_object=_return_object_docstring, + return_boundaries=_return_boundaries_docstring, + precision=_precision_docstring, + binner_dict_=_binner_dict_docstring, + fit=_fit_discretiser_docstring, + transform=_transform_discretiser_docstring, + variables=_variables_numerical_docstring, + variables_=_variables_attribute_docstring, + feature_names_in_=_feature_names_in_docstring, + n_features_in_=_n_features_in_docstring, + fit_transform=_fit_transform_docstring, +) +class Binariser(BaseDiscretiser): + """ + The Binariser() divides continuous numerical variables into two intervals, where + the value `threshold`, the point at which the interval is divided, is determined + by the user. + + The Binariser() works only with numerical variables. + A list of variables can be passed as argument. Alternatively, the discretiser + will automatically select all numerical variables. + + The Binariser() first finds the boundaries for the intervals for + each variable. Then, it transforms the variables, that is, sorts the values into + the intervals. + + More details in the :ref:`User Guide `. + + Parameters + ---------- + {variables} + + threshold: int, float, default=None + Desired value at which to divide the interval. + + {return_object} + + {return_boundaries} + + {precision} + + Attributes + ---------- + {binner_dict_} + + {variables_} + + {feature_names_in_} + + {n_features_in_} + + Methods + ------- + {fit} + + {fit_transform} + + {transform} + + See Also + -------- + pandas.cut + sklearn.preprocessing.KBinsDiscretizer + + References + ---------- + .. [1] Kotsiantis and Pintelas, "Data preprocessing for supervised leaning," + International Journal of Computer Science, vol. 1, pp. 111 117, 2006. + + .. [2] Dong. "Beating Kaggle the easy way". Master Thesis. + https://www.ke.tu-darmstadt.de/lehre/arbeiten/studien/2015/Dong_Ying.pdf + + Examples + -------- + + >>> import pandas as pd + >>> import numpy as np + >>> from feature_engine.discretisation import EqualWidthDiscretiser + >>> np.random.seed(42) + >>> X = pd.DataFrame(dict(x = np.random.randint(1,100, 100))) + >>> transformer = Binariser(threshold=50) + >>> transformer.fit(X) + >>> transformer.transform(X)['x'].value_counts() + x + 1 56 + 0 44 + Name: count, dtype: int64 + """ + + def __init__( + self, + threshold: Union[None, int, float] = None, + variables: Union[None, int, str, List[Union[str, int]]] = None, + return_object: bool = False, + return_boundaries: bool = False, + precision: int = 3, + ) -> None: + + if threshold is None: + raise TypeError( + "threshold not supplied." + " Please provide a threshold of type float or int." + ) + + if not isinstance(threshold, (int, float)): + raise TypeError( + "threshold must be an integer or a float." + f" Got type '{type(threshold).__name__}' instead." + ) + + super().__init__(return_object, return_boundaries, precision) + + self.variables = _check_variables_input_value(variables) + self.threshold = threshold + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + """ + Learn the boundaries of the bins for each + variable. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The training dataset. Can be the entire dataframe, not just the variables + to be transformed. + y: None + y is not needed in this encoder. You can pass y or None. + """ + + # check input dataframe + X = super().fit(X) + + failed_threshold_check = [] + self.binner_dict_ = {} + for var in self.variables_: + # Check that threshold is within range + if (self.threshold < min(X[var])) or (self.threshold > max(X[var])): + # Omit these features from transformation step + failed_threshold_check.append(var) + else: + self.binner_dict_[var] = [ + float("-inf"), + np.float64(self.threshold), + float("inf"), + ] + + if failed_threshold_check: + print( + "threshold outside of range for one or more variables." + f" Features {failed_threshold_check} have not been transformed." + ) + + # A list of features that satisfy threshold check and will be transformed + self.variables_trans_ = [ + var for var in self.variables_ if var not in failed_threshold_check + ] + + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """Sort the variable values into the intervals. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The data to transform. + + Returns + ------- + X_new: pandas dataframe of shape = [n_samples, n_features] + The transformed data with the discrete variables. + """ + + # check input dataframe and if class was fitted + X = self._check_transform_input_and_state(X) + + # transform variables + if self.return_boundaries is True: + for feature in self.variables_trans_: + X[feature] = pd.cut( + X[feature], + self.binner_dict_[feature], + precision=self.precision, + include_lowest=True, + ) + X[self.variables_trans_] = X[self.variables_trans_].astype(str) + + else: + for feature in self.variables_trans_: + X[feature] = pd.cut( + X[feature], + self.binner_dict_[feature], + labels=False, + include_lowest=True, + ) + + # return object + if self.return_object: + X[self.variables_trans_] = X[self.variables_trans_].astype("O") + + return X From c2b138aa597088fd6a44382d4c489d8c29af7790 Mon Sep 17 00:00:00 2001 From: Doug Date: Thu, 15 Aug 2024 16:11:43 +0100 Subject: [PATCH 04/15] Committing tests --- tests/test_discretisation/test_binariser.py | 84 +++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 tests/test_discretisation/test_binariser.py diff --git a/tests/test_discretisation/test_binariser.py b/tests/test_discretisation/test_binariser.py new file mode 100644 index 000000000..b84c862ad --- /dev/null +++ b/tests/test_discretisation/test_binariser.py @@ -0,0 +1,84 @@ +import numpy as np +import pytest +from sklearn.exceptions import NotFittedError + +from feature_engine.discretisation.binariser import Binariser + + +def test_automatically_find_variables_and_return_as_numeric(df_normal_dist): + # test case 1: automatically select variables, return_object=False + transformer = Binariser(threshold=0, variables=None, return_object=False) + X = transformer.fit_transform(df_normal_dist) + + # transform input + Xt = np.where(df_normal_dist["var"] > 0, 1, 0) + bins = [float("-inf"), np.float64(0), float("inf")] + + # init params + assert transformer.threshold == 0 + assert transformer.variables is None + assert transformer.return_object is False + # fit params + assert transformer.variables_ == ["var"] + assert transformer.n_features_in_ == 1 + assert transformer.binner_dict_["var"] == bins + # check transformed output against Xt + assert all(x == y for x, y in zip(X["var"].values, Xt)) + + +def test_automatically_find_variables_and_return_as_object(df_normal_dist): + transformer = Binariser(threshold=0, variables=None, return_object=True) + X = transformer.fit_transform(df_normal_dist) + assert X["var"].dtypes == "O" + + +def test_error_when_threshold_not_int_or_float(): + with pytest.raises(TypeError): + Binariser(threshold="other") + + +def test_error_when_threshold_not_supplied(): + with pytest.raises(TypeError): + Binariser() + + +def test_error_if_return_object_not_bool(): + with pytest.raises(ValueError): + Binariser(threshold=0, return_object="other") + + +def test_error_if_input_df_contains_na_in_fit(df_na): + # test case 3: when dataset contains na, fit method + with pytest.raises(ValueError): + transformer = Binariser(threshold=0) + transformer.fit(df_na) + + +def test_error_if_input_df_contains_na_in_transform(df_vartypes, df_na): + # test case 4: when dataset contains na, transform method + with pytest.raises(ValueError): + transformer = Binariser(threshold=0) + transformer.fit(df_vartypes) + transformer.transform(df_na[["Name", "City", "Age", "Marks", "dob"]]) + + +def test_non_fitted_error(df_vartypes): + with pytest.raises(NotFittedError): + transformer = Binariser(threshold=0) + transformer.transform(df_vartypes) + + +def test_stout_threshold_out_of_range(df_vartypes, capsys): + transformer = Binariser(threshold=20, variables=None, return_object=False) + _ = transformer.fit_transform(df_vartypes[["Age", "Marks"]]) + captured = capsys.readouterr() + assert ( + captured.out + == "threshold outside of range for one or more variables. Features ['Marks'] have not been transformed.\n" + ) + + +def test_return_boundaries(df_normal_dist): + transformer = Binariser(threshold=0, return_boundaries=True) + Xt = transformer.fit_transform(df_normal_dist) + assert all(x for x in df_normal_dist["var"].unique() if x not in Xt) From 008d495f4599806690fdea6cf8798675382db04c Mon Sep 17 00:00:00 2001 From: Doug Date: Thu, 15 Aug 2024 16:13:13 +0100 Subject: [PATCH 05/15] typo: fixing typo in DecisionTreeDiscretiser --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8f01bfd1c..afbceb101 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ Please share your story by answering 1 quick question * EqualWidthDiscretiser * GeometricWidthDiscretiser * DecisionTreeDiscretiser -* ArbitraryDiscreriser +* ArbitraryDiscretiser ### Outlier Handling methods * Winsorizer From 994f119036c2abb05f7078b4c62a00906702f8b2 Mon Sep 17 00:00:00 2001 From: Doug Date: Thu, 15 Aug 2024 16:14:00 +0100 Subject: [PATCH 06/15] Adding Binariser to list of transformers --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index afbceb101..52d448bf1 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,7 @@ Please share your story by answering 1 quick question * GeometricWidthDiscretiser * DecisionTreeDiscretiser * ArbitraryDiscretiser +* Binariser ### Outlier Handling methods * Winsorizer From 3a85d33702d95311b717264b3dfc8e64cbed8151 Mon Sep 17 00:00:00 2001 From: Doug Date: Thu, 15 Aug 2024 16:18:21 +0100 Subject: [PATCH 07/15] Adding Binariser to the index --- docs/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.rst b/docs/index.rst index e30eb7eb0..29962c2a6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -183,6 +183,7 @@ discretization with decision trees: - :doc:`api_doc/discretisation/EqualWidthDiscretiser`: sorts variable into equal width intervals - :doc:`api_doc/discretisation/DecisionTreeDiscretiser`: uses decision trees to create finite variables - :doc:`api_doc/discretisation/GeometricWidthDiscretiser`: sorts variable into geometrical intervals +- :doc:`api_doc/discretisation/Binariser`: two intervals determined by a threshold Outlier Capping or Removal ~~~~~~~~~~~~~~~~~~~~~~~~~~ From 40bd686f73ed331e108f6a21e54861b3b13d761f Mon Sep 17 00:00:00 2001 From: Doug Date: Thu, 15 Aug 2024 16:22:57 +0100 Subject: [PATCH 08/15] Adding Binariser to the API index --- docs/api_doc/discretisation/Binariser.rst | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/api_doc/discretisation/Binariser.rst diff --git a/docs/api_doc/discretisation/Binariser.rst b/docs/api_doc/discretisation/Binariser.rst new file mode 100644 index 000000000..e2445c04b --- /dev/null +++ b/docs/api_doc/discretisation/Binariser.rst @@ -0,0 +1,5 @@ +Binariser +========= + +.. autoclass:: feature_engine.discretisation.Binariser + :members: From 76750585bb963ddaf380ea24a7b67905dfff16fc Mon Sep 17 00:00:00 2001 From: Doug Date: Thu, 15 Aug 2024 16:58:49 +0100 Subject: [PATCH 09/15] Renaming Binariser to BinaryDiscretiser to avoid naming conflicts and for consistency with other discretisers --- feature_engine/discretisation/__init__.py | 2 ++ feature_engine/discretisation/binariser.py | 10 +++++----- tests/test_discretisation/test_binariser.py | 22 ++++++++++----------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/feature_engine/discretisation/__init__.py b/feature_engine/discretisation/__init__.py index 5016d1aa9..9d1e602e6 100644 --- a/feature_engine/discretisation/__init__.py +++ b/feature_engine/discretisation/__init__.py @@ -8,6 +8,7 @@ from .equal_frequency import EqualFrequencyDiscretiser from .equal_width import EqualWidthDiscretiser from .geometric_width import GeometricWidthDiscretiser +from .binariser import BinaryDiscretiser __all__ = [ "DecisionTreeDiscretiser", @@ -15,4 +16,5 @@ "EqualWidthDiscretiser", "ArbitraryDiscretiser", "GeometricWidthDiscretiser", + "BinaryDiscretiser", ] diff --git a/feature_engine/discretisation/binariser.py b/feature_engine/discretisation/binariser.py index cf8f98373..a78223ddd 100644 --- a/feature_engine/discretisation/binariser.py +++ b/feature_engine/discretisation/binariser.py @@ -42,17 +42,17 @@ n_features_in_=_n_features_in_docstring, fit_transform=_fit_transform_docstring, ) -class Binariser(BaseDiscretiser): +class BinaryDiscretiser(BaseDiscretiser): """ - The Binariser() divides continuous numerical variables into two intervals, where + The BinaryDiscretiser() divides continuous numerical variables into two intervals, where the value `threshold`, the point at which the interval is divided, is determined by the user. - The Binariser() works only with numerical variables. + The BinaryDiscretiser() works only with numerical variables. A list of variables can be passed as argument. Alternatively, the discretiser will automatically select all numerical variables. - The Binariser() first finds the boundaries for the intervals for + The BinaryDiscretiser() first finds the boundaries for the intervals for each variable. Then, it transforms the variables, that is, sorts the values into the intervals. @@ -110,7 +110,7 @@ class Binariser(BaseDiscretiser): >>> from feature_engine.discretisation import EqualWidthDiscretiser >>> np.random.seed(42) >>> X = pd.DataFrame(dict(x = np.random.randint(1,100, 100))) - >>> transformer = Binariser(threshold=50) + >>> transformer = BinaryDiscretiser(threshold=50) >>> transformer.fit(X) >>> transformer.transform(X)['x'].value_counts() x diff --git a/tests/test_discretisation/test_binariser.py b/tests/test_discretisation/test_binariser.py index b84c862ad..bc0dda892 100644 --- a/tests/test_discretisation/test_binariser.py +++ b/tests/test_discretisation/test_binariser.py @@ -2,12 +2,12 @@ import pytest from sklearn.exceptions import NotFittedError -from feature_engine.discretisation.binariser import Binariser +from feature_engine.discretisation import BinaryDiscretiser def test_automatically_find_variables_and_return_as_numeric(df_normal_dist): # test case 1: automatically select variables, return_object=False - transformer = Binariser(threshold=0, variables=None, return_object=False) + transformer = BinaryDiscretiser(threshold=0, variables=None, return_object=False) X = transformer.fit_transform(df_normal_dist) # transform input @@ -27,49 +27,49 @@ def test_automatically_find_variables_and_return_as_numeric(df_normal_dist): def test_automatically_find_variables_and_return_as_object(df_normal_dist): - transformer = Binariser(threshold=0, variables=None, return_object=True) + transformer = BinaryDiscretiser(threshold=0, variables=None, return_object=True) X = transformer.fit_transform(df_normal_dist) assert X["var"].dtypes == "O" def test_error_when_threshold_not_int_or_float(): with pytest.raises(TypeError): - Binariser(threshold="other") + BinaryDiscretiser(threshold="other") def test_error_when_threshold_not_supplied(): with pytest.raises(TypeError): - Binariser() + BinaryDiscretiser() def test_error_if_return_object_not_bool(): with pytest.raises(ValueError): - Binariser(threshold=0, return_object="other") + BinaryDiscretiser(threshold=0, return_object="other") def test_error_if_input_df_contains_na_in_fit(df_na): # test case 3: when dataset contains na, fit method with pytest.raises(ValueError): - transformer = Binariser(threshold=0) + transformer = BinaryDiscretiser(threshold=0) transformer.fit(df_na) def test_error_if_input_df_contains_na_in_transform(df_vartypes, df_na): # test case 4: when dataset contains na, transform method with pytest.raises(ValueError): - transformer = Binariser(threshold=0) + transformer = BinaryDiscretiser(threshold=0) transformer.fit(df_vartypes) transformer.transform(df_na[["Name", "City", "Age", "Marks", "dob"]]) def test_non_fitted_error(df_vartypes): with pytest.raises(NotFittedError): - transformer = Binariser(threshold=0) + transformer = BinaryDiscretiser(threshold=0) transformer.transform(df_vartypes) def test_stout_threshold_out_of_range(df_vartypes, capsys): - transformer = Binariser(threshold=20, variables=None, return_object=False) + transformer = BinaryDiscretiser(threshold=20, variables=None, return_object=False) _ = transformer.fit_transform(df_vartypes[["Age", "Marks"]]) captured = capsys.readouterr() assert ( @@ -79,6 +79,6 @@ def test_stout_threshold_out_of_range(df_vartypes, capsys): def test_return_boundaries(df_normal_dist): - transformer = Binariser(threshold=0, return_boundaries=True) + transformer = BinaryDiscretiser(threshold=0, return_boundaries=True) Xt = transformer.fit_transform(df_normal_dist) assert all(x for x in df_normal_dist["var"].unique() if x not in Xt) From 9d3efab428a00e6660a229a9b46a49193303d067 Mon Sep 17 00:00:00 2001 From: Doug Date: Thu, 15 Aug 2024 17:00:58 +0100 Subject: [PATCH 10/15] Updating to BinaryDiscretiser --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 52d448bf1..594566859 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,7 @@ Please share your story by answering 1 quick question * GeometricWidthDiscretiser * DecisionTreeDiscretiser * ArbitraryDiscretiser -* Binariser +* BinaryDiscretiser ### Outlier Handling methods * Winsorizer From 85c7831d0cb57b87fc33a33d6613324f461cf6be Mon Sep 17 00:00:00 2001 From: Doug Date: Thu, 15 Aug 2024 17:02:28 +0100 Subject: [PATCH 11/15] Updating to BinaryDiscretiser --- docs/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index 29962c2a6..fa8e15668 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -183,7 +183,7 @@ discretization with decision trees: - :doc:`api_doc/discretisation/EqualWidthDiscretiser`: sorts variable into equal width intervals - :doc:`api_doc/discretisation/DecisionTreeDiscretiser`: uses decision trees to create finite variables - :doc:`api_doc/discretisation/GeometricWidthDiscretiser`: sorts variable into geometrical intervals -- :doc:`api_doc/discretisation/Binariser`: two intervals determined by a threshold +- :doc:`api_doc/discretisation/BinaryDiscretiser`: sorts variable into two intervals determined by a threshold Outlier Capping or Removal ~~~~~~~~~~~~~~~~~~~~~~~~~~ From 0436a69e3b38304529b4c8ebe7040b7ef569cad2 Mon Sep 17 00:00:00 2001 From: Doug Date: Thu, 15 Aug 2024 17:04:37 +0100 Subject: [PATCH 12/15] Removing renamed file --- docs/api_doc/discretisation/Binariser.rst | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 docs/api_doc/discretisation/Binariser.rst diff --git a/docs/api_doc/discretisation/Binariser.rst b/docs/api_doc/discretisation/Binariser.rst deleted file mode 100644 index e2445c04b..000000000 --- a/docs/api_doc/discretisation/Binariser.rst +++ /dev/null @@ -1,5 +0,0 @@ -Binariser -========= - -.. autoclass:: feature_engine.discretisation.Binariser - :members: From bf85e5510cd7ea0b92b491e393a1426193f37729 Mon Sep 17 00:00:00 2001 From: Doug Date: Thu, 15 Aug 2024 17:05:05 +0100 Subject: [PATCH 13/15] Adding BinaryDiscretiser to api docs --- docs/api_doc/discretisation/BinaryDiscretiser.rst | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/api_doc/discretisation/BinaryDiscretiser.rst diff --git a/docs/api_doc/discretisation/BinaryDiscretiser.rst b/docs/api_doc/discretisation/BinaryDiscretiser.rst new file mode 100644 index 000000000..53d0d14e8 --- /dev/null +++ b/docs/api_doc/discretisation/BinaryDiscretiser.rst @@ -0,0 +1,5 @@ +BinaryDiscretiser +================= + +.. autoclass:: feature_engine.discretisation.BinaryDiscretiser + :members: From 2ee9f2265abe39604d96fa9bbfb2f949517f9062 Mon Sep 17 00:00:00 2001 From: Doug Date: Thu, 15 Aug 2024 17:08:02 +0100 Subject: [PATCH 14/15] Adding BinaryDiscretiser --- docs/api_doc/discretisation/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/api_doc/discretisation/index.rst b/docs/api_doc/discretisation/index.rst index 75c484e9d..9fad0bd14 100644 --- a/docs/api_doc/discretisation/index.rst +++ b/docs/api_doc/discretisation/index.rst @@ -18,6 +18,7 @@ into continuous intervals. :class:`ArbitraryDiscretiser()` Sorts values into intervals predefined by the user. :class:`DecisionTreeDiscretiser()` Replaces values by predictions of a decision tree, which are discrete. :class:`GeometricWidthDiscretiser()` Sorts variable into geometrical intervals. +:class:`BinaryDiscretiser()` Sorts variable into two intervals determined by a threshold. ===================================== ======================================================================== From ba71711cd4a253f8a8c5baafe213a72cf640de21 Mon Sep 17 00:00:00 2001 From: Doug Date: Thu, 15 Aug 2024 17:26:36 +0100 Subject: [PATCH 15/15] typo: Fixing typo. --- feature_engine/discretisation/binariser.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/feature_engine/discretisation/binariser.py b/feature_engine/discretisation/binariser.py index a78223ddd..d6e41570f 100644 --- a/feature_engine/discretisation/binariser.py +++ b/feature_engine/discretisation/binariser.py @@ -44,9 +44,9 @@ ) class BinaryDiscretiser(BaseDiscretiser): """ - The BinaryDiscretiser() divides continuous numerical variables into two intervals, where - the value `threshold`, the point at which the interval is divided, is determined - by the user. + The BinaryDiscretiser() divides continuous numerical variables into two intervals, + where the value `threshold`, the point at which the interval is divided, is + determined by the user. The BinaryDiscretiser() works only with numerical variables. A list of variables can be passed as argument. Alternatively, the discretiser @@ -56,8 +56,6 @@ class BinaryDiscretiser(BaseDiscretiser): each variable. Then, it transforms the variables, that is, sorts the values into the intervals. - More details in the :ref:`User Guide `. - Parameters ---------- {variables}