From 2db667f845ff8e91bce926c0336b254bb471f2a7 Mon Sep 17 00:00:00 2001 From: joaopfonseca Date: Sat, 18 Dec 2021 14:44:18 +0000 Subject: [PATCH] MNT add sparse input support and complete documentation (#881) --- imblearn/over_sampling/_smote/geometric.py | 59 ++++++++++++++++++---- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/imblearn/over_sampling/_smote/geometric.py b/imblearn/over_sampling/_smote/geometric.py index a7601a01e..b13d6745b 100644 --- a/imblearn/over_sampling/_smote/geometric.py +++ b/imblearn/over_sampling/_smote/geometric.py @@ -1,12 +1,14 @@ """Class to perform over-sampling using Geometric SMOTE.""" # Author: Georgios Douzas +# Joao Fonseca # License: BSD 3 clause import numpy as np from numpy.linalg import norm +from scipy import sparse from sklearn.utils import check_random_state -from imblearn.over_sampling.base import BaseOverSampler +from ..base import BaseOverSampler from imblearn.utils import check_neighbors_object, Substitution from imblearn.utils._docstring import _random_state_docstring @@ -119,6 +121,33 @@ class GeometricSMOTE(BaseOverSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. + Attributes + ---------- + + sampling_strategy_ : dict + Dictionary containing the information to sample the dataset. The keys + corresponds to the class labels from which to sample and the values + are the number of samples to sample. + + n_features_in_ : int + Number of features in the input dataset. + + nns_pos_ : estimator object + Validated k-nearest neighbours created from the `k_neighbors` parameter. It is + used to find the nearest neighbors of the same class of a selected + observation. + + nn_neg_ : estimator object + Validated k-nearest neighbours created from the `k_neighbors` parameter. It is + used to find the nearest neighbor of the remaining classes (k=1) of a selected + observation. + + random_state_ : instance of RandomState + If the `random_state` parameter is None, it is a RandomState singleton used by + np.random. If `random_state` is an int, it is a RandomState instance seeded with + seed. If `random_state` is already a RandomState instance, it is the same + object. + Notes ----- See the original paper: [1]_ for more details. @@ -142,7 +171,8 @@ class GeometricSMOTE(BaseOverSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification - >>> from gsmote import GeometricSMOTE # doctest: +NORMALIZE_WHITESPACE + >>> from imblearn.over_sampling import \ +GeometricSMOTE # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -237,7 +267,7 @@ def _make_geometric_samples(self, X, y, pos_class_label, n_samples): # Force minority strategy if no negative class samples are present self.selection_strategy_ = ( - 'minority' if len(X) == len(X_pos) else self.selection_strategy + 'minority' if X.shape[0] == X_pos.shape[0] else self.selection_strategy ) # Minority or combined strategy @@ -306,19 +336,28 @@ def _fit_resample(self, X, y): # Validate estimator's parameters self._validate_estimator() + # Ensure the input data is dense + X_dense = X.toarray() if sparse.issparse(X) else X + # Copy data - X_resampled, y_resampled = X.copy(), y.copy() + X_resampled, y_resampled = [X_dense.copy()], [y.copy()] # Resample data for class_label, n_samples in self.sampling_strategy_.items(): # Apply gsmote mechanism - X_new, y_new = self._make_geometric_samples(X, y, class_label, n_samples) - - # Append new data - X_resampled, y_resampled = ( - np.vstack((X_resampled, X_new)), - np.hstack((y_resampled, y_new)), + X_new, y_new = self._make_geometric_samples( + X_dense, y, class_label, n_samples ) + X_resampled.append(X_new) + y_resampled.append(y_new) + + # Append new data + if sparse.issparse(X): + X_resampled = sparse.vstack(X_resampled, format=X.format) + else: + X_resampled = np.vstack(X_resampled).astype(X.dtype) + y_resampled = np.hstack(y_resampled).astype(y.dtype) + return X_resampled, y_resampled