Skip to content

Commit

Permalink
Merge pull request #6534 from noahnovsak/dask-pca-fix
Browse files Browse the repository at this point in the history
PCA: distinguish between PCA and IncrementalPCA
  • Loading branch information
markotoplak committed Sep 14, 2023
2 parents 7540a74 + 645e51e commit 99bde1b
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 13 deletions.
28 changes: 18 additions & 10 deletions Orange/projection/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
from sklearn.utils.extmath import svd_flip, safe_sparse_dot
from sklearn.utils.validation import check_is_fitted

try:
import dask_ml.decomposition as dask_decomposition
except ImportError:
dask_decomposition = skl_decomposition

import Orange.data
from Orange.statistics import util as ut
from Orange.data import Variable
Expand Down Expand Up @@ -275,18 +280,13 @@ def _initialize_wrapped(self, X=None, Y=None):
params["n_components"] = min(*X.shape, params["n_components"])

if isinstance(X, da.Array) or isinstance(Y, da.Array):
try:
import dask_ml.decomposition as dask_decomposition

if dask_decomposition is skl_decomposition:
warnings.warn("dask_ml is not installed. Using sklearn instead.")
else:
if params["iterated_power"] == "auto":
params["iterated_power"] = 0
del params["tol"]

# use IPCA instead of PCA due to memory issues
return dask_decomposition.IncrementalPCA(**params)

except ImportError:
warnings.warn("dask_ml is not installed. Using sklearn instead.")
return dask_decomposition.PCA(**params)

return self.__wraps__(**params)

Expand Down Expand Up @@ -340,8 +340,16 @@ def __init__(self, n_components=None, whiten=False, copy=True,
super().__init__(preprocessors=preprocessors)
self.params = vars()

def _initialize_wrapped(self, X=None, Y=None):
if isinstance(X, da.Array) or isinstance(Y, da.Array):
if dask_decomposition is skl_decomposition:
warnings.warn("dask_ml is not installed. Using sklearn instead.")
else:
return dask_decomposition.IncrementalPCA(**self.params)
return self.__wraps__(**self.params)

def fit(self, X, Y=None):
proj = self.__wraps__(**self.params)
proj = self._initialize_wrapped(X, Y)
proj = proj.fit(X, Y)
return IncrementalPCAModel(proj, self.domain, len(proj.components_))

Expand Down
7 changes: 4 additions & 3 deletions Orange/tests/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,9 @@ def test_improved_randomized_pca_sparse_data(self):

@unittest.skipIf(sklearn_version.startswith('0.20'),
"https://github.com/scikit-learn/scikit-learn/issues/12234")
def test_incremental_pca(self):
data = self.ionosphere
@with_dasktable
def test_incremental_pca(self, prepare_table):
data = prepare_table(self.ionosphere)
self.__ipca_test_helper(data, n_com=3, min_xpl_var=0.49)
self.__ipca_test_helper(data, n_com=32, min_xpl_var=1)

Expand All @@ -172,7 +173,7 @@ def __ipca_test_helper(self, data, n_com, min_xpl_var):
self.assertEqual(n_com, pca_model.n_components)
self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape)
proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T)
np.testing.assert_almost_equal(pca_model(data).X, proj)
np.testing.assert_almost_equal(pca_model(data).X, np.asarray(proj))
pc1_ipca = pca_model.components_[0]
self.assertAlmostEqual(np.linalg.norm(pc1_ipca), 1)
pc1_pca = PCA(n_components=n_com)(data).components_[0]
Expand Down

0 comments on commit 99bde1b

Please sign in to comment.