From fbc2e497ffa01949983ad2ca7c5ca127c1f9cc04 Mon Sep 17 00:00:00 2001 From: Intron7 Date: Fri, 18 Oct 2024 10:59:48 +0200 Subject: [PATCH 1/5] lets umap run in parallel --- src/scanpy/tools/_umap.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/scanpy/tools/_umap.py b/src/scanpy/tools/_umap.py index 4f225da2a1..29fe0db69f 100644 --- a/src/scanpy/tools/_umap.py +++ b/src/scanpy/tools/_umap.py @@ -56,6 +56,7 @@ def umap( key_added: str | None = None, neighbors_key: str = "neighbors", copy: bool = False, + parallel: bool = False, ) -> AnnData | None: """\ Embed the neighborhood graph using UMAP :cite:p:`McInnes2018`. @@ -146,6 +147,8 @@ def umap( :attr:`~anndata.AnnData.obsp`\\ ``[.uns[neighbors_key]['connectivities_key']]`` for connectivities. copy Return a copy instead of writing to adata. + parallel + Whether to run the computation using numba parallel. Running in parallel is non-deterministic, and is not used if a random seed has been set, to ensure reproducibility. Returns ------- @@ -232,6 +235,7 @@ def umap( densmap_kwds={}, output_dens=False, verbose=settings.verbosity > 3, + parallel=parallel, ) elif method == "rapids": msg = ( From 812e63037e85a4e9e2dc4d12e4a7a48d10b62637 Mon Sep 17 00:00:00 2001 From: Intron7 Date: Fri, 18 Oct 2024 12:30:15 +0200 Subject: [PATCH 2/5] add a test to check if parallel with randomstate errors --- tests/test_embedding.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_embedding.py b/tests/test_embedding.py index 692157a084..e58036ff0a 100644 --- a/tests/test_embedding.py +++ b/tests/test_embedding.py @@ -88,3 +88,9 @@ def test_diffmap(): sc.tl.diffmap(pbmc, random_state=1234) d3 = pbmc.obsm["X_diffmap"].copy() assert_raises(AssertionError, assert_array_equal, d1, d3) + + +def test_umap_parallel_randomstate(): + pbmc = pbmc68k_reduced()[:100, :].copy() + sc.tl.umap(pbmc, parallel=True, random_state=42) + sc.tl.umap(pbmc, parallel=True, random_state=np.random.RandomState(42)) From ba6538de6a3b9effb384d1b961c139503bac3ba3 Mon Sep 17 00:00:00 2001 From: Intron7 Date: Fri, 18 Oct 2024 12:35:37 +0200 Subject: [PATCH 3/5] updated docstring --- src/scanpy/tools/_umap.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/scanpy/tools/_umap.py b/src/scanpy/tools/_umap.py index 29fe0db69f..e058250eb8 100644 --- a/src/scanpy/tools/_umap.py +++ b/src/scanpy/tools/_umap.py @@ -148,8 +148,7 @@ def umap( copy Return a copy instead of writing to adata. parallel - Whether to run the computation using numba parallel. Running in parallel is non-deterministic, and is not used if a random seed has been set, to ensure reproducibility. - + Whether to run the computation using numba parallel. Running in parallel is non-deterministic. Returns ------- Returns `None` if `copy=False`, else returns an `AnnData` object. Sets the following fields: From d2ab85d308c97a10f1682dfbf5a4732a8bb8ac09 Mon Sep 17 00:00:00 2001 From: Intron7 Date: Fri, 18 Oct 2024 12:48:03 +0200 Subject: [PATCH 4/5] added warning --- src/scanpy/tools/_umap.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/scanpy/tools/_umap.py b/src/scanpy/tools/_umap.py index e058250eb8..2d4dc9c2be 100644 --- a/src/scanpy/tools/_umap.py +++ b/src/scanpy/tools/_umap.py @@ -216,6 +216,12 @@ def umap( # for the init condition in the UMAP embedding default_epochs = 500 if neighbors["connectivities"].shape[0] <= 10000 else 200 n_epochs = default_epochs if maxiter is None else maxiter + if parallel and random_state is not None: + warnings.warn( + "Parallel execution was expected to be disabled when both `parallel=True` and `random_state` are set, " + "to ensure reproducibility. However, parallel execution still seems to occur, which may lead to " + "non-deterministic results." + ) X_umap, _ = simplicial_set_embedding( data=X, graph=neighbors["connectivities"].tocoo(), From 4eba4a84d5a6726a01ef7ca07d4397220ca59c33 Mon Sep 17 00:00:00 2001 From: Intron7 Date: Fri, 18 Oct 2024 14:11:08 +0200 Subject: [PATCH 5/5] update test --- tests/test_embedding.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/tests/test_embedding.py b/tests/test_embedding.py index e58036ff0a..c4377535c2 100644 --- a/tests/test_embedding.py +++ b/tests/test_embedding.py @@ -1,5 +1,7 @@ from __future__ import annotations +import warnings + import numpy as np import pytest from numpy.testing import assert_array_almost_equal, assert_array_equal, assert_raises @@ -90,7 +92,26 @@ def test_diffmap(): assert_raises(AssertionError, assert_array_equal, d1, d3) -def test_umap_parallel_randomstate(): +@pytest.mark.parametrize( + ("random_state", "expect_warning"), + [ + pytest.param(42, True, id="random_state_int"), + pytest.param(np.random.RandomState(42), True, id="random_state_RandomState"), + pytest.param(None, True, id="random_state_None"), + ], +) +def test_umap_parallel_randomstate(random_state, expect_warning): pbmc = pbmc68k_reduced()[:100, :].copy() - sc.tl.umap(pbmc, parallel=True, random_state=42) - sc.tl.umap(pbmc, parallel=True, random_state=np.random.RandomState(42)) + + if expect_warning: + with pytest.warns( + UserWarning, match="Parallel execution was expected to be disabled" + ): + sc.tl.umap(pbmc, parallel=True, random_state=random_state) + else: + # This is case is currently not in use because of lmcinnes/umap#1155 + with warnings.catch_warnings(record=True) as record: + warnings.simplefilter("always") + sc.tl.umap(pbmc, parallel=True, random_state=random_state) + + assert len(record) == 0