From 49066afbdcb1112fd430085dcfb06dc36941300d Mon Sep 17 00:00:00 2001 From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Wed, 11 Dec 2024 14:05:53 +0100 Subject: [PATCH] Provide clean error message when running scrublet on an empty array (#929) --- CHANGELOG.md | 2 + .../filter_with_scrublet/config.vsh.yaml | 6 +- src/filter/filter_with_scrublet/script.py | 6 + src/filter/filter_with_scrublet/test.py | 187 +++++++++++------- 4 files changed, 127 insertions(+), 74 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c7811a1f5d..508a6c2c9a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ ## MINOR CHANGES +* `filter/filter_with_scrublet`: provide cleaner error message when running scrublet on an empty modality (PR #929). + * Several component (cleanup): remove workaround for using being able to use shared utility functions with Nextflow Fusion (PR #920). * Several annotation (`src/annotate/`) components (`onclass`, `celltypist`, `random_forest_annotation`, `scanvi`, `svm_annotation`): Updated input parameteres to ensure uniformity across components, implemented functionality to cross-check the overlap of genes between query and reference (model) datasets and implemented logic to allow for subsetting of genes (PR #919). diff --git a/src/filter/filter_with_scrublet/config.vsh.yaml b/src/filter/filter_with_scrublet/config.vsh.yaml index d4d091460d0..9b0004cf6c1 100644 --- a/src/filter/filter_with_scrublet/config.vsh.yaml +++ b/src/filter/filter_with_scrublet/config.vsh.yaml @@ -131,10 +131,8 @@ engines: __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] packages: - scrublet - - annoy==1.16.3 - test_setup: - - type: python - __merge__: [ /src/base/requirements/viashpy.yaml, .] + - annoy==1.17.3 + __merge__: [/src/base/requirements/python_test_setup.yaml, .] runners: - type: executable diff --git a/src/filter/filter_with_scrublet/script.py b/src/filter/filter_with_scrublet/script.py index 6900135dbe8..8aba6db53cf 100644 --- a/src/filter/filter_with_scrublet/script.py +++ b/src/filter/filter_with_scrublet/script.py @@ -44,6 +44,12 @@ logger.info("Using layer '%s'.", "X" if not par["layer"] else par["layer"]) input_layer = data.X if not par["layer"] else data.layers[par["layer"]] +if 0 in input_layer.shape: + raise ValueError( + f"Modality {mod} of input Mudata {par['input']} appears " + f"to be empty (shape: {input_layer.shape})." + ) + logger.info("\tRunning scrublet") scrub = scr.Scrublet(input_layer) diff --git a/src/filter/filter_with_scrublet/test.py b/src/filter/filter_with_scrublet/test.py index 8d3899e3abe..fe9f3c0e287 100644 --- a/src/filter/filter_with_scrublet/test.py +++ b/src/filter/filter_with_scrublet/test.py @@ -7,40 +7,74 @@ import numpy as np import pandas as pd import anndata as ad -from scipy.sparse import csr_matrix +from scipy.sparse import csr_matrix, csr_array ## VIASH START meta = { "name": "foo", "resources_dir": "resources_test/", "executable": "target/executable/filter/filter_with_scrublet/filter_with_scrublet", + "config": "./src/filter/filter_with_scrublet/config.vsh.yaml", } -# def run_component(args_as_list): -# try: -# subprocess_args = [meta['executable']] + args_as_list -# print(" ".join(subprocess_args), flush=True) -# subprocess.check_output(subprocess_args, stderr=subprocess.STDOUT) -# except subprocess.CalledProcessError as e: -# print(e.stdout.decode("utf-8"), flush=True) -# raise e ## VIASH END # read input file -input_path = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" -input_mu = mu.read_h5mu(input_path) -orig_obs = input_mu.mod["rna"].n_obs -orig_vars = input_mu.mod["rna"].n_vars -orig_prot_obs = input_mu.mod["prot"].n_obs -orig_prot_vars = input_mu.mod["prot"].n_vars -def test_filter_a_little_bit(run_component): - output_mu = "output-1.h5mu" +@pytest.fixture +def input_mudata_path(): + return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + + +@pytest.fixture +def input_mudata(input_mudata_path): + return mu.read_h5mu(input_mudata_path) + + +@pytest.fixture +def input_with_failed_run(random_h5mu_path, input_mudata_path): + new_mudata_path = random_h5mu_path() + + mudata_in = mu.read_h5mu(input_mudata_path) + + # Make test reproducable + np.random.seed(4) + + # Simulate a failed scrublet run by passing very little cells + mudata = mudata_in[152].copy() + nobs = 100 + x_data = np.repeat(mudata.mod["rna"].X.todense(), nobs, axis=0) + + # Random perturbations because otherwise the detection fails in other ways (PCA cannot be run) + replace_rate = 0.000001 + mask = np.random.choice( + [0, 1], size=x_data.shape, p=((1 - replace_rate), replace_rate) + ).astype("bool") + r = np.random.rand(*x_data.shape) * np.max(x_data) + x_data[mask] = r[mask] + + # create obs + obs_name = mudata.mod["rna"].obs.index.to_list()[0] + obs_data = pd.DataFrame([], index=[f"{obs_name}_{i}" for i in range(nobs)]) + + # create resulting mudata + mod = ad.AnnData(X=csr_matrix(x_data), obs=obs_data, var=mudata.mod["rna"].var) + new_mudata = mu.MuData({"rna": mod}) + new_mudata.update() + new_mudata.write(new_mudata_path) + + return new_mudata_path + + +def test_filter_a_little_bit( + run_component, random_h5mu_path, input_mudata_path, input_mudata +): + output_mu = random_h5mu_path() run_component( [ "--input", - input_path, + input_mudata_path, "--output", output_mu, "--min_counts", @@ -56,13 +90,17 @@ def test_filter_a_little_bit(run_component): new_obs = mu_out.mod["rna"].n_obs new_vars = mu_out.mod["rna"].n_vars - assert new_obs == orig_obs, "No RNA obs should have been filtered" - assert new_vars == orig_vars, "No RNA vars should have been filtered" assert ( - mu_out.mod["prot"].n_obs == orig_prot_obs + new_obs == input_mudata.mod["rna"].n_obs + ), "No RNA obs should have been filtered" + assert ( + new_vars == input_mudata.mod["rna"].n_vars + ), "No RNA vars should have been filtered" + assert ( + mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs ), "No prot obs should have been filtered" assert ( - mu_out.mod["prot"].n_vars == orig_prot_vars + mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars ), "No prot vars should have been filtered" assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ "Gene Expression" @@ -72,13 +110,15 @@ def test_filter_a_little_bit(run_component): ], "Feature types of prot modality should be Antibody Capture" -def test_filtering_a_lot(run_component): - output_mu = "output-2.h5mu" +def test_filtering_a_lot( + run_component, random_h5mu_path, input_mudata_path, input_mudata +): + output_mu = random_h5mu_path() run_component( [ "--input", - input_path, + input_mudata_path, "--output", output_mu, "--modality", @@ -95,11 +135,17 @@ def test_filtering_a_lot(run_component): mu_out = mu.read_h5mu(output_mu) new_obs = mu_out.mod["rna"].n_obs new_vars = mu_out.mod["rna"].n_vars - assert new_obs < orig_obs, "Some cells should have been filtered" - assert new_vars == orig_vars, "No genes should have been filtered" - assert mu_out.mod["prot"].n_obs == orig_obs, "No prot obs should have been filtered" assert ( - mu_out.mod["prot"].n_vars == orig_prot_vars + new_obs < input_mudata.mod["rna"].n_obs + ), "Some cells should have been filtered" + assert ( + new_vars == input_mudata.mod["rna"].n_vars + ), "No genes should have been filtered" + assert ( + mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs + ), "No prot obs should have been filtered" + assert ( + mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars ), "No prot vars should have been filtered" assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ "Gene Expression" @@ -109,49 +155,42 @@ def test_filtering_a_lot(run_component): ], "Feature types of prot modality should be Antibody Capture" -@pytest.fixture(scope="module") -def input_with_failed_run(): - new_mudata_path = "pbmc-perturbed.h5mu" - - mudata_in = mu.read_h5mu(input_path) - - # Make test reproducable - np.random.seed(4) - - # Simulate a failed scrublet run by passing very little cells - mudata = mudata_in[152].copy() - nobs = 100 - x_data = np.repeat(mudata.mod["rna"].X.todense(), nobs, axis=0) - - # Random perturbations because otherwise the detection fails in other ways (PCA cannot be run) - replace_rate = 0.000001 - mask = np.random.choice( - [0, 1], size=x_data.shape, p=((1 - replace_rate), replace_rate) - ).astype("bool") - r = np.random.rand(*x_data.shape) * np.max(x_data) - x_data[mask] = r[mask] - - # create obs - obs_name = mudata.mod["rna"].obs.index.to_list()[0] - obs_data = pd.DataFrame([], index=[f"{obs_name}_{i}" for i in range(nobs)]) - - # create resulting mudata - mod = ad.AnnData(X=csr_matrix(x_data), obs=obs_data, var=mudata.mod["rna"].var) - new_mudata = mu.MuData({"rna": mod}) - new_mudata.update() - new_mudata.write(new_mudata_path) +def test_empty_mudata(run_component, random_h5mu_path): + output_mu = random_h5mu_path() + empty_mudata_path = random_h5mu_path() + empty_mudata = mu.MuData( + { + modality: ad.AnnData(csr_array((5, 0), dtype=np.int8)) + for modality in ("rna",) + } + ) - return new_mudata_path + empty_mudata.write(empty_mudata_path) + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + empty_mudata_path, + "--output", + output_mu, + "--output_compression", + "gzip", + ] + ) + assert re.search( + "ValueError: Modality rna of input Mudata .* appears to be empty", + err.value.stdout.decode("utf-8"), + ) @pytest.mark.xfail(strict=False) def test_doublet_automatic_threshold_detection_fails( - run_component, input_with_failed_run + run_component, input_with_failed_run, random_h5mu_path ): """ Test if the component fails if doublet score threshold could not automatically be set """ - output_mu = "output-4.h5mu" + output_mu = random_h5mu_path() with pytest.raises(subprocess.CalledProcessError) as e_info: run_component( @@ -217,9 +256,11 @@ def test_doublet_automatic_threshold_detection_fails_recovery( assert mu_out.mod["rna"].obs["filter_with_scrublet"].isna().all() -def test_selecting_input_layer(run_component, tmp_path): - output_mu = "output-2.h5mu" - input_data = mu.read_h5mu(input_path) +def test_selecting_input_layer( + run_component, tmp_path, random_h5mu_path, input_mudata, input_mudata_path +): + output_mu = random_h5mu_path() + input_data = mu.read_h5mu(input_mudata_path) input_data.mod["rna"].layers["test_layer"] = input_data.mod["rna"].X input_data.mod["rna"].X = None @@ -248,11 +289,17 @@ def test_selecting_input_layer(run_component, tmp_path): mu_out = mu.read_h5mu(output_mu) new_obs = mu_out.mod["rna"].n_obs new_vars = mu_out.mod["rna"].n_vars - assert new_obs < orig_obs, "Some cells should have been filtered" - assert new_vars == orig_vars, "No genes should have been filtered" - assert mu_out.mod["prot"].n_obs == orig_obs, "No prot obs should have been filtered" assert ( - mu_out.mod["prot"].n_vars == orig_prot_vars + new_obs < input_mudata.mod["rna"].n_obs + ), "Some cells should have been filtered" + assert ( + new_vars == input_mudata.mod["rna"].n_vars + ), "No genes should have been filtered" + assert ( + mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs + ), "No prot obs should have been filtered" + assert ( + mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars ), "No prot vars should have been filtered" assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ "Gene Expression"