From 49066afbdcb1112fd430085dcfb06dc36941300d Mon Sep 17 00:00:00 2001
From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>
Date: Wed, 11 Dec 2024 14:05:53 +0100
Subject: [PATCH] Provide clean error message when running scrublet on an empty
 array (#929)

---
 CHANGELOG.md                                  |   2 +
 .../filter_with_scrublet/config.vsh.yaml      |   6 +-
 src/filter/filter_with_scrublet/script.py     |   6 +
 src/filter/filter_with_scrublet/test.py       | 187 +++++++++++-------
 4 files changed, 127 insertions(+), 74 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9c7811a1f5d..508a6c2c9a8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,8 @@
 
 ## MINOR CHANGES
 
+* `filter/filter_with_scrublet`: provide cleaner error message when running scrublet on an empty modality (PR #929).
+
 * Several component (cleanup): remove workaround for using being able to use shared utility functions with Nextflow Fusion (PR #920).
 
 * Several annotation (`src/annotate/`) components (`onclass`, `celltypist`, `random_forest_annotation`, `scanvi`, `svm_annotation`): Updated input parameteres to ensure uniformity across components, implemented functionality to cross-check the overlap of genes between query and reference (model) datasets and implemented logic to allow for subsetting of genes (PR #919). 
diff --git a/src/filter/filter_with_scrublet/config.vsh.yaml b/src/filter/filter_with_scrublet/config.vsh.yaml
index d4d091460d0..9b0004cf6c1 100644
--- a/src/filter/filter_with_scrublet/config.vsh.yaml
+++ b/src/filter/filter_with_scrublet/config.vsh.yaml
@@ -131,10 +131,8 @@ engines:
       __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .]
       packages:
         - scrublet
-        - annoy==1.16.3
-  test_setup:
-    - type: python
-      __merge__: [ /src/base/requirements/viashpy.yaml, .]
+        - annoy==1.17.3
+  __merge__: [/src/base/requirements/python_test_setup.yaml, .]
 
 runners:
 - type: executable
diff --git a/src/filter/filter_with_scrublet/script.py b/src/filter/filter_with_scrublet/script.py
index 6900135dbe8..8aba6db53cf 100644
--- a/src/filter/filter_with_scrublet/script.py
+++ b/src/filter/filter_with_scrublet/script.py
@@ -44,6 +44,12 @@
 logger.info("Using layer '%s'.", "X" if not par["layer"] else par["layer"])
 input_layer = data.X if not par["layer"] else data.layers[par["layer"]]
 
+if 0 in input_layer.shape:
+    raise ValueError(
+        f"Modality {mod} of input Mudata {par['input']} appears "
+        f"to be empty (shape: {input_layer.shape})."
+    )
+
 logger.info("\tRunning scrublet")
 scrub = scr.Scrublet(input_layer)
 
diff --git a/src/filter/filter_with_scrublet/test.py b/src/filter/filter_with_scrublet/test.py
index 8d3899e3abe..fe9f3c0e287 100644
--- a/src/filter/filter_with_scrublet/test.py
+++ b/src/filter/filter_with_scrublet/test.py
@@ -7,40 +7,74 @@
 import numpy as np
 import pandas as pd
 import anndata as ad
-from scipy.sparse import csr_matrix
+from scipy.sparse import csr_matrix, csr_array
 
 ## VIASH START
 meta = {
     "name": "foo",
     "resources_dir": "resources_test/",
     "executable": "target/executable/filter/filter_with_scrublet/filter_with_scrublet",
+    "config": "./src/filter/filter_with_scrublet/config.vsh.yaml",
 }
-# def run_component(args_as_list):
-#     try:
-#         subprocess_args = [meta['executable']] + args_as_list
-#         print(" ".join(subprocess_args), flush=True)
-#         subprocess.check_output(subprocess_args, stderr=subprocess.STDOUT)
-#     except subprocess.CalledProcessError as e:
-#         print(e.stdout.decode("utf-8"), flush=True)
-#         raise e
 ## VIASH END
 
 # read input file
-input_path = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
-input_mu = mu.read_h5mu(input_path)
-orig_obs = input_mu.mod["rna"].n_obs
-orig_vars = input_mu.mod["rna"].n_vars
-orig_prot_obs = input_mu.mod["prot"].n_obs
-orig_prot_vars = input_mu.mod["prot"].n_vars
 
 
-def test_filter_a_little_bit(run_component):
-    output_mu = "output-1.h5mu"
+@pytest.fixture
+def input_mudata_path():
+    return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
+
+
+@pytest.fixture
+def input_mudata(input_mudata_path):
+    return mu.read_h5mu(input_mudata_path)
+
+
+@pytest.fixture
+def input_with_failed_run(random_h5mu_path, input_mudata_path):
+    new_mudata_path = random_h5mu_path()
+
+    mudata_in = mu.read_h5mu(input_mudata_path)
+
+    # Make test reproducable
+    np.random.seed(4)
+
+    # Simulate a failed scrublet run by passing very little cells
+    mudata = mudata_in[152].copy()
+    nobs = 100
+    x_data = np.repeat(mudata.mod["rna"].X.todense(), nobs, axis=0)
+
+    # Random perturbations because otherwise the detection fails in other ways (PCA cannot be run)
+    replace_rate = 0.000001
+    mask = np.random.choice(
+        [0, 1], size=x_data.shape, p=((1 - replace_rate), replace_rate)
+    ).astype("bool")
+    r = np.random.rand(*x_data.shape) * np.max(x_data)
+    x_data[mask] = r[mask]
+
+    # create obs
+    obs_name = mudata.mod["rna"].obs.index.to_list()[0]
+    obs_data = pd.DataFrame([], index=[f"{obs_name}_{i}" for i in range(nobs)])
+
+    # create resulting mudata
+    mod = ad.AnnData(X=csr_matrix(x_data), obs=obs_data, var=mudata.mod["rna"].var)
+    new_mudata = mu.MuData({"rna": mod})
+    new_mudata.update()
+    new_mudata.write(new_mudata_path)
+
+    return new_mudata_path
+
+
+def test_filter_a_little_bit(
+    run_component, random_h5mu_path, input_mudata_path, input_mudata
+):
+    output_mu = random_h5mu_path()
 
     run_component(
         [
             "--input",
-            input_path,
+            input_mudata_path,
             "--output",
             output_mu,
             "--min_counts",
@@ -56,13 +90,17 @@ def test_filter_a_little_bit(run_component):
 
     new_obs = mu_out.mod["rna"].n_obs
     new_vars = mu_out.mod["rna"].n_vars
-    assert new_obs == orig_obs, "No RNA obs should have been filtered"
-    assert new_vars == orig_vars, "No RNA vars should have been filtered"
     assert (
-        mu_out.mod["prot"].n_obs == orig_prot_obs
+        new_obs == input_mudata.mod["rna"].n_obs
+    ), "No RNA obs should have been filtered"
+    assert (
+        new_vars == input_mudata.mod["rna"].n_vars
+    ), "No RNA vars should have been filtered"
+    assert (
+        mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs
     ), "No prot obs should have been filtered"
     assert (
-        mu_out.mod["prot"].n_vars == orig_prot_vars
+        mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars
     ), "No prot vars should have been filtered"
     assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [
         "Gene Expression"
@@ -72,13 +110,15 @@ def test_filter_a_little_bit(run_component):
     ], "Feature types of prot modality should be Antibody Capture"
 
 
-def test_filtering_a_lot(run_component):
-    output_mu = "output-2.h5mu"
+def test_filtering_a_lot(
+    run_component, random_h5mu_path, input_mudata_path, input_mudata
+):
+    output_mu = random_h5mu_path()
 
     run_component(
         [
             "--input",
-            input_path,
+            input_mudata_path,
             "--output",
             output_mu,
             "--modality",
@@ -95,11 +135,17 @@ def test_filtering_a_lot(run_component):
     mu_out = mu.read_h5mu(output_mu)
     new_obs = mu_out.mod["rna"].n_obs
     new_vars = mu_out.mod["rna"].n_vars
-    assert new_obs < orig_obs, "Some cells should have been filtered"
-    assert new_vars == orig_vars, "No genes should have been filtered"
-    assert mu_out.mod["prot"].n_obs == orig_obs, "No prot obs should have been filtered"
     assert (
-        mu_out.mod["prot"].n_vars == orig_prot_vars
+        new_obs < input_mudata.mod["rna"].n_obs
+    ), "Some cells should have been filtered"
+    assert (
+        new_vars == input_mudata.mod["rna"].n_vars
+    ), "No genes should have been filtered"
+    assert (
+        mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs
+    ), "No prot obs should have been filtered"
+    assert (
+        mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars
     ), "No prot vars should have been filtered"
     assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [
         "Gene Expression"
@@ -109,49 +155,42 @@ def test_filtering_a_lot(run_component):
     ], "Feature types of prot modality should be Antibody Capture"
 
 
-@pytest.fixture(scope="module")
-def input_with_failed_run():
-    new_mudata_path = "pbmc-perturbed.h5mu"
-
-    mudata_in = mu.read_h5mu(input_path)
-
-    # Make test reproducable
-    np.random.seed(4)
-
-    # Simulate a failed scrublet run by passing very little cells
-    mudata = mudata_in[152].copy()
-    nobs = 100
-    x_data = np.repeat(mudata.mod["rna"].X.todense(), nobs, axis=0)
-
-    # Random perturbations because otherwise the detection fails in other ways (PCA cannot be run)
-    replace_rate = 0.000001
-    mask = np.random.choice(
-        [0, 1], size=x_data.shape, p=((1 - replace_rate), replace_rate)
-    ).astype("bool")
-    r = np.random.rand(*x_data.shape) * np.max(x_data)
-    x_data[mask] = r[mask]
-
-    # create obs
-    obs_name = mudata.mod["rna"].obs.index.to_list()[0]
-    obs_data = pd.DataFrame([], index=[f"{obs_name}_{i}" for i in range(nobs)])
-
-    # create resulting mudata
-    mod = ad.AnnData(X=csr_matrix(x_data), obs=obs_data, var=mudata.mod["rna"].var)
-    new_mudata = mu.MuData({"rna": mod})
-    new_mudata.update()
-    new_mudata.write(new_mudata_path)
+def test_empty_mudata(run_component, random_h5mu_path):
+    output_mu = random_h5mu_path()
+    empty_mudata_path = random_h5mu_path()
+    empty_mudata = mu.MuData(
+        {
+            modality: ad.AnnData(csr_array((5, 0), dtype=np.int8))
+            for modality in ("rna",)
+        }
+    )
 
-    return new_mudata_path
+    empty_mudata.write(empty_mudata_path)
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component(
+            [
+                "--input",
+                empty_mudata_path,
+                "--output",
+                output_mu,
+                "--output_compression",
+                "gzip",
+            ]
+        )
+    assert re.search(
+        "ValueError: Modality rna of input Mudata .* appears to be empty",
+        err.value.stdout.decode("utf-8"),
+    )
 
 
 @pytest.mark.xfail(strict=False)
 def test_doublet_automatic_threshold_detection_fails(
-    run_component, input_with_failed_run
+    run_component, input_with_failed_run, random_h5mu_path
 ):
     """
     Test if the component fails if doublet score threshold could not automatically be set
     """
-    output_mu = "output-4.h5mu"
+    output_mu = random_h5mu_path()
 
     with pytest.raises(subprocess.CalledProcessError) as e_info:
         run_component(
@@ -217,9 +256,11 @@ def test_doublet_automatic_threshold_detection_fails_recovery(
     assert mu_out.mod["rna"].obs["filter_with_scrublet"].isna().all()
 
 
-def test_selecting_input_layer(run_component, tmp_path):
-    output_mu = "output-2.h5mu"
-    input_data = mu.read_h5mu(input_path)
+def test_selecting_input_layer(
+    run_component, tmp_path, random_h5mu_path, input_mudata, input_mudata_path
+):
+    output_mu = random_h5mu_path()
+    input_data = mu.read_h5mu(input_mudata_path)
     input_data.mod["rna"].layers["test_layer"] = input_data.mod["rna"].X
     input_data.mod["rna"].X = None
 
@@ -248,11 +289,17 @@ def test_selecting_input_layer(run_component, tmp_path):
     mu_out = mu.read_h5mu(output_mu)
     new_obs = mu_out.mod["rna"].n_obs
     new_vars = mu_out.mod["rna"].n_vars
-    assert new_obs < orig_obs, "Some cells should have been filtered"
-    assert new_vars == orig_vars, "No genes should have been filtered"
-    assert mu_out.mod["prot"].n_obs == orig_obs, "No prot obs should have been filtered"
     assert (
-        mu_out.mod["prot"].n_vars == orig_prot_vars
+        new_obs < input_mudata.mod["rna"].n_obs
+    ), "Some cells should have been filtered"
+    assert (
+        new_vars == input_mudata.mod["rna"].n_vars
+    ), "No genes should have been filtered"
+    assert (
+        mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs
+    ), "No prot obs should have been filtered"
+    assert (
+        mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars
     ), "No prot vars should have been filtered"
     assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [
         "Gene Expression"