Skip to content

Commit

Permalink
Provide clean error message when running scrublet on an empty array (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
DriesSchaumont authored Dec 11, 2024
1 parent aa4cd62 commit 49066af
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 74 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

## MINOR CHANGES

* `filter/filter_with_scrublet`: provide cleaner error message when running scrublet on an empty modality (PR #929).

* Several component (cleanup): remove workaround for using being able to use shared utility functions with Nextflow Fusion (PR #920).

* Several annotation (`src/annotate/`) components (`onclass`, `celltypist`, `random_forest_annotation`, `scanvi`, `svm_annotation`): Updated input parameteres to ensure uniformity across components, implemented functionality to cross-check the overlap of genes between query and reference (model) datasets and implemented logic to allow for subsetting of genes (PR #919).
Expand Down
6 changes: 2 additions & 4 deletions src/filter/filter_with_scrublet/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,8 @@ engines:
__merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .]
packages:
- scrublet
- annoy==1.16.3
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
- annoy==1.17.3
__merge__: [/src/base/requirements/python_test_setup.yaml, .]

runners:
- type: executable
Expand Down
6 changes: 6 additions & 0 deletions src/filter/filter_with_scrublet/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@
logger.info("Using layer '%s'.", "X" if not par["layer"] else par["layer"])
input_layer = data.X if not par["layer"] else data.layers[par["layer"]]

if 0 in input_layer.shape:
raise ValueError(
f"Modality {mod} of input Mudata {par['input']} appears "
f"to be empty (shape: {input_layer.shape})."
)

logger.info("\tRunning scrublet")
scrub = scr.Scrublet(input_layer)

Expand Down
187 changes: 117 additions & 70 deletions src/filter/filter_with_scrublet/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,40 +7,74 @@
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
from scipy.sparse import csr_matrix, csr_array

## VIASH START
meta = {
"name": "foo",
"resources_dir": "resources_test/",
"executable": "target/executable/filter/filter_with_scrublet/filter_with_scrublet",
"config": "./src/filter/filter_with_scrublet/config.vsh.yaml",
}
# def run_component(args_as_list):
# try:
# subprocess_args = [meta['executable']] + args_as_list
# print(" ".join(subprocess_args), flush=True)
# subprocess.check_output(subprocess_args, stderr=subprocess.STDOUT)
# except subprocess.CalledProcessError as e:
# print(e.stdout.decode("utf-8"), flush=True)
# raise e
## VIASH END

# read input file
input_path = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
input_mu = mu.read_h5mu(input_path)
orig_obs = input_mu.mod["rna"].n_obs
orig_vars = input_mu.mod["rna"].n_vars
orig_prot_obs = input_mu.mod["prot"].n_obs
orig_prot_vars = input_mu.mod["prot"].n_vars


def test_filter_a_little_bit(run_component):
output_mu = "output-1.h5mu"
@pytest.fixture
def input_mudata_path():
return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"


@pytest.fixture
def input_mudata(input_mudata_path):
return mu.read_h5mu(input_mudata_path)


@pytest.fixture
def input_with_failed_run(random_h5mu_path, input_mudata_path):
new_mudata_path = random_h5mu_path()

mudata_in = mu.read_h5mu(input_mudata_path)

# Make test reproducable
np.random.seed(4)

# Simulate a failed scrublet run by passing very little cells
mudata = mudata_in[152].copy()
nobs = 100
x_data = np.repeat(mudata.mod["rna"].X.todense(), nobs, axis=0)

# Random perturbations because otherwise the detection fails in other ways (PCA cannot be run)
replace_rate = 0.000001
mask = np.random.choice(
[0, 1], size=x_data.shape, p=((1 - replace_rate), replace_rate)
).astype("bool")
r = np.random.rand(*x_data.shape) * np.max(x_data)
x_data[mask] = r[mask]

# create obs
obs_name = mudata.mod["rna"].obs.index.to_list()[0]
obs_data = pd.DataFrame([], index=[f"{obs_name}_{i}" for i in range(nobs)])

# create resulting mudata
mod = ad.AnnData(X=csr_matrix(x_data), obs=obs_data, var=mudata.mod["rna"].var)
new_mudata = mu.MuData({"rna": mod})
new_mudata.update()
new_mudata.write(new_mudata_path)

return new_mudata_path


def test_filter_a_little_bit(
run_component, random_h5mu_path, input_mudata_path, input_mudata
):
output_mu = random_h5mu_path()

run_component(
[
"--input",
input_path,
input_mudata_path,
"--output",
output_mu,
"--min_counts",
Expand All @@ -56,13 +90,17 @@ def test_filter_a_little_bit(run_component):

new_obs = mu_out.mod["rna"].n_obs
new_vars = mu_out.mod["rna"].n_vars
assert new_obs == orig_obs, "No RNA obs should have been filtered"
assert new_vars == orig_vars, "No RNA vars should have been filtered"
assert (
mu_out.mod["prot"].n_obs == orig_prot_obs
new_obs == input_mudata.mod["rna"].n_obs
), "No RNA obs should have been filtered"
assert (
new_vars == input_mudata.mod["rna"].n_vars
), "No RNA vars should have been filtered"
assert (
mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs
), "No prot obs should have been filtered"
assert (
mu_out.mod["prot"].n_vars == orig_prot_vars
mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars
), "No prot vars should have been filtered"
assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [
"Gene Expression"
Expand All @@ -72,13 +110,15 @@ def test_filter_a_little_bit(run_component):
], "Feature types of prot modality should be Antibody Capture"


def test_filtering_a_lot(run_component):
output_mu = "output-2.h5mu"
def test_filtering_a_lot(
run_component, random_h5mu_path, input_mudata_path, input_mudata
):
output_mu = random_h5mu_path()

run_component(
[
"--input",
input_path,
input_mudata_path,
"--output",
output_mu,
"--modality",
Expand All @@ -95,11 +135,17 @@ def test_filtering_a_lot(run_component):
mu_out = mu.read_h5mu(output_mu)
new_obs = mu_out.mod["rna"].n_obs
new_vars = mu_out.mod["rna"].n_vars
assert new_obs < orig_obs, "Some cells should have been filtered"
assert new_vars == orig_vars, "No genes should have been filtered"
assert mu_out.mod["prot"].n_obs == orig_obs, "No prot obs should have been filtered"
assert (
mu_out.mod["prot"].n_vars == orig_prot_vars
new_obs < input_mudata.mod["rna"].n_obs
), "Some cells should have been filtered"
assert (
new_vars == input_mudata.mod["rna"].n_vars
), "No genes should have been filtered"
assert (
mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs
), "No prot obs should have been filtered"
assert (
mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars
), "No prot vars should have been filtered"
assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [
"Gene Expression"
Expand All @@ -109,49 +155,42 @@ def test_filtering_a_lot(run_component):
], "Feature types of prot modality should be Antibody Capture"


@pytest.fixture(scope="module")
def input_with_failed_run():
new_mudata_path = "pbmc-perturbed.h5mu"

mudata_in = mu.read_h5mu(input_path)

# Make test reproducable
np.random.seed(4)

# Simulate a failed scrublet run by passing very little cells
mudata = mudata_in[152].copy()
nobs = 100
x_data = np.repeat(mudata.mod["rna"].X.todense(), nobs, axis=0)

# Random perturbations because otherwise the detection fails in other ways (PCA cannot be run)
replace_rate = 0.000001
mask = np.random.choice(
[0, 1], size=x_data.shape, p=((1 - replace_rate), replace_rate)
).astype("bool")
r = np.random.rand(*x_data.shape) * np.max(x_data)
x_data[mask] = r[mask]

# create obs
obs_name = mudata.mod["rna"].obs.index.to_list()[0]
obs_data = pd.DataFrame([], index=[f"{obs_name}_{i}" for i in range(nobs)])

# create resulting mudata
mod = ad.AnnData(X=csr_matrix(x_data), obs=obs_data, var=mudata.mod["rna"].var)
new_mudata = mu.MuData({"rna": mod})
new_mudata.update()
new_mudata.write(new_mudata_path)
def test_empty_mudata(run_component, random_h5mu_path):
output_mu = random_h5mu_path()
empty_mudata_path = random_h5mu_path()
empty_mudata = mu.MuData(
{
modality: ad.AnnData(csr_array((5, 0), dtype=np.int8))
for modality in ("rna",)
}
)

return new_mudata_path
empty_mudata.write(empty_mudata_path)
with pytest.raises(subprocess.CalledProcessError) as err:
run_component(
[
"--input",
empty_mudata_path,
"--output",
output_mu,
"--output_compression",
"gzip",
]
)
assert re.search(
"ValueError: Modality rna of input Mudata .* appears to be empty",
err.value.stdout.decode("utf-8"),
)


@pytest.mark.xfail(strict=False)
def test_doublet_automatic_threshold_detection_fails(
run_component, input_with_failed_run
run_component, input_with_failed_run, random_h5mu_path
):
"""
Test if the component fails if doublet score threshold could not automatically be set
"""
output_mu = "output-4.h5mu"
output_mu = random_h5mu_path()

with pytest.raises(subprocess.CalledProcessError) as e_info:
run_component(
Expand Down Expand Up @@ -217,9 +256,11 @@ def test_doublet_automatic_threshold_detection_fails_recovery(
assert mu_out.mod["rna"].obs["filter_with_scrublet"].isna().all()


def test_selecting_input_layer(run_component, tmp_path):
output_mu = "output-2.h5mu"
input_data = mu.read_h5mu(input_path)
def test_selecting_input_layer(
run_component, tmp_path, random_h5mu_path, input_mudata, input_mudata_path
):
output_mu = random_h5mu_path()
input_data = mu.read_h5mu(input_mudata_path)
input_data.mod["rna"].layers["test_layer"] = input_data.mod["rna"].X
input_data.mod["rna"].X = None

Expand Down Expand Up @@ -248,11 +289,17 @@ def test_selecting_input_layer(run_component, tmp_path):
mu_out = mu.read_h5mu(output_mu)
new_obs = mu_out.mod["rna"].n_obs
new_vars = mu_out.mod["rna"].n_vars
assert new_obs < orig_obs, "Some cells should have been filtered"
assert new_vars == orig_vars, "No genes should have been filtered"
assert mu_out.mod["prot"].n_obs == orig_obs, "No prot obs should have been filtered"
assert (
mu_out.mod["prot"].n_vars == orig_prot_vars
new_obs < input_mudata.mod["rna"].n_obs
), "Some cells should have been filtered"
assert (
new_vars == input_mudata.mod["rna"].n_vars
), "No genes should have been filtered"
assert (
mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs
), "No prot obs should have been filtered"
assert (
mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars
), "No prot vars should have been filtered"
assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [
"Gene Expression"
Expand Down

0 comments on commit 49066af

Please sign in to comment.