diff --git a/CHANGELOG.md b/CHANGELOG.md index a2e1f09955c..7faa308b741 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -117,6 +117,8 @@ * `transform/regress_out`: Allow providing 'input' and 'output' layers for scanpy regress_out functionality (PR #863). +* Added `dimred/lsi` component (PR #552). + * `metadata/copy_obs` component: Added a component to copy an .obs column from a MuData object to another (PR #874). * `workflows/annotation/scgpt_annotation` workflow: Added a scGPT transformer-based cell type annotation workflow (PR #832). diff --git a/src/authors/sarah_ouologuem.yaml b/src/authors/sarah_ouologuem.yaml new file mode 100644 index 00000000000..5ed8795a5d2 --- /dev/null +++ b/src/authors/sarah_ouologuem.yaml @@ -0,0 +1,10 @@ +name: Sarah Ouologuem +info: + role: Contributor + links: + github: SarahOuologuem + orcid: 0009-0005-3398-1700 + organizations: + - name: Helmholtz Munich + href: https://www.helmholtz-munich.de + role: Student Assistant \ No newline at end of file diff --git a/src/dimred/lsi/config.vsh.yaml b/src/dimred/lsi/config.vsh.yaml new file mode 100644 index 00000000000..fd2ac8df0b1 --- /dev/null +++ b/src/dimred/lsi/config.vsh.yaml @@ -0,0 +1,124 @@ +name: lsi +namespace: "dimred" +description: | + Runs Latent Semantic Indexing. Computes cell embeddings, feature loadings and singular values. Uses the implementation of scipy. +authors: + - __merge__: /src/authors/sarah_ouologuem.yaml + roles: [ contributor ] + - __merge__: /src/authors/vladimir_shitov.yaml + roles: [ contributor ] +argument_groups: + - name: Inputs + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Path to input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + type: string + default: "atac" + description: On which modality to run LSI on. + required: false + + - name: "--layer" + type: string + description: Use specified layer for expression values. If not specified, uses adata.X. + required: false + + - name: "--var_input" + type: string + description: Column name in .var matrix that will be used to select which genes to run the LSI on. If not specified, uses all features. + required: false + + - name: LSI options + arguments: + - name: "--num_components" + type: integer + default: 50 + description: Number of components to compute. + required: false + min: 2 + + - name: "--scale_embeddings" + type: boolean + default: true + description: Scale embeddings to zero mean and unit variance. + + - name: Outputs + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + description: Output h5mu file. + direction: output + required: true + example: output.h5mu + + - name: "--output_compression" + type: string + default: "gzip" + description: The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + + - name: "--obsm_output" + type: string + default: "X_lsi" + description: In which .obsm slot to store the resulting embedding. + required: false + + - name: "--varm_output" + type: string + default: "lsi" + description: In which .varm slot to store the resulting loadings matrix. + required: false + + - name: "--uns_output" + type: string + default: "lsi" + description: In which .uns slot to store the stdev. + required: false + + - name: "--overwrite" + type: boolean_true + description: Allow overwriting .obsm, .varm and .uns slots. + + +resources: + - type: python_script + path: script.py + - path: ../../utils/subset_vars.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: ../../utils/subset_vars.py + - path: ../../../resources_test/concat_test_data + + +engines: + - type: docker + image: python:3.11-slim + setup: + - type: apt + packages: + - procps + - pkg-config # Otherwise h5py installation fails, which is required for scanpy + - libhdf5-dev + - gcc + - type: python + __merge__: [../../../src/base/requirements/anndata_mudata.yaml, .] + packages: + - muon~=0.1.6 + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: + - highcpu + - highmem diff --git a/src/dimred/lsi/script.py b/src/dimred/lsi/script.py new file mode 100644 index 00000000000..8a1f5328823 --- /dev/null +++ b/src/dimred/lsi/script.py @@ -0,0 +1,108 @@ +import muon as mu +import mudata as md +from anndata import AnnData +import numpy as np +import sys + + +## VIASH START +par = { + "num_components": 50, # number of components to calculate with SVD + "scale_embeddings": True, # scale embeddings to zero mean and unit variance + "modality": "atac", # on which modality the LSI should be run + "layer": None, # on which layer to run the LSI, if None, will run it on anndata.X + "var_input": None, # column in anndata.var of the highly variable features + + "overwrite": True, + "obsm_output": "X_lsi", + "varm_output": "LSI", + "uns_output": "lsi", + "output": "output.h5mu", + "output_compression": "gzip" +} +## VIASH END + + +sys.path.append(meta["resources_dir"]) +from subset_vars import subset_vars + + +# START TEMPORARY WORKAROUND setup_logger +# reason: resources aren't available when using Nextflow fusion +# from setup_logger import setup_logger +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger +# END TEMPORARY WORKAROUND setup_logger +logger = setup_logger() + + +#1.read in mudata +logger.info("Reading %s.", par["input"]) +mdata = md.read_h5mu(par["input"]) + +#2. subset on modality +if par["modality"] not in mdata.mod: + raise ValueError(f"Modality '{par['modality']}' was not found in mudata {par['input']}.") +adata = mdata.mod[par['modality']] + + +#3. Specify layer +if par['layer'] and par["layer"] not in adata.layers: + raise ValueError(f"Layer '{par['layer']}' was not found in modality '{par['modality']}'.") +layer = adata.X if not par['layer'] else adata.layers[par['layer']] +adata_input_layer = AnnData(layer, var=adata.var) + + +if not par["layer"]: + logger.info("Using modality '%s' and adata.X for LSI computation", par['modality']) +else: + logger.info("Using modality '%s' and layer '%s' for LSI computation", par['modality'], par["layer"]) + + +#4. Subset on highly variable features if applicable +if par["var_input"]: + adata_input_layer = subset_vars(adata_input_layer, par["var_input"]) + + + +#5. Run LSI +logger.info("Computing %s LSI components on %s features", par["num_components"], adata_input_layer.X.shape[1]) +mu.atac.tl.lsi(adata_input_layer, scale_embeddings = par["scale_embeddings"], n_comps = par["num_components"]) + + + +#6. Store output in object +check_exist_dict = { + "obsm_output": ("obsm"), + "varm_output": ("varm"), + "uns_output": ("uns") +} +for parameter_name, field in check_exist_dict.items(): + if par[parameter_name] in getattr(adata, field): + if not par["overwrite"]: + raise ValueError(f"Requested to create field {par[parameter_name]} in .{field} " + f"for modality {par['modality']}, but field already exists.") + del getattr(adata, field)[par[parameter_name]] + +adata.obsm[par["obsm_output"]] = adata_input_layer.obsm['X_lsi'] +adata.uns[par["uns_output"]] = adata_input_layer.uns['lsi'] +if par["var_input"]: + adata.varm[par["varm_output"]] = np.zeros(shape=(adata.n_vars, adata_input_layer.varm["LSI"].shape[1])) + adata.varm[par["varm_output"]][adata.var[par["var_input"]]] = adata_input_layer.varm['LSI'] +else: + adata.varm[par["varm_output"]] = adata_input_layer.varm['LSI'] + +logger.info("Writing to %s.", par["output"]) +mdata.write(filename = par["output"], compression=par["output_compression"]) + +logger.info("Finished") diff --git a/src/dimred/lsi/test.py b/src/dimred/lsi/test.py new file mode 100644 index 00000000000..f6d293c6535 --- /dev/null +++ b/src/dimred/lsi/test.py @@ -0,0 +1,201 @@ +import sys +import pytest +import subprocess +import mudata as mu +import numpy as np + +## VIASH START +meta = { + 'resources_dir': 'resources_test', + 'executable': './target/docker/dimred/lsi/lsi', + 'config': './src/dimred/lsi/config.vsh.yaml' +} +## VIASH END + +input_path = f"{meta['resources_dir']}/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" + + +''' +Tests: +1. general test +2. test HVF +3. test modality +4. test layer +5. test overwrite +''' + +@pytest.fixture +def atac_mudata(tmp_path): + + mdata = mu.read_h5mu(input_path) + mdata.mod["atac"].layers["counts"] = mdata.mod["atac"].X + mdata.mod["atac"].var["highly_variable"] = np.random.choice([True, False], size=mdata.mod["atac"].n_vars) + print(mdata) + + mdata.write(tmp_path / "atac_mudata.h5mu") + + return tmp_path / "atac_mudata.h5mu" + +# 1.general test +def test_lsi(run_component, tmp_path): + output_path = tmp_path / "output_lsi.h5mu" + + cmd_args = [ + "--input", input_path, + "--output", str(output_path), + "--obsm_output", "X_test", + "--num_components", "30" + ] + run_component(cmd_args) + + assert output_path.is_file() + data = mu.read_h5mu(output_path) + assert "X_test" in data.mod['atac'].obsm + assert data.mod["atac"].obsm["X_test"].shape == (data.mod["atac"].n_obs, 30) + assert "lsi" in data.mod['atac'].uns + assert "lsi" in data.mod['atac'].varm + + + +# 2.test HVF +def test_select_highly_variable_column(run_component, random_h5mu_path, atac_mudata): + output_path = random_h5mu_path() + + # run component + cmd_args = [ + "--input", str(atac_mudata), + "--output", str(output_path), + "--var_input", "highly_variable" + ] + run_component(cmd_args) + + assert output_path.is_file() + data = mu.read_h5mu(output_path) + assert "X_lsi" in data.mod['atac'].obsm + assert data.mod["atac"].obsm["X_lsi"].shape == (data.mod["atac"].n_obs, 50) + assert "highly_variable" in data.mod["atac"].var.columns + assert "lsi" in data.mod['atac'].uns + assert "lsi" in data.mod['atac'].varm + assert data.mod["atac"].varm["lsi"].shape == (data.mod["atac"].n_vars, 50) + + +def test_highly_variable_column_does_not_exist_raises(run_component): + with pytest.raises(subprocess.CalledProcessError) as err: + cmd_args = [ + "--input", input_path, + "--output", "output_lsi.h5mu", + "--var_input", "does_not_exist" + ] + run_component(cmd_args) + + assert "ValueError: Requested to use .var column 'does_not_exist' as a selection of genes, but the column is not available." in \ + err.value.stdout.decode('utf-8') + + +# 3.test modality +def test_modality_does_not_exist_raises(run_component): + with pytest.raises(subprocess.CalledProcessError) as err: + cmd_args = [ + "--input", input_path, + "--output", "output_lsi.h5mu", + "--modality", "does_not_exist" + ] + run_component(cmd_args) + + assert "ValueError: Modality 'does_not_exist' was not found in mudata " + input_path +"." in \ + err.value.stdout.decode('utf-8') + + + +# 4.test layer +def test_selecting_input_layer(run_component, atac_mudata, tmp_path): + output_path = tmp_path / "output_lsi.h5mu" + + # run component + cmd_args = [ + "--input", str(atac_mudata), + "--output", str(output_path), + "--num_components", "20", + "--layer", "counts" + ] + run_component(cmd_args) + + + assert output_path.is_file() + data = mu.read_h5mu(output_path) + assert "counts" in data.mod["atac"].layers + assert "X_lsi" in data.mod['atac'].obsm + assert data.mod["atac"].obsm["X_lsi"].shape == (data.mod["atac"].n_obs, 20) + assert "lsi" in data.mod['atac'].uns + assert "lsi" in data.mod['atac'].varm + + + +def test_raise_if_input_layer_is_missing(run_component): + with pytest.raises(subprocess.CalledProcessError) as err: + cmd_args = [ + "--input", input_path, + "--output", "output.h5mu", + "--layer", "does_not_exist" + ] + run_component(cmd_args) + + assert "ValueError: Layer 'does_not_exist' was not found in modality 'atac'." in \ + err.value.stdout.decode('utf-8') + + + +# 5.test overwrite + +def test_output_field_already_present_raises(run_component, tmp_path): + output_path = tmp_path / "output_lsi.h5mu" + + #create slots + input_data = mu.read_h5mu(input_path) + input_data.mod["atac"].varm["lsi"] = np.zeros(shape=(input_data.mod["atac"].n_vars, 50)) + input_data.mod["atac"].obsm["X_lsi"] = np.zeros(shape=(input_data.mod["atac"].n_obs, 50)) + input_data.mod["atac"].uns['lsi'] = "test" + tmp_file = tmp_path / "input_data_adjusted.h5mu" + input_data.write_h5mu(tmp_file) + + with pytest.raises(subprocess.CalledProcessError) as err: + cmd_args = [ + "--input", str(tmp_file), + "--output", str(output_path), + "--output_compression", "gzip" + ] + run_component(cmd_args) + + assert "ValueError: Requested to create field X_lsi in .obsm for " \ + "modality atac, but field already exists." in \ + err.value.stdout.decode('utf-8') + +def test_output_field_already_present_overwrite(run_component, tmp_path): + output_path = tmp_path / "output_lsi.h5mu" + + #create slots + input_data = mu.read_h5mu(input_path) + input_data.mod["atac"].varm["lsi"] = np.zeros(shape=(input_data.mod["atac"].n_vars, 50)) + input_data.mod["atac"].obsm["X_lsi"] = np.zeros(shape=(input_data.mod["atac"].n_obs, 50)) + input_data.mod["atac"].uns['lsi'] = "test" + tmp_file = tmp_path / "input_data_adjusted.h5mu" + input_data.write_h5mu(tmp_file) + + cmd_args = [ + "--input", str(tmp_file), + "--output", str(output_path), + "--output_compression", "gzip", + "--overwrite", + "--num_components", "30" + ] + run_component(cmd_args) + + assert output_path.is_file() + data = mu.read_h5mu(output_path) + assert "X_lsi" in data.mod['atac'].obsm + assert data.mod["atac"].obsm["X_lsi"].shape == (data.mod["atac"].n_obs, 30) + assert "lsi" in data.mod['atac'].uns + assert "lsi" in data.mod['atac'].varm + +if __name__ == '__main__': + sys.exit(pytest.main([__file__])) \ No newline at end of file diff --git a/src/utils/subset_vars.py b/src/utils/subset_vars.py index 10011c8fcca..64071e6d41a 100644 --- a/src/utils/subset_vars.py +++ b/src/utils/subset_vars.py @@ -1,5 +1,5 @@ def subset_vars(adata, subset_col): - """Subset highly variable genes from AnnData object + """Subset AnnData object on highly variable genes Parameters ---------- @@ -13,4 +13,7 @@ def subset_vars(adata, subset_col): AnnData Copy of `adata` with subsetted features """ + if not subset_col in adata.var.columns: + raise ValueError(f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available.") + return adata[:, adata.var[subset_col]].copy()