Add LSI (#552)

SarahOuologuem · VladimirShitov · dorien-er · commit dc38e1560ed8 · 2024-11-18T15:18:00.000+01:00
Co-authored-by: Vladimir Shitov &lt;35199218+VladimirShitov@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -117,6 +117,8 @@
 
 * `transform/regress_out`: Allow providing 'input' and 'output' layers for scanpy regress_out functionality (PR #863).
 
+* Added `dimred/lsi` component (PR #552).
+
 * `metadata/copy_obs` component: Added a component to copy an .obs column from a MuData object to another (PR #874).
 
 * `workflows/annotation/scgpt_annotation` workflow: Added a scGPT transformer-based cell type annotation workflow (PR #832).
diff --git a/src/authors/sarah_ouologuem.yaml b/src/authors/sarah_ouologuem.yaml
@@ -0,0 +1,10 @@
+name: Sarah Ouologuem
+info:
+  role: Contributor
+  links:
+    github: SarahOuologuem
+    orcid: 0009-0005-3398-1700
+  organizations:
+    - name: Helmholtz Munich
+      href: https://www.helmholtz-munich.de
+      role: Student Assistant
diff --git a/src/dimred/lsi/config.vsh.yaml b/src/dimred/lsi/config.vsh.yaml
@@ -0,0 +1,124 @@
+name: lsi
+namespace: "dimred"
+description: |
+  Runs Latent Semantic Indexing. Computes cell embeddings, feature loadings and singular values. Uses the implementation of scipy.
+authors:
+  - __merge__: /src/authors/sarah_ouologuem.yaml
+    roles: [ contributor ]
+  - __merge__: /src/authors/vladimir_shitov.yaml
+    roles: [ contributor ]
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: "--input"
+        alternatives: ["-i"]
+        type: file
+        description: Path to input h5mu file
+        direction: input
+        required: true
+        example: input.h5mu
+
+      - name: "--modality"
+        type: string
+        default: "atac"
+        description: On which modality to run LSI on.
+        required: false
+
+      - name: "--layer"
+        type: string
+        description: Use specified layer for expression values. If not specified, uses adata.X.
+        required: false
+
+      - name: "--var_input"
+        type: string
+        description: Column name in .var matrix that will be used to select which genes to run the LSI on. If not specified, uses all features.
+        required: false
+
+  - name: LSI options
+    arguments:
+      - name: "--num_components"
+        type: integer
+        default: 50
+        description: Number of components to compute.
+        required: false
+        min: 2
+
+      - name: "--scale_embeddings"
+        type: boolean
+        default: true
+        description: Scale embeddings to zero mean and unit variance.
+
+  - name: Outputs
+    arguments:
+      - name: "--output"
+        alternatives: ["-o"]
+        type: file
+        description: Output h5mu file.
+        direction: output
+        required: true
+        example: output.h5mu
+
+      - name: "--output_compression"
+        type: string
+        default: "gzip"
+        description: The compression format to be used on the output h5mu object.
+        choices: ["gzip", "lzf"]
+        required: false
+
+      - name: "--obsm_output"
+        type: string
+        default: "X_lsi"
+        description: In which .obsm slot to store the resulting embedding.
+        required: false
+
+      - name: "--varm_output"
+        type: string
+        default: "lsi"
+        description: In which .varm slot to store the resulting loadings matrix.
+        required: false
+
+      - name: "--uns_output"
+        type: string
+        default: "lsi"
+        description: In which .uns slot to store the stdev.
+        required: false
+
+      - name: "--overwrite"
+        type: boolean_true
+        description: Allow overwriting .obsm, .varm and .uns slots.
+
+    
+resources:
+  - type: python_script
+    path: script.py
+  - path: ../../utils/subset_vars.py
+  - path: /src/utils/setup_logger.py
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: ../../utils/subset_vars.py
+  - path: ../../../resources_test/concat_test_data
+  
+
+engines:
+  - type: docker
+    image: python:3.11-slim
+    setup:
+      - type: apt
+        packages:
+          - procps
+          - pkg-config  # Otherwise h5py installation fails, which is required for scanpy
+          - libhdf5-dev
+          - gcc
+      - type: python
+        __merge__: [../../../src/base/requirements/anndata_mudata.yaml, .]
+        packages:
+          - muon~=0.1.6
+    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: 
+        - highcpu
+        - highmem
diff --git a/src/dimred/lsi/script.py b/src/dimred/lsi/script.py
@@ -0,0 +1,108 @@
+import muon as mu
+import mudata as md
+from anndata import AnnData
+import numpy as np
+import sys
+
+
+## VIASH START
+par = {
+    "num_components": 50, # number of components to calculate with SVD
+    "scale_embeddings": True, # scale embeddings to zero mean and unit variance
+    "modality": "atac", # on which modality the LSI should be run 
+    "layer": None, # on which layer to run the LSI, if None, will run it on anndata.X 
+    "var_input": None, # column in anndata.var of the highly variable features
+
+    "overwrite": True, 
+    "obsm_output": "X_lsi",
+    "varm_output": "LSI",
+    "uns_output": "lsi",
+    "output": "output.h5mu",
+    "output_compression": "gzip"
+}
+## VIASH END
+
+
+sys.path.append(meta["resources_dir"])
+from subset_vars import subset_vars
+
+
+# START TEMPORARY WORKAROUND setup_logger
+# reason: resources aren't available when using Nextflow fusion
+# from setup_logger import setup_logger
+def setup_logger():
+    import logging
+    from sys import stdout
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
+# END TEMPORARY WORKAROUND setup_logger
+logger = setup_logger()
+
+
+#1.read in mudata
+logger.info("Reading %s.", par["input"])
+mdata = md.read_h5mu(par["input"])
+
+#2. subset on modality
+if par["modality"] not in mdata.mod:
+    raise ValueError(f"Modality '{par['modality']}' was not found in mudata {par['input']}.")
+adata = mdata.mod[par['modality']]
+
+
+#3. Specify layer
+if par['layer'] and par["layer"] not in adata.layers:
+    raise ValueError(f"Layer '{par['layer']}' was not found in modality '{par['modality']}'.")
+layer = adata.X if not par['layer'] else adata.layers[par['layer']]
+adata_input_layer = AnnData(layer, var=adata.var)
+
+
+if not par["layer"]:
+    logger.info("Using modality '%s' and adata.X for LSI computation", par['modality'])
+else:
+    logger.info("Using modality '%s' and layer '%s' for LSI computation", par['modality'], par["layer"])
+
+
+#4. Subset on highly variable features if applicable
+if par["var_input"]:
+    adata_input_layer = subset_vars(adata_input_layer, par["var_input"])
+
+
+
+#5. Run LSI
+logger.info("Computing %s LSI components on %s features", par["num_components"], adata_input_layer.X.shape[1])
+mu.atac.tl.lsi(adata_input_layer, scale_embeddings = par["scale_embeddings"], n_comps = par["num_components"])
+
+
+
+#6. Store output in object
+check_exist_dict = {
+    "obsm_output": ("obsm"),
+    "varm_output": ("varm"),
+    "uns_output": ("uns")
+}
+for parameter_name, field in check_exist_dict.items():
+    if par[parameter_name] in getattr(adata, field):
+        if not par["overwrite"]:
+            raise ValueError(f"Requested to create field {par[parameter_name]} in .{field} "
+                            f"for modality {par['modality']}, but field already exists.")
+        del getattr(adata, field)[par[parameter_name]]
+
+adata.obsm[par["obsm_output"]] = adata_input_layer.obsm['X_lsi']
+adata.uns[par["uns_output"]] = adata_input_layer.uns['lsi']
+if par["var_input"]:
+    adata.varm[par["varm_output"]] = np.zeros(shape=(adata.n_vars, adata_input_layer.varm["LSI"].shape[1]))
+    adata.varm[par["varm_output"]][adata.var[par["var_input"]]] = adata_input_layer.varm['LSI']
+else:
+    adata.varm[par["varm_output"]] = adata_input_layer.varm['LSI']
+
+logger.info("Writing to %s.", par["output"])
+mdata.write(filename = par["output"], compression=par["output_compression"])
+
+logger.info("Finished")
diff --git a/src/dimred/lsi/test.py b/src/dimred/lsi/test.py
diff --git a/src/utils/subset_vars.py b/src/utils/subset_vars.py