Skip to content

Commit

Permalink
Add LSI (#552)
Browse files Browse the repository at this point in the history
Co-authored-by: Vladimir Shitov <[email protected]>
  • Loading branch information
2 people authored and dorien-er committed Nov 18, 2024
1 parent aa865cb commit dc38e15
Show file tree
Hide file tree
Showing 6 changed files with 449 additions and 1 deletion.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@

* `transform/regress_out`: Allow providing 'input' and 'output' layers for scanpy regress_out functionality (PR #863).

* Added `dimred/lsi` component (PR #552).

* `metadata/copy_obs` component: Added a component to copy an .obs column from a MuData object to another (PR #874).

* `workflows/annotation/scgpt_annotation` workflow: Added a scGPT transformer-based cell type annotation workflow (PR #832).
Expand Down
10 changes: 10 additions & 0 deletions src/authors/sarah_ouologuem.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: Sarah Ouologuem
info:
role: Contributor
links:
github: SarahOuologuem
orcid: 0009-0005-3398-1700
organizations:
- name: Helmholtz Munich
href: https://www.helmholtz-munich.de
role: Student Assistant
124 changes: 124 additions & 0 deletions src/dimred/lsi/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
name: lsi
namespace: "dimred"
description: |
Runs Latent Semantic Indexing. Computes cell embeddings, feature loadings and singular values. Uses the implementation of scipy.
authors:
- __merge__: /src/authors/sarah_ouologuem.yaml
roles: [ contributor ]
- __merge__: /src/authors/vladimir_shitov.yaml
roles: [ contributor ]
argument_groups:
- name: Inputs
arguments:
- name: "--input"
alternatives: ["-i"]
type: file
description: Path to input h5mu file
direction: input
required: true
example: input.h5mu

- name: "--modality"
type: string
default: "atac"
description: On which modality to run LSI on.
required: false

- name: "--layer"
type: string
description: Use specified layer for expression values. If not specified, uses adata.X.
required: false

- name: "--var_input"
type: string
description: Column name in .var matrix that will be used to select which genes to run the LSI on. If not specified, uses all features.
required: false

- name: LSI options
arguments:
- name: "--num_components"
type: integer
default: 50
description: Number of components to compute.
required: false
min: 2

- name: "--scale_embeddings"
type: boolean
default: true
description: Scale embeddings to zero mean and unit variance.

- name: Outputs
arguments:
- name: "--output"
alternatives: ["-o"]
type: file
description: Output h5mu file.
direction: output
required: true
example: output.h5mu

- name: "--output_compression"
type: string
default: "gzip"
description: The compression format to be used on the output h5mu object.
choices: ["gzip", "lzf"]
required: false

- name: "--obsm_output"
type: string
default: "X_lsi"
description: In which .obsm slot to store the resulting embedding.
required: false

- name: "--varm_output"
type: string
default: "lsi"
description: In which .varm slot to store the resulting loadings matrix.
required: false

- name: "--uns_output"
type: string
default: "lsi"
description: In which .uns slot to store the stdev.
required: false

- name: "--overwrite"
type: boolean_true
description: Allow overwriting .obsm, .varm and .uns slots.


resources:
- type: python_script
path: script.py
- path: ../../utils/subset_vars.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: ../../utils/subset_vars.py
- path: ../../../resources_test/concat_test_data


engines:
- type: docker
image: python:3.11-slim
setup:
- type: apt
packages:
- procps
- pkg-config # Otherwise h5py installation fails, which is required for scanpy
- libhdf5-dev
- gcc
- type: python
__merge__: [../../../src/base/requirements/anndata_mudata.yaml, .]
packages:
- muon~=0.1.6
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label:
- highcpu
- highmem
108 changes: 108 additions & 0 deletions src/dimred/lsi/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import muon as mu
import mudata as md
from anndata import AnnData
import numpy as np
import sys


## VIASH START
par = {
"num_components": 50, # number of components to calculate with SVD
"scale_embeddings": True, # scale embeddings to zero mean and unit variance
"modality": "atac", # on which modality the LSI should be run
"layer": None, # on which layer to run the LSI, if None, will run it on anndata.X
"var_input": None, # column in anndata.var of the highly variable features

"overwrite": True,
"obsm_output": "X_lsi",
"varm_output": "LSI",
"uns_output": "lsi",
"output": "output.h5mu",
"output_compression": "gzip"
}
## VIASH END


sys.path.append(meta["resources_dir"])
from subset_vars import subset_vars


# START TEMPORARY WORKAROUND setup_logger
# reason: resources aren't available when using Nextflow fusion
# from setup_logger import setup_logger
def setup_logger():
import logging
from sys import stdout

logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)

return logger
# END TEMPORARY WORKAROUND setup_logger
logger = setup_logger()


#1.read in mudata
logger.info("Reading %s.", par["input"])
mdata = md.read_h5mu(par["input"])

#2. subset on modality
if par["modality"] not in mdata.mod:
raise ValueError(f"Modality '{par['modality']}' was not found in mudata {par['input']}.")
adata = mdata.mod[par['modality']]


#3. Specify layer
if par['layer'] and par["layer"] not in adata.layers:
raise ValueError(f"Layer '{par['layer']}' was not found in modality '{par['modality']}'.")
layer = adata.X if not par['layer'] else adata.layers[par['layer']]
adata_input_layer = AnnData(layer, var=adata.var)


if not par["layer"]:
logger.info("Using modality '%s' and adata.X for LSI computation", par['modality'])
else:
logger.info("Using modality '%s' and layer '%s' for LSI computation", par['modality'], par["layer"])


#4. Subset on highly variable features if applicable
if par["var_input"]:
adata_input_layer = subset_vars(adata_input_layer, par["var_input"])



#5. Run LSI
logger.info("Computing %s LSI components on %s features", par["num_components"], adata_input_layer.X.shape[1])
mu.atac.tl.lsi(adata_input_layer, scale_embeddings = par["scale_embeddings"], n_comps = par["num_components"])



#6. Store output in object
check_exist_dict = {
"obsm_output": ("obsm"),
"varm_output": ("varm"),
"uns_output": ("uns")
}
for parameter_name, field in check_exist_dict.items():
if par[parameter_name] in getattr(adata, field):
if not par["overwrite"]:
raise ValueError(f"Requested to create field {par[parameter_name]} in .{field} "
f"for modality {par['modality']}, but field already exists.")
del getattr(adata, field)[par[parameter_name]]

adata.obsm[par["obsm_output"]] = adata_input_layer.obsm['X_lsi']
adata.uns[par["uns_output"]] = adata_input_layer.uns['lsi']
if par["var_input"]:
adata.varm[par["varm_output"]] = np.zeros(shape=(adata.n_vars, adata_input_layer.varm["LSI"].shape[1]))
adata.varm[par["varm_output"]][adata.var[par["var_input"]]] = adata_input_layer.varm['LSI']
else:
adata.varm[par["varm_output"]] = adata_input_layer.varm['LSI']

logger.info("Writing to %s.", par["output"])
mdata.write(filename = par["output"], compression=par["output_compression"])

logger.info("Finished")
Loading

0 comments on commit dc38e15

Please sign in to comment.