Skip to content

Commit dc38e15

Browse files
SarahOuologuemVladimirShitov
authored andcommitted
Add LSI (#552)
Co-authored-by: Vladimir Shitov <[email protected]>
1 parent aa865cb commit dc38e15

File tree

6 files changed

+449
-1
lines changed

6 files changed

+449
-1
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@
117117

118118
* `transform/regress_out`: Allow providing 'input' and 'output' layers for scanpy regress_out functionality (PR #863).
119119

120+
* Added `dimred/lsi` component (PR #552).
121+
120122
* `metadata/copy_obs` component: Added a component to copy an .obs column from a MuData object to another (PR #874).
121123

122124
* `workflows/annotation/scgpt_annotation` workflow: Added a scGPT transformer-based cell type annotation workflow (PR #832).

src/authors/sarah_ouologuem.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
name: Sarah Ouologuem
2+
info:
3+
role: Contributor
4+
links:
5+
github: SarahOuologuem
6+
orcid: 0009-0005-3398-1700
7+
organizations:
8+
- name: Helmholtz Munich
9+
href: https://www.helmholtz-munich.de
10+
role: Student Assistant

src/dimred/lsi/config.vsh.yaml

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
name: lsi
2+
namespace: "dimred"
3+
description: |
4+
Runs Latent Semantic Indexing. Computes cell embeddings, feature loadings and singular values. Uses the implementation of scipy.
5+
authors:
6+
- __merge__: /src/authors/sarah_ouologuem.yaml
7+
roles: [ contributor ]
8+
- __merge__: /src/authors/vladimir_shitov.yaml
9+
roles: [ contributor ]
10+
argument_groups:
11+
- name: Inputs
12+
arguments:
13+
- name: "--input"
14+
alternatives: ["-i"]
15+
type: file
16+
description: Path to input h5mu file
17+
direction: input
18+
required: true
19+
example: input.h5mu
20+
21+
- name: "--modality"
22+
type: string
23+
default: "atac"
24+
description: On which modality to run LSI on.
25+
required: false
26+
27+
- name: "--layer"
28+
type: string
29+
description: Use specified layer for expression values. If not specified, uses adata.X.
30+
required: false
31+
32+
- name: "--var_input"
33+
type: string
34+
description: Column name in .var matrix that will be used to select which genes to run the LSI on. If not specified, uses all features.
35+
required: false
36+
37+
- name: LSI options
38+
arguments:
39+
- name: "--num_components"
40+
type: integer
41+
default: 50
42+
description: Number of components to compute.
43+
required: false
44+
min: 2
45+
46+
- name: "--scale_embeddings"
47+
type: boolean
48+
default: true
49+
description: Scale embeddings to zero mean and unit variance.
50+
51+
- name: Outputs
52+
arguments:
53+
- name: "--output"
54+
alternatives: ["-o"]
55+
type: file
56+
description: Output h5mu file.
57+
direction: output
58+
required: true
59+
example: output.h5mu
60+
61+
- name: "--output_compression"
62+
type: string
63+
default: "gzip"
64+
description: The compression format to be used on the output h5mu object.
65+
choices: ["gzip", "lzf"]
66+
required: false
67+
68+
- name: "--obsm_output"
69+
type: string
70+
default: "X_lsi"
71+
description: In which .obsm slot to store the resulting embedding.
72+
required: false
73+
74+
- name: "--varm_output"
75+
type: string
76+
default: "lsi"
77+
description: In which .varm slot to store the resulting loadings matrix.
78+
required: false
79+
80+
- name: "--uns_output"
81+
type: string
82+
default: "lsi"
83+
description: In which .uns slot to store the stdev.
84+
required: false
85+
86+
- name: "--overwrite"
87+
type: boolean_true
88+
description: Allow overwriting .obsm, .varm and .uns slots.
89+
90+
91+
resources:
92+
- type: python_script
93+
path: script.py
94+
- path: ../../utils/subset_vars.py
95+
- path: /src/utils/setup_logger.py
96+
test_resources:
97+
- type: python_script
98+
path: test.py
99+
- path: ../../utils/subset_vars.py
100+
- path: ../../../resources_test/concat_test_data
101+
102+
103+
engines:
104+
- type: docker
105+
image: python:3.11-slim
106+
setup:
107+
- type: apt
108+
packages:
109+
- procps
110+
- pkg-config # Otherwise h5py installation fails, which is required for scanpy
111+
- libhdf5-dev
112+
- gcc
113+
- type: python
114+
__merge__: [../../../src/base/requirements/anndata_mudata.yaml, .]
115+
packages:
116+
- muon~=0.1.6
117+
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
118+
runners:
119+
- type: executable
120+
- type: nextflow
121+
directives:
122+
label:
123+
- highcpu
124+
- highmem

src/dimred/lsi/script.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import muon as mu
2+
import mudata as md
3+
from anndata import AnnData
4+
import numpy as np
5+
import sys
6+
7+
8+
## VIASH START
9+
par = {
10+
"num_components": 50, # number of components to calculate with SVD
11+
"scale_embeddings": True, # scale embeddings to zero mean and unit variance
12+
"modality": "atac", # on which modality the LSI should be run
13+
"layer": None, # on which layer to run the LSI, if None, will run it on anndata.X
14+
"var_input": None, # column in anndata.var of the highly variable features
15+
16+
"overwrite": True,
17+
"obsm_output": "X_lsi",
18+
"varm_output": "LSI",
19+
"uns_output": "lsi",
20+
"output": "output.h5mu",
21+
"output_compression": "gzip"
22+
}
23+
## VIASH END
24+
25+
26+
sys.path.append(meta["resources_dir"])
27+
from subset_vars import subset_vars
28+
29+
30+
# START TEMPORARY WORKAROUND setup_logger
31+
# reason: resources aren't available when using Nextflow fusion
32+
# from setup_logger import setup_logger
33+
def setup_logger():
34+
import logging
35+
from sys import stdout
36+
37+
logger = logging.getLogger()
38+
logger.setLevel(logging.INFO)
39+
console_handler = logging.StreamHandler(stdout)
40+
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
41+
console_handler.setFormatter(logFormatter)
42+
logger.addHandler(console_handler)
43+
44+
return logger
45+
# END TEMPORARY WORKAROUND setup_logger
46+
logger = setup_logger()
47+
48+
49+
#1.read in mudata
50+
logger.info("Reading %s.", par["input"])
51+
mdata = md.read_h5mu(par["input"])
52+
53+
#2. subset on modality
54+
if par["modality"] not in mdata.mod:
55+
raise ValueError(f"Modality '{par['modality']}' was not found in mudata {par['input']}.")
56+
adata = mdata.mod[par['modality']]
57+
58+
59+
#3. Specify layer
60+
if par['layer'] and par["layer"] not in adata.layers:
61+
raise ValueError(f"Layer '{par['layer']}' was not found in modality '{par['modality']}'.")
62+
layer = adata.X if not par['layer'] else adata.layers[par['layer']]
63+
adata_input_layer = AnnData(layer, var=adata.var)
64+
65+
66+
if not par["layer"]:
67+
logger.info("Using modality '%s' and adata.X for LSI computation", par['modality'])
68+
else:
69+
logger.info("Using modality '%s' and layer '%s' for LSI computation", par['modality'], par["layer"])
70+
71+
72+
#4. Subset on highly variable features if applicable
73+
if par["var_input"]:
74+
adata_input_layer = subset_vars(adata_input_layer, par["var_input"])
75+
76+
77+
78+
#5. Run LSI
79+
logger.info("Computing %s LSI components on %s features", par["num_components"], adata_input_layer.X.shape[1])
80+
mu.atac.tl.lsi(adata_input_layer, scale_embeddings = par["scale_embeddings"], n_comps = par["num_components"])
81+
82+
83+
84+
#6. Store output in object
85+
check_exist_dict = {
86+
"obsm_output": ("obsm"),
87+
"varm_output": ("varm"),
88+
"uns_output": ("uns")
89+
}
90+
for parameter_name, field in check_exist_dict.items():
91+
if par[parameter_name] in getattr(adata, field):
92+
if not par["overwrite"]:
93+
raise ValueError(f"Requested to create field {par[parameter_name]} in .{field} "
94+
f"for modality {par['modality']}, but field already exists.")
95+
del getattr(adata, field)[par[parameter_name]]
96+
97+
adata.obsm[par["obsm_output"]] = adata_input_layer.obsm['X_lsi']
98+
adata.uns[par["uns_output"]] = adata_input_layer.uns['lsi']
99+
if par["var_input"]:
100+
adata.varm[par["varm_output"]] = np.zeros(shape=(adata.n_vars, adata_input_layer.varm["LSI"].shape[1]))
101+
adata.varm[par["varm_output"]][adata.var[par["var_input"]]] = adata_input_layer.varm['LSI']
102+
else:
103+
adata.varm[par["varm_output"]] = adata_input_layer.varm['LSI']
104+
105+
logger.info("Writing to %s.", par["output"])
106+
mdata.write(filename = par["output"], compression=par["output_compression"])
107+
108+
logger.info("Finished")

0 commit comments

Comments
 (0)