Skip to content

Commit

Permalink
Update scvelo (#932)
Browse files Browse the repository at this point in the history
  • Loading branch information
DriesSchaumont authored Dec 13, 2024
1 parent 8669cb1 commit cd4674f
Show file tree
Hide file tree
Showing 6 changed files with 173 additions and 44 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@

## BREAKING CHANGES

* `velocity/scvelo`: update `scvelo` to `0.3.3`, which also removes support for using `loom` input files. The component now uses a `MuData` object as input. Several arguments were added to support selecting different inputs from the MuData file: `counts_layer`, `modality`, `layer_spliced`, `layer_unspliced`, `layer_ambiguous`. An `output_h5mu` argument was has been added (PR #932).

* `src/annotate/onclass` and `src/annotate/celltypist`: Input parameter for gene name layers of input datasets has been updated to `--input_var_gene_names` and `reference_var_gene_names` (PR #919).

* Several components under `src/scgpt` (`cross_check_genes`, `tokenize_pad`, `binning`) now processes the input (query) datasets differently. Instead of subsetting datasets based on genes in the model vocabulary and/or highly variable genes, these components require an input .var column with a boolean mask specifying this information. The results are written back to the original input data, preserving the dataset structure (PR #832).

## NEW FUNCTIONALITY

* `velocyto_to_h5mu`: now writes counts to `.X` (PR #932)

* `qc/calculate_atac_qc_metrics`: new component for calculating ATAC QC metrics (PR #868).

* `workflows/annotation/scgpt_annotation` workflow: Added a scGPT transformer-based cell type annotation workflow (PR #832).
Expand Down
12 changes: 9 additions & 3 deletions resources_test_scripts/rna_velocity.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#!/bin/bash

set -eo pipefail

# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
Expand All @@ -19,7 +17,7 @@ mkdir -p "$velocyto_dir"
# Create a compatible BAM file from BD Rhapsody Output #
########################################################

bd_rhap_wta_bam="resources_test/bdrhap_5kjrt/processed/WTA.bd_rhapsody.output_raw/sample_final.BAM"
bd_rhap_wta_bam="resources_test/bdrhap_5kjrt/processed/output_raw/Combined_sample_Bioproduct.bam"

if [[ ! -f "$bd_rhap_wta_bam" ]]; then
echo "$bd_rhap_wta_bam does not exist. Please generate BD Rhapsody test data first."
Expand Down Expand Up @@ -52,3 +50,11 @@ viash run src/velocity/velocyto/config.vsh.yaml -- \
-i "$bam" \
-o "$OUT/velocyto_processed/cellranger_tiny.loom" \
--transcriptome "$gtf"

echo "> Converting loom file to MuData object"
viash run src/velocity/velocyto_to_h5mu/config.vsh.yaml -- \
--input_loom "$OUT/velocyto_processed/cellranger_tiny.loom" \
--input_h5mu "resources_test/cellranger_tiny_fastq/raw_dataset.h5mu" \
--modality velocyto \
--output_compression "gzip" \
--output "$OUT/velocyto_processed/velocyto.h5mu"
40 changes: 31 additions & 9 deletions src/velocity/scvelo/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,42 @@ argument_groups:
- name: Inputs
arguments:
- name: "--input"
description: "Input MuData file"
type: file
direction: input
description: "Velocyto loom file."
required: true
- name: "--counts_layer"
type: string
description: "Name of the counts layer, if not specified, X is used."
required: false
- name: "--modality"
description: Input modality
required: true
type: string
- name: "--layer_spliced"
type: string
required: false
default: "spliced"
- name: "--layer_unspliced"
type: string
required: false
default: "unspliced"
- name: "--layer_ambiguous"
type: string
required: false
default: "ambiguous"
- name: Outputs
arguments:
- name: "--output"
required: true
type: file
direction: output
description: "Output directory. If it does not exist, will be created."
- name: "--output_h5mu"
required: true
type: file
direction: output
description: "Output mudata file."
- name: "--output_compression"
type: string
description: The compression format to be used on the output h5mu object.
Expand Down Expand Up @@ -69,10 +94,11 @@ resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
- path: /src/utils/compress_h5mu.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/rna_velocity/velocyto_processed/cellranger_tiny.loom
- path: /resources_test/rna_velocity/velocyto_processed/velocyto.h5mu
engines:
- type: docker
image: python:3.12-slim
Expand All @@ -82,15 +108,11 @@ engines:
- procps
- git
- type: python
__merge__: [/src/base/requirements/anndata_mudata.yaml, .]
__merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .]
packages:
- scvelo[vi]~=0.3.2
- scvelo~=0.3.3
- scipy~=1.14.1
- scanpy~=1.9.8

test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow
Expand Down
107 changes: 83 additions & 24 deletions src/velocity/scvelo/script.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,11 @@
import sys
import mudata
import anndata
import tempfile
import shutil
from contextlib import redirect_stdout
from pathlib import Path
import matplotlib as mpl

# Backwards compatibility for numpy 2.0
import numpy

numpy_module = sys.modules["numpy"]
numpy_module.float_ = numpy.float64
sys.modules["numpy"] = numpy_module

# Backwards compatibility for scipy
import scipy # noqa: F401

scipy_module = sys.modules["scipy"]
scipy_module.sparse._base._spbase.A = property(lambda self: self.toarray())

sys.modules["scipy"] = scipy_module

import scvelo

## VIASH START
Expand All @@ -32,16 +19,24 @@ def none_factory():
par = defaultdict(
none_factory,
{
"input": "./resources_test/rna_velocity/velocyto_processed/cellranger_tiny.loom",
"input": "resources_test/rna_velocity/velocyto_processed/velocyto.h5mu",
"modality": "velocyto",
"output": "./foo",
"output_h5mu": "output.h5mu",
"log_transform": True,
"n_neighbors": 30,
"layer_spliced": "velo_spliced",
"layer_unspliced": "velo_unspliced",
"layer_ambiguous": "velo_ambiguous",
},
)

meta = {"resources_dir": "src/utils", "temp_dir": "/tmp/"}
## VIASH END

sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
from compress_h5mu import compress_h5mu

logger = setup_logger()

Expand All @@ -56,12 +51,32 @@ def main():
output_dir.mkdir(parents=True, exist_ok=True)
scvelo.settings.figdir = str(output_dir)

# Load the input data
adata_in = mudata.read_h5ad(par["input"], mod=par["modality"])

# Create a copy of the data as input
# as many scvelo functions do not take input layer arguments
layers_mapping = {
"spliced": par["layer_spliced"],
"unspliced": par["layer_unspliced"],
"ambiguous": par["layer_ambiguous"],
}
layer_data = {
default: (adata_in.layers.get(arg_val) if arg_val else adata_in.layers[default])
for default, arg_val in layers_mapping.items()
}
adata = anndata.AnnData(
X=adata_in.X
if not par["counts_layer"]
else adata_in.layers[par["counts_layer"]],
layers=layer_data,
)

# Calculate the sample name
sample_name = par["output"].removesuffix(".loom")
sample_name = par["output"].removesuffix(".h5mu")
sample_name = Path(sample_name).name

# Read the input data
adata = scvelo.read(par["input"])

# Save spliced vs unspliced proportions to file
with (output_dir / "proportions.txt").open("w") as target:
Expand Down Expand Up @@ -107,11 +122,55 @@ def main():
adata, save=str(output_dir / "scvelo_embedding.pdf"), show=False
)

# Create output
ouput_data = mudata.MuData({"rna_velocity": adata})
ouput_data.write_h5mu(
output_dir / f"{sample_name}.h5mu", compression=par["output_compression"]
)
# Copy over slots to output
for slot in ("obs", "var"):
setattr(
adata_in,
slot,
getattr(adata_in, slot)
.assign(**getattr(adata, slot).to_dict())
.convert_dtypes(),
)
items_per_slot = {
"uns": (
"recover_dynamics",
"velocity_params",
"velocity_graph",
"velocity_graph_neg",
),
"varm": ("loss",),
"obsm": ("velocity_pca",),
"layers": (
"Ms",
"Mu",
"fit_t",
"fit_tau",
"fit_tau_",
"velocity",
"velocity_u",
),
}
for dict_slot, dict_items in items_per_slot.items():
setattr(
adata_in,
dict_slot,
dict(
getattr(adata_in, dict_slot),
**{key_: getattr(adata, dict_slot)[key_] for key_ in dict_items},
),
)
with tempfile.NamedTemporaryFile(
suffix=".h5mu", delete_on_close=False
) as temp_h5mu:
shutil.copyfile(par["input"], temp_h5mu.name)
# Create output
mudata.write_h5ad(temp_h5mu.name, mod=par["modality"], data=adata_in)
compression = par["output_compression"]

if compression:
compress_h5mu(temp_h5mu.name, par["output_h5mu"], compression=compression)
else:
shutil.move(temp_h5mu.name, par["output_h5mu"])


if __name__ == "__main__":
Expand Down
51 changes: 44 additions & 7 deletions src/velocity/scvelo/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,34 @@

## VIASH START
meta = {
"name": "./target/executable/projection/scvelo/scvelo",
"resources_dir": "./resources_test/",
"config": "src/velocity/scvelo/config.vsh.yaml",
"executable": "./target/executable/velocity/scvelo/scvelo",
"resources_dir": "resources_test/rna_velocity/velocyto_processed/",
}
## VIASH END

input_loom = f"{meta['resources_dir']}/cellranger_tiny.loom"
input_h5mu = f"{meta['resources_dir']}/velocyto.h5mu"


def test_scvelo(run_component, tmp_path):
output_dir = tmp_path / "foo"
output_h5mu = tmp_path / "output.h5mu"
run_component(
[
"--input",
input_loom,
input_h5mu,
"--modality",
"velocyto",
"--output",
str(output_dir),
"--output_h5mu",
output_h5mu,
"--layer_spliced",
"velo_spliced",
"--layer_unspliced",
"velo_unspliced",
"--layer_ambiguous",
"velo_ambiguous",
"--output_compression",
"gzip",
]
Expand All @@ -30,10 +42,35 @@ def test_scvelo(run_component, tmp_path):
assert (output_dir / "scvelo_embedding.pdf").is_file()
assert (output_dir / "scvelo_graph.pdf").is_file()
assert (output_dir / "proportions.txt").is_file()
assert (output_dir / "foo.h5mu").is_file()
assert output_h5mu.is_file()

output_data = read_h5mu(output_dir / "foo.h5mu")
assert "rna_velocity" in output_data.mod.keys()
output_data = read_h5mu(output_h5mu)
items_per_slot = {
"uns": (
"recover_dynamics",
"velocity_params",
"velocity_graph",
"velocity_graph_neg",
),
"varm": ("loss",),
"obsm": ("velocity_pca",),
"layers": (
"Ms",
"Mu",
"fit_t",
"fit_tau",
"fit_tau_",
"velocity",
"velocity_u",
),
}
for dict_slot, dict_items in items_per_slot.items():
for dict_item in dict_items:
assert (
getattr(output_data.mod["velocyto"], dict_slot).get(dict_item)
is not None
), f"Expected {dict_item} to be present in {dict_slot}"
del getattr(output_data.mod["velocyto"], dict_slot)[dict_item]


if __name__ == "__main__":
Expand Down
3 changes: 2 additions & 1 deletion src/velocity/velocyto_to_h5mu/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
## VIASH START
par = {
"input_loom": "resources_test/rna_velocity/velocyto_processed/cellranger_tiny.loom",
"input_h5mu": "/home/rcannood/workspace/openpipelines-bio/openpipeline/resources_test/cellranger_tiny_fastq/raw_dataset.h5mu",
"input_h5mu": "resources_test/cellranger_tiny_fastq/raw_dataset.h5mu",
"modality": "rna_velocity",
"output": "output.h5mu",
"layer_spliced": "velo_spliced",
Expand All @@ -28,6 +28,7 @@

print("Creating clean AnnData", flush=True)
adata = ad.AnnData(
X=adata_in.X,
obs=adata_in.obs[[]],
var=adata_in.var[[]],
layers={
Expand Down

0 comments on commit cd4674f

Please sign in to comment.