-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d2ef117
commit 8669cb1
Showing
4 changed files
with
360 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
name: calculate_atac_qc_metrics | ||
namespace: "qc" | ||
description: | | ||
Add basic ATAC quality control metrics to an .h5mu file. | ||
The metrics are comparable to what scanpy.pp.calculate_qc_metrics output, | ||
although they have slightly different names: | ||
Obs metrics (name in this component -> name in scanpy): | ||
- n_features_per_cell -> n_genes_by_counts | ||
- total_fragment_counts -> total_counts | ||
authors: | ||
- __merge__: /src/authors/vladimir_shitov.yaml | ||
roles: [ author ] | ||
argument_groups: | ||
- name: Inputs | ||
arguments: | ||
- name: "--input" | ||
type: file | ||
description: Input h5mu file | ||
direction: input | ||
required: true | ||
example: input.h5mu | ||
- name: "--fragments_path" | ||
type: file | ||
description: | | ||
Path to the fragments file. If not provided and not present in the input h5mu file, | ||
the nucleosome signal and TSS enrichment score will not be calculated. | ||
direction: input | ||
required: false | ||
example: fragments.tsv.gz | ||
- name: "--modality" | ||
type: string | ||
default: "atac" | ||
required: false | ||
- name: "--layer" | ||
description: | | ||
Layer at `.layers` to use for the calculation. If not provided, `.X` is used. | ||
type: string | ||
example: "raw_counts" | ||
required: false | ||
- name: "--n_fragments_for_nucleosome_signal" | ||
type: integer | ||
description: | | ||
Number of fragments to use per cell for nucleosome signal calculation. | ||
Takes very long to calculate, for a test run lower value (e.g. 10e3) is recommended. | ||
See https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#nucleosome-signal | ||
for more information | ||
default: 10e4 | ||
required: false | ||
- name: "--nuc_signal_threshold" | ||
type: double | ||
description: | | ||
Threshold for nucleosome signal. Cells with nucleosome signal above this threshold | ||
will be marked as low quality ("NS_FAIL"), otherwise they will be marked "NS_PASS". | ||
default: 2 | ||
required: false | ||
- name: "--n_tss" | ||
type: integer | ||
description: | | ||
Number of the transcription start sites to calculate TSS enrichment score. | ||
See https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#tss-enrichment | ||
for more information | ||
default: 3000 | ||
required: false | ||
- name: "--tss_threshold" | ||
type: double | ||
description: | | ||
Threshold for TSS enrichment score. Cells with TSS enrichment score below this threshold | ||
will be marked as low quality ("TSS_FAIL") otherwise they will be marked as "TSS_PASS". | ||
default: 1.5 | ||
required: false | ||
- name: Outputs | ||
arguments: | ||
- name: "--output" | ||
type: file | ||
description: Output h5mu file. | ||
direction: output | ||
example: output.h5mu | ||
- name: "--output_compression" | ||
type: string | ||
description: The compression format to be used on the output h5mu object. | ||
choices: ["gzip", "lzf"] | ||
required: false | ||
example: "gzip" | ||
resources: | ||
- type: python_script | ||
path: script.py | ||
- path: /src/utils/setup_logger.py | ||
test_resources: | ||
- type: python_script | ||
path: test.py | ||
- path: /resources_test/cellranger_atac_tiny_bcl/counts/ | ||
engines: | ||
- type: docker | ||
image: python:3.11-slim | ||
setup: | ||
- type: apt | ||
packages: | ||
- procps | ||
- pkg-config # Otherwise h5py installation fails, which is required for scanpy | ||
- libhdf5-dev | ||
- gcc | ||
- type: python | ||
__merge__: [/src/base/requirements/anndata_mudata.yaml, .] | ||
__merge__: [ /src/base/requirements/scanpy.yaml, .] | ||
packages: | ||
- muon~=0.1.5 | ||
- pysam~=0.22.0 | ||
test_setup: | ||
- type: python | ||
__merge__: [ /src/base/requirements/viashpy.yaml, .] | ||
runners: | ||
- type: executable | ||
- type: nextflow | ||
directives: | ||
label: [singlecpu, midmem] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
import sys | ||
|
||
import scanpy as sc | ||
import muon as mu | ||
from muon import atac as ac # the module containing function for scATAC data processing | ||
import numpy as np | ||
|
||
## VIASH START | ||
par = { | ||
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", | ||
"fragments_path": None, | ||
"output": "foo.h5mu", | ||
"modality": "atac", | ||
"layer": None, | ||
"n_fragments_for_nucleosome_signal": 10e4, | ||
"n_tss": 3000, | ||
"nuc_signal_threshold": 2, | ||
"tss_threshold": 1.5, | ||
} | ||
## VIASH END | ||
|
||
sys.path.append(meta["resources_dir"]) | ||
# START TEMPORARY WORKAROUND setup_logger | ||
# reason: resources aren't available when using Nextflow fusion | ||
# from setup_logger import setup_logger | ||
def setup_logger(): | ||
import logging | ||
from sys import stdout | ||
|
||
logger = logging.getLogger() | ||
logger.setLevel(logging.INFO) | ||
console_handler = logging.StreamHandler(stdout) | ||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") | ||
console_handler.setFormatter(logFormatter) | ||
logger.addHandler(console_handler) | ||
|
||
return logger | ||
# END TEMPORARY WORKAROUND setup_logger | ||
logger = setup_logger() | ||
|
||
def main(): | ||
logger.info("Reading input data") | ||
mdata = mu.read(par["input"]) | ||
|
||
atac = mdata.mod[par["modality"]] | ||
|
||
logger.info("Checking if QC columns are already calculated") | ||
for col in ("n_features_per_cell", "total_fragment_counts"): | ||
if col in atac.obs: | ||
logger.warning(f"{col} is already in atac.obs, dropping") | ||
atac.obs.drop(col, axis=1, inplace=True) | ||
|
||
logger.info("Calculating QC metrics") | ||
sc.pp.calculate_qc_metrics(atac, percent_top=None, log1p=False, inplace=True, layer=par["layer"]) | ||
|
||
logger.debug("Putting QC columns to ATAC adata") | ||
atac.obs.rename(columns={"n_genes_by_counts": "n_features_per_cell", "total_counts": "total_fragment_counts"}, inplace=True) | ||
|
||
logger.debug("Adding log-transformed total fragment counts") | ||
# log-transform total counts and add as column | ||
atac.obs["log_total_fragment_counts"] = np.log10(atac.obs["total_fragment_counts"]) | ||
|
||
if par["fragments_path"] is not None: | ||
logger.info("Trying to locate frafments") | ||
ac.tl.locate_fragments(atac, fragments=par["fragments_path"]) | ||
else: | ||
logger.info("Skipping fragment location: `fragments_path` is not set") | ||
|
||
# Calculate the nucleosome signal across cells | ||
if "files" in atac.uns and "fragments" in atac.uns["files"]: | ||
logger.info("Trying to calculate nucleosome signal") | ||
ac.tl.nucleosome_signal(atac, n=par["n_fragments_for_nucleosome_signal"] * atac.n_obs) | ||
atac.obs["nuc_signal_filter"] = [ | ||
"NS_FAIL" if ns > par["nuc_signal_threshold"] else "NS_PASS" | ||
for ns in atac.obs["nucleosome_signal"] | ||
] | ||
else: | ||
logger.info("Skipping nucleosome signal calculation: fragments information is not found") | ||
|
||
# If interval information is available, calculate TSS enrichment | ||
if "peak_annotation" in mdata.mod["atac"].uns.keys(): | ||
tss = ac.tl.tss_enrichment(mdata, features=mdata.mod["atac"].uns["peak_annotation"],n_tss=par["n_tss"], random_state=666) | ||
|
||
tss.obs["tss_filter"] = [ | ||
"TSS_FAIL" if score < par["tss_threshold"] else "TSS_PASS" | ||
for score in atac.obs["tss_score"] | ||
] | ||
else: | ||
logger.info("Skipping TSS enrichment calculation: genes intervals are not found") | ||
|
||
logger.info("Writing output") | ||
mdata.write(par["output"], compression=par["output_compression"]) | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
import sys | ||
from pathlib import Path | ||
import pytest | ||
|
||
import mudata as md | ||
import numpy as np | ||
import scanpy as sc | ||
import muon as mu | ||
import pandas as pd | ||
|
||
## VIASH START | ||
meta = { | ||
'executable': './target/docker/qc/calculate_atac_qc_metrics/calculate_atac_qc_metrics', | ||
'resources_dir': "./resources_test/cellranger_atac_tiny_bcl/counts/", | ||
'config': './src/qc/calculate_atac_qc_metrics/config.vsh.yaml', | ||
'cpus': 2 | ||
} | ||
## VIASH END | ||
|
||
@pytest.fixture | ||
def synthetic_example(): | ||
atac = sc.AnnData(np.array([ | ||
[0, 0, 0], | ||
[1, 0, 1], | ||
[10, 0, 0], | ||
[100, 0, 1], | ||
[1000, 0, 0] | ||
])) | ||
atac.obs_names = ["A", "B", "C", "D", "E"] | ||
atac.var_names = ["x", "y", "z"] | ||
|
||
return md.MuData({"atac": atac}) | ||
|
||
@pytest.fixture | ||
def example_mudata(tmp_path, synthetic_example): | ||
mdata_path = tmp_path / "example.h5mu" | ||
synthetic_example.write(mdata_path) | ||
|
||
return mdata_path | ||
|
||
@pytest.fixture | ||
def example_mudata_with_layer(tmp_path, synthetic_example): | ||
synthetic_example.mod["atac"].layers["atac_counts"] = synthetic_example.mod["atac"].X.copy() | ||
synthetic_example.mod["atac"].X = np.random.normal(size=synthetic_example.mod["atac"].X.shape) | ||
mdata_path = tmp_path / "example.h5mu" | ||
synthetic_example.write(mdata_path) | ||
|
||
return mdata_path | ||
|
||
@pytest.fixture | ||
def neurips_mudata(tmp_path): | ||
"""From the `NeurIPS Multimodal Single-Cell Integration Challenge | ||
<https://www.kaggle.com/competitions/open-problems-multimodal/data>` | ||
Link is taken from the Moscot repository: | ||
https://github.com/theislab/moscot/blob/cb53435c80fafe58046ead3c42a767fd0b818aaa/src/moscot/datasets.py#L67 | ||
""" | ||
adata = sc.read("../data/neurips_data.h5ad", backup_url="https://figshare.com/ndownloader/files/37993503") | ||
|
||
mdata = md.MuData({"atac": adata}) | ||
mdata_path = tmp_path / "neurips.h5mu" | ||
mdata.write(mdata_path) | ||
|
||
return mdata_path | ||
|
||
@pytest.fixture | ||
def tiny_atac_mudata(tmp_path): | ||
resources_dir = Path(meta["resources_dir"]) | ||
mdata = mu.read_10x_h5(resources_dir / "counts" / "filtered_peak_bc_matrix.h5") | ||
mu.atac.tl.locate_fragments(mdata, fragments=str(resources_dir / "counts" / "fragments.tsv.gz")) | ||
assert "files" in mdata.mod["atac"].uns.keys() | ||
assert "fragments" in mdata.mod["atac"].uns["files"].keys() | ||
|
||
# Read features annotation and save it to uns | ||
peak_annotation = pd.read_csv(resources_dir / "counts" / "peak_annotation.tsv", sep="\t") | ||
peak_annotation.columns = ["Chromosome", "Start", "End", "gene", "distance", "peak_type"] | ||
peak_annotation["gene"] = peak_annotation["gene"].astype(str) # Fixes saving error | ||
mdata.mod["atac"].uns["peak_annotation"] = peak_annotation | ||
|
||
mdata_path = tmp_path / "tiny_atac.h5mu" | ||
mdata.write(mdata_path) | ||
|
||
return mdata_path | ||
|
||
@pytest.mark.parametrize("mudata", ["example_mudata", "neurips_mudata", "tiny_atac_mudata"]) | ||
def test_qc_columns_in_tables(run_component, request, mudata, tmp_path): | ||
input_path = request.getfixturevalue(mudata) | ||
output_path = tmp_path / "foo.h5mu" | ||
|
||
args = [ | ||
"--input", str(input_path), | ||
"--output", str(output_path), | ||
"--modality", "atac", | ||
"--n_fragments_for_nucleosome_signal", "100" | ||
] | ||
|
||
run_component(args) | ||
assert output_path.is_file() | ||
data_with_qc = md.read(output_path) | ||
|
||
for qc_metric in ("n_features_per_cell", "total_fragment_counts", "log_total_fragment_counts"): | ||
assert qc_metric in data_with_qc.mod["atac"].obs | ||
for qc_metric in ("n_cells_by_counts", "mean_counts", "pct_dropout_by_counts", "total_counts"): | ||
assert qc_metric in data_with_qc.mod["atac"].var | ||
|
||
# Check that ATAC-specific metrics are calculated if fragments information is present (for tiny ATAC data) | ||
if "files" in data_with_qc.mod["atac"].uns and "fragments" in data_with_qc.mod["atac"].uns["files"]: | ||
assert "nucleosome_signal" in data_with_qc.mod["atac"].obs | ||
|
||
if "peak_annotation" in data_with_qc.mod["atac"].uns.keys(): | ||
assert "tss_score" in data_with_qc.mod["atac"].obs | ||
|
||
|
||
@pytest.mark.parametrize("mudata", ["example_mudata", "example_mudata_with_layer"]) | ||
def test_calculations_correctness(request, run_component, mudata, tmp_path): | ||
input_path = request.getfixturevalue(mudata) | ||
output_path = tmp_path / "foo.h5mu" | ||
|
||
args = [ | ||
"--input", str(input_path), | ||
"--output", str(output_path), | ||
"--modality", "atac", | ||
"--n_fragments_for_nucleosome_signal", "100" | ||
] | ||
|
||
if mudata == "example_mudata_with_layer": | ||
args.extend(["--layer", "atac_counts"]) | ||
|
||
run_component(args) | ||
assert output_path.is_file() | ||
data_with_qc = md.read(output_path) | ||
|
||
assert np.allclose(data_with_qc.mod["atac"].obs["n_features_per_cell"], [0, 2, 1, 2, 1]) | ||
assert np.allclose(data_with_qc.mod["atac"].obs["total_fragment_counts"], [0, 2, 10, 101, 1000]) | ||
assert np.allclose(data_with_qc.mod["atac"].obs["log_total_fragment_counts"], [-np.inf, np.log10(2), np.log10(10), np.log10(101), np.log10(1000)]) | ||
|
||
assert np.allclose(data_with_qc.mod["atac"].var["n_cells_by_counts"], [4, 0, 2]) | ||
assert np.allclose(data_with_qc.mod["atac"].var["mean_counts"], [222.2, 0, 0.4]) | ||
assert np.allclose(data_with_qc.mod["atac"].var["pct_dropout_by_counts"], [20, 100, 60]) | ||
assert np.allclose(data_with_qc.mod["atac"].var["total_counts"], [1111, 0, 2]) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(pytest.main([__file__])) |