Skip to content

Commit 8669cb1

Browse files
ATAC QC component (#868)
1 parent d2ef117 commit 8669cb1

File tree

4 files changed

+360
-0
lines changed

4 files changed

+360
-0
lines changed

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
## NEW FUNCTIONALITY
1010

11+
* `qc/calculate_atac_qc_metrics`: new component for calculating ATAC QC metrics (PR #868).
12+
1113
* `workflows/annotation/scgpt_annotation` workflow: Added a scGPT transformer-based cell type annotation workflow (PR #832).
1214

1315
* `workflows/annotation/scgpt_integration_knn` workflow: Cell-type annotation based on scGPT integration with KNN label transfer (PR #875).
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
name: calculate_atac_qc_metrics
2+
namespace: "qc"
3+
description: |
4+
Add basic ATAC quality control metrics to an .h5mu file.
5+
6+
The metrics are comparable to what scanpy.pp.calculate_qc_metrics output,
7+
although they have slightly different names:
8+
9+
Obs metrics (name in this component -> name in scanpy):
10+
- n_features_per_cell -> n_genes_by_counts
11+
- total_fragment_counts -> total_counts
12+
13+
authors:
14+
- __merge__: /src/authors/vladimir_shitov.yaml
15+
roles: [ author ]
16+
argument_groups:
17+
- name: Inputs
18+
arguments:
19+
- name: "--input"
20+
type: file
21+
description: Input h5mu file
22+
direction: input
23+
required: true
24+
example: input.h5mu
25+
- name: "--fragments_path"
26+
type: file
27+
description: |
28+
Path to the fragments file. If not provided and not present in the input h5mu file,
29+
the nucleosome signal and TSS enrichment score will not be calculated.
30+
direction: input
31+
required: false
32+
example: fragments.tsv.gz
33+
- name: "--modality"
34+
type: string
35+
default: "atac"
36+
required: false
37+
- name: "--layer"
38+
description: |
39+
Layer at `.layers` to use for the calculation. If not provided, `.X` is used.
40+
type: string
41+
example: "raw_counts"
42+
required: false
43+
- name: "--n_fragments_for_nucleosome_signal"
44+
type: integer
45+
description: |
46+
Number of fragments to use per cell for nucleosome signal calculation.
47+
Takes very long to calculate, for a test run lower value (e.g. 10e3) is recommended.
48+
See https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#nucleosome-signal
49+
for more information
50+
default: 10e4
51+
required: false
52+
- name: "--nuc_signal_threshold"
53+
type: double
54+
description: |
55+
Threshold for nucleosome signal. Cells with nucleosome signal above this threshold
56+
will be marked as low quality ("NS_FAIL"), otherwise they will be marked "NS_PASS".
57+
default: 2
58+
required: false
59+
- name: "--n_tss"
60+
type: integer
61+
description: |
62+
Number of the transcription start sites to calculate TSS enrichment score.
63+
See https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#tss-enrichment
64+
for more information
65+
default: 3000
66+
required: false
67+
- name: "--tss_threshold"
68+
type: double
69+
description: |
70+
Threshold for TSS enrichment score. Cells with TSS enrichment score below this threshold
71+
will be marked as low quality ("TSS_FAIL") otherwise they will be marked as "TSS_PASS".
72+
default: 1.5
73+
required: false
74+
- name: Outputs
75+
arguments:
76+
- name: "--output"
77+
type: file
78+
description: Output h5mu file.
79+
direction: output
80+
example: output.h5mu
81+
- name: "--output_compression"
82+
type: string
83+
description: The compression format to be used on the output h5mu object.
84+
choices: ["gzip", "lzf"]
85+
required: false
86+
example: "gzip"
87+
resources:
88+
- type: python_script
89+
path: script.py
90+
- path: /src/utils/setup_logger.py
91+
test_resources:
92+
- type: python_script
93+
path: test.py
94+
- path: /resources_test/cellranger_atac_tiny_bcl/counts/
95+
engines:
96+
- type: docker
97+
image: python:3.11-slim
98+
setup:
99+
- type: apt
100+
packages:
101+
- procps
102+
- pkg-config # Otherwise h5py installation fails, which is required for scanpy
103+
- libhdf5-dev
104+
- gcc
105+
- type: python
106+
__merge__: [/src/base/requirements/anndata_mudata.yaml, .]
107+
__merge__: [ /src/base/requirements/scanpy.yaml, .]
108+
packages:
109+
- muon~=0.1.5
110+
- pysam~=0.22.0
111+
test_setup:
112+
- type: python
113+
__merge__: [ /src/base/requirements/viashpy.yaml, .]
114+
runners:
115+
- type: executable
116+
- type: nextflow
117+
directives:
118+
label: [singlecpu, midmem]
+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import sys
2+
3+
import scanpy as sc
4+
import muon as mu
5+
from muon import atac as ac # the module containing function for scATAC data processing
6+
import numpy as np
7+
8+
## VIASH START
9+
par = {
10+
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
11+
"fragments_path": None,
12+
"output": "foo.h5mu",
13+
"modality": "atac",
14+
"layer": None,
15+
"n_fragments_for_nucleosome_signal": 10e4,
16+
"n_tss": 3000,
17+
"nuc_signal_threshold": 2,
18+
"tss_threshold": 1.5,
19+
}
20+
## VIASH END
21+
22+
sys.path.append(meta["resources_dir"])
23+
# START TEMPORARY WORKAROUND setup_logger
24+
# reason: resources aren't available when using Nextflow fusion
25+
# from setup_logger import setup_logger
26+
def setup_logger():
27+
import logging
28+
from sys import stdout
29+
30+
logger = logging.getLogger()
31+
logger.setLevel(logging.INFO)
32+
console_handler = logging.StreamHandler(stdout)
33+
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
34+
console_handler.setFormatter(logFormatter)
35+
logger.addHandler(console_handler)
36+
37+
return logger
38+
# END TEMPORARY WORKAROUND setup_logger
39+
logger = setup_logger()
40+
41+
def main():
42+
logger.info("Reading input data")
43+
mdata = mu.read(par["input"])
44+
45+
atac = mdata.mod[par["modality"]]
46+
47+
logger.info("Checking if QC columns are already calculated")
48+
for col in ("n_features_per_cell", "total_fragment_counts"):
49+
if col in atac.obs:
50+
logger.warning(f"{col} is already in atac.obs, dropping")
51+
atac.obs.drop(col, axis=1, inplace=True)
52+
53+
logger.info("Calculating QC metrics")
54+
sc.pp.calculate_qc_metrics(atac, percent_top=None, log1p=False, inplace=True, layer=par["layer"])
55+
56+
logger.debug("Putting QC columns to ATAC adata")
57+
atac.obs.rename(columns={"n_genes_by_counts": "n_features_per_cell", "total_counts": "total_fragment_counts"}, inplace=True)
58+
59+
logger.debug("Adding log-transformed total fragment counts")
60+
# log-transform total counts and add as column
61+
atac.obs["log_total_fragment_counts"] = np.log10(atac.obs["total_fragment_counts"])
62+
63+
if par["fragments_path"] is not None:
64+
logger.info("Trying to locate frafments")
65+
ac.tl.locate_fragments(atac, fragments=par["fragments_path"])
66+
else:
67+
logger.info("Skipping fragment location: `fragments_path` is not set")
68+
69+
# Calculate the nucleosome signal across cells
70+
if "files" in atac.uns and "fragments" in atac.uns["files"]:
71+
logger.info("Trying to calculate nucleosome signal")
72+
ac.tl.nucleosome_signal(atac, n=par["n_fragments_for_nucleosome_signal"] * atac.n_obs)
73+
atac.obs["nuc_signal_filter"] = [
74+
"NS_FAIL" if ns > par["nuc_signal_threshold"] else "NS_PASS"
75+
for ns in atac.obs["nucleosome_signal"]
76+
]
77+
else:
78+
logger.info("Skipping nucleosome signal calculation: fragments information is not found")
79+
80+
# If interval information is available, calculate TSS enrichment
81+
if "peak_annotation" in mdata.mod["atac"].uns.keys():
82+
tss = ac.tl.tss_enrichment(mdata, features=mdata.mod["atac"].uns["peak_annotation"],n_tss=par["n_tss"], random_state=666)
83+
84+
tss.obs["tss_filter"] = [
85+
"TSS_FAIL" if score < par["tss_threshold"] else "TSS_PASS"
86+
for score in atac.obs["tss_score"]
87+
]
88+
else:
89+
logger.info("Skipping TSS enrichment calculation: genes intervals are not found")
90+
91+
logger.info("Writing output")
92+
mdata.write(par["output"], compression=par["output_compression"])
93+
94+
if __name__ == "__main__":
95+
main()
+145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import sys
2+
from pathlib import Path
3+
import pytest
4+
5+
import mudata as md
6+
import numpy as np
7+
import scanpy as sc
8+
import muon as mu
9+
import pandas as pd
10+
11+
## VIASH START
12+
meta = {
13+
'executable': './target/docker/qc/calculate_atac_qc_metrics/calculate_atac_qc_metrics',
14+
'resources_dir': "./resources_test/cellranger_atac_tiny_bcl/counts/",
15+
'config': './src/qc/calculate_atac_qc_metrics/config.vsh.yaml',
16+
'cpus': 2
17+
}
18+
## VIASH END
19+
20+
@pytest.fixture
21+
def synthetic_example():
22+
atac = sc.AnnData(np.array([
23+
[0, 0, 0],
24+
[1, 0, 1],
25+
[10, 0, 0],
26+
[100, 0, 1],
27+
[1000, 0, 0]
28+
]))
29+
atac.obs_names = ["A", "B", "C", "D", "E"]
30+
atac.var_names = ["x", "y", "z"]
31+
32+
return md.MuData({"atac": atac})
33+
34+
@pytest.fixture
35+
def example_mudata(tmp_path, synthetic_example):
36+
mdata_path = tmp_path / "example.h5mu"
37+
synthetic_example.write(mdata_path)
38+
39+
return mdata_path
40+
41+
@pytest.fixture
42+
def example_mudata_with_layer(tmp_path, synthetic_example):
43+
synthetic_example.mod["atac"].layers["atac_counts"] = synthetic_example.mod["atac"].X.copy()
44+
synthetic_example.mod["atac"].X = np.random.normal(size=synthetic_example.mod["atac"].X.shape)
45+
mdata_path = tmp_path / "example.h5mu"
46+
synthetic_example.write(mdata_path)
47+
48+
return mdata_path
49+
50+
@pytest.fixture
51+
def neurips_mudata(tmp_path):
52+
"""From the `NeurIPS Multimodal Single-Cell Integration Challenge
53+
<https://www.kaggle.com/competitions/open-problems-multimodal/data>`
54+
55+
Link is taken from the Moscot repository:
56+
https://github.com/theislab/moscot/blob/cb53435c80fafe58046ead3c42a767fd0b818aaa/src/moscot/datasets.py#L67
57+
58+
"""
59+
adata = sc.read("../data/neurips_data.h5ad", backup_url="https://figshare.com/ndownloader/files/37993503")
60+
61+
mdata = md.MuData({"atac": adata})
62+
mdata_path = tmp_path / "neurips.h5mu"
63+
mdata.write(mdata_path)
64+
65+
return mdata_path
66+
67+
@pytest.fixture
68+
def tiny_atac_mudata(tmp_path):
69+
resources_dir = Path(meta["resources_dir"])
70+
mdata = mu.read_10x_h5(resources_dir / "counts" / "filtered_peak_bc_matrix.h5")
71+
mu.atac.tl.locate_fragments(mdata, fragments=str(resources_dir / "counts" / "fragments.tsv.gz"))
72+
assert "files" in mdata.mod["atac"].uns.keys()
73+
assert "fragments" in mdata.mod["atac"].uns["files"].keys()
74+
75+
# Read features annotation and save it to uns
76+
peak_annotation = pd.read_csv(resources_dir / "counts" / "peak_annotation.tsv", sep="\t")
77+
peak_annotation.columns = ["Chromosome", "Start", "End", "gene", "distance", "peak_type"]
78+
peak_annotation["gene"] = peak_annotation["gene"].astype(str) # Fixes saving error
79+
mdata.mod["atac"].uns["peak_annotation"] = peak_annotation
80+
81+
mdata_path = tmp_path / "tiny_atac.h5mu"
82+
mdata.write(mdata_path)
83+
84+
return mdata_path
85+
86+
@pytest.mark.parametrize("mudata", ["example_mudata", "neurips_mudata", "tiny_atac_mudata"])
87+
def test_qc_columns_in_tables(run_component, request, mudata, tmp_path):
88+
input_path = request.getfixturevalue(mudata)
89+
output_path = tmp_path / "foo.h5mu"
90+
91+
args = [
92+
"--input", str(input_path),
93+
"--output", str(output_path),
94+
"--modality", "atac",
95+
"--n_fragments_for_nucleosome_signal", "100"
96+
]
97+
98+
run_component(args)
99+
assert output_path.is_file()
100+
data_with_qc = md.read(output_path)
101+
102+
for qc_metric in ("n_features_per_cell", "total_fragment_counts", "log_total_fragment_counts"):
103+
assert qc_metric in data_with_qc.mod["atac"].obs
104+
for qc_metric in ("n_cells_by_counts", "mean_counts", "pct_dropout_by_counts", "total_counts"):
105+
assert qc_metric in data_with_qc.mod["atac"].var
106+
107+
# Check that ATAC-specific metrics are calculated if fragments information is present (for tiny ATAC data)
108+
if "files" in data_with_qc.mod["atac"].uns and "fragments" in data_with_qc.mod["atac"].uns["files"]:
109+
assert "nucleosome_signal" in data_with_qc.mod["atac"].obs
110+
111+
if "peak_annotation" in data_with_qc.mod["atac"].uns.keys():
112+
assert "tss_score" in data_with_qc.mod["atac"].obs
113+
114+
115+
@pytest.mark.parametrize("mudata", ["example_mudata", "example_mudata_with_layer"])
116+
def test_calculations_correctness(request, run_component, mudata, tmp_path):
117+
input_path = request.getfixturevalue(mudata)
118+
output_path = tmp_path / "foo.h5mu"
119+
120+
args = [
121+
"--input", str(input_path),
122+
"--output", str(output_path),
123+
"--modality", "atac",
124+
"--n_fragments_for_nucleosome_signal", "100"
125+
]
126+
127+
if mudata == "example_mudata_with_layer":
128+
args.extend(["--layer", "atac_counts"])
129+
130+
run_component(args)
131+
assert output_path.is_file()
132+
data_with_qc = md.read(output_path)
133+
134+
assert np.allclose(data_with_qc.mod["atac"].obs["n_features_per_cell"], [0, 2, 1, 2, 1])
135+
assert np.allclose(data_with_qc.mod["atac"].obs["total_fragment_counts"], [0, 2, 10, 101, 1000])
136+
assert np.allclose(data_with_qc.mod["atac"].obs["log_total_fragment_counts"], [-np.inf, np.log10(2), np.log10(10), np.log10(101), np.log10(1000)])
137+
138+
assert np.allclose(data_with_qc.mod["atac"].var["n_cells_by_counts"], [4, 0, 2])
139+
assert np.allclose(data_with_qc.mod["atac"].var["mean_counts"], [222.2, 0, 0.4])
140+
assert np.allclose(data_with_qc.mod["atac"].var["pct_dropout_by_counts"], [20, 100, 60])
141+
assert np.allclose(data_with_qc.mod["atac"].var["total_counts"], [1111, 0, 2])
142+
143+
144+
if __name__ == "__main__":
145+
sys.exit(pytest.main([__file__]))

0 commit comments

Comments
 (0)