Skip to content

Commit

Permalink
ATAC QC component (#868)
Browse files Browse the repository at this point in the history
  • Loading branch information
VladimirShitov authored Dec 13, 2024
1 parent d2ef117 commit 8669cb1
Show file tree
Hide file tree
Showing 4 changed files with 360 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

## NEW FUNCTIONALITY

* `qc/calculate_atac_qc_metrics`: new component for calculating ATAC QC metrics (PR #868).

* `workflows/annotation/scgpt_annotation` workflow: Added a scGPT transformer-based cell type annotation workflow (PR #832).

* `workflows/annotation/scgpt_integration_knn` workflow: Cell-type annotation based on scGPT integration with KNN label transfer (PR #875).
Expand Down
118 changes: 118 additions & 0 deletions src/qc/calculate_atac_qc_metrics/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
name: calculate_atac_qc_metrics
namespace: "qc"
description: |
Add basic ATAC quality control metrics to an .h5mu file.
The metrics are comparable to what scanpy.pp.calculate_qc_metrics output,
although they have slightly different names:
Obs metrics (name in this component -> name in scanpy):
- n_features_per_cell -> n_genes_by_counts
- total_fragment_counts -> total_counts
authors:
- __merge__: /src/authors/vladimir_shitov.yaml
roles: [ author ]
argument_groups:
- name: Inputs
arguments:
- name: "--input"
type: file
description: Input h5mu file
direction: input
required: true
example: input.h5mu
- name: "--fragments_path"
type: file
description: |
Path to the fragments file. If not provided and not present in the input h5mu file,
the nucleosome signal and TSS enrichment score will not be calculated.
direction: input
required: false
example: fragments.tsv.gz
- name: "--modality"
type: string
default: "atac"
required: false
- name: "--layer"
description: |
Layer at `.layers` to use for the calculation. If not provided, `.X` is used.
type: string
example: "raw_counts"
required: false
- name: "--n_fragments_for_nucleosome_signal"
type: integer
description: |
Number of fragments to use per cell for nucleosome signal calculation.
Takes very long to calculate, for a test run lower value (e.g. 10e3) is recommended.
See https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#nucleosome-signal
for more information
default: 10e4
required: false
- name: "--nuc_signal_threshold"
type: double
description: |
Threshold for nucleosome signal. Cells with nucleosome signal above this threshold
will be marked as low quality ("NS_FAIL"), otherwise they will be marked "NS_PASS".
default: 2
required: false
- name: "--n_tss"
type: integer
description: |
Number of the transcription start sites to calculate TSS enrichment score.
See https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#tss-enrichment
for more information
default: 3000
required: false
- name: "--tss_threshold"
type: double
description: |
Threshold for TSS enrichment score. Cells with TSS enrichment score below this threshold
will be marked as low quality ("TSS_FAIL") otherwise they will be marked as "TSS_PASS".
default: 1.5
required: false
- name: Outputs
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_compression"
type: string
description: The compression format to be used on the output h5mu object.
choices: ["gzip", "lzf"]
required: false
example: "gzip"
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/cellranger_atac_tiny_bcl/counts/
engines:
- type: docker
image: python:3.11-slim
setup:
- type: apt
packages:
- procps
- pkg-config # Otherwise h5py installation fails, which is required for scanpy
- libhdf5-dev
- gcc
- type: python
__merge__: [/src/base/requirements/anndata_mudata.yaml, .]
__merge__: [ /src/base/requirements/scanpy.yaml, .]
packages:
- muon~=0.1.5
- pysam~=0.22.0
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [singlecpu, midmem]
95 changes: 95 additions & 0 deletions src/qc/calculate_atac_qc_metrics/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import sys

import scanpy as sc
import muon as mu
from muon import atac as ac # the module containing function for scATAC data processing
import numpy as np

## VIASH START
par = {
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
"fragments_path": None,
"output": "foo.h5mu",
"modality": "atac",
"layer": None,
"n_fragments_for_nucleosome_signal": 10e4,
"n_tss": 3000,
"nuc_signal_threshold": 2,
"tss_threshold": 1.5,
}
## VIASH END

sys.path.append(meta["resources_dir"])
# START TEMPORARY WORKAROUND setup_logger
# reason: resources aren't available when using Nextflow fusion
# from setup_logger import setup_logger
def setup_logger():
import logging
from sys import stdout

logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)

return logger
# END TEMPORARY WORKAROUND setup_logger
logger = setup_logger()

def main():
logger.info("Reading input data")
mdata = mu.read(par["input"])

atac = mdata.mod[par["modality"]]

logger.info("Checking if QC columns are already calculated")
for col in ("n_features_per_cell", "total_fragment_counts"):
if col in atac.obs:
logger.warning(f"{col} is already in atac.obs, dropping")
atac.obs.drop(col, axis=1, inplace=True)

logger.info("Calculating QC metrics")
sc.pp.calculate_qc_metrics(atac, percent_top=None, log1p=False, inplace=True, layer=par["layer"])

logger.debug("Putting QC columns to ATAC adata")
atac.obs.rename(columns={"n_genes_by_counts": "n_features_per_cell", "total_counts": "total_fragment_counts"}, inplace=True)

logger.debug("Adding log-transformed total fragment counts")
# log-transform total counts and add as column
atac.obs["log_total_fragment_counts"] = np.log10(atac.obs["total_fragment_counts"])

if par["fragments_path"] is not None:
logger.info("Trying to locate frafments")
ac.tl.locate_fragments(atac, fragments=par["fragments_path"])
else:
logger.info("Skipping fragment location: `fragments_path` is not set")

# Calculate the nucleosome signal across cells
if "files" in atac.uns and "fragments" in atac.uns["files"]:
logger.info("Trying to calculate nucleosome signal")
ac.tl.nucleosome_signal(atac, n=par["n_fragments_for_nucleosome_signal"] * atac.n_obs)
atac.obs["nuc_signal_filter"] = [
"NS_FAIL" if ns > par["nuc_signal_threshold"] else "NS_PASS"
for ns in atac.obs["nucleosome_signal"]
]
else:
logger.info("Skipping nucleosome signal calculation: fragments information is not found")

# If interval information is available, calculate TSS enrichment
if "peak_annotation" in mdata.mod["atac"].uns.keys():
tss = ac.tl.tss_enrichment(mdata, features=mdata.mod["atac"].uns["peak_annotation"],n_tss=par["n_tss"], random_state=666)

tss.obs["tss_filter"] = [
"TSS_FAIL" if score < par["tss_threshold"] else "TSS_PASS"
for score in atac.obs["tss_score"]
]
else:
logger.info("Skipping TSS enrichment calculation: genes intervals are not found")

logger.info("Writing output")
mdata.write(par["output"], compression=par["output_compression"])

if __name__ == "__main__":
main()
145 changes: 145 additions & 0 deletions src/qc/calculate_atac_qc_metrics/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import sys
from pathlib import Path
import pytest

import mudata as md
import numpy as np
import scanpy as sc
import muon as mu
import pandas as pd

## VIASH START
meta = {
'executable': './target/docker/qc/calculate_atac_qc_metrics/calculate_atac_qc_metrics',
'resources_dir': "./resources_test/cellranger_atac_tiny_bcl/counts/",
'config': './src/qc/calculate_atac_qc_metrics/config.vsh.yaml',
'cpus': 2
}
## VIASH END

@pytest.fixture
def synthetic_example():
atac = sc.AnnData(np.array([
[0, 0, 0],
[1, 0, 1],
[10, 0, 0],
[100, 0, 1],
[1000, 0, 0]
]))
atac.obs_names = ["A", "B", "C", "D", "E"]
atac.var_names = ["x", "y", "z"]

return md.MuData({"atac": atac})

@pytest.fixture
def example_mudata(tmp_path, synthetic_example):
mdata_path = tmp_path / "example.h5mu"
synthetic_example.write(mdata_path)

return mdata_path

@pytest.fixture
def example_mudata_with_layer(tmp_path, synthetic_example):
synthetic_example.mod["atac"].layers["atac_counts"] = synthetic_example.mod["atac"].X.copy()
synthetic_example.mod["atac"].X = np.random.normal(size=synthetic_example.mod["atac"].X.shape)
mdata_path = tmp_path / "example.h5mu"
synthetic_example.write(mdata_path)

return mdata_path

@pytest.fixture
def neurips_mudata(tmp_path):
"""From the `NeurIPS Multimodal Single-Cell Integration Challenge
<https://www.kaggle.com/competitions/open-problems-multimodal/data>`
Link is taken from the Moscot repository:
https://github.com/theislab/moscot/blob/cb53435c80fafe58046ead3c42a767fd0b818aaa/src/moscot/datasets.py#L67
"""
adata = sc.read("../data/neurips_data.h5ad", backup_url="https://figshare.com/ndownloader/files/37993503")

mdata = md.MuData({"atac": adata})
mdata_path = tmp_path / "neurips.h5mu"
mdata.write(mdata_path)

return mdata_path

@pytest.fixture
def tiny_atac_mudata(tmp_path):
resources_dir = Path(meta["resources_dir"])
mdata = mu.read_10x_h5(resources_dir / "counts" / "filtered_peak_bc_matrix.h5")
mu.atac.tl.locate_fragments(mdata, fragments=str(resources_dir / "counts" / "fragments.tsv.gz"))
assert "files" in mdata.mod["atac"].uns.keys()
assert "fragments" in mdata.mod["atac"].uns["files"].keys()

# Read features annotation and save it to uns
peak_annotation = pd.read_csv(resources_dir / "counts" / "peak_annotation.tsv", sep="\t")
peak_annotation.columns = ["Chromosome", "Start", "End", "gene", "distance", "peak_type"]
peak_annotation["gene"] = peak_annotation["gene"].astype(str) # Fixes saving error
mdata.mod["atac"].uns["peak_annotation"] = peak_annotation

mdata_path = tmp_path / "tiny_atac.h5mu"
mdata.write(mdata_path)

return mdata_path

@pytest.mark.parametrize("mudata", ["example_mudata", "neurips_mudata", "tiny_atac_mudata"])
def test_qc_columns_in_tables(run_component, request, mudata, tmp_path):
input_path = request.getfixturevalue(mudata)
output_path = tmp_path / "foo.h5mu"

args = [
"--input", str(input_path),
"--output", str(output_path),
"--modality", "atac",
"--n_fragments_for_nucleosome_signal", "100"
]

run_component(args)
assert output_path.is_file()
data_with_qc = md.read(output_path)

for qc_metric in ("n_features_per_cell", "total_fragment_counts", "log_total_fragment_counts"):
assert qc_metric in data_with_qc.mod["atac"].obs
for qc_metric in ("n_cells_by_counts", "mean_counts", "pct_dropout_by_counts", "total_counts"):
assert qc_metric in data_with_qc.mod["atac"].var

# Check that ATAC-specific metrics are calculated if fragments information is present (for tiny ATAC data)
if "files" in data_with_qc.mod["atac"].uns and "fragments" in data_with_qc.mod["atac"].uns["files"]:
assert "nucleosome_signal" in data_with_qc.mod["atac"].obs

if "peak_annotation" in data_with_qc.mod["atac"].uns.keys():
assert "tss_score" in data_with_qc.mod["atac"].obs


@pytest.mark.parametrize("mudata", ["example_mudata", "example_mudata_with_layer"])
def test_calculations_correctness(request, run_component, mudata, tmp_path):
input_path = request.getfixturevalue(mudata)
output_path = tmp_path / "foo.h5mu"

args = [
"--input", str(input_path),
"--output", str(output_path),
"--modality", "atac",
"--n_fragments_for_nucleosome_signal", "100"
]

if mudata == "example_mudata_with_layer":
args.extend(["--layer", "atac_counts"])

run_component(args)
assert output_path.is_file()
data_with_qc = md.read(output_path)

assert np.allclose(data_with_qc.mod["atac"].obs["n_features_per_cell"], [0, 2, 1, 2, 1])
assert np.allclose(data_with_qc.mod["atac"].obs["total_fragment_counts"], [0, 2, 10, 101, 1000])
assert np.allclose(data_with_qc.mod["atac"].obs["log_total_fragment_counts"], [-np.inf, np.log10(2), np.log10(10), np.log10(101), np.log10(1000)])

assert np.allclose(data_with_qc.mod["atac"].var["n_cells_by_counts"], [4, 0, 2])
assert np.allclose(data_with_qc.mod["atac"].var["mean_counts"], [222.2, 0, 0.4])
assert np.allclose(data_with_qc.mod["atac"].var["pct_dropout_by_counts"], [20, 100, 60])
assert np.allclose(data_with_qc.mod["atac"].var["total_counts"], [1111, 0, 2])


if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

0 comments on commit 8669cb1

Please sign in to comment.