diff --git a/CHANGELOG.md b/CHANGELOG.md index 551a1571197..b360f7540db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ ## NEW FUNCTIONALITY +* `qc/calculate_atac_qc_metrics`: new component for calculating ATAC QC metrics (PR #868). + * `workflows/annotation/scgpt_annotation` workflow: Added a scGPT transformer-based cell type annotation workflow (PR #832). * `workflows/annotation/scgpt_integration_knn` workflow: Cell-type annotation based on scGPT integration with KNN label transfer (PR #875). diff --git a/src/qc/calculate_atac_qc_metrics/config.vsh.yaml b/src/qc/calculate_atac_qc_metrics/config.vsh.yaml new file mode 100644 index 00000000000..9fde2e2e925 --- /dev/null +++ b/src/qc/calculate_atac_qc_metrics/config.vsh.yaml @@ -0,0 +1,118 @@ +name: calculate_atac_qc_metrics +namespace: "qc" +description: | + Add basic ATAC quality control metrics to an .h5mu file. + + The metrics are comparable to what scanpy.pp.calculate_qc_metrics output, + although they have slightly different names: + + Obs metrics (name in this component -> name in scanpy): + - n_features_per_cell -> n_genes_by_counts + - total_fragment_counts -> total_counts + +authors: + - __merge__: /src/authors/vladimir_shitov.yaml + roles: [ author ] +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--fragments_path" + type: file + description: | + Path to the fragments file. If not provided and not present in the input h5mu file, + the nucleosome signal and TSS enrichment score will not be calculated. + direction: input + required: false + example: fragments.tsv.gz + - name: "--modality" + type: string + default: "atac" + required: false + - name: "--layer" + description: | + Layer at `.layers` to use for the calculation. If not provided, `.X` is used. + type: string + example: "raw_counts" + required: false + - name: "--n_fragments_for_nucleosome_signal" + type: integer + description: | + Number of fragments to use per cell for nucleosome signal calculation. + Takes very long to calculate, for a test run lower value (e.g. 10e3) is recommended. + See https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#nucleosome-signal + for more information + default: 10e4 + required: false + - name: "--nuc_signal_threshold" + type: double + description: | + Threshold for nucleosome signal. Cells with nucleosome signal above this threshold + will be marked as low quality ("NS_FAIL"), otherwise they will be marked "NS_PASS". + default: 2 + required: false + - name: "--n_tss" + type: integer + description: | + Number of the transcription start sites to calculate TSS enrichment score. + See https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#tss-enrichment + for more information + default: 3000 + required: false + - name: "--tss_threshold" + type: double + description: | + Threshold for TSS enrichment score. Cells with TSS enrichment score below this threshold + will be marked as low quality ("TSS_FAIL") otherwise they will be marked as "TSS_PASS". + default: 1.5 + required: false + - name: Outputs + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + - name: "--output_compression" + type: string + description: The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + example: "gzip" +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/cellranger_atac_tiny_bcl/counts/ +engines: + - type: docker + image: python:3.11-slim + setup: + - type: apt + packages: + - procps + - pkg-config # Otherwise h5py installation fails, which is required for scanpy + - libhdf5-dev + - gcc + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + __merge__: [ /src/base/requirements/scanpy.yaml, .] + packages: + - muon~=0.1.5 + - pysam~=0.22.0 + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [singlecpu, midmem] diff --git a/src/qc/calculate_atac_qc_metrics/script.py b/src/qc/calculate_atac_qc_metrics/script.py new file mode 100644 index 00000000000..822970d9c71 --- /dev/null +++ b/src/qc/calculate_atac_qc_metrics/script.py @@ -0,0 +1,95 @@ +import sys + +import scanpy as sc +import muon as mu +from muon import atac as ac # the module containing function for scATAC data processing +import numpy as np + +## VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "fragments_path": None, + "output": "foo.h5mu", + "modality": "atac", + "layer": None, + "n_fragments_for_nucleosome_signal": 10e4, + "n_tss": 3000, + "nuc_signal_threshold": 2, + "tss_threshold": 1.5, +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +# START TEMPORARY WORKAROUND setup_logger +# reason: resources aren't available when using Nextflow fusion +# from setup_logger import setup_logger +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger +# END TEMPORARY WORKAROUND setup_logger +logger = setup_logger() + +def main(): + logger.info("Reading input data") + mdata = mu.read(par["input"]) + + atac = mdata.mod[par["modality"]] + + logger.info("Checking if QC columns are already calculated") + for col in ("n_features_per_cell", "total_fragment_counts"): + if col in atac.obs: + logger.warning(f"{col} is already in atac.obs, dropping") + atac.obs.drop(col, axis=1, inplace=True) + + logger.info("Calculating QC metrics") + sc.pp.calculate_qc_metrics(atac, percent_top=None, log1p=False, inplace=True, layer=par["layer"]) + + logger.debug("Putting QC columns to ATAC adata") + atac.obs.rename(columns={"n_genes_by_counts": "n_features_per_cell", "total_counts": "total_fragment_counts"}, inplace=True) + + logger.debug("Adding log-transformed total fragment counts") + # log-transform total counts and add as column + atac.obs["log_total_fragment_counts"] = np.log10(atac.obs["total_fragment_counts"]) + + if par["fragments_path"] is not None: + logger.info("Trying to locate frafments") + ac.tl.locate_fragments(atac, fragments=par["fragments_path"]) + else: + logger.info("Skipping fragment location: `fragments_path` is not set") + + # Calculate the nucleosome signal across cells + if "files" in atac.uns and "fragments" in atac.uns["files"]: + logger.info("Trying to calculate nucleosome signal") + ac.tl.nucleosome_signal(atac, n=par["n_fragments_for_nucleosome_signal"] * atac.n_obs) + atac.obs["nuc_signal_filter"] = [ + "NS_FAIL" if ns > par["nuc_signal_threshold"] else "NS_PASS" + for ns in atac.obs["nucleosome_signal"] + ] + else: + logger.info("Skipping nucleosome signal calculation: fragments information is not found") + + # If interval information is available, calculate TSS enrichment + if "peak_annotation" in mdata.mod["atac"].uns.keys(): + tss = ac.tl.tss_enrichment(mdata, features=mdata.mod["atac"].uns["peak_annotation"],n_tss=par["n_tss"], random_state=666) + + tss.obs["tss_filter"] = [ + "TSS_FAIL" if score < par["tss_threshold"] else "TSS_PASS" + for score in atac.obs["tss_score"] + ] + else: + logger.info("Skipping TSS enrichment calculation: genes intervals are not found") + + logger.info("Writing output") + mdata.write(par["output"], compression=par["output_compression"]) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/qc/calculate_atac_qc_metrics/test.py b/src/qc/calculate_atac_qc_metrics/test.py new file mode 100644 index 00000000000..9844100c611 --- /dev/null +++ b/src/qc/calculate_atac_qc_metrics/test.py @@ -0,0 +1,145 @@ +import sys +from pathlib import Path +import pytest + +import mudata as md +import numpy as np +import scanpy as sc +import muon as mu +import pandas as pd + +## VIASH START +meta = { + 'executable': './target/docker/qc/calculate_atac_qc_metrics/calculate_atac_qc_metrics', + 'resources_dir': "./resources_test/cellranger_atac_tiny_bcl/counts/", + 'config': './src/qc/calculate_atac_qc_metrics/config.vsh.yaml', + 'cpus': 2 +} +## VIASH END + +@pytest.fixture +def synthetic_example(): + atac = sc.AnnData(np.array([ + [0, 0, 0], + [1, 0, 1], + [10, 0, 0], + [100, 0, 1], + [1000, 0, 0] + ])) + atac.obs_names = ["A", "B", "C", "D", "E"] + atac.var_names = ["x", "y", "z"] + + return md.MuData({"atac": atac}) + +@pytest.fixture +def example_mudata(tmp_path, synthetic_example): + mdata_path = tmp_path / "example.h5mu" + synthetic_example.write(mdata_path) + + return mdata_path + +@pytest.fixture +def example_mudata_with_layer(tmp_path, synthetic_example): + synthetic_example.mod["atac"].layers["atac_counts"] = synthetic_example.mod["atac"].X.copy() + synthetic_example.mod["atac"].X = np.random.normal(size=synthetic_example.mod["atac"].X.shape) + mdata_path = tmp_path / "example.h5mu" + synthetic_example.write(mdata_path) + + return mdata_path + +@pytest.fixture +def neurips_mudata(tmp_path): + """From the `NeurIPS Multimodal Single-Cell Integration Challenge + ` + + Link is taken from the Moscot repository: + https://github.com/theislab/moscot/blob/cb53435c80fafe58046ead3c42a767fd0b818aaa/src/moscot/datasets.py#L67 + + """ + adata = sc.read("../data/neurips_data.h5ad", backup_url="https://figshare.com/ndownloader/files/37993503") + + mdata = md.MuData({"atac": adata}) + mdata_path = tmp_path / "neurips.h5mu" + mdata.write(mdata_path) + + return mdata_path + +@pytest.fixture +def tiny_atac_mudata(tmp_path): + resources_dir = Path(meta["resources_dir"]) + mdata = mu.read_10x_h5(resources_dir / "counts" / "filtered_peak_bc_matrix.h5") + mu.atac.tl.locate_fragments(mdata, fragments=str(resources_dir / "counts" / "fragments.tsv.gz")) + assert "files" in mdata.mod["atac"].uns.keys() + assert "fragments" in mdata.mod["atac"].uns["files"].keys() + + # Read features annotation and save it to uns + peak_annotation = pd.read_csv(resources_dir / "counts" / "peak_annotation.tsv", sep="\t") + peak_annotation.columns = ["Chromosome", "Start", "End", "gene", "distance", "peak_type"] + peak_annotation["gene"] = peak_annotation["gene"].astype(str) # Fixes saving error + mdata.mod["atac"].uns["peak_annotation"] = peak_annotation + + mdata_path = tmp_path / "tiny_atac.h5mu" + mdata.write(mdata_path) + + return mdata_path + +@pytest.mark.parametrize("mudata", ["example_mudata", "neurips_mudata", "tiny_atac_mudata"]) +def test_qc_columns_in_tables(run_component, request, mudata, tmp_path): + input_path = request.getfixturevalue(mudata) + output_path = tmp_path / "foo.h5mu" + + args = [ + "--input", str(input_path), + "--output", str(output_path), + "--modality", "atac", + "--n_fragments_for_nucleosome_signal", "100" + ] + + run_component(args) + assert output_path.is_file() + data_with_qc = md.read(output_path) + + for qc_metric in ("n_features_per_cell", "total_fragment_counts", "log_total_fragment_counts"): + assert qc_metric in data_with_qc.mod["atac"].obs + for qc_metric in ("n_cells_by_counts", "mean_counts", "pct_dropout_by_counts", "total_counts"): + assert qc_metric in data_with_qc.mod["atac"].var + + # Check that ATAC-specific metrics are calculated if fragments information is present (for tiny ATAC data) + if "files" in data_with_qc.mod["atac"].uns and "fragments" in data_with_qc.mod["atac"].uns["files"]: + assert "nucleosome_signal" in data_with_qc.mod["atac"].obs + + if "peak_annotation" in data_with_qc.mod["atac"].uns.keys(): + assert "tss_score" in data_with_qc.mod["atac"].obs + + +@pytest.mark.parametrize("mudata", ["example_mudata", "example_mudata_with_layer"]) +def test_calculations_correctness(request, run_component, mudata, tmp_path): + input_path = request.getfixturevalue(mudata) + output_path = tmp_path / "foo.h5mu" + + args = [ + "--input", str(input_path), + "--output", str(output_path), + "--modality", "atac", + "--n_fragments_for_nucleosome_signal", "100" + ] + + if mudata == "example_mudata_with_layer": + args.extend(["--layer", "atac_counts"]) + + run_component(args) + assert output_path.is_file() + data_with_qc = md.read(output_path) + + assert np.allclose(data_with_qc.mod["atac"].obs["n_features_per_cell"], [0, 2, 1, 2, 1]) + assert np.allclose(data_with_qc.mod["atac"].obs["total_fragment_counts"], [0, 2, 10, 101, 1000]) + assert np.allclose(data_with_qc.mod["atac"].obs["log_total_fragment_counts"], [-np.inf, np.log10(2), np.log10(10), np.log10(101), np.log10(1000)]) + + assert np.allclose(data_with_qc.mod["atac"].var["n_cells_by_counts"], [4, 0, 2]) + assert np.allclose(data_with_qc.mod["atac"].var["mean_counts"], [222.2, 0, 0.4]) + assert np.allclose(data_with_qc.mod["atac"].var["pct_dropout_by_counts"], [20, 100, 60]) + assert np.allclose(data_with_qc.mod["atac"].var["total_counts"], [1111, 0, 2]) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__]))