feat: add validation during pipeline run (#203)

* feat: added validation smk * feat: added validation scripts * feat: included validation rules * feat: moved validations into main rules Validations now in same files as annotate and filter * fix: creating sentinel * fix: removed validation.smk * fix: missing 'output' * fix: corrected errors in validation scripts * feat: added combined wc constraint * feat: ignoring multiple viewpoint check for tiled
sims-lab · Sep 12, 2023 · 2d6517e · 2d6517e
1 parent ae32a81
commit 2d6517e
Show file tree

Hide file tree

Showing 6 changed files with 175 additions and 2 deletions.
diff --git a/capcruncher/pipeline/workflow/Snakefile b/capcruncher/pipeline/workflow/Snakefile
@@ -55,7 +55,9 @@ BIN_SIZES = capcruncher.pipeline.utils.get_bin_sizes(config)
 HIGH_NUMBER_OF_VIEWPOINTS = capcruncher.pipeline.utils.has_high_viewpoint_number(
     VIEWPOINTS, config
 )
-
+IGNORE_MULTIPLE_FRAGMENTS_PER_VIEWPOINT = config["analysis"].get(
+    "ignore_multiple_fragments_per_viewpoint", False
+)
 
 # Details
 SUMMARY_METHODS = [
@@ -80,7 +82,6 @@ PERFORM_BINNING = capcruncher.pipeline.utils.can_perform_binning(config)
 ASSAY = config["analysis"]["method"]
 SAMPLE_NAMES = FASTQ_SAMPLES.sample_names_all
 
-# Optional
 
 ## Check if capcruncher_tools is installed
 if importlib.util.find_spec("capcruncher_tools"):
@@ -109,6 +110,7 @@ wildcard_constraints:
     sample="|".join(SAMPLE_NAMES),
     part=r"\d+",
     viewpoint="|".join(VIEWPOINT_NAMES),
+    combined="|".join(["flashed", "pe"]),
 
 
 rule all:

diff --git a/capcruncher/pipeline/workflow/rules/annotate.smk b/capcruncher/pipeline/workflow/rules/annotate.smk
@@ -18,11 +18,29 @@ rule exclusions:
         """
 
 
+rule check_n_bins_per_viewpoint:
+    input:
+        bins=rules.digest_genome.output.bed,
+        viewpoints=config["analysis"]["viewpoints"],
+    output:
+        sentinel="capcruncher_output/resources/validation/check_n_bins_per_viewpoint.sentinel",
+        n_bins_per_viewpoint="capcruncher_output/resources/validation/n_bins_per_viewpoint.tsv",
+    params:
+        ignore_multiple_bins_per_viewpoint=IGNORE_MULTIPLE_FRAGMENTS_PER_VIEWPOINT
+        if ASSAY in ["capture", "tri"]
+        else True,
+    log:
+        "capcruncher_output/logs/validation/check_n_bins_per_viewpoint.log",
+    script:
+        "../scripts/validation_check_n_bins_per_viewpoint.py"
+
+
 rule annotate:
     input:
         bam=rules.align_bowtie2.output.bam,
         exclusions="capcruncher_output/interim/annotate/exclude.bed",
         viewpoints=config["analysis"]["viewpoints"],
+        single_bin_per_viewpoint=rules.check_n_bins_per_viewpoint.output.sentinel,
     output:
         annotated=temp(
             "capcruncher_output/interim/annotate/{sample}/{sample}_part{part}_{combined}.parquet"

diff --git a/capcruncher/pipeline/workflow/rules/filter.smk b/capcruncher/pipeline/workflow/rules/filter.smk
@@ -12,10 +12,33 @@ def get_filtered_slices(wildcards):
     return slices
 
 
+def get_annotated_slices(wildcards):
+    slices = dict()
+    for combined_type in ["flashed", "pe"]:
+        parts = get_rebalanced_parts(wildcards, combined=combined_type)
+        slices[combined_type] = [
+            f"capcruncher_output/interim/annotate/{wildcards.sample}/{wildcards.sample}_part{part}_{combined_type}.parquet"
+            for part in parts
+        ]
+    return [*slices["flashed"], *slices["pe"]]
+
+
+rule check_viewpoints_annotated:
+    input:
+        slices=get_annotated_slices,
+        viewpoints=config["analysis"]["viewpoints"],
+    output:
+        sentinel="capcruncher_output/resources/validation/{sample}.check_viewpoints.sentinel",
+        viewpoints_present="capcruncher_output/resources/validation/{sample}.annotated_viewpoints_present.tsv",
+    script:
+        "../scripts/validation_confirm_annotated_viewpoints_present.py"
+
+
 rule filter_alignments:
     input:
         bam=rules.align_bowtie2.output.bam,
         annotations=rules.annotate.output.annotated,
+        all_viewpoints_present=rules.check_viewpoints_annotated.output.sentinel,
     output:
         filtered_slices=temp(
             "capcruncher_output/interim/filtering/initial/{sample}/{sample}_part{part}_{combined}.slices.parquet"

diff --git a/capcruncher/pipeline/workflow/scripts/validation_check_n_bins_per_viewpoint.py b/capcruncher/pipeline/workflow/scripts/validation_check_n_bins_per_viewpoint.py
@@ -0,0 +1,86 @@
+"""
+Aim: Check that there is only one restriction fragment per viewpoint.
+"""
+
+import pandas as pd
+import numpy as np
+import pyranges as pr
+import polars as pl
+import tabulate
+import pathlib
+from loguru import logger
+
+
+with logger.catch():
+    logger.info("Checking that there is only one restriction fragment per viewpoint")
+
+    bins = snakemake.input.bins
+    viewpoints = snakemake.input.viewpoints
+
+    df_bins = pl.read_csv(
+        bins,
+        separator="\t",
+        has_header=False,
+        new_columns=["Chromosome", "Start", "End", "Name"],
+    )
+    gr_bins = pr.PyRanges(df_bins.to_pandas())
+
+    df_viewpoints = pl.read_csv(
+        viewpoints,
+        separator="\t",
+        has_header=False,
+        new_columns=["Chromosome", "Start", "End", "Name"],
+    )
+    gr_viewpoints = pr.PyRanges(df_viewpoints.to_pandas())
+
+    # Generate a table with the number of restriction fragments overlapped by each viewpoint
+    gr_bin_vp_overlap = gr_bins.join(gr_viewpoints, suffix="_viewpoints")
+    bin_vp_counts = gr_bin_vp_overlap.df["Name_viewpoints"].value_counts()
+    has_multiple_bins = bin_vp_counts > 1
+
+    df_viewpoints = df_viewpoints.to_pandas()
+    df_viewpoints["n_restriction_fragments_overlapped"] = 1
+    df_viewpoints = df_viewpoints.set_index("Name")
+    df_viewpoints.loc[
+        has_multiple_bins, "n_restriction_fragments_overlapped"
+    ] = bin_vp_counts[has_multiple_bins].values
+    df_viewpoints = df_viewpoints.reset_index().rename(columns={"index": "Viewpoint"})
+    df_viewpoints.to_csv(snakemake.output.n_bins_per_viewpoint, sep="\t", index=False)
+
+    # df_rf_counts = (
+    #     df_viewpoints.to_pandas()
+    #     .set_index("Name")
+    #     .loc[has_multiple_bins]
+    #     .assign(
+    #         n_restriction_fragments_overlapped=bin_vp_counts[has_multiple_bins].values
+    #     )
+    # )
+
+    # df_rf_counts = pd.concat(
+    #     [
+    #         df_rf_counts,
+    #         df_viewpoints.to_pandas()
+    #         .set_index("Name")
+    #         .loc[~has_multiple_bins]
+    #         .assign(n_restriction_fragments_overlapped=1),
+    #     ]
+    # )
+
+    # df_rf_counts = (
+    #     df_rf_counts.reset_index()
+    #     .rename(columns={"index": "Viewpoint"})
+    #     .to_csv(snakemake.output.n_bins_per_viewpoint, sep="\t", index=False)
+    # )
+
+    if (
+        has_multiple_bins.any()
+        and not snakemake.params.ignore_multiple_bins_per_viewpoint
+    ):
+        tbl = tabulate.tabulate(df_rf_counts, headers="keys", tablefmt="psql")
+
+        raise ValueError(
+            f"""The following viewpoints overlap multiple restriction fragments:\n{df_rf_counts}\n"""
+        )
+
+    else:
+        pathlib.Path(snakemake.output.sentinel).touch()
diff --git a/capcruncher/pipeline/workflow/scripts/validation_confirm_annotated_viewpoints_present.py b/capcruncher/pipeline/workflow/scripts/validation_confirm_annotated_viewpoints_present.py
@@ -0,0 +1,38 @@
+"""
+Aim: Ensure that all viewpoints are found in the annotated slices.
+"""
+
+import pandas as pd
+import numpy as np
+import pyranges as pr
+import polars as pl
+import tabulate
+import pathlib
+
+slices = snakemake.input.slices
+viewpoints = snakemake.input.viewpoints
+gr_viewpoints = pr.read_bed(viewpoints)
+
+with pl.StringCache():
+    vp_counts = []
+    for pq in slices:
+        df = pl.read_parquet(pq, columns=["capture"])
+        vp_counts.append(df["capture"].value_counts())
+
+    df_counts = (
+        pl.concat(vp_counts).groupby("capture").agg(pl.sum("counts")).to_pandas()
+    )
+
+
+df_counts.rename(columns={"counts": "n_slices"}).to_csv(
+    snakemake.output.viewpoints_present, sep="\t", index=True
+)
+
+
+if not gr_viewpoints.df.Name.isin(df_counts.capture).all():
+    # check which viewpoints are missing
+    missing = gr_viewpoints.df.Name[~viewpoints.df.Name.isin(df_counts.capture)]
+    raise ValueError(f"Not all viewpoints are present in the annotation: {missing}")
+
+else:
+    pathlib.Path(snakemake.output.sentinel).touch()
diff --git a/pyproject.toml b/pyproject.toml
@@ -84,6 +84,12 @@ max-complexity = 10
 "capcruncher/pipeline/rules/scripts/combine_filtering_stats.py" = ["F821"]
 "capcruncher/pipeline/rules/scripts/combine_deduplication_stats.py" = ["F821"]
 "capcruncher/pipeline/rules/scripts/combine_digestion_stats.py" = ["F821"]
+"capcruncher/pipeline/workflow/scripts/validation_check_n_bins_per_viewpoint.py" = [
+    "F821",
+]
+"capcruncher/pipeline/workflow/scripts/validation_confirm_annotated_viewpoints_present.py" = [
+    "F821",
+]
 
 [tool.ruff.pydocstyle]
 convention = "google"