Skip to content

Commit

Permalink
feat: add validation during pipeline run (#203)
Browse files Browse the repository at this point in the history
* feat: added validation smk

* feat: added validation scripts

* feat: included validation rules

* feat: moved validations into main rules

Validations now in same files as annotate and filter

* fix: creating sentinel

* fix: removed validation.smk

* fix: missing 'output'

* fix: corrected errors in validation scripts

* feat: added combined wc constraint

* feat: ignoring multiple viewpoint check for tiled
  • Loading branch information
alsmith151 authored Sep 12, 2023
1 parent ae32a81 commit 2d6517e
Show file tree
Hide file tree
Showing 6 changed files with 175 additions and 2 deletions.
6 changes: 4 additions & 2 deletions capcruncher/pipeline/workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ BIN_SIZES = capcruncher.pipeline.utils.get_bin_sizes(config)
HIGH_NUMBER_OF_VIEWPOINTS = capcruncher.pipeline.utils.has_high_viewpoint_number(
VIEWPOINTS, config
)

IGNORE_MULTIPLE_FRAGMENTS_PER_VIEWPOINT = config["analysis"].get(
"ignore_multiple_fragments_per_viewpoint", False
)

# Details
SUMMARY_METHODS = [
Expand All @@ -80,7 +82,6 @@ PERFORM_BINNING = capcruncher.pipeline.utils.can_perform_binning(config)
ASSAY = config["analysis"]["method"]
SAMPLE_NAMES = FASTQ_SAMPLES.sample_names_all

# Optional

## Check if capcruncher_tools is installed
if importlib.util.find_spec("capcruncher_tools"):
Expand Down Expand Up @@ -109,6 +110,7 @@ wildcard_constraints:
sample="|".join(SAMPLE_NAMES),
part=r"\d+",
viewpoint="|".join(VIEWPOINT_NAMES),
combined="|".join(["flashed", "pe"]),


rule all:
Expand Down
18 changes: 18 additions & 0 deletions capcruncher/pipeline/workflow/rules/annotate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,29 @@ rule exclusions:
"""


rule check_n_bins_per_viewpoint:
input:
bins=rules.digest_genome.output.bed,
viewpoints=config["analysis"]["viewpoints"],
output:
sentinel="capcruncher_output/resources/validation/check_n_bins_per_viewpoint.sentinel",
n_bins_per_viewpoint="capcruncher_output/resources/validation/n_bins_per_viewpoint.tsv",
params:
ignore_multiple_bins_per_viewpoint=IGNORE_MULTIPLE_FRAGMENTS_PER_VIEWPOINT
if ASSAY in ["capture", "tri"]
else True,
log:
"capcruncher_output/logs/validation/check_n_bins_per_viewpoint.log",
script:
"../scripts/validation_check_n_bins_per_viewpoint.py"


rule annotate:
input:
bam=rules.align_bowtie2.output.bam,
exclusions="capcruncher_output/interim/annotate/exclude.bed",
viewpoints=config["analysis"]["viewpoints"],
single_bin_per_viewpoint=rules.check_n_bins_per_viewpoint.output.sentinel,
output:
annotated=temp(
"capcruncher_output/interim/annotate/{sample}/{sample}_part{part}_{combined}.parquet"
Expand Down
23 changes: 23 additions & 0 deletions capcruncher/pipeline/workflow/rules/filter.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,33 @@ def get_filtered_slices(wildcards):
return slices


def get_annotated_slices(wildcards):
slices = dict()
for combined_type in ["flashed", "pe"]:
parts = get_rebalanced_parts(wildcards, combined=combined_type)
slices[combined_type] = [
f"capcruncher_output/interim/annotate/{wildcards.sample}/{wildcards.sample}_part{part}_{combined_type}.parquet"
for part in parts
]
return [*slices["flashed"], *slices["pe"]]


rule check_viewpoints_annotated:
input:
slices=get_annotated_slices,
viewpoints=config["analysis"]["viewpoints"],
output:
sentinel="capcruncher_output/resources/validation/{sample}.check_viewpoints.sentinel",
viewpoints_present="capcruncher_output/resources/validation/{sample}.annotated_viewpoints_present.tsv",
script:
"../scripts/validation_confirm_annotated_viewpoints_present.py"


rule filter_alignments:
input:
bam=rules.align_bowtie2.output.bam,
annotations=rules.annotate.output.annotated,
all_viewpoints_present=rules.check_viewpoints_annotated.output.sentinel,
output:
filtered_slices=temp(
"capcruncher_output/interim/filtering/initial/{sample}/{sample}_part{part}_{combined}.slices.parquet"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""
Aim: Check that there is only one restriction fragment per viewpoint.
"""

import pandas as pd
import numpy as np
import pyranges as pr
import polars as pl
import tabulate
import pathlib
from loguru import logger


with logger.catch():
logger.info("Checking that there is only one restriction fragment per viewpoint")

bins = snakemake.input.bins
viewpoints = snakemake.input.viewpoints

df_bins = pl.read_csv(
bins,
separator="\t",
has_header=False,
new_columns=["Chromosome", "Start", "End", "Name"],
)
gr_bins = pr.PyRanges(df_bins.to_pandas())

df_viewpoints = pl.read_csv(
viewpoints,
separator="\t",
has_header=False,
new_columns=["Chromosome", "Start", "End", "Name"],
)
gr_viewpoints = pr.PyRanges(df_viewpoints.to_pandas())

# Generate a table with the number of restriction fragments overlapped by each viewpoint
gr_bin_vp_overlap = gr_bins.join(gr_viewpoints, suffix="_viewpoints")
bin_vp_counts = gr_bin_vp_overlap.df["Name_viewpoints"].value_counts()
has_multiple_bins = bin_vp_counts > 1

df_viewpoints = df_viewpoints.to_pandas()
df_viewpoints["n_restriction_fragments_overlapped"] = 1
df_viewpoints = df_viewpoints.set_index("Name")
df_viewpoints.loc[
has_multiple_bins, "n_restriction_fragments_overlapped"
] = bin_vp_counts[has_multiple_bins].values
df_viewpoints = df_viewpoints.reset_index().rename(columns={"index": "Viewpoint"})
df_viewpoints.to_csv(snakemake.output.n_bins_per_viewpoint, sep="\t", index=False)

# df_rf_counts = (
# df_viewpoints.to_pandas()
# .set_index("Name")
# .loc[has_multiple_bins]
# .assign(
# n_restriction_fragments_overlapped=bin_vp_counts[has_multiple_bins].values
# )
# )

# df_rf_counts = pd.concat(
# [
# df_rf_counts,
# df_viewpoints.to_pandas()
# .set_index("Name")
# .loc[~has_multiple_bins]
# .assign(n_restriction_fragments_overlapped=1),
# ]
# )

# df_rf_counts = (
# df_rf_counts.reset_index()
# .rename(columns={"index": "Viewpoint"})
# .to_csv(snakemake.output.n_bins_per_viewpoint, sep="\t", index=False)
# )

if (
has_multiple_bins.any()
and not snakemake.params.ignore_multiple_bins_per_viewpoint
):
tbl = tabulate.tabulate(df_rf_counts, headers="keys", tablefmt="psql")

raise ValueError(
f"""The following viewpoints overlap multiple restriction fragments:\n{df_rf_counts}\n"""
)

else:
pathlib.Path(snakemake.output.sentinel).touch()
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""
Aim: Ensure that all viewpoints are found in the annotated slices.
"""

import pandas as pd
import numpy as np
import pyranges as pr
import polars as pl
import tabulate
import pathlib

slices = snakemake.input.slices
viewpoints = snakemake.input.viewpoints
gr_viewpoints = pr.read_bed(viewpoints)

with pl.StringCache():
vp_counts = []
for pq in slices:
df = pl.read_parquet(pq, columns=["capture"])
vp_counts.append(df["capture"].value_counts())

df_counts = (
pl.concat(vp_counts).groupby("capture").agg(pl.sum("counts")).to_pandas()
)


df_counts.rename(columns={"counts": "n_slices"}).to_csv(
snakemake.output.viewpoints_present, sep="\t", index=True
)


if not gr_viewpoints.df.Name.isin(df_counts.capture).all():
# check which viewpoints are missing
missing = gr_viewpoints.df.Name[~viewpoints.df.Name.isin(df_counts.capture)]
raise ValueError(f"Not all viewpoints are present in the annotation: {missing}")

else:
pathlib.Path(snakemake.output.sentinel).touch()
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,12 @@ max-complexity = 10
"capcruncher/pipeline/rules/scripts/combine_filtering_stats.py" = ["F821"]
"capcruncher/pipeline/rules/scripts/combine_deduplication_stats.py" = ["F821"]
"capcruncher/pipeline/rules/scripts/combine_digestion_stats.py" = ["F821"]
"capcruncher/pipeline/workflow/scripts/validation_check_n_bins_per_viewpoint.py" = [
"F821",
]
"capcruncher/pipeline/workflow/scripts/validation_confirm_annotated_viewpoints_present.py" = [
"F821",
]

[tool.ruff.pydocstyle]
convention = "google"
Expand Down

0 comments on commit 2d6517e

Please sign in to comment.