-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add validation during pipeline run (#203)
* feat: added validation smk * feat: added validation scripts * feat: included validation rules * feat: moved validations into main rules Validations now in same files as annotate and filter * fix: creating sentinel * fix: removed validation.smk * fix: missing 'output' * fix: corrected errors in validation scripts * feat: added combined wc constraint * feat: ignoring multiple viewpoint check for tiled
- Loading branch information
1 parent
ae32a81
commit 2d6517e
Showing
6 changed files
with
175 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
86 changes: 86 additions & 0 deletions
86
capcruncher/pipeline/workflow/scripts/validation_check_n_bins_per_viewpoint.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
""" | ||
Aim: Check that there is only one restriction fragment per viewpoint. | ||
""" | ||
|
||
import pandas as pd | ||
import numpy as np | ||
import pyranges as pr | ||
import polars as pl | ||
import tabulate | ||
import pathlib | ||
from loguru import logger | ||
|
||
|
||
with logger.catch(): | ||
logger.info("Checking that there is only one restriction fragment per viewpoint") | ||
|
||
bins = snakemake.input.bins | ||
viewpoints = snakemake.input.viewpoints | ||
|
||
df_bins = pl.read_csv( | ||
bins, | ||
separator="\t", | ||
has_header=False, | ||
new_columns=["Chromosome", "Start", "End", "Name"], | ||
) | ||
gr_bins = pr.PyRanges(df_bins.to_pandas()) | ||
|
||
df_viewpoints = pl.read_csv( | ||
viewpoints, | ||
separator="\t", | ||
has_header=False, | ||
new_columns=["Chromosome", "Start", "End", "Name"], | ||
) | ||
gr_viewpoints = pr.PyRanges(df_viewpoints.to_pandas()) | ||
|
||
# Generate a table with the number of restriction fragments overlapped by each viewpoint | ||
gr_bin_vp_overlap = gr_bins.join(gr_viewpoints, suffix="_viewpoints") | ||
bin_vp_counts = gr_bin_vp_overlap.df["Name_viewpoints"].value_counts() | ||
has_multiple_bins = bin_vp_counts > 1 | ||
|
||
df_viewpoints = df_viewpoints.to_pandas() | ||
df_viewpoints["n_restriction_fragments_overlapped"] = 1 | ||
df_viewpoints = df_viewpoints.set_index("Name") | ||
df_viewpoints.loc[ | ||
has_multiple_bins, "n_restriction_fragments_overlapped" | ||
] = bin_vp_counts[has_multiple_bins].values | ||
df_viewpoints = df_viewpoints.reset_index().rename(columns={"index": "Viewpoint"}) | ||
df_viewpoints.to_csv(snakemake.output.n_bins_per_viewpoint, sep="\t", index=False) | ||
|
||
# df_rf_counts = ( | ||
# df_viewpoints.to_pandas() | ||
# .set_index("Name") | ||
# .loc[has_multiple_bins] | ||
# .assign( | ||
# n_restriction_fragments_overlapped=bin_vp_counts[has_multiple_bins].values | ||
# ) | ||
# ) | ||
|
||
# df_rf_counts = pd.concat( | ||
# [ | ||
# df_rf_counts, | ||
# df_viewpoints.to_pandas() | ||
# .set_index("Name") | ||
# .loc[~has_multiple_bins] | ||
# .assign(n_restriction_fragments_overlapped=1), | ||
# ] | ||
# ) | ||
|
||
# df_rf_counts = ( | ||
# df_rf_counts.reset_index() | ||
# .rename(columns={"index": "Viewpoint"}) | ||
# .to_csv(snakemake.output.n_bins_per_viewpoint, sep="\t", index=False) | ||
# ) | ||
|
||
if ( | ||
has_multiple_bins.any() | ||
and not snakemake.params.ignore_multiple_bins_per_viewpoint | ||
): | ||
tbl = tabulate.tabulate(df_rf_counts, headers="keys", tablefmt="psql") | ||
|
||
raise ValueError( | ||
f"""The following viewpoints overlap multiple restriction fragments:\n{df_rf_counts}\n""" | ||
) | ||
|
||
else: | ||
pathlib.Path(snakemake.output.sentinel).touch() |
38 changes: 38 additions & 0 deletions
38
capcruncher/pipeline/workflow/scripts/validation_confirm_annotated_viewpoints_present.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
""" | ||
Aim: Ensure that all viewpoints are found in the annotated slices. | ||
""" | ||
|
||
import pandas as pd | ||
import numpy as np | ||
import pyranges as pr | ||
import polars as pl | ||
import tabulate | ||
import pathlib | ||
|
||
slices = snakemake.input.slices | ||
viewpoints = snakemake.input.viewpoints | ||
gr_viewpoints = pr.read_bed(viewpoints) | ||
|
||
with pl.StringCache(): | ||
vp_counts = [] | ||
for pq in slices: | ||
df = pl.read_parquet(pq, columns=["capture"]) | ||
vp_counts.append(df["capture"].value_counts()) | ||
|
||
df_counts = ( | ||
pl.concat(vp_counts).groupby("capture").agg(pl.sum("counts")).to_pandas() | ||
) | ||
|
||
|
||
df_counts.rename(columns={"counts": "n_slices"}).to_csv( | ||
snakemake.output.viewpoints_present, sep="\t", index=True | ||
) | ||
|
||
|
||
if not gr_viewpoints.df.Name.isin(df_counts.capture).all(): | ||
# check which viewpoints are missing | ||
missing = gr_viewpoints.df.Name[~viewpoints.df.Name.isin(df_counts.capture)] | ||
raise ValueError(f"Not all viewpoints are present in the annotation: {missing}") | ||
|
||
else: | ||
pathlib.Path(snakemake.output.sentinel).touch() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters