Skip to content

Commit

Permalink
Updated workflow and testing.
Browse files Browse the repository at this point in the history
  • Loading branch information
Christopher Nobles committed Jun 19, 2019
1 parent 152cf74 commit ab0f85d
Show file tree
Hide file tree
Showing 15 changed files with 77 additions and 48 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ conda deactivate
* Revised a bit of the workflow to make reprocessing smoother
* Updated BLAT coupling script to be more memory efficient
* Fixed TravisCI testing!
* Changed stat workflow, now restarting analysis won't init a total reproc.

**v0.9.8 (April 19th, 2019)**

Expand Down
5 changes: 4 additions & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ if not os.path.isdir(ROOT_DIR):
# Check for sequence file paths
if not os.path.isdir(config["Seq_Path"]):
raise SystemExit("Path to sequencing files is not found (Seq_Path). Check your config file.")


# Check for config symlink to check proper run directory setup
if not os.path.isfile(RUN_DIR + "/config.yml"):
raise SystemExit("Path to symbolic config is not present. Check to make sure you've run 'iguide setup' first.")

# Default params if not included in config
if not "maxNcount" in config:
Expand Down
1 change: 1 addition & 0 deletions docs/pages/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ ChangeLog
* Revised a bit of the workflow to make reprocessing smoother
* Updated BLAT coupling script to be more memory efficient
* Fixed TravisCI testing!
* Changed stat workflow, now restarting analysis won't initiate a total reprocessing.

**v0.9.8 (April 19th, 2019)**

Expand Down
11 changes: 8 additions & 3 deletions etc/tests/simulation.digests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ file1 :
md5 : "dbfc152e8e3ee129c11ab953ed8aa9de"

file2 :
name : "stats.simulation.csv"
path : "analysis/simulation/output/stats.simulation.csv"
md5 : "98a6ba9dbae43828f485e6a9070f6a5e"
name : "stats.core.simulation.csv"
path : "analysis/simulation/output/stats.core.simulation.csv"
md5 : "b593a0e58c97f48a5b367184bd440ad3"

file3 :
name : "stats.eval.simulation.csv"
path : "analysis/simulation/output/stats.eval.simulation.csv"
md5 : "57158a4826685a8024b3281284ac42b6"
2 changes: 1 addition & 1 deletion etc/tests/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ iguide run configs/simulation.config.yml -- -np
iguide run configs/simulation.config.yml -- --dag --nolock | dot -Tsvg > \
analysis/simulation/reports/simulation.dag.svg

iguide run configs/simulation.config.yml -- -p -w 30 --nolock --cores ${__CORES}
iguide run configs/simulation.config.yml -- -p -w 30 --notemp --nolock --cores ${__CORES}

# Evaluate and report out using a different metadata set
iguide eval configs/simulation.config.yml \
Expand Down
26 changes: 16 additions & 10 deletions rules/arch.rules
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Architecture Rules
# Related to setting up analysis directories and consolidating data

rule generate_stat_matrix:
rule core_stat_matrix:
input:
demulti=RUN_DIR + "/process_data/" + RUN + ".demulti.stat",
trimR1=expand(
Expand All @@ -20,22 +20,28 @@ rule generate_stat_matrix:
sample=SAMPLES, read=READS),
align=expand(
RUN_DIR + "/process_data/{sample}.align.stat", sample=SAMPLES),
assim=RUN_DIR + "/process_data/" + RUN + ".assim.stat",
eval=RUN_DIR + "/process_data/" + RUN + ".eval.stat"
assim=RUN_DIR + "/process_data/" + RUN + ".assim.stat"
output:
RUN_DIR + "/output/stats." + RUN + ".csv"
RUN_DIR + "/output/stats.core." + RUN + ".csv"
params:
dir=RUN_DIR + "/process_data",
tool=ROOT_DIR + "/tools/rscripts/collect_stats.R"
resources:
mem_mb = lambda wildcards, attempt: attempt * config["defaultMB"]
shell:
"""
Rscript {params.tool} {params.dir} -o {output}
"""
shell: "Rscript {params.tool} {input} -o {output}"

rule eval_stat_matrix:
input: RUN_DIR + "/process_data/" + RUN + ".eval.stat"
output: RUN_DIR + "/output/stats.eval." + RUN + ".csv"
params:
tool=ROOT_DIR + "/tools/rscripts/collect_stats.R"
resources:
mem_mb = lambda wildcards, attempt: attempt * config["defaultMB"]
shell: "Rscript {params.tool} {input} -o {output}"

rule gen_stat_report:
input: RUN_DIR + "/output/stats." + RUN + ".csv"
input:
core = RUN_DIR + "/output/stats.core." + RUN + ".csv",
eval = RUN_DIR + "/output/stats.eval." + RUN + ".csv"
output: RUN_DIR + "/reports/runstats." + RUN + ".html"
params:
tool = ROOT_DIR + "/tools/rscripts/generate_stat_report.R",
Expand Down
4 changes: 2 additions & 2 deletions rules/consol.rules
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ rule consolidate:
output:
consol=temp(RUN_DIR + "/process_data/{sample}.{read}.consol.fasta"),
key=temp(RUN_DIR + "/process_data/{sample}.{read}.key.csv"),
stat=RUN_DIR + "/process_data/{sample}.{read}.consol.stat"
stat=temp(RUN_DIR + "/process_data/{sample}.{read}.consol.stat")
params:
tool=ROOT_DIR + "/tools/rscripts/consol.R"
log:
Expand All @@ -17,6 +17,6 @@ rule consolidate:
shell:
"""
Rscript {params.tool} {input} -o {output.consol} -k {output.key} \
--stat {output.stat} > {log} 2>&1
--stat {output.stat} > {log} 2>&1
"""

2 changes: 1 addition & 1 deletion rules/demulti.rules
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ rule demultiplex:
RUN_DIR + "/process_data/degenerate.{type}.fastq.gz", type=TYPES)),
unas=temp(expand(
RUN_DIR + "/process_data/unassigned.{type}.fastq.gz", type=TYPES)),
stat=RUN_DIR + "/process_data/" + RUN + ".demulti.stat"
stat=temp(RUN_DIR + "/process_data/" + RUN + ".demulti.stat")
params:
tool=ROOT_DIR + "/tools/rscripts/demulti.R",
bc1Len=config["barcode1Length"],
Expand Down
2 changes: 1 addition & 1 deletion rules/filt.rules
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ rule seq_filter:
output:
R1=temp(RUN_DIR + "/process_data/{sample}.R1.filt.fastq.gz"),
R2=temp(RUN_DIR + "/process_data/{sample}.R2.filt.fastq.gz"),
stat=RUN_DIR + "/process_data/{sample}.filt.stat"
stat=temp(RUN_DIR + "/process_data/{sample}.filt.stat")
params:
tool=ROOT_DIR + "/tools/rscripts/filt.R"
log:
Expand Down
4 changes: 2 additions & 2 deletions rules/process.rules
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ rule assimilate_sites:
multi=all_multi_inputs
output:
incorp=RUN_DIR + "/output/incorp_sites." + RUN + ".rds",
stat=RUN_DIR + "/process_data/" + RUN + ".assim.stat"
stat=temp(RUN_DIR + "/process_data/" + RUN + ".assim.stat")
params:
config=RUN_DIR + "/" + "config.yml",
tool=ROOT_DIR + "/tools/rscripts/assimilate_incorp_data.R"
Expand All @@ -62,7 +62,7 @@ rule iguide_evaluation:
input: RUN_DIR + "/output/incorp_sites." + RUN + ".rds"
output:
eval=temp(RUN_DIR + "/output/iguide.eval." + RUN + ".rds"),
stat=RUN_DIR + "/process_data/" + RUN + ".eval.stat"
stat=temp(RUN_DIR + "/process_data/" + RUN + ".eval.stat")
params:
tool = ROOT_DIR + "/tools/rscripts/evaluate_incorp_data.R",
config = RUN_DIR + "/" + "config.yml"
Expand Down
8 changes: 4 additions & 4 deletions rules/quality.blat.rules
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ rule post_align:
keyR1=RUN_DIR + "/process_data/{sample}.R1.key.csv",
keyR2=RUN_DIR + "/process_data/{sample}.R2.key.csv"
output:
uniq=RUN_DIR + "/process_data/{sample}.uniq.csv",
chimera=RUN_DIR + "/process_data/{sample}.chimera.rds",
multihit=RUN_DIR + "/process_data/{sample}.multihits.rds",
stat=RUN_DIR + "/process_data/{sample}.align.stat"
uniq=temp(RUN_DIR + "/process_data/{sample}.uniq.csv"),
chimera=temp(RUN_DIR + "/process_data/{sample}.chimera.rds"),
multihit=temp(RUN_DIR + "/process_data/{sample}.multihits.rds"),
stat=temp(RUN_DIR + "/process_data/{sample}.align.stat")
params:
tool=ROOT_DIR + "/tools/rscripts/couple.R",
ref=config["Ref_Genome"],
Expand Down
6 changes: 3 additions & 3 deletions rules/trim.rules
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ rule seq_trim_R1:
RUN_DIR + "/process_data/{sample}.R1.fastq.gz"
output:
trim=temp(RUN_DIR + "/process_data/{sample}.R1.trim.fastq.gz"),
stat=RUN_DIR + "/process_data/{sample}.R1.trim.stat"
stat=temp(RUN_DIR + "/process_data/{sample}.R1.trim.stat")
params:
tool=ROOT_DIR + "/tools/rscripts/trim.R",
lead=lambda wildcards: R1_LEAD[wildcards.sample],
Expand All @@ -32,7 +32,7 @@ rule seq_trim_R2_primer:
RUN_DIR + "/process_data/{sample}.R2.fastq.gz"
output:
trim=temp(RUN_DIR + "/process_data/{sample}.R2.primer.trim.fastq.gz"),
stat=RUN_DIR + "/process_data/{sample}.R2.primer.trim.stat"
stat=temp(RUN_DIR + "/process_data/{sample}.R2.primer.trim.stat")
params:
tool=ROOT_DIR + "/tools/rscripts/trim.R",
lead=lambda wildcards: R2_LEAD[wildcards.sample],
Expand All @@ -58,7 +58,7 @@ rule seq_trim_R2_odn:
RUN_DIR + "/process_data/{sample}.R2.primer.trim.fastq.gz"
output:
trim=temp(RUN_DIR + "/process_data/{sample}.R2.trim.fastq.gz"),
stat=RUN_DIR + "/process_data/{sample}.R2.trim.stat"
stat=temp(RUN_DIR + "/process_data/{sample}.R2.trim.stat")
params:
tool=ROOT_DIR + "/tools/rscripts/trim.R",
lead=lambda wildcards: R2_LEAD_ODN[wildcards.sample],
Expand Down
4 changes: 2 additions & 2 deletions rules/umitag.rules
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ rule collect_umitags:
RUN_DIR + "/process_data/{sample}.I2.fastq.gz"
output:
seq=temp(RUN_DIR + "/process_data/{sample}.I2.trim.fastq.gz"),
umi=RUN_DIR + "/process_data/{sample}.umitags.fasta.gz",
stat=RUN_DIR + "/process_data/{sample}.umitags.stat"
umi=temp(RUN_DIR + "/process_data/{sample}.umitags.fasta.gz"),
stat=temp(RUN_DIR + "/process_data/{sample}.umitags.stat")
params:
tool=ROOT_DIR + "/tools/rscripts/trim.R",
seq=lambda wildcards: UMIseqs[wildcards.sample],
Expand Down
27 changes: 18 additions & 9 deletions tools/rscripts/collect_stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ parser <- argparse::ArgumentParser(
)

parser$add_argument(
"dir", nargs = 1, type = "character", help = "Directory with *.stat files."
"files", nargs = "+", type = "character",
help = "Paths to stat containing files (long, csv format). "
)

parser$add_argument(
Expand All @@ -20,19 +21,27 @@ parser$add_argument(
args <- parser$parse_args(commandArgs(trailingOnly = TRUE))

# Manipulate file paths to determine stat types
all_files <- list.files(path = args$dir, pattern = "*.stat")
file_names <- stringr::str_extract(all_files, "[\\w\\.\\-\\_]+$")
files_present <- sapply(args$files, file.exists)

if( !all(files_present) ){
stop(
"\n Cannot find the following files: ",
paste(args$files[!files_present], collapse = "\n ")
)
}

file_names <- stringr::str_extract(args$files, "[\\w\\.\\-\\_]+$")
file_types <- sub("[\\w\\-\\_]+.", "", file_names, perl = TRUE)
file_types <- sub(".stat", "", file_types)

# Read in data in a long format
long_data <- dplyr::bind_rows(
lapply(
structure(all_files, names = file_types),
function(file, dir){
structure(args$files, names = file_types),
function(file){

x <- try(
expr = read.csv(file = file.path(dir, file), header = FALSE),
expr = read.csv(file = file, header = FALSE),
silent = TRUE
)

Expand All @@ -55,10 +64,10 @@ long_data <- dplyr::bind_rows(

}

},
dir = args$dir
}
),
.id = "type")
.id = "type"
)

# Transform data into a wide format
wide_data <- dplyr::mutate(
Expand Down
22 changes: 13 additions & 9 deletions tools/rscripts/generate_stat_report.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# iguide installation directory, will look for sys argument 'IGUIDE_DIR'
#

options(stringsAsFactors = FALSE, scipen = 99, width = 180)
options(stringsAsFactors = FALSE, scipen = 99, width = 120)

args <- commandArgs(trailingOnly = TRUE)

Expand All @@ -22,10 +22,11 @@ code_dir <- dirname(sub(
))

# Check input file ----
input_file <- args[1]
core_file <- args[1]
eval_file <- args[2]

if( !file.exists(input_file) ){
stop("\n Cannot find input stat file. Check inputs.")
if( !file.exists(core_file) | !file.exists(eval_file) ){
stop("\n Cannot find input stat files. Check inputs.")
}

# Check output file ----
Expand Down Expand Up @@ -134,7 +135,10 @@ build_version <- list.files(file.path(iguide_dir, "etc")) %>%
signature <- config[["signature"]]

# Load input data ----
stat_df <- read.csv(input_file) %>%
core_stat_df <- read.csv(core_file)
eval_stat_df <- read.csv(eval_file)

stat_df <- dplyr::full_join(core_stat_df, eval_stat_df, by = "sampleName") %>%
dplyr::mutate_all(function(x) ifelse(is.na(x), rep(0, length(x)), x))

sampleName_levels <- unique(stat_df$sampleName)
Expand Down Expand Up @@ -165,9 +169,9 @@ names(read_tbl) <- stringr::str_replace(names(read_tbl), ".reads$", "")

# Alignment outcome table ----
algn_tbl <- dplyr::select(
stat_df, sampleName, align.unique.reads, align.unique.algns, align.unique.loci,
align.multihit.reads, align.multihit.lengths, align.multihit.clusters,
align.chimera.reads
stat_df, sampleName, align.unique.reads, align.unique.algns,
align.unique.loci, align.multihit.reads, align.multihit.lengths,
align.multihit.clusters, align.chimera.reads
) %>%
dplyr::filter(sampleName %in% sampleNames) %>%
dplyr::mutate(sampleName = factor(sampleName, levels = sampleNames)) %>%
Expand All @@ -186,7 +190,7 @@ incorp_tbl <- dplyr::select(
dplyr::mutate(sampleName = factor(sampleName, levels = sampleNames)) %>%
dplyr::arrange(sampleName)

names(incorp_tbl) <- stringr::str_replace(names(incorp_tbl), "assim.", "")
names(incorp_tbl) <- stringr::str_replace(names(incorp_tbl), "eval.", "")
names(incorp_tbl) <- stringr::str_replace(names(incorp_tbl), ".algns$", "")


Expand Down

0 comments on commit ab0f85d

Please sign in to comment.