Updated workflow and testing.

cnobles · Jun 19, 2019 · ab0f85d · ab0f85d
1 parent 152cf74
commit ab0f85d
Show file tree

Hide file tree

Showing 15 changed files with 77 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -114,6 +114,7 @@ conda deactivate
 * Revised a bit of the workflow to make reprocessing smoother
 * Updated BLAT coupling script to be more memory efficient
 * Fixed TravisCI testing!
+* Changed stat workflow, now restarting analysis won't init a total reproc.
 
 **v0.9.8 (April 19th, 2019)**
 

diff --git a/Snakefile b/Snakefile
@@ -55,7 +55,10 @@ if not os.path.isdir(ROOT_DIR):
 # Check for sequence file paths
 if not os.path.isdir(config["Seq_Path"]):
     raise SystemExit("Path to sequencing files is not found (Seq_Path). Check your config file.")
-
+
+# Check for config symlink to check proper run directory setup
+if not os.path.isfile(RUN_DIR + "/config.yml"):
+    raise SystemExit("Path to symbolic config is not present. Check to make sure you've run 'iguide setup' first.")
 
 # Default params if not included in config
 if not "maxNcount" in config:

diff --git a/docs/pages/changelog.rst b/docs/pages/changelog.rst
@@ -18,6 +18,7 @@ ChangeLog
 * Revised a bit of the workflow to make reprocessing smoother
 * Updated BLAT coupling script to be more memory efficient
 * Fixed TravisCI testing!
+* Changed stat workflow, now restarting analysis won't initiate a total reprocessing.
 
 **v0.9.8 (April 19th, 2019)**
 

diff --git a/etc/tests/simulation.digests.yml b/etc/tests/simulation.digests.yml
@@ -6,6 +6,11 @@ file1 :
     md5  : "dbfc152e8e3ee129c11ab953ed8aa9de"
 
 file2 :
-    name : "stats.simulation.csv"
-    path : "analysis/simulation/output/stats.simulation.csv"
-    md5  : "98a6ba9dbae43828f485e6a9070f6a5e"
+    name : "stats.core.simulation.csv"
+    path : "analysis/simulation/output/stats.core.simulation.csv"
+    md5  : "b593a0e58c97f48a5b367184bd440ad3"
+
+file3 :
+    name : "stats.eval.simulation.csv"
+    path : "analysis/simulation/output/stats.eval.simulation.csv"
+    md5  : "57158a4826685a8024b3281284ac42b6"
diff --git a/etc/tests/test.sh b/etc/tests/test.sh
@@ -21,7 +21,7 @@ iguide run configs/simulation.config.yml -- -np
 iguide run configs/simulation.config.yml -- --dag --nolock | dot -Tsvg > \
     analysis/simulation/reports/simulation.dag.svg
 
-iguide run configs/simulation.config.yml -- -p -w 30 --nolock --cores ${__CORES}
+iguide run configs/simulation.config.yml -- -p -w 30 --notemp --nolock --cores ${__CORES}
 
 # Evaluate and report out using a different metadata set
 iguide eval configs/simulation.config.yml \

diff --git a/rules/arch.rules b/rules/arch.rules
@@ -2,7 +2,7 @@
 # Architecture Rules
 # Related to setting up analysis directories and consolidating data
 
-rule generate_stat_matrix:
+rule core_stat_matrix:
   input:
     demulti=RUN_DIR + "/process_data/" + RUN + ".demulti.stat",
     trimR1=expand(
@@ -20,22 +20,28 @@ rule generate_stat_matrix:
       sample=SAMPLES, read=READS),
     align=expand(
       RUN_DIR + "/process_data/{sample}.align.stat", sample=SAMPLES),
-    assim=RUN_DIR + "/process_data/" + RUN + ".assim.stat",
-    eval=RUN_DIR + "/process_data/" + RUN + ".eval.stat"
+    assim=RUN_DIR + "/process_data/" + RUN + ".assim.stat"
   output:
-    RUN_DIR + "/output/stats." + RUN + ".csv"
+    RUN_DIR + "/output/stats.core." + RUN + ".csv"
   params:
-    dir=RUN_DIR + "/process_data",
     tool=ROOT_DIR + "/tools/rscripts/collect_stats.R"
   resources:
     mem_mb = lambda wildcards, attempt: attempt * config["defaultMB"]
-  shell:
-    """
-    Rscript {params.tool} {params.dir} -o {output}
-    """
+  shell: "Rscript {params.tool} {input} -o {output}"
+
+rule eval_stat_matrix:
+  input: RUN_DIR + "/process_data/" + RUN + ".eval.stat"
+  output: RUN_DIR + "/output/stats.eval." + RUN + ".csv"
+  params:
+    tool=ROOT_DIR + "/tools/rscripts/collect_stats.R"
+  resources:
+    mem_mb = lambda wildcards, attempt: attempt * config["defaultMB"]
+  shell: "Rscript {params.tool} {input} -o {output}"
 
 rule gen_stat_report:
-  input: RUN_DIR + "/output/stats." + RUN + ".csv"
+  input: 
+    core = RUN_DIR + "/output/stats.core." + RUN + ".csv",
+    eval = RUN_DIR + "/output/stats.eval." + RUN + ".csv"
   output: RUN_DIR + "/reports/runstats." + RUN + ".html"
   params: 
     tool = ROOT_DIR + "/tools/rscripts/generate_stat_report.R",

diff --git a/rules/consol.rules b/rules/consol.rules
@@ -7,7 +7,7 @@ rule consolidate:
   output:
     consol=temp(RUN_DIR + "/process_data/{sample}.{read}.consol.fasta"),
     key=temp(RUN_DIR + "/process_data/{sample}.{read}.key.csv"),
-    stat=RUN_DIR + "/process_data/{sample}.{read}.consol.stat"
+    stat=temp(RUN_DIR + "/process_data/{sample}.{read}.consol.stat")
   params:
     tool=ROOT_DIR + "/tools/rscripts/consol.R"
   log:
@@ -17,6 +17,6 @@ rule consolidate:
   shell:
     """
     Rscript {params.tool} {input} -o {output.consol} -k {output.key} \
-    --stat {output.stat} > {log} 2>&1
+      --stat {output.stat} > {log} 2>&1
     """
 
diff --git a/rules/demulti.rules b/rules/demulti.rules
@@ -19,7 +19,7 @@ rule demultiplex:
       RUN_DIR + "/process_data/degenerate.{type}.fastq.gz", type=TYPES)),
     unas=temp(expand(
       RUN_DIR + "/process_data/unassigned.{type}.fastq.gz", type=TYPES)),
-    stat=RUN_DIR + "/process_data/" + RUN + ".demulti.stat"
+    stat=temp(RUN_DIR + "/process_data/" + RUN + ".demulti.stat")
   params:
     tool=ROOT_DIR + "/tools/rscripts/demulti.R",
     bc1Len=config["barcode1Length"],

diff --git a/rules/filt.rules b/rules/filt.rules
@@ -8,7 +8,7 @@ rule seq_filter:
   output:
     R1=temp(RUN_DIR + "/process_data/{sample}.R1.filt.fastq.gz"),
     R2=temp(RUN_DIR + "/process_data/{sample}.R2.filt.fastq.gz"),
-    stat=RUN_DIR + "/process_data/{sample}.filt.stat"
+    stat=temp(RUN_DIR + "/process_data/{sample}.filt.stat")
   params:
     tool=ROOT_DIR + "/tools/rscripts/filt.R"
   log:

diff --git a/rules/process.rules b/rules/process.rules
@@ -40,7 +40,7 @@ rule assimilate_sites:
     multi=all_multi_inputs
   output:
     incorp=RUN_DIR + "/output/incorp_sites." + RUN + ".rds",
-    stat=RUN_DIR + "/process_data/" + RUN + ".assim.stat"
+    stat=temp(RUN_DIR + "/process_data/" + RUN + ".assim.stat")
   params:
     config=RUN_DIR + "/" + "config.yml",
     tool=ROOT_DIR + "/tools/rscripts/assimilate_incorp_data.R"
@@ -62,7 +62,7 @@ rule iguide_evaluation:
   input: RUN_DIR + "/output/incorp_sites." + RUN + ".rds"
   output: 
     eval=temp(RUN_DIR + "/output/iguide.eval." + RUN + ".rds"),
-    stat=RUN_DIR + "/process_data/" + RUN + ".eval.stat"
+    stat=temp(RUN_DIR + "/process_data/" + RUN + ".eval.stat")
   params: 
     tool = ROOT_DIR + "/tools/rscripts/evaluate_incorp_data.R",
     config = RUN_DIR + "/" + "config.yml"

diff --git a/rules/quality.blat.rules b/rules/quality.blat.rules
@@ -8,10 +8,10 @@ rule post_align:
     keyR1=RUN_DIR + "/process_data/{sample}.R1.key.csv",
     keyR2=RUN_DIR + "/process_data/{sample}.R2.key.csv"
   output:
-    uniq=RUN_DIR + "/process_data/{sample}.uniq.csv",
-    chimera=RUN_DIR + "/process_data/{sample}.chimera.rds",
-    multihit=RUN_DIR + "/process_data/{sample}.multihits.rds",
-    stat=RUN_DIR + "/process_data/{sample}.align.stat"
+    uniq=temp(RUN_DIR + "/process_data/{sample}.uniq.csv"),
+    chimera=temp(RUN_DIR + "/process_data/{sample}.chimera.rds"),
+    multihit=temp(RUN_DIR + "/process_data/{sample}.multihits.rds"),
+    stat=temp(RUN_DIR + "/process_data/{sample}.align.stat")
   params:
     tool=ROOT_DIR + "/tools/rscripts/couple.R",
     ref=config["Ref_Genome"],

diff --git a/rules/trim.rules b/rules/trim.rules
@@ -6,7 +6,7 @@ rule seq_trim_R1:
     RUN_DIR + "/process_data/{sample}.R1.fastq.gz"
   output:
     trim=temp(RUN_DIR + "/process_data/{sample}.R1.trim.fastq.gz"),
-    stat=RUN_DIR + "/process_data/{sample}.R1.trim.stat"
+    stat=temp(RUN_DIR + "/process_data/{sample}.R1.trim.stat")
   params:
     tool=ROOT_DIR + "/tools/rscripts/trim.R",
     lead=lambda wildcards: R1_LEAD[wildcards.sample],
@@ -32,7 +32,7 @@ rule seq_trim_R2_primer:
     RUN_DIR + "/process_data/{sample}.R2.fastq.gz"
   output:
     trim=temp(RUN_DIR + "/process_data/{sample}.R2.primer.trim.fastq.gz"),
-    stat=RUN_DIR + "/process_data/{sample}.R2.primer.trim.stat"
+    stat=temp(RUN_DIR + "/process_data/{sample}.R2.primer.trim.stat")
   params:
     tool=ROOT_DIR + "/tools/rscripts/trim.R",
     lead=lambda wildcards: R2_LEAD[wildcards.sample],
@@ -58,7 +58,7 @@ rule seq_trim_R2_odn:
     RUN_DIR + "/process_data/{sample}.R2.primer.trim.fastq.gz"
   output:
     trim=temp(RUN_DIR + "/process_data/{sample}.R2.trim.fastq.gz"),
-    stat=RUN_DIR + "/process_data/{sample}.R2.trim.stat"
+    stat=temp(RUN_DIR + "/process_data/{sample}.R2.trim.stat")
   params:
     tool=ROOT_DIR + "/tools/rscripts/trim.R",
     lead=lambda wildcards: R2_LEAD_ODN[wildcards.sample],

diff --git a/rules/umitag.rules b/rules/umitag.rules
@@ -6,8 +6,8 @@ rule collect_umitags:
     RUN_DIR + "/process_data/{sample}.I2.fastq.gz"
   output:
     seq=temp(RUN_DIR + "/process_data/{sample}.I2.trim.fastq.gz"),
-    umi=RUN_DIR + "/process_data/{sample}.umitags.fasta.gz",
-    stat=RUN_DIR + "/process_data/{sample}.umitags.stat"
+    umi=temp(RUN_DIR + "/process_data/{sample}.umitags.fasta.gz"),
+    stat=temp(RUN_DIR + "/process_data/{sample}.umitags.stat")
   params:
     tool=ROOT_DIR + "/tools/rscripts/trim.R",
     seq=lambda wildcards: UMIseqs[wildcards.sample],

diff --git a/tools/rscripts/collect_stats.R b/tools/rscripts/collect_stats.R
@@ -9,7 +9,8 @@ parser <- argparse::ArgumentParser(
 )
 
 parser$add_argument(
-  "dir", nargs = 1, type = "character", help = "Directory with *.stat files."
+  "files", nargs = "+", type = "character", 
+  help = "Paths to stat containing files (long, csv format). "
 )
 
 parser$add_argument(
@@ -20,19 +21,27 @@ parser$add_argument(
 args <- parser$parse_args(commandArgs(trailingOnly = TRUE))
 
 # Manipulate file paths to determine stat types
-all_files <- list.files(path = args$dir, pattern = "*.stat")
-file_names <- stringr::str_extract(all_files, "[\\w\\.\\-\\_]+$")
+files_present <- sapply(args$files, file.exists)
+
+if( !all(files_present) ){
+  stop(
+    "\n  Cannot find the following files: ", 
+    paste(args$files[!files_present], collapse = "\n    ")
+  )
+}
+
+file_names <- stringr::str_extract(args$files, "[\\w\\.\\-\\_]+$")
 file_types <- sub("[\\w\\-\\_]+.", "", file_names, perl = TRUE)
 file_types <- sub(".stat", "", file_types)
 
 # Read in data in a long format
 long_data <- dplyr::bind_rows(
   lapply(
-    structure(all_files, names = file_types), 
-    function(file, dir){
+    structure(args$files, names = file_types), 
+    function(file){
 
       x <- try(
-        expr = read.csv(file = file.path(dir, file), header = FALSE), 
+        expr = read.csv(file = file, header = FALSE), 
         silent = TRUE
       )
 
@@ -55,10 +64,10 @@ long_data <- dplyr::bind_rows(
 
       }
 
-    },
-    dir = args$dir
+    }
   ),
-  .id = "type")
+  .id = "type"
+)
 
 # Transform data into a wide format
 wide_data <- dplyr::mutate(

diff --git a/tools/rscripts/generate_stat_report.R b/tools/rscripts/generate_stat_report.R
@@ -11,7 +11,7 @@
 #   iguide installation directory, will look for sys argument 'IGUIDE_DIR'
 # 
 
-options(stringsAsFactors = FALSE, scipen = 99, width = 180)
+options(stringsAsFactors = FALSE, scipen = 99, width = 120)
 
 args <- commandArgs(trailingOnly = TRUE)
 
@@ -22,10 +22,11 @@ code_dir <- dirname(sub(
 ))
 
 # Check input file ----
-input_file <- args[1]
+core_file <- args[1]
+eval_file <- args[2]
 
-if( !file.exists(input_file) ){
-  stop("\n  Cannot find input stat file. Check inputs.")
+if( !file.exists(core_file) | !file.exists(eval_file) ){
+  stop("\n  Cannot find input stat files. Check inputs.")
 }
 
 # Check output file ----
@@ -134,7 +135,10 @@ build_version <- list.files(file.path(iguide_dir, "etc")) %>%
 signature <- config[["signature"]]
 
 # Load input data ----
-stat_df <- read.csv(input_file) %>%
+core_stat_df <- read.csv(core_file)
+eval_stat_df <- read.csv(eval_file)
+
+stat_df <- dplyr::full_join(core_stat_df, eval_stat_df, by = "sampleName") %>%
   dplyr::mutate_all(function(x) ifelse(is.na(x), rep(0, length(x)), x))
 
 sampleName_levels <- unique(stat_df$sampleName)
@@ -165,9 +169,9 @@ names(read_tbl) <- stringr::str_replace(names(read_tbl), ".reads$", "")
 
 # Alignment outcome table ----
 algn_tbl <- dplyr::select(
-    stat_df, sampleName, align.unique.reads, align.unique.algns, align.unique.loci, 
-    align.multihit.reads, align.multihit.lengths, align.multihit.clusters, 
-    align.chimera.reads
+    stat_df, sampleName, align.unique.reads, align.unique.algns, 
+    align.unique.loci, align.multihit.reads, align.multihit.lengths, 
+    align.multihit.clusters, align.chimera.reads
   ) %>%
   dplyr::filter(sampleName %in% sampleNames) %>%
   dplyr::mutate(sampleName = factor(sampleName, levels = sampleNames)) %>%
@@ -186,7 +190,7 @@ incorp_tbl <- dplyr::select(
   dplyr::mutate(sampleName = factor(sampleName, levels = sampleNames)) %>%
   dplyr::arrange(sampleName)
 
-names(incorp_tbl) <- stringr::str_replace(names(incorp_tbl), "assim.", "")
+names(incorp_tbl) <- stringr::str_replace(names(incorp_tbl), "eval.", "")
 names(incorp_tbl) <- stringr::str_replace(names(incorp_tbl), ".algns$", "")