Merge pull request #39 from cnobles/update_v0.9.9

Update v0.9.9
cnobles · Jun 19, 2019 · 9ccd22d · 9ccd22d
2 parents 39ee0c9 + ab0f85d
commit 9ccd22d
Show file tree

Hide file tree

Showing 26 changed files with 1,428 additions and 1,261 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -3,6 +3,7 @@ before_install:
   - sudo apt-get -qq update
   - sudo apt-get install xdotool
   - sudo apt-get install texlive-latex-extra
+  - sudo apt-get install libgfortran3
 install:
   - bash install.sh
 script: bash etc/tests/test.sh
diff --git a/.version b/.version
@@ -1 +1 @@
-v0.9.8
+v0.9.9
diff --git a/README.md b/README.md
@@ -63,35 +63,33 @@ conda activate iguide
 iguide list_samples configs/simulation.config.yml
 
 # Create test analysis directory
-# (The simulation configuration file is used by default and does not need to be specified)
 
 iguide setup configs/simulation.config.yml
 
 # Process a simulation dataset
 
 iguide run configs/simulation.config.yml -- -np
 iguide run configs/simulation.config.yml -- --latency-wait 30
-cat analysis/simulation/output/unique_sites.simulation.csv
 
-# Processing will complete with a report, but if additional analyses are required,
-# you can reevaluate the 'incorp_sites' object. Multiple objects can be evaluated
-# together, just include the run files.
+# Processing will complete with several reports, but if additional analyses are required,
+# you can re-evaluate a run by its config file. Multiple runs can be evaluated together, 
+# just include multiple config files.
 
-iguide eval analysis/simulation/output/incorp_sites.simulation.rds \
+iguide eval configs/simulation.config.yml \
   -o analysis/simulation/output/iguide.eval.simulation.test.rds \
   -s sampleInfo/simulation.supp.csv
 
 # After evaluation, generate a report in a different format than standard.
 # Additionally the evaluation and report generation step can be combined using 
-# config file(s) as inputs for the 'report' subcommand.
+# config file(s) as inputs for the 'report' subcommand (using the -c flag instead of -e).
 
 iguide report -e analysis/simulation/output/iguide.eval.simulation.test.rds \
   -o analysis/simulation/reports/report.simulation.pdf \
   -s sampleInfo/simulation.supp.csv \
   -t pdf
 
 # When you are all finished and ready to archive / remove excess files, a minimal configuration
-# can be achived with the 'clean' subcommand.
+# can be achieved with the 'clean' subcommand.
 
 iguide clean configs/simulation.config.yml
 
@@ -106,6 +104,18 @@ conda deactivate
 
 ### Changelog:
 
+**v0.9.9 (June 10th, 2019)**
+
+* Modified the assimilate + evaluate workflow
+  + Assimilate now only includes reference genome data, meaning a cleaner intermediate file
+  + Evaluate will now handle ref. gene sets and further analysis
+  + This increases the modularity and consistancy of the workflow
+* Revised the iGUIDE Report format to be more informational and clearer
+* Revised a bit of the workflow to make reprocessing smoother
+* Updated BLAT coupling script to be more memory efficient
+* Fixed TravisCI testing!
+* Changed stat workflow, now restarting analysis won't init a total reproc.
+
 **v0.9.8 (April 19th, 2019)**
 
 * iGUIDE can now support non-Cas9 nucleases as well!

diff --git a/Snakefile b/Snakefile
@@ -55,7 +55,10 @@ if not os.path.isdir(ROOT_DIR):
 # Check for sequence file paths
 if not os.path.isdir(config["Seq_Path"]):
     raise SystemExit("Path to sequencing files is not found (Seq_Path). Check your config file.")
-
+
+# Check for config symlink to check proper run directory setup
+if not os.path.isfile(RUN_DIR + "/config.yml"):
+    raise SystemExit("Path to symbolic config is not present. Check to make sure you've run 'iguide setup' first.")
 
 # Default params if not included in config
 if not "maxNcount" in config:
@@ -100,7 +103,6 @@ if not "reportMB" in config:
 # Target Rules
 rule all:
     input: 
-      uniq_sites=RUN_DIR + "/output/unique_sites." + RUN + ".csv.gz",
       incorp_sites=RUN_DIR + "/output/incorp_sites." + RUN + ".rds",
       report=RUN_DIR + "/reports/report." + RUN + ".html",
       summary=RUN_DIR + "/reports/summary." + RUN + ".txt",

diff --git a/docs/pages/changelog.rst b/docs/pages/changelog.rst
@@ -6,6 +6,20 @@
 ChangeLog 
 =========
 
+**v0.9.9 (June 10th, 2019)**
+
+* Modified the assimilate + evaluate workflow
+
+  - Assimilate now only includes reference genome data, meaning a cleaner intermediate file
+  - Evaluate will now handle ref. gene sets and further analysis
+  - This increases the modularity and consistancy of the workflow
+
+* Revised the iGUIDE Report format to be more informational and clearer
+* Revised a bit of the workflow to make reprocessing smoother
+* Updated BLAT coupling script to be more memory efficient
+* Fixed TravisCI testing!
+* Changed stat workflow, now restarting analysis won't initiate a total reprocessing.
+
 **v0.9.8 (April 19th, 2019)**
 
 * iGUIDE can now support non-Cas9 nucleases as well!

diff --git a/docs/pages/quickstart.rst b/docs/pages/quickstart.rst
@@ -123,57 +123,57 @@ terminal, so you can see what snakemake is about to perform. Next, the test data
 is moved to the input directory underneath the new test run directory. Then the 
 entirety of processing can start.::
 
-  # After constructing the config file and having reference files 
-  # (i.e. sampleinfo). You can check the samples associated with the run.
-  
+  # If conda is not in your path ...
+
+  source ${HOME}/miniconda3/etc/profile.d/conda.sh
+
+  # Activate iguide environment
+
+  conda activate iguide
+
+  # After constructing the config file and having reference files (i.e. sampleinfo)
+  # You can check the samples associated with the run.
+
   iguide list_samples configs/simulation.config.yml
 
   # Create test analysis directory
-  # (The simulation configuration file is used by default and does not need to 
-  # be specified)
-  
+
   iguide setup configs/simulation.config.yml
 
   # Process a simulation dataset
 
   iguide run configs/simulation.config.yml -- -np
   iguide run configs/simulation.config.yml -- --latency-wait 30
-  zcat analysis/simulation/output/unique_sites.simulation.csv.gz
 
-  # Processing will complete with a report, but if additional analyses are 
-  # required, you can reevaluate the 'incorp_sites' object. Multiple objects 
-  # can be evaluated together, just include the run files.
+  # Processing will complete with several reports, but if additional analyses are required,
+  # you can re-evaluate a run by its config file. Multiple runs can be evaluated together,
+  # just include multiple config files.
 
-  iguide eval analysis/simulation/output/incorp_sites.simulation.rds \
+  iguide eval configs/simulation.config.yml \
     -o analysis/simulation/output/iguide.eval.simulation.test.rds \
     -s sampleInfo/simulation.supp.csv
 
   # After evaluation, generate a report in a different format than standard.
-  # Additionally the evaluation and report generation step can be combined using 
-  # config file(s) as inputs for the 'report' subcommand. For PDF output, you'll
-  # need to verify that your system has the correct latex-based software
-  # support, such as 'texlive'.
+  # Additionally the evaluation and report generation step can be combined using
+  # config file(s) as inputs for the 'report' subcommand (using the -c flag instead of -e).
 
   iguide report -e analysis/simulation/output/iguide.eval.simulation.test.rds \
     -o analysis/simulation/reports/report.simulation.pdf \
     -s sampleInfo/simulation.supp.csv \
     -t pdf
 
-  # If you are looking for a quick and consise report of the output, use the 
-  # 'summary' subcommand with input of either a config file(s) or a single 
-  # evaluation file, generated by the 'eval' subcommand. 
-  
-  iguide summary -e analysis/simulation/output/iguide.eval.simulation.test.rds
-
-  # When you are all finished and ready to archive / remove excess files, a 
-  # minimal configuration can be achived with the 'clean' subcommand.
+  # When you are all finished and ready to archive / remove excess files, a minimal configuration
+  # can be achieved with the 'clean' subcommand.
 
   iguide clean configs/simulation.config.yml
 
   # Or you realized you messed up all the input and need to restart
 
   iguide clean configs/simulation.config.yml --remove_proj
 
+  # Deactivate the environment
+
+  conda deactivate
 
 Uninstall
 ---------

diff --git a/etc/tests/simulation.digests.yml b/etc/tests/simulation.digests.yml
@@ -1,16 +1,16 @@
 # Simulation output check sums (md5)
 
 file1 :
-    name : "unique_sites.simulation.csv"
-    path : "analysis/simulation/output/unique_sites.simulation.csv.gz"
-    md5  : "c4cd8f98201fe8dd613f4eab9287cc78"
-
-file2 :
     name : "incorp_sites.simulation.rds"
     path : "analysis/simulation/output/incorp_sites.simulation.rds"
-    md5  : "1c92defd2037daf5ad540157ff523663"
+    md5  : "dbfc152e8e3ee129c11ab953ed8aa9de"
+
+file2 :
+    name : "stats.core.simulation.csv"
+    path : "analysis/simulation/output/stats.core.simulation.csv"
+    md5  : "b593a0e58c97f48a5b367184bd440ad3"
 
 file3 :
-    name : "stats.simulation.csv"
-    path : "analysis/simulation/output/stats.simulation.csv"
-    md5  : "c3f087be1a092d1d7dfce053936fb238"
+    name : "stats.eval.simulation.csv"
+    path : "analysis/simulation/output/stats.eval.simulation.csv"
+    md5  : "57158a4826685a8024b3281284ac42b6"
diff --git a/etc/tests/test.sh b/etc/tests/test.sh
@@ -15,14 +15,15 @@ conda activate ${__IGUIDE_ENV}
 # Create test analysis directory
 iguide setup configs/simulation.config.yml
 
-# Generate test DAG graph
+# Generate test DAG graph and run
 iguide run configs/simulation.config.yml -- -np
 
 iguide run configs/simulation.config.yml -- --dag --nolock | dot -Tsvg > \
     analysis/simulation/reports/simulation.dag.svg
 
-iguide run configs/simulation.config.yml -- -p -w 30 --nolock --cores ${__CORES}
+iguide run configs/simulation.config.yml -- -p -w 30 --notemp --nolock --cores ${__CORES}
 
+# Evaluate and report out using a different metadata set
 iguide eval configs/simulation.config.yml \
     -o analysis/simulation/reports/iguide.eval.simulation.test.rds \
     -s sampleInfo/simulation.supp.csv 

diff --git a/rules/arch.rules b/rules/arch.rules
@@ -2,7 +2,7 @@
 # Architecture Rules
 # Related to setting up analysis directories and consolidating data
 
-rule generate_stat_matrix:
+rule core_stat_matrix:
   input:
     demulti=RUN_DIR + "/process_data/" + RUN + ".demulti.stat",
     trimR1=expand(
@@ -22,22 +22,29 @@ rule generate_stat_matrix:
       RUN_DIR + "/process_data/{sample}.align.stat", sample=SAMPLES),
     assim=RUN_DIR + "/process_data/" + RUN + ".assim.stat"
   output:
-    RUN_DIR + "/output/stats." + RUN + ".csv"
+    RUN_DIR + "/output/stats.core." + RUN + ".csv"
   params:
-    dir=RUN_DIR + "/process_data",
     tool=ROOT_DIR + "/tools/rscripts/collect_stats.R"
   resources:
     mem_mb = lambda wildcards, attempt: attempt * config["defaultMB"]
-  shell:
-    """
-    Rscript {params.tool} {params.dir} -o {output}
-    """
+  shell: "Rscript {params.tool} {input} -o {output}"
+
+rule eval_stat_matrix:
+  input: RUN_DIR + "/process_data/" + RUN + ".eval.stat"
+  output: RUN_DIR + "/output/stats.eval." + RUN + ".csv"
+  params:
+    tool=ROOT_DIR + "/tools/rscripts/collect_stats.R"
+  resources:
+    mem_mb = lambda wildcards, attempt: attempt * config["defaultMB"]
+  shell: "Rscript {params.tool} {input} -o {output}"
 
 rule gen_stat_report:
-  input: RUN_DIR + "/output/stats." + RUN + ".csv"
+  input: 
+    core = RUN_DIR + "/output/stats.core." + RUN + ".csv",
+    eval = RUN_DIR + "/output/stats.eval." + RUN + ".csv"
   output: RUN_DIR + "/reports/runstats." + RUN + ".html"
   params: 
-    tool = ROOT_DIR + "/tools/rscripts/write_stat_report.R",
+    tool = ROOT_DIR + "/tools/rscripts/generate_stat_report.R",
     config = RUN_DIR + "/" + "config.yml"
   log: RUN_DIR + "/logs/" + RUN + ".runstats.log"
   resources:

diff --git a/rules/consol.rules b/rules/consol.rules
@@ -17,6 +17,6 @@ rule consolidate:
   shell:
     """
     Rscript {params.tool} {input} -o {output.consol} -k {output.key} \
-    --stat {output.stat} > {log} 2>&1
+      --stat {output.stat} > {log} 2>&1
     """
 
diff --git a/rules/demulti.rules b/rules/demulti.rules
@@ -4,7 +4,7 @@
 rule demultiplex:
   input:
     configFile=ancient("configs/" + RUN + ".config.yml"),
-    sampleInfo=config["Sample_Info"],
+    sampleInfo=ancient(config["Sample_Info"]),
     R1=str(Path(config["Seq_Path"]) / config["R1"]),
     R2=str(Path(config["Seq_Path"]) / config["R2"]),
     I1=str(Path(config["Seq_Path"]) / config["I1"]),
@@ -19,8 +19,7 @@ rule demultiplex:
       RUN_DIR + "/process_data/degenerate.{type}.fastq.gz", type=TYPES)),
     unas=temp(expand(
       RUN_DIR + "/process_data/unassigned.{type}.fastq.gz", type=TYPES)),
-    stat=temp(
-      RUN_DIR + "/process_data/" + RUN + ".demulti.stat")
+    stat=temp(RUN_DIR + "/process_data/" + RUN + ".demulti.stat")
   params:
     tool=ROOT_DIR + "/tools/rscripts/demulti.R",
     bc1Len=config["barcode1Length"],

diff --git a/rules/process.rules b/rules/process.rules
@@ -5,7 +5,7 @@ rule all_uniq_sites:
   input:
     expand(RUN_DIR + "/process_data/{sample}.uniq.csv", sample=SAMPLES)
   output:
-    RUN_DIR + "/output/unique_sites." + RUN + ".csv"
+    temp(RUN_DIR + "/output/unique_sites." + RUN + ".csv")
   params:
     RUN_DIR + "/process_data"
   resources:
@@ -18,15 +18,6 @@ rule all_uniq_sites:
     done
     """
 
-rule compress_uniq_sites:
-  input: 
-    sites=RUN_DIR + "/output/unique_sites." + RUN + ".csv",
-    edits=RUN_DIR + "/output/incorp_sites." + RUN + ".rds"
-  output: RUN_DIR + "/output/unique_sites." + RUN + ".csv.gz"
-  resources:
-    mem_mb = lambda wildcards, attempt: attempt * config["defaultMB"]
-  shell: "gzip {input.sites}"
-
 
 def all_umitag_inputs(wildcards):
   if (config["UMItags"]):
@@ -51,8 +42,8 @@ rule assimilate_sites:
     incorp=RUN_DIR + "/output/incorp_sites." + RUN + ".rds",
     stat=temp(RUN_DIR + "/process_data/" + RUN + ".assim.stat")
   params:
-    config = RUN_DIR + "/" + "config.yml",
-    tool = ROOT_DIR + "/tools/rscripts/assimilate_incorp_data.R"
+    config=RUN_DIR + "/" + "config.yml",
+    tool=ROOT_DIR + "/tools/rscripts/assimilate_incorp_data.R"
   log:
     RUN_DIR + "/logs/" + RUN + ".assim.log"
   resources:
@@ -69,18 +60,20 @@ rule assimilate_sites:
 
 rule iguide_evaluation:
   input: RUN_DIR + "/output/incorp_sites." + RUN + ".rds"
-  output: temp(RUN_DIR + "/output/iguide.eval." + RUN + ".rds")
+  output: 
+    eval=temp(RUN_DIR + "/output/iguide.eval." + RUN + ".rds"),
+    stat=temp(RUN_DIR + "/process_data/" + RUN + ".eval.stat")
   params: 
     tool = ROOT_DIR + "/tools/rscripts/evaluate_incorp_data.R",
     config = RUN_DIR + "/" + "config.yml"
   log: RUN_DIR + "/logs/" + RUN + ".eval.log"
   resources:
     mem_mb=lambda wildcards, attempt: attempt * config["evaluateMB"]
   run:
-    call_str="Rscript {params.tool} {params.config} -o {output}"
+    call_str="Rscript {params.tool} {params.config} -o {output.eval}"
     if (config["suppFile"]):
       call_str=call_str + " -s " + ROOT_DIR + "/" + config["Supplemental_Info"]
-    call_str=call_str + " > {log} 2>&1"
+    call_str=call_str + " --stat {output.stat} > {log} 2>&1"
     shell(call_str)
 
 
@@ -111,6 +104,7 @@ rule run_report:
     Rscript {params.tool} {input} -o {output} {params.supp} > {log} 2>&1
     """
 
+
 rule run_summary:
   input: RUN_DIR + "/output/iguide.eval." + RUN + ".rds"
   output: RUN_DIR + "/reports/summary." + RUN + ".txt"

diff --git a/rules/quality.blat.rules b/rules/quality.blat.rules
@@ -9,8 +9,8 @@ rule post_align:
     keyR2=RUN_DIR + "/process_data/{sample}.R2.key.csv"
   output:
     uniq=temp(RUN_DIR + "/process_data/{sample}.uniq.csv"),
-    chimera=RUN_DIR + "/process_data/{sample}.chimera.rds",
-    multihit=RUN_DIR + "/process_data/{sample}.multihits.rds",
+    chimera=temp(RUN_DIR + "/process_data/{sample}.chimera.rds"),
+    multihit=temp(RUN_DIR + "/process_data/{sample}.multihits.rds"),
     stat=temp(RUN_DIR + "/process_data/{sample}.align.stat")
   params:
     tool=ROOT_DIR + "/tools/rscripts/couple.R",

diff --git a/rules/umitag.rules b/rules/umitag.rules
@@ -6,7 +6,7 @@ rule collect_umitags:
     RUN_DIR + "/process_data/{sample}.I2.fastq.gz"
   output:
     seq=temp(RUN_DIR + "/process_data/{sample}.I2.trim.fastq.gz"),
-    umi=RUN_DIR + "/process_data/{sample}.umitags.fasta.gz",
+    umi=temp(RUN_DIR + "/process_data/{sample}.umitags.fasta.gz"),
     stat=temp(RUN_DIR + "/process_data/{sample}.umitags.stat")
   params:
     tool=ROOT_DIR + "/tools/rscripts/trim.R",