Merge pull request #31 from cnobles/update_v0.9.5

Update v0.9.5
cnobles · Feb 19, 2019 · 09b7f40 · 09b7f40
2 parents 33b558e + 959eea5
commit 09b7f40
Show file tree

Hide file tree

Showing 44 changed files with 1,626 additions and 373 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -13,9 +13,9 @@ jobs:
       - checkout
 
       # Download and cache dependencies
-      - restore_cache:
-          keys:
-          - v1-dependencies-{{ checksum "etc/build.v0.9.2.txt" }}
+      #- restore_cache:
+      #    keys:
+      #    - v1-dependencies-{{ checksum "etc/build.v0.9.3.txt" }}
 
       - run:
           name: install
@@ -27,7 +27,7 @@ jobs:
           command: |
             bash tests/test.sh
 
-      - save_cache:
-          paths:
-            - ~/miniconda3
-          key: v1-dependencies-{{ checksum "etc/build.v0.9.2.txt" }}
+      #- save_cache:
+      #    paths:
+      #      - ~/miniconda3
+      #    key: v1-dependencies-{{ checksum "etc/build.v0.9.3.txt" }}
diff --git a/.version b/.version
@@ -1 +1 @@
-v0.9.4
+v0.9.5
diff --git a/README.md b/README.md
@@ -70,8 +70,7 @@ iguide run configs/simulation.config.yml -- --latency-wait 30
 cat analysis/simulation/output/unique_sites.simulation.csv
 
 # After run completion, generate a report in a different format than standard
-iguide report analysis/simulation/output/edited_sites.simulation.rds \
-  -c configs/simulation.config.yml \
+iguide report configs/simulation.config.yml \
   -o analysis/simulation/reports/report.simulation.pdf \
   -s sampleInfo/simulation.supp.csv \
   -t pdf
@@ -89,6 +88,17 @@ conda deactivate
 
 ### Changelog:
 
+**v0.9.5 (February 19th, 2019)**
+
+* Updated demultiplexing to be more efficient and better HPC compatible.
+* Added RefSeq Extended* reference gene sets
+  + 'ext' includes curated, predicted, and other RefSeq sets
+  + 'ext.nomodel' includes only curated and other RefSeq sets
+* Incorporated resource allocation for job dependent memory consumption
+  + Works great with HPC to specify memory requirements
+* Streamlined input for report generation by only requiring config(s)
+
+
 **v0.9.4 (January 30th, 2019)**
 
 * Updated 'report' utility and formating

diff --git a/Snakefile b/Snakefile
@@ -8,6 +8,7 @@ import sys
 import re
 import yaml
 import configparser
+from pathlib import Path
 from iguidelib import import_sample_info, choose_sequence_data
 
 if not config:
@@ -51,6 +52,48 @@ RUN_DIR = ROOT_DIR + "/analysis/" + RUN
 if not os.path.isdir(ROOT_DIR):
     raise SystemExit("Path to iGUIDE is not found. Check environmental variables.")
 
+# Check for sequence file paths
+if not os.path.isdir(config["Seq_Path"]):
+    raise SystemExit("Path to sequencing files is not found (Seq_Path). Check your config file.")
+
+
+# Default params if not included in config
+if not "maxNcount" in config:
+    config["maxNcount"] = 1
+
+if not "demultiCores" in config: 
+    demulti_cores = snakemake.utils.available_cpu_count()
+else:
+    demulti_cores = min(
+        config["demultiCores"], snakemake.utils.available_cpu_count()
+    )
+
+## Memory params
+if not "demultiMB" in config:
+    config["demultiMB"] = 16000
+
+if not "trimMB" in config:
+    config["trimMB"] = 4000
+
+if not "filtMB" in config:
+    config["filtMB"] = 4000
+
+if not "consolMB" in config:
+    config["consolMB"] = 4000
+
+if not "alignMB" in config:
+    config["alignMB"] = 4000
+
+if not "coupleMB" in config:
+    config["coupleMB"] = 4000
+
+if not "processMB" in config:
+    config["processMB"] = 4000
+
+if not "reportMB" in config:
+    config["reportMB"] = 4000
+
+
 # Target Rules
 rule all:
     input: 

diff --git a/analysis/README.md b/analysis/README.md
@@ -1,8 +1,8 @@
 # Analysis directory
-This directory will contain analysis data. A run directory can be initiated by using the following command, 
+This directory will contain analysis data. A run directory can be initiated by using the following command after activating the iguide environment. 
 
 ```
-
+iguide setup <configs/config.yml>
 ```
 
 Run directory names are given in the specific configuration files.
diff --git a/configs/cluster.config.yml b/configs/cluster.config.yml
@@ -0,0 +1,7 @@
+__default__ :
+    nCPUs     : 1
+    memory    : 24000
+    resources : "\"span[hosts=1]\""
+    name      : "JOBNAME.{rule}.{wildcards}"
+    output    : "{RUN_DIR}/logs/{rule}.{wildcards}.out"
+    error     : "{RUN_DIR}/logs/{rule}.{wildcards}.err"
diff --git a/configs/simulation.config.yml b/configs/simulation.config.yml
@@ -8,10 +8,11 @@ Aligner : "blat"
 UMItags : TRUE
 
 # Sequence files
-R1: "tests/Data/Undetermined_S0_L001_R1_001.fastq.gz"
-R2: "tests/Data/Undetermined_S0_L001_R2_001.fastq.gz"
-I1: "tests/Data/Undetermined_S0_L001_I1_001.fastq.gz"
-I2: "tests/Data/Undetermined_S0_L001_I2_001.fastq.gz"
+Seq_Path : "tests/Data/"
+R1: "Undetermined_S0_L001_R1_001.fastq.gz"
+R2: "Undetermined_S0_L001_R2_001.fastq.gz"
+I1: "Undetermined_S0_L001_I1_001.fastq.gz"
+I2: "Undetermined_S0_L001_I2_001.fastq.gz"
 
 # Sequence file directory (if already demultiplexed)
 Demulti_Dir: "/tests/Data/Demult/"
@@ -53,14 +54,25 @@ Treatment :
 Read_Types : ["R1", "R2", "I1", "I2"]
 Genomic_Reads : ["R1", "R2"]
 
+# Memory Management (in MB units)
+demultiMB : 16000
+trimMB : 4000
+filtMB : 4000
+consolMB : 4000
+alignMB : 4000
+coupleMB : 4000
+processMB : 4000
+reportMB : 4000
+
 # Demultiplexing parameters
 barcode1Length : 8
 barcode2Length : 8
 barcode1 : "I1"
 barcode2 : "I2"
 bc1Mismatch : 0
 bc2Mismatch : 0
-demultiCores : 4
+maxNcount : 1
+#demultiCores : 4
 
 # Sequence trimming
 ## R1 sequence
@@ -84,7 +96,7 @@ maxTempLength : 2500
 
 # Post-processing
 refGenes :
-    file : "genomes/hg38.refSeq.rds"
+    file : "genomes/hg38.refSeq.ext.nomodel.rds"
     symbolCol : "name2"
 oncoGeneList : 
     file : "http://bushmanlab.org/assets/doc/allOnco_Feb2017.tsv"

diff --git a/docs/index.rst b/docs/index.rst
@@ -31,4 +31,5 @@ broken into a few parts:
    pages/install.rst
    pages/quickstart.rst
    pages/configinfo.rst
-   pages/sampleinfo.rst   
+   pages/sampleinfo.rst
+   pages/changelog.rst   
diff --git a/docs/pages/changelog.rst b/docs/pages/changelog.rst
@@ -0,0 +1,47 @@
+.. _changelog:
+
+.. contents::
+   :depth: 2
+
+ChangeLog 
+========================
+
+**v0.9.5 (February 19th, 2019)**
+
+* Updated demultiplexing to be more efficient and better HPC compatible.
+* Added RefSeq Extended* reference gene sets
+  + 'ext' includes curated, predicted, and other RefSeq sets
+  + 'ext.nomodel' includes only curated and other RefSeq sets
+* Incorporated resource allocation for job dependent memory consumption
+  + Works great with HPC to specify memory requirements
+* Streamlined input for report generation by only requiring config(s)
+
+**v0.9.4 (January 30th, 2019)**
+
+* Updated 'report' utility and formating
+  + custom templates now accepted
+  + included as subcommand, check with 'iguide report -h'
+  + pdf and html options report 'nicely' even when printed from either
+* Updated build to v0.9.2 to support new formating in report
+* Builds are constructed from spec files rather than yaml requirements
+* Included the 'clean' subcommand to reduce size of processed projects
+  + after cleaning a project, only terminal data files will remain
+
+**v0.9.3 (January 11th, 2019)**
+
+* Added 'list_samples' subcommand to list samples within a project.
+* Caught a few bugs and worked them out for smoother processing and reports.
+
+**v0.9.2 (January 7th, 2019)**
+
+* Modified test dataset to run tests quicker and implemented CirclCI checking.
+
+**v0.9.1 (January 6th, 2019)**
+
+* Fixed problematic install for first time conda installers.
+
+**v0.9.0 (January 4th, 2019)**
+
+* Initial release.
+* Supports setup and analysis of GUIDE-seq and iGUIDE experiments.
+* Documentation on [ReadTheDocs.io](https://iguide.readthedocs.io/en/latest/index.html).
diff --git a/docs/pages/configinfo.rst b/docs/pages/configinfo.rst
@@ -88,19 +88,27 @@ Run configuration
 Sequence files
 """"""""""""""
 
+``Seq_Path``
+  This is the file path to the sequence files. Rather than repeating the path
+  for each below, just include the path to the directory containing the files.
+
 ``R1 / R2 / I1 / I2``
   These parameters should be the file names of the sequence files to be 
   analyzed by the iGUIDE software. It is recommened to pass complete sequencing
   files to iGUIDE rather than demultiplexing prior to analysis.
 
+``Demulti_Dir``
+  Path to the directory containing demultiplexed sequence data. This is still
+  under development and may present with bugs.
+
 SampleInfo formating
 """"""""""""""""""""
 
 ``Sample_Name_Column``
   This is the name of the column in the sample information file which contains 
-  information about samples. An appropriate format for the sample names is 
-  "{specimen}-{rep}" where 'specimen' is an alpha-numeric designator for the 
-  specimen and 'rep' is a numeric identifier for technical or biological 
+  identifiable information about samples. An appropriate format for the sample 
+  names is "{specimen}-{rep}" where 'specimen' is an alpha-numeric designator for 
+  the specimen and 'rep' is a numeric identifier for technical or biological 
   replicates, separated by a dash (``-``).
 
 Sequence information
@@ -194,6 +202,23 @@ iGUIDE configuration
   This parameter is similar to the ``Read_Types`` but only indicates which reads
   contain genomic information rather than indexing.
 
+Memory Management
+"""""""""""""""""
+
+``demultiMB / trimMB / filtMB / consolMB / alignMB / coupleMB / processMB / reportMB``
+  Controls the amount of memory allocated to each of these processes during 
+  snakemake processing. While working on a server or multicored machine, these
+  parameters will work internally to help schedule jobs. Each value will act as
+  an upper limit for the amount of MB of RAM to expect the process to take, and 
+  schedule jobs appropriately using the ``--resources mem_mb={limitMB}`` flag with
+  snakemake. During HPC use, these parameters can be combined with the cluster config
+  to schedule specific memory requirements for jobs. Additionally, if the 
+  ``--restart-times {x}`` is used where "x" is the number of times to restart a job
+  if it fails, then the amount of memory for the job will increase by a unit of the 
+  parameter. For example, if a trimming job fails because it runs out of memory, then
+  restarting the job will try to allocate 2 times the memory for the second attempt.
+  All parameters should be in megabytes (MB).
+
 Demultiplexing parameters
 """""""""""""""""""""""""
 
@@ -209,11 +234,6 @@ Demultiplexing parameters
   An integer value indicating the number of tolarated mismatches in the barcode
   sequences for either barcode 1 or 2.
 
-``demultiCores``
-  The number of core to be requested during demultiplexing. This can be a 
-  memory intensive process and therefore can be limited here by using a smaller
-  value than given the the ``iguide run`` command.
-
 Sequence trimming
 """""""""""""""""
 
@@ -338,4 +358,4 @@ Report
 ``signature``
   Character string included at the beginning of reports to denote the author,
   analyst, laboratory, etc. Make sure you change if you don't want Chris 
-  getting credit for all the work.
+  getting credit for your work.
diff --git a/docs/pages/quickstart.rst b/docs/pages/quickstart.rst
@@ -87,14 +87,18 @@ processed using the following command::
 Snakemake offers a great number of resources for managing the processing through 
 the pipeline. I recommend familiarizing yourself with the utility 
 (https://snakemake.readthedocs.io/en/stable/). Here are some helpful snakemake
-options that can be passed to iGUIDE by appending to the iguide command after '--':
-
-* [\-\-configfile X] associate a specific configuration for processing, essential for processing
-* [\-\-cores X] multicored processing, specified cores to use by X
-* [\-\-nolock] process multiple runs a the same time, from different sessions
-* [\-\-notemp] keep all temporary files, otherwise removed
-* [\-\-keep-going] will keep processing if one or more job error out
-* [-w X, \-\-latency-wait X] wait X seconds for the output files to appear before erroring out
+options that can be passed to iGUIDE by appending to the iguide command after ``--``:
+
+* ``[--configfile X]`` associate a specific configuration for processing, essential for processing but already passed in by ``iguide``.
+* ``[--cores X]`` multicored processing, specified cores to use by X.
+* ``[--nolock]`` process multiple runs a the same time, from different sessions.
+* ``[--notemp]`` keep all temporary files, otherwise removed.
+* ``[--keep-going]`` will keep processing if one or more job error out.
+* ``[-w X, --latency-wait X]`` wait X seconds for the output files to appear before erroring out.
+* ``[--restart-times X]`` X is the number of time to restart a job if it fails. Defaults to 0, but is used in ``iguide`` to increase memory allocation.
+* ``[--resources mem_mb=X]`` Defined resources, for ``iguide`` the mem_mb is the MB units to allow for memory allocation to the whole run. For HPC, this can be coupled with ``--cluster-config`` to request specific resources for each job.
+* ``[--rerun-incomplete, --ri]`` Re-run all jobs that the output is recognized as incomplete, useful if your run gets terminated before finishing.
+* ``[--cluster-config FILE]`` A JSON or YAML file that defines wildcards used for HPC.
 
 
 An Example Run