Merge pull request #69 from cnobles/update_1.1.0

Update 1.1.0
cnobles · Mar 10, 2020 · 1bd8d4f · 1bd8d4f
2 parents 20920fa + fd49b79
commit 1bd8d4f
Show file tree

Hide file tree

Showing 35 changed files with 1,650 additions and 616 deletions.
diff --git a/.version b/.version
@@ -1 +1 @@
-v1.0.2
+v1.1.0
diff --git a/README.md b/README.md
@@ -14,7 +14,17 @@ To get started, checkout the iGUIDE documentation at [iGUIDE.ReadTheDocs.io](htt
 
 ### Changelog:
 
-**v1.0.0 (August 15th,2019)**
+**v1.1.0 (March 8th, 2020)**
+
+* Modified how samples designated as Mock are treated during the analysis
+* Mock samples can now be indicated by "None" or "Control" as well 
+  (case-insensitive)
+* Abundance can now be selected as [Read], [UMI], or [Fragment]{default} within 
+  config parameters and this selection will identify the abundance method used
+  for analysis
+* Added support for alternative UMI method (dx.doi.org/10.17504/protocols.io.wikfccw)
+
+**v1.0.0 (August 15th, 2019)**
 
 * Release of version 1.0.0!!!
 * iGUIDE is a computational pipeline that supports the detection of DSBs induced

diff --git a/Snakefile b/Snakefile
@@ -66,6 +66,24 @@ elif ".tsv" in config["Sample_Info"]:
 else:
     raise SystemExit("\n  Sample Info file needs to contain extention '.csv' or '.tsv'.\n")
 
+# Default params if not included in config
+if not "maxNcount" in config:
+    config["maxNcount"] = 1
+
+if not "demultiCores" in config: 
+    demulti_cores = snakemake.utils.available_cpu_count()
+else:
+    demulti_cores = min(
+        config["demultiCores"], snakemake.utils.available_cpu_count()
+    )
+
+if not "skipDemultiplexing" in config:
+    config["skipDemultiplexing"] = False
+
+if not "Alternate_UMI_Method" in config:
+    config["Alternate_UMI_Method"] = False
+
+
 # Sample information
 sampleInfo = import_sample_info(
     config["Sample_Info"], config["Sample_Name_Column"], delim)
@@ -75,28 +93,19 @@ READ_TYPES=config["Read_Types"]
 READS=config["Genomic_Reads"]
 REQ_TYPES=READS[:]
 
-if config["UMItags"]: 
+if config["UMItags"] and not config["Alternate_UMI_Method"]: 
     REQ_TYPES.append("I2")
 
 R1_LEAD=choose_sequence_data(config["R1_Leading_Trim"], sampleInfo)
 R1_OVER=choose_sequence_data(config["R1_Overreading_Trim"], sampleInfo)
 R2_LEAD=choose_sequence_data(config["R2_Leading_Trim"], sampleInfo)
-R2_LEAD_ODN=choose_sequence_data(config["R2_Leading_Trim_ODN"], sampleInfo)
 R2_OVER=choose_sequence_data(config["R2_Overreading_Trim"], sampleInfo)
 
-# Default params if not included in config
-if not "maxNcount" in config:
-    config["maxNcount"] = 1
-
-if not "demultiCores" in config: 
-    demulti_cores = snakemake.utils.available_cpu_count()
+if config["Alternate_UMI_Method"]:
+    R1_LEAD_ODN=choose_sequence_data(config["R1_Leading_Trim_ODN"], sampleInfo)
 else:
-    demulti_cores = min(
-        config["demultiCores"], snakemake.utils.available_cpu_count()
-    )
+    R2_LEAD_ODN=choose_sequence_data(config["R2_Leading_Trim_ODN"], sampleInfo)
 
-if not "skipDemultiplexing" in config:
-    config["skipDemultiplexing"] = False
 
 ## Memory and default params
 if not "demultiMB" in config:
@@ -160,7 +169,10 @@ rule all:
       stats=RUN_DIR + "/reports/runstats." + RUN + ".html"
 
 # Architecture Rules
-include: "rules/arch.rules"
+if (config["Alternate_UMI_Method"]):
+    include: "rules/arch.umi_alt_method.rules"
+else:
+    include: "rules/arch.rules"
 
 # Processing Rules
 if (config["skipDemultiplexing"]):
@@ -169,11 +181,18 @@ else:
     include: "rules/demulti.rules"
 
 include: "rules/binning.rules"
-include: "rules/trim.rules"
+
+if (config["Alternate_UMI_Method"]):
+    include: "rules/trim.umi_alt_method.rules"
+else:
+    include: "rules/trim.rules"
 
 if (config["UMItags"]):
-    include: "rules/umitag.rules"
-    UMIseqs = sampleInfo["barcode2"]
+    if (config["Alternate_UMI_Method"]):
+        include: "rules/umitag.umi_alt_method.rules"
+    else:
+        include: "rules/umitag.rules"
+        UMIseqs = sampleInfo["barcode2"]
 else:
     include: "rules/umitag_stub.rules"
 
@@ -182,10 +201,16 @@ include: "rules/filt.rules"
 if (config["Aligner"] == "BLAT" or config["Aligner"] == "blat"):
     include: "rules/consol.rules"
     include: "rules/align.blat.rules"
-    include: "rules/quality.blat.rules"
+    if (config["Alternate_UMI_Method"]):
+        include: "rules/quality.blat.umi_alt_method.rules"
+    else:
+        include: "rules/quality.blat.rules"
 elif (config["Aligner"] == "BWA" or config["Aligner"] == "bwa"):
     include: "rules/consol_stub.rules"
-    include: "rules/align.bwa.rules"
+    if (config["Alternate_UMI_Method"]):
+        include: "rules/align.bwa.umi_alt_method.rules"
+    else:
+        include: "rules/align.bwa.rules"
     include: "rules/quality.sam.rules"
 else:
     raise SystemExit( 

diff --git a/configs/cluster.config.yml b/configs/cluster.config.yml
diff --git a/configs/simulation.config.yml b/configs/simulation.config.yml
@@ -5,6 +5,7 @@ Supplemental_Info : "sampleInfo/simulation.supp.csv"
 Ref_Genome : "hg38"
 Aligner : "blat"
 UMItags : TRUE
+Abundance_Method : "Fragment"
 
 # Sequence files
 Seq_Path : "etc/tests/Data"

diff --git a/configs/umi_alt_example.config.yml b/configs/umi_alt_example.config.yml
@@ -0,0 +1,154 @@
+# Run configuration
+Run_Name : "umi_alt_example"
+Sample_Info : "sampleInfo/umi_alt_example.sampleInfo.csv"
+Supplemental_Info : "sampleInfo/umi_alt_example.supp.csv"
+Ref_Genome : "hg38"
+Aligner : "bwa"
+UMItags : TRUE
+Abundance_Method : "UMI"
+Alternate_UMI_Method : TRUE
+
+# Sequence files
+Seq_Path : "analysis/umi_alt_example/input_data"
+R1: "Undetermined_S0_L001_R1_001.fastq.gz"
+R2: "Undetermined_S0_L001_R2_001.fastq.gz"
+I1: "Undetermined_S0_L001_I1_001.fastq.gz"
+I2: "Undetermined_S0_L001_I2_001.fastq.gz"
+
+# SampleInfo formating
+Sample_Name_Column : "sampleName"
+
+# Sequence information
+R2_Leading_Trim : "NNNNNNNNSCTACAAGAGCGGTGAGT"
+R2_Overreading_Trim : "GTTTAATTGAGTTGTCATATGTTAATAACGG" # Minus option, use "CCGTTATTAACATATGACAACTCAATTAAAC" for Plus.
+R1_Leading_Trim : "NNNCCGTTATTAACATATGACAACTCAATTAAAC"  # Minus option, use "NNNGTTTAATTGAGTTGTCATATGTTAATAACGG" for Plus.
+R1_Leading_Trim_ODN : "."                               # Minus option, use "TAT" for Plus.
+R1_Overreading_Trim : "ACTCACCGCTCTTGTAG"
+
+# Target sequence information, do not include PAM sequence with gRNAs
+Target_Sequences:
+    EMXs1 : "GAGTCCGAGCAGAAGAAGAA"
+On_Target_Sites :
+    EMXs1 : "chr2:+:72933869"
+
+# Specimen target treatment
+# Data can be input here or referenced in sampleInfo by just putting 
+# 'sampleInfo:{column}'. For simplicity, if all specimens were treated with the 
+# same target sequence(s), the sample can be named 'all' followed by the treated
+# target sequence name(s). If placing data in sampleInfo, delimit multiple 
+# target sequence names with a semicolon ';'.
+Treatment :
+    all : "EMXs1"
+
+# Specimen nuclease treatment
+# Similar to target treatment above, this parameter dictates which nuclease(s)
+# where used on the specimens. This refers to the class of nuclease, such as
+# Cas9 or Cpf1, which behave differently when they edit DNA. Notation can follow
+# the same as above, if all specimens were treated with the same class of 
+# nuclease, then just specify 'all : "{nuclease_profile}"', or list out by 
+# specimen. Additionally you can specify the column in sampleInfo in the same 
+# format as above. Currently, iGUIDE does not support processing for specimens 
+# with multiple classes of nuclease profiles. Only one profile can be specified
+# per specimen.
+Nuclease :
+    all : "Cas9"
+
+# Nuclease profile
+# Whatever name is given to the nuclease, specify parameter below that dictate
+# how the nuclease cuts DNA. 'PAM' - protospacer adjacent motif - should be 
+# specified here and can contain ambiguous nucleotides. 'PAM_Loc' indicates the 
+# location of the PAM with respect to the pattern, either '5p', '3p' or FALSE.
+# 'PAM_Tol' indicates the tolerance for mismatches in the PAM sequence (ignorned
+# if PAM is FALSE). 'Cut_Offset' indicates the offset from the 5' nucleotide of 
+# the PAM sequence where the nuclease creates a double strand break, unless PAM 
+# is FALSE, then the 5' position of the target sequence (also accepts 
+# "mid_insert" to specify middle of region between paired alignments). 
+# 'Insert_size' is used if target sequences are expected to flank each other for
+# editing, such as with TALENs, and indicates the expected size of the insert. 
+# To input a range, delimit the min and max by a colon, ie. 15:21. All names of 
+# nucleases used to treat specimens need to have a profile. Additional profiles 
+# should be added under the 'Nuclease_Profiles' parameter.
+Nuclease_Profiles :
+    Cas9 :
+        PAM : "NGG"
+        PAM_Loc : "3p"
+        PAM_Tol : 1
+        Cut_Offset : -4
+        Insert_size : FALSE
+
+# ------------------------------------------------------------------------------
+# iGUIDE configuration
+Read_Types : ["R1", "R2"]
+Genomic_Reads : ["R1", "R2"]
+readNamePattern : "'[\\w\\:\\-\\+]+'"
+
+# Memory Management (in MB units)
+defaultMB : 2000
+demultiMB : 40000
+trimMB : 8000
+filtMB : 4000
+consolMB : 4000
+alignMB : 20000
+qualCtrlMB : 16000
+assimilateMB : 16000
+evaluateMB : 8000
+reportMB : 4000
+
+# Demultiplexing parameters
+skipDemultiplexing : TRUE
+barcode1Length : 8
+barcode2Length : 8
+barcode1 : "I1"
+barcode2 : "I2"
+bc1Mismatch : 0
+bc2Mismatch : 0
+maxNcount : 1
+
+# Sequence trimming
+## R1 sequence
+R1leadMismatch : 3
+R1odnMismatch : 0
+R1overMismatch : 4
+R1overMaxLength : 20
+## R2 sequence
+R2leadMismatch : 2
+R2overMismatch : 4
+R2overMaxLength : 20
+
+# Binning
+bins : 10
+level : 25000
+
+# Reference Alignment
+BLATparams : "-tileSize=11 -stepSize=9 -minIdentity=85 -maxIntron=5 -minScore=27 -dots=1000 -out=psl -noHead"
+BWAparams : "-k 30 -w 2500 -P -L 25 -a"
+
+# Post-alignment filtering
+maxAlignStart : 5
+minPercentIdentity : 95
+minTempLength : 30
+maxTempLength : 2500
+
+# Post-processing
+refGenes :
+    file : "genomes/hg38.refSeq.ext.nomodel.rds"
+    symbolCol : "name2"
+oncoGeneList : 
+    file : "http://bushmanlab.org/assets/doc/allOnco_Feb2017.tsv"
+    symbolCol : "symbol"
+specialGeneList : 
+    file : "http://bushmanlab.org/assets/doc/humanLymph.tsv"
+    symbolCol : "symbol"
+maxTargetMismatch : 6
+upstreamDist : 100
+downstreamDist : 30
+pileUpMin : 3
+recoverMultihits : FALSE
+
+# Report
+suppFile : TRUE
+tables : FALSE
+figures : TRUE
+reportData : FALSE
+infoGraphic : TRUE
+signature : "Christopher L. Nobles, Ph.D. [Bushman Lab]"
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -3,6 +3,25 @@
 ChangeLog
 =========
 
+**v1.1.0 (March 8th, 2020)**
+
+- Modified how samples designated as Mock are treated during the analysis
+- Mock samples can now be indicated by "None" or "Control" as well 
+  (case-insensitive)
+- Abundance can now be selected as [Read], [UMI], or [Fragment]{default} within 
+  config parameters and this selection will identify the abundance method used
+  for analysis
+- Added support for alternative UMI method (dx.doi.org/10.17504/protocols.io.wikfccw)
+
+**v1.0.2 (February 15th, 2020)**
+
+- Bugfix: UMItags set to FALSE will now process through to completion
+- Rebuild: Updated to build version 1.0.1
+
+**v1.0.1 (December 3rd, 2019)**
+
+- Bugfix: Updated Gene set enrichment test within report
+
 **v1.0.0 (August 15th, 2019)**
 
 - Complete support for BLAT and BWA aligners

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -518,10 +518,20 @@ Run configuration
   listed here.
 
 ``UMItags``
-  This is a logical parameter indicating whether to use unique molecular indices
-  (UMI) sequence tags ('TRUE') or to only use unique fragments lengths (see
-  `SonicAbundance <https://doi.org/10.1093/bioinformatics/bts004>`) to quantify
-  abundances of unique observations.
+  This is a logical parameter indicating whether to capture unique molecular 
+  indices (UMI) sequence tags ('TRUE') during processing. **Note:** Ambiguous 
+  nucleotides will need to be identified in the barcode fields of the sampleInfo
+  file. Please see supplied simulated sampleInfo file for example.
+
+``Abundance_Method``
+  Options include 'Read', 'UMI', or 'Fragment' (default) for calculating the 
+  abundance method. 'Fragment' refers to the use of unique fragments lengths 
+  (see `SonicAbundance <https://doi.org/10.1093/bioinformatics/bts004>`) to 
+  quantify abundances of unique observations. 'UMI' will change the abundance
+  method to use the unique molecular indices (**Note** that the ``UMItags`` 
+  option will need to be set to TRUE for this feature to work). 'Read' will 
+  change the abundance counts to read counts, yet this method may be unreliable
+  due to PCR jackpotting or bias.
 
 
 Sequence files

diff --git a/configs/cluster.highmem.config → ...additional_configs/cluster.highmem.config b/configs/cluster.highmem.config → ...additional_configs/cluster.highmem.config
diff --git a/etc/tests/simulation.digests.yml b/etc/tests/simulation.digests.yml
@@ -13,4 +13,4 @@ file2 :
 file3 :
     name : "stats.eval.simulation.csv"
     path : "analysis/simulation/output/stats.eval.simulation.csv"
-    md5  : "26cd4b00fa40212cd01027d1c11cd76f"
+    md5  : "3c4eff47a2f419a9b47c45259ac70778"