Merge branch 'dev' of https://github.com/NYU-Molecular-Pathology/NGS5…

…80-nf into dev
NYU-Molecular-Pathology · May 25, 2019 · 7d97183 · 7d97183
2 parents 2259f96 + a44ee58
commit 7d97183
Show file tree

Hide file tree

Showing 18 changed files with 12,064 additions and 630 deletions.
diff --git a/.config.json b/.config.json
@@ -3,12 +3,16 @@
     "runID": null,
     "fastqDirs": [],
     "demux_samplesheet": "demux-samplesheet.csv",
-    "targetsBed": "targets.bed",
-    "targetsrefFlatBed":"targets.refFlat.580.bed",
-    "numTargetSplitLines": 100,
+    "targetsBed": "targets/targets.580.bed",
+    "targetsAnnotatedBed": "targets/targets.annotated.580.bed",
+    "numTargetSplitLines": 50,
     "HapMapBam": "/gpfs/scratch/kellys04/molecpathlab/ref/HapMap-Pool/HapMap-pool.bam",
     "HapMapBai": "/gpfs/scratch/kellys04/molecpathlab/ref/HapMap-Pool/HapMap-pool.bam.bai",
-    "HapMapBamMd5": "8ec0a749f46fb7259524e4aa8d50f44f",
     "SeraCareSelectedTsv": "data/SeraCare-selected-variants.tsv",
-    "SeraCareErrorRate": 0.02
+    "SeraCareErrorRate": 0.02,
+    "CNVPool": "/gpfs/data/molecpathlab/ref/CNV-Pool/CNV-Pool.580.cnn",
+    "workflowLabel" : "NGS580",
+    "ANNOVAR_BUILD_VERSION": "hg19",
+    "ANNOVAR_PROTOCOL": "refGene,clinvar_20170905,cosmic70,1000g2015aug_all,avsnp150,exac03,snp138",
+    "ANNOVAR_OPERATION": "g,f,f,f,f,f,f"
 }
diff --git a/.gitignore b/.gitignore
@@ -72,3 +72,4 @@ nextflow.html.*
 *.html.*
 trace.txt.*
 data/*
+samples.cnv.tsv
diff --git a/Makefile b/Makefile
@@ -374,6 +374,9 @@ HAPMAP_POOL_SHEET:=samples.hapmap.tsv
 hapmap-pool: $(HAPMAP_POOL_SHEET)
 	./nextflow run hapmap-pool.nf -profile hapmap_pool $(RESUME)
 
+CNV_POOL_SHEET:=samples.cnv.tsv
+cnv-pool: $(CNV_POOL_SHEET)
+	./nextflow run cnv-pool.nf -profile cnv_pool $(RESUME)
 
 
 # save a record of the most recent Nextflow run completion

diff --git a/bin/igv-variant-filter.py b/bin/igv-variant-filter.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Filters the ANNOVAR annotation .tsv table for usage with IGV Snapshots
+
+INPUT: ANNOVAR annotations merged with original .vcf .tsv table
+OUTPUT: Filtered annotations .tsv table
+USAGE: igv-variant-filter.py -c HaplotypeCaller -s "sampleID" -i "annotations.tsv" -o "sampleID.tmb.filtered.tsv"
+
+
+Criteria:
+
+For both matched and unmatched we will apply the following criteria:
+1- VAF >5% tumor
+2- VAF <2% normal
+"""
+import csv
+import sys
+import argparse
+
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE,SIG_DFL)
+"""
+https://stackoverflow.com/questions/14207708/ioerror-errno-32-broken-pipe-python
+"""
+
+frequency_min_tumor = 0.05 # 5%
+frequency_min_normal = 0.02 # 5%
+
+
+def unpaired_filter(row):
+    """
+    Return True or False if the row passes all the filter criteria
+    """
+    frequency = float(row['AF'])
+
+    frequency_pass = frequency > frequency_min_tumor
+
+    return(all([ frequency_pass ]))
+
+def LoFreqSomatic(fin, fout):
+    reader = csv.DictReader(fin, delimiter = '\t')
+    fieldnames = reader.fieldnames
+    writer = csv.DictWriter(fout, delimiter = '\t', fieldnames = fieldnames)
+    writer.writeheader()
+    for row in reader:
+        if unpaired_filter(row):
+            writer.writerow(row)
+
+def MuTect2(fin, fout):
+    reader = csv.DictReader(fin, delimiter = '\t')
+    fieldnames = reader.fieldnames
+    writer = csv.DictWriter(fout, delimiter = '\t', fieldnames = fieldnames)
+    writer.writeheader()
+    for row in reader:
+        if unpaired_filter(row):
+            writer.writerow(row)
+
+
+
+def main(**kwargs):
+    """
+    Main control function for the script
+    """
+    input_file = kwargs.pop('input_file', None)
+    output_file = kwargs.pop('output_file', None)
+    caller = kwargs.pop('caller')
+
+    if input_file:
+        fin = open(input_file)
+    else:
+        fin = sys.stdin
+
+    if output_file:
+        fout = open(output_file, "w")
+    else:
+        fout = sys.stdout
+
+    if caller == "LoFreqSomatic":
+        LoFreqSomatic(fin, fout)
+        fout.close()
+        fin.close()
+    elif caller == "MuTect2":
+        MuTect2(fin, fout) # TODO: create this function & filter methods for paired calling
+        fout.close()
+        fin.close()
+    else:
+        print("ERROR: caller not recognized: {0}".format(caller))
+        sys.exit(1)
+
+def parse():
+    """
+    Parses script args
+    """
+    parser = argparse.ArgumentParser(description='Filters the ANNOVAR annotation .tsv table for usage with IGV snapshots')
+    parser.add_argument("-i", default = None, dest = 'input_file', help="Input file")
+    parser.add_argument("-o", default = None, dest = 'output_file', help="Output file")
+    parser.add_argument("-c", "--caller", dest = 'caller', help="Variant caller used", required=True)
+    args = parser.parse_args()
+
+    main(**vars(args))
+
+if __name__ == '__main__':
+    parse()
diff --git a/bin/make-igv-batchscript.py b/bin/make-igv-batchscript.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Script to create an IGV batchscript
+https://software.broadinstitute.org/software/igv/PortCommands
+http://software.broadinstitute.org/software/igv/automation
+https://software.broadinstitute.org/software/igv/batch
+
+example IGV batch script:
+
+new
+snapshotDirectory IGV_Snapshots
+load test_alignments.bam
+genome hg19
+maxPanelHeight 500
+goto chr1:713167-714758
+snapshot chr1_713167_714758_h500.png
+goto chr1:713500-714900
+snapshot chr1_713500_714900_h500.png
+exit
+
+Usage:
+./make-batchscript.py foo.bam bar.bam
+
+"""
+import os
+import argparse
+
+def append_string(string, output_file):
+    """
+    Append a string to a file
+    """
+    with open(output_file, "a") as myfile:
+        myfile.write(string + '\n')
+
+def make_regions(regions_file):
+    """
+    Parse the .bed format regions file to generate the IGV location and output filenames
+    """
+    regions = []
+    with open(regions_file) as f:
+        for line in f:
+            if len(line.split()) >= 3:
+                chrom, start, stop = line.split()[0:3]
+            elif len(line.split()) == 2:
+                chrom, start = line.split()
+                stop = start
+            # make IGV format location
+            loc = '{0}:{1}-{2}'.format(chrom, start, stop)
+            filename = '{0}_{1}_{2}.png'.format(chrom, start, stop)
+            region = {'chrom': chrom, 'start': start, 'stop': stop, 'loc': loc, 'filename': filename}
+            regions.append(region)
+    return(regions)
+
+def main(**kwargs):
+    """
+    Main control function for the script
+    """
+    input_files = kwargs.pop('input_files')
+    regions_file = kwargs.pop('regions_file', "regions.bed")
+    snapshotDirectory = kwargs.pop('snapshotDirectory', "IGV_snapshots")
+    batchscript_file = kwargs.pop('batchscript_file', "IGV_snapshots.bat")
+    image_height = int(kwargs.pop('image_height', 500))
+    genome = kwargs.pop('genome', "hg19")
+
+    regions = make_regions(regions_file)
+
+    append_string("new", batchscript_file)
+    append_string("snapshotDirectory " + snapshotDirectory, batchscript_file)
+    append_string("genome " + genome, batchscript_file)
+    for input_file in input_files:
+        append_string("load " + input_file, batchscript_file)
+    append_string("maxPanelHeight " + str(image_height), batchscript_file)
+    for region in regions:
+        append_string("goto " + region['loc'], batchscript_file)
+        append_string("snapshot " + region['filename'], batchscript_file)
+    append_string("exit", batchscript_file)
+
+def parse():
+    """
+    Parses script args
+    """
+    parser = argparse.ArgumentParser(description='IGV batchscript creator')
+    parser.add_argument("input_files",
+        nargs='+',
+        help="pathes to the files to create snapshots from e.g. .bam, .bigwig, etc.")
+    parser.add_argument("-r", "--regions",
+        default = "regions.bed",
+        dest = 'regions_file',
+        metavar = 'regions_file',
+        help="Path to .bed formatted regions file")
+    parser.add_argument("-b",
+        default = "IGV_snapshots.bat",
+        dest = 'batchscript_file',
+        metavar = 'batchscript_file',
+        help="Name of the IGV batchscript file to create")
+    parser.add_argument("-d",
+        default = "IGV_snapshots.bat",
+        dest = 'snapshotDirectory',
+        metavar = 'snapshotDirectory',
+        help="Name of the IGV snapshot directory to save images to")
+    parser.add_argument("--height",
+        default = 500,
+        dest = 'image_height',
+        metavar = 'image_height',
+        help="Height in pixels of the images to create")
+    parser.add_argument("--genome",
+        default = "hg19",
+        dest = 'genome',
+        metavar = 'genome',
+        help="Name of genome to use in IGV")
+
+    args = parser.parse_args()
+    main(**vars(args))
+
+if __name__ == '__main__':
+    parse()
diff --git a/bin/snp-overlap.R b/bin/snp-overlap.R
@@ -96,18 +96,25 @@ new_names <- sapply(combs, function(x){
     }
 })
 colnames(mat) <- new_names
-
+save.image("loaded.Rdata")
 write.table(x = mat, file = output_matrix, quote = F,sep = "\t", row.names = F)
 
 # convert the matrix to a long format dataframe
 # NOTE: this can get slow for large numbers of combinations...
 overlap_df <- data.frame()
 for( i in seq(length(colnames(mat))) ){
     # make dataframe
-    df <- as.data.frame(elements[[i]])
-
+    # # check that there are elements to make df from...
+    if (length(elements[[i]]) < 1){
+        # create dataframe with no rows
+        df <- setNames(data.frame(matrix(ncol = 2, nrow = 0)), 
+                       c("VariantID", colnames(mat)[i]))
+    } else {
+        df <- as.data.frame(elements[[i]])
         # make combination label column
-    df[["comb"]] <- colnames(mat)[i]
+        df[["comb"]] <- colnames(mat)[i]
+        names(df)[1] <- "VariantID"
+    }
 
     # append to full dataframe
     if(nrow(overlap_df) < 1){
@@ -117,8 +124,6 @@ for( i in seq(length(colnames(mat))) ){
     }
 }
 
-# rename the first column holding the Variant IDs
-names(overlap_df)[1] <- "VariantID"
 
 # add dummy variable for aggregating
 overlap_df[['n']] <- 1

diff --git a/bin/variant-tsv2bed.py b/bin/variant-tsv2bed.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Convert the .vcf TSV file to a bed file
+"""
+import csv
+import sys
+import argparse
+
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE,SIG_DFL)
+"""
+https://stackoverflow.com/questions/14207708/ioerror-errno-32-broken-pipe-python
+"""
+
+def main(**kwargs):
+    """
+    Main control function for the script
+    """
+    input_file = kwargs.pop('input_file', None)
+    output_file = kwargs.pop('output_file', None)
+
+    # load input/output file handles
+    if input_file:
+        fin = open(input_file)
+    else:
+        fin = sys.stdin
+
+    if output_file:
+        fout = open(output_file, "w")
+    else:
+        fout = sys.stdout
+
+    # start processing input
+    reader = csv.DictReader(fin, delimiter = '\t')
+    fieldnames = reader.fieldnames
+    writer = csv.writer(fout, delimiter = '\t')
+    for row in reader:
+        chrom = row['CHROM']
+        pos = int(row['POS'])
+        ref = row['REF']
+        alt = row['ALT']
+
+        alt_len = len(alt)
+        end = pos + alt_len
+        start = pos
+
+        row = [chrom, start, end]
+        writer.writerow(row)
+
+    fout.close()
+    fin.close()
+
+def parse():
+    """
+    Parses script args
+    """
+    parser = argparse.ArgumentParser(description='Filters the ANNOVAR annotation .tsv table for usage with IGV snapshots')
+    parser.add_argument("-i", default = None, dest = 'input_file', help="Input file")
+    parser.add_argument("-o", default = None, dest = 'output_file', help="Output file")
+    args = parser.parse_args()
+
+    main(**vars(args))
+
+if __name__ == '__main__':
+    parse()