Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…80-nf into dev
  • Loading branch information
stevekm committed May 25, 2019
2 parents 2259f96 + a44ee58 commit 7d97183
Show file tree
Hide file tree
Showing 18 changed files with 12,064 additions and 630 deletions.
14 changes: 9 additions & 5 deletions .config.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@
"runID": null,
"fastqDirs": [],
"demux_samplesheet": "demux-samplesheet.csv",
"targetsBed": "targets.bed",
"targetsrefFlatBed":"targets.refFlat.580.bed",
"numTargetSplitLines": 100,
"targetsBed": "targets/targets.580.bed",
"targetsAnnotatedBed": "targets/targets.annotated.580.bed",
"numTargetSplitLines": 50,
"HapMapBam": "/gpfs/scratch/kellys04/molecpathlab/ref/HapMap-Pool/HapMap-pool.bam",
"HapMapBai": "/gpfs/scratch/kellys04/molecpathlab/ref/HapMap-Pool/HapMap-pool.bam.bai",
"HapMapBamMd5": "8ec0a749f46fb7259524e4aa8d50f44f",
"SeraCareSelectedTsv": "data/SeraCare-selected-variants.tsv",
"SeraCareErrorRate": 0.02
"SeraCareErrorRate": 0.02,
"CNVPool": "/gpfs/data/molecpathlab/ref/CNV-Pool/CNV-Pool.580.cnn",
"workflowLabel" : "NGS580",
"ANNOVAR_BUILD_VERSION": "hg19",
"ANNOVAR_PROTOCOL": "refGene,clinvar_20170905,cosmic70,1000g2015aug_all,avsnp150,exac03,snp138",
"ANNOVAR_OPERATION": "g,f,f,f,f,f,f"
}
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,4 @@ nextflow.html.*
*.html.*
trace.txt.*
data/*
samples.cnv.tsv
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,9 @@ HAPMAP_POOL_SHEET:=samples.hapmap.tsv
hapmap-pool: $(HAPMAP_POOL_SHEET)
./nextflow run hapmap-pool.nf -profile hapmap_pool $(RESUME)

CNV_POOL_SHEET:=samples.cnv.tsv
cnv-pool: $(CNV_POOL_SHEET)
./nextflow run cnv-pool.nf -profile cnv_pool $(RESUME)


# save a record of the most recent Nextflow run completion
Expand Down
104 changes: 104 additions & 0 deletions bin/igv-variant-filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Filters the ANNOVAR annotation .tsv table for usage with IGV Snapshots
INPUT: ANNOVAR annotations merged with original .vcf .tsv table
OUTPUT: Filtered annotations .tsv table
USAGE: igv-variant-filter.py -c HaplotypeCaller -s "sampleID" -i "annotations.tsv" -o "sampleID.tmb.filtered.tsv"
Criteria:
For both matched and unmatched we will apply the following criteria:
1- VAF >5% tumor
2- VAF <2% normal
"""
import csv
import sys
import argparse

from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE,SIG_DFL)
"""
https://stackoverflow.com/questions/14207708/ioerror-errno-32-broken-pipe-python
"""

frequency_min_tumor = 0.05 # 5%
frequency_min_normal = 0.02 # 5%


def unpaired_filter(row):
"""
Return True or False if the row passes all the filter criteria
"""
frequency = float(row['AF'])

frequency_pass = frequency > frequency_min_tumor

return(all([ frequency_pass ]))

def LoFreqSomatic(fin, fout):
reader = csv.DictReader(fin, delimiter = '\t')
fieldnames = reader.fieldnames
writer = csv.DictWriter(fout, delimiter = '\t', fieldnames = fieldnames)
writer.writeheader()
for row in reader:
if unpaired_filter(row):
writer.writerow(row)

def MuTect2(fin, fout):
reader = csv.DictReader(fin, delimiter = '\t')
fieldnames = reader.fieldnames
writer = csv.DictWriter(fout, delimiter = '\t', fieldnames = fieldnames)
writer.writeheader()
for row in reader:
if unpaired_filter(row):
writer.writerow(row)



def main(**kwargs):
"""
Main control function for the script
"""
input_file = kwargs.pop('input_file', None)
output_file = kwargs.pop('output_file', None)
caller = kwargs.pop('caller')

if input_file:
fin = open(input_file)
else:
fin = sys.stdin

if output_file:
fout = open(output_file, "w")
else:
fout = sys.stdout

if caller == "LoFreqSomatic":
LoFreqSomatic(fin, fout)
fout.close()
fin.close()
elif caller == "MuTect2":
MuTect2(fin, fout) # TODO: create this function & filter methods for paired calling
fout.close()
fin.close()
else:
print("ERROR: caller not recognized: {0}".format(caller))
sys.exit(1)

def parse():
"""
Parses script args
"""
parser = argparse.ArgumentParser(description='Filters the ANNOVAR annotation .tsv table for usage with IGV snapshots')
parser.add_argument("-i", default = None, dest = 'input_file', help="Input file")
parser.add_argument("-o", default = None, dest = 'output_file', help="Output file")
parser.add_argument("-c", "--caller", dest = 'caller', help="Variant caller used", required=True)
args = parser.parse_args()

main(**vars(args))

if __name__ == '__main__':
parse()
117 changes: 117 additions & 0 deletions bin/make-igv-batchscript.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Script to create an IGV batchscript
https://software.broadinstitute.org/software/igv/PortCommands
http://software.broadinstitute.org/software/igv/automation
https://software.broadinstitute.org/software/igv/batch
example IGV batch script:
new
snapshotDirectory IGV_Snapshots
load test_alignments.bam
genome hg19
maxPanelHeight 500
goto chr1:713167-714758
snapshot chr1_713167_714758_h500.png
goto chr1:713500-714900
snapshot chr1_713500_714900_h500.png
exit
Usage:
./make-batchscript.py foo.bam bar.bam
"""
import os
import argparse

def append_string(string, output_file):
"""
Append a string to a file
"""
with open(output_file, "a") as myfile:
myfile.write(string + '\n')

def make_regions(regions_file):
"""
Parse the .bed format regions file to generate the IGV location and output filenames
"""
regions = []
with open(regions_file) as f:
for line in f:
if len(line.split()) >= 3:
chrom, start, stop = line.split()[0:3]
elif len(line.split()) == 2:
chrom, start = line.split()
stop = start
# make IGV format location
loc = '{0}:{1}-{2}'.format(chrom, start, stop)
filename = '{0}_{1}_{2}.png'.format(chrom, start, stop)
region = {'chrom': chrom, 'start': start, 'stop': stop, 'loc': loc, 'filename': filename}
regions.append(region)
return(regions)

def main(**kwargs):
"""
Main control function for the script
"""
input_files = kwargs.pop('input_files')
regions_file = kwargs.pop('regions_file', "regions.bed")
snapshotDirectory = kwargs.pop('snapshotDirectory', "IGV_snapshots")
batchscript_file = kwargs.pop('batchscript_file', "IGV_snapshots.bat")
image_height = int(kwargs.pop('image_height', 500))
genome = kwargs.pop('genome', "hg19")

regions = make_regions(regions_file)

append_string("new", batchscript_file)
append_string("snapshotDirectory " + snapshotDirectory, batchscript_file)
append_string("genome " + genome, batchscript_file)
for input_file in input_files:
append_string("load " + input_file, batchscript_file)
append_string("maxPanelHeight " + str(image_height), batchscript_file)
for region in regions:
append_string("goto " + region['loc'], batchscript_file)
append_string("snapshot " + region['filename'], batchscript_file)
append_string("exit", batchscript_file)

def parse():
"""
Parses script args
"""
parser = argparse.ArgumentParser(description='IGV batchscript creator')
parser.add_argument("input_files",
nargs='+',
help="pathes to the files to create snapshots from e.g. .bam, .bigwig, etc.")
parser.add_argument("-r", "--regions",
default = "regions.bed",
dest = 'regions_file',
metavar = 'regions_file',
help="Path to .bed formatted regions file")
parser.add_argument("-b",
default = "IGV_snapshots.bat",
dest = 'batchscript_file',
metavar = 'batchscript_file',
help="Name of the IGV batchscript file to create")
parser.add_argument("-d",
default = "IGV_snapshots.bat",
dest = 'snapshotDirectory',
metavar = 'snapshotDirectory',
help="Name of the IGV snapshot directory to save images to")
parser.add_argument("--height",
default = 500,
dest = 'image_height',
metavar = 'image_height',
help="Height in pixels of the images to create")
parser.add_argument("--genome",
default = "hg19",
dest = 'genome',
metavar = 'genome',
help="Name of genome to use in IGV")

args = parser.parse_args()
main(**vars(args))

if __name__ == '__main__':
parse()
17 changes: 11 additions & 6 deletions bin/snp-overlap.R
Original file line number Diff line number Diff line change
Expand Up @@ -96,18 +96,25 @@ new_names <- sapply(combs, function(x){
}
})
colnames(mat) <- new_names

save.image("loaded.Rdata")
write.table(x = mat, file = output_matrix, quote = F,sep = "\t", row.names = F)

# convert the matrix to a long format dataframe
# NOTE: this can get slow for large numbers of combinations...
overlap_df <- data.frame()
for( i in seq(length(colnames(mat))) ){
# make dataframe
df <- as.data.frame(elements[[i]])

# # check that there are elements to make df from...
if (length(elements[[i]]) < 1){
# create dataframe with no rows
df <- setNames(data.frame(matrix(ncol = 2, nrow = 0)),
c("VariantID", colnames(mat)[i]))
} else {
df <- as.data.frame(elements[[i]])
# make combination label column
df[["comb"]] <- colnames(mat)[i]
df[["comb"]] <- colnames(mat)[i]
names(df)[1] <- "VariantID"
}

# append to full dataframe
if(nrow(overlap_df) < 1){
Expand All @@ -117,8 +124,6 @@ for( i in seq(length(colnames(mat))) ){
}
}

# rename the first column holding the Variant IDs
names(overlap_df)[1] <- "VariantID"

# add dummy variable for aggregating
overlap_df[['n']] <- 1
Expand Down
66 changes: 66 additions & 0 deletions bin/variant-tsv2bed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Convert the .vcf TSV file to a bed file
"""
import csv
import sys
import argparse

from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE,SIG_DFL)
"""
https://stackoverflow.com/questions/14207708/ioerror-errno-32-broken-pipe-python
"""

def main(**kwargs):
"""
Main control function for the script
"""
input_file = kwargs.pop('input_file', None)
output_file = kwargs.pop('output_file', None)

# load input/output file handles
if input_file:
fin = open(input_file)
else:
fin = sys.stdin

if output_file:
fout = open(output_file, "w")
else:
fout = sys.stdout

# start processing input
reader = csv.DictReader(fin, delimiter = '\t')
fieldnames = reader.fieldnames
writer = csv.writer(fout, delimiter = '\t')
for row in reader:
chrom = row['CHROM']
pos = int(row['POS'])
ref = row['REF']
alt = row['ALT']

alt_len = len(alt)
end = pos + alt_len
start = pos

row = [chrom, start, end]
writer.writerow(row)

fout.close()
fin.close()

def parse():
"""
Parses script args
"""
parser = argparse.ArgumentParser(description='Filters the ANNOVAR annotation .tsv table for usage with IGV snapshots')
parser.add_argument("-i", default = None, dest = 'input_file', help="Input file")
parser.add_argument("-o", default = None, dest = 'output_file', help="Output file")
args = parser.parse_args()

main(**vars(args))

if __name__ == '__main__':
parse()
Loading

0 comments on commit 7d97183

Please sign in to comment.