diff --git a/Dockerfile b/Dockerfile index b9545b0a..84c2fec1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -76,6 +76,7 @@ RUN conda update -y pyopenssl WORKDIR / ADD resources.tar.gz / +RUN cp -a /resources/* / && rm -rf /resources/ RUN conda install -c defaults -y numpy RUN pip install https://github.com/bioinform/breakseq2/archive/2.2.tar.gz diff --git a/README.md b/README.md index fe7e5e7f..bbaa982f 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,7 @@ breakseq2 -2.2- has requirement pysam==0.7.7, but you'll have pysam 0.15.1 which ``` > What's going on? -This is a known error message caused by how we currently manage the conflicting pysam versions required for BreakSeq and SVTyper. We are currently working on a more stable solution. +This is a known error message caused by how we currently manage the conflicting pysam versions required for BreakSeq and SVTyper. This issue should be resolved in v0.1.10. ### Tool versions diff --git a/dx_app_code/parliament2/dxapp.json b/dx_app_code/parliament2/dxapp.json index 7b6a43af..151ed506 100644 --- a/dx_app_code/parliament2/dxapp.json +++ b/dx_app_code/parliament2/dxapp.json @@ -7,7 +7,7 @@ "WGS" ], "dxapi": "1.0.0", - "version": "0.1.9", + "version": "0.1.10", "inputSpec": [ { "name": "illumina_bam", diff --git a/dx_app_code/parliament2/parliament2.py b/dx_app_code/parliament2/parliament2.py index f531680b..ab298872 100644 --- a/dx_app_code/parliament2/parliament2.py +++ b/dx_app_code/parliament2/parliament2.py @@ -18,9 +18,8 @@ def main(**job_inputs): else: prefix = job_inputs['prefix'] - # Running Docker image subprocess.check_call(['mkdir', '-p', '/home/dnanexus/in', '/home/dnanexus/out']) - docker_pull = ['docker', 'pull', 'dnanexus/parliament2:v0.1.9-13-g37d63065'] + docker_pull = ['docker', 'pull', 'dnanexus/parliament2:0.1.10'] subprocess.check_call(docker_pull) print "Downloading input files" @@ -33,8 +32,7 @@ def main(**job_inputs): ref_name = "/home/dnanexus/in/{0}".format(ref_genome.name) dxpy.download_dxfile(ref_genome.id, ref_name) - docker_call = ['docker', 'run', '-v', '/home/dnanexus/in/:/home/dnanexus/in/', '-v', '/home/dnanexus/out/:/home/dnanexus/out/', 'dnanexus/parliament2:v0.1.9-13-g37d63065', '--bam', bam_name, '-r', ref_name, '--prefix', str(prefix)] - # docker_call = ['dx-docker', 'run', '-v', '/home/dnanexus/in/:/home/dnanexus/in/', '-v', '/home/dnanexus/out/:/home/dnanexus/out/', 'parliament2:0.1.9', '--bam', bam_name, '-r', ref_name, '--prefix', str(prefix)] + docker_call = ['docker', 'run', '-v', '/home/dnanexus/in/:/home/dnanexus/in/', '-v', '/home/dnanexus/out/:/home/dnanexus/out/', 'dnanexus/parliament2:0.1.10', '--bam', bam_name, '-r', ref_name, '--prefix', str(prefix)] if 'illumina_bai' in job_inputs: input_bai = dxpy.DXFile(job_inputs['illumina_bai']) diff --git a/resources/combine_combined.py b/resources/combine_combined.py index 814ebdf1..64fe84ba 100644 --- a/resources/combine_combined.py +++ b/resources/combine_combined.py @@ -1,11 +1,17 @@ import sys +# arg 1: survivor_sorted.vcf (sorted SUVIVOR output file) +# arg 2: "${prefix}" +# arg 3: survivor_inputs (all files generated by SURVIVOR) +# arg 4: /all.phred.txt (phred thresholds of calls by various callers) + def main(): headers = [] written_additional_header = False sample = sys.argv[2] + # get all SV callers used to generate this file with open(sys.argv[3]) as survivor_input_list: for line in survivor_input_list: if "cnvnator" in line: @@ -25,6 +31,7 @@ def main(): quality_mappings = { "lt300": {}, "300to1000": {}, "1kbplus": {}, "all": {}, "ins": {} } + # parse all phred file with open(sys.argv[4]) as all_phred_values: for line in all_phred_values: size_split = line.split("_") @@ -39,6 +46,7 @@ def main(): with open(sys.argv[1]) as survivor_output: for line in survivor_output: + # modify header if line.startswith("##"): if "FORMAT" in line and not written_additional_header: print "##INFO=" @@ -51,27 +59,32 @@ def main(): written_additional_header = True else: sys.stdout.write(line) + # add sample to line describing VCF fields elif line[0] == "#" and line[1] != "#": tab_split = line.strip().split("\t") print "\t".join(tab_split[:9]) + "\t%s" % sample + # VCF entries else: tab_split = line.strip().split("\t") position = int(tab_split[1]) end = tab_split[7].replace("CIEND","XXXXX").split("END=")[-1].split(";")[0].split("\t")[0] end_position = int(end) + # possibly same as correct_max_position? if end_position < position: new_end = str(position) new_start = end tab_split[1] = new_start tab_split[7].replace("END=%s" % end, "END=%s" % new_end) + # adds "chr" + if "chr" not in tab_split[0]: + tab_split[0] = "chr" + tab_split[0] + support = "" het = 0 hom = 0 ref = 0 - if "chr" not in tab_split[0]: - tab_split[0] = "chr" + tab_split[0] - + # counts support for het/hom/ref for i in range(len(tab_split[9:])): if "0/1" in tab_split[9+i] or "1/1" in tab_split[9+i] or "./1" in tab_split[9+i]: if "0/1" in tab_split[9+i] or "./1" in tab_split[9+i]: @@ -80,13 +93,16 @@ def main(): hom += 1 if "0/0" in tab_split[9+i]: ref += 1 + # adds SV caller to "support" string if not there already if headers[i] not in support: support += ",%s" % headers[i] + # if caller(s) supports variant, adds this to string at end if len(support) > 0: tab_split[7] += ";CALLERS=%s" % support.lstrip(",") else: support = "." + # parses hom/het/ref into short genotype strings tab_split[8] = "GT:SP" if het == 0 and hom == 0: if ref > 0: @@ -104,6 +120,8 @@ def main(): tab_split[9] += support.lstrip(",") + # adding size range for SVs + # deletions: if "SVTYPE=DEL" in line: #try: size = end_position - position @@ -115,11 +133,12 @@ def main(): size_range = "1kbplus" #except: # size_range = "all" + # insertions: if "SVTYPE=INS" in line: size_range="ins" - - if "SVTYPE=DEL" in line or "SVTYPE=DEL" in line: + # adds quality mappings if deletion + if "SVTYPE=DEL" in line: callers = support.lstrip(",").split(",") callers.sort() while len(callers) > 0: @@ -133,6 +152,7 @@ def main(): if "SVTYPE=DUP" in line and (tab_split[9].split(":")[0] == "0/1" or tab_split[9].split(":")[0] == "1/1"): tab_split[6] = "Unknown" + # prints final line print "\t".join(tab_split[:10]) main() diff --git a/resources/correct_max_position.py b/resources/correct_max_position.py index 30262c6a..5b886d0c 100644 --- a/resources/correct_max_position.py +++ b/resources/correct_max_position.py @@ -1,21 +1,29 @@ import sys for line in sys.stdin: + # line is header; continue if line.startswith('#'): sys.stdout.write(line) continue + # line contains variant else: tab_split = line.strip().split("\t") + # only one item in line if len(tab_split) == 1: continue + # full VCF entry else: + # get position, end, chr1, and chr2 position = int(tab_split[1]) end = int(line.replace("CIEND","XXXXX").split("END=")[-1].split(";")[0].split("\t")[0].split(",")[0]) chr2 = line.split("CHR2=")[-1].split(";")[0].split("\t")[0] chr1 = line.split("\t")[0].split("chr")[-1] + # if chr1 and chr2 are the same, and the max position is greater than the end if end < position and chr1 == chr2: + # correct the end tab_split[1] = str(end) + # correct the info field to reflect position info_fields = tab_split[7].split(";") for i in range(len(info_fields)): if "END=" in info_fields[i]: