V0.1.10 update (#66)

* Updating to v0.1.10 * Commenting resources files * Updating README to reflect bug fix * Bug fix and updating to be ready for new docker image
dnanexus-archive · Jun 10, 2019 · 5c1fb27 · 5c1fb27
1 parent 4ea10cd
commit 5c1fb27
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 11 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -76,6 +76,7 @@ RUN conda update -y pyopenssl
 
 WORKDIR /
 ADD resources.tar.gz /
+RUN cp -a /resources/* / && rm -rf /resources/
 
 RUN conda install -c defaults -y numpy
 RUN pip install https://github.com/bioinform/breakseq2/archive/2.2.tar.gz

diff --git a/README.md b/README.md
@@ -154,7 +154,7 @@ breakseq2 -2.2- has requirement pysam==0.7.7, but you'll have pysam 0.15.1 which
 ```
 > What's going on?
 
-This is a known error message caused by how we currently manage the conflicting pysam versions required for BreakSeq and SVTyper. We are currently working on a more stable solution.
+This is a known error message caused by how we currently manage the conflicting pysam versions required for BreakSeq and SVTyper. This issue should be resolved in v0.1.10.
 
 ### Tool versions
 

diff --git a/dx_app_code/parliament2/dxapp.json b/dx_app_code/parliament2/dxapp.json
@@ -7,7 +7,7 @@
     "WGS"
   ],
   "dxapi": "1.0.0",
-  "version": "0.1.9",
+  "version": "0.1.10",
   "inputSpec": [
     {
       "name": "illumina_bam",

diff --git a/dx_app_code/parliament2/parliament2.py b/dx_app_code/parliament2/parliament2.py
@@ -18,9 +18,8 @@ def main(**job_inputs):
     else:
         prefix = job_inputs['prefix']
 
-    # Running Docker image
     subprocess.check_call(['mkdir', '-p', '/home/dnanexus/in', '/home/dnanexus/out'])
-    docker_pull = ['docker', 'pull', 'dnanexus/parliament2:v0.1.9-13-g37d63065']
+    docker_pull = ['docker', 'pull', 'dnanexus/parliament2:0.1.10']
     subprocess.check_call(docker_pull)
 
     print "Downloading input files"
@@ -33,8 +32,7 @@ def main(**job_inputs):
     ref_name = "/home/dnanexus/in/{0}".format(ref_genome.name)
     dxpy.download_dxfile(ref_genome.id, ref_name)
 
-    docker_call = ['docker', 'run', '-v', '/home/dnanexus/in/:/home/dnanexus/in/', '-v', '/home/dnanexus/out/:/home/dnanexus/out/', 'dnanexus/parliament2:v0.1.9-13-g37d63065', '--bam', bam_name, '-r', ref_name, '--prefix', str(prefix)]
-    # docker_call = ['dx-docker', 'run', '-v', '/home/dnanexus/in/:/home/dnanexus/in/', '-v', '/home/dnanexus/out/:/home/dnanexus/out/', 'parliament2:0.1.9', '--bam', bam_name, '-r', ref_name, '--prefix', str(prefix)]
+    docker_call = ['docker', 'run', '-v', '/home/dnanexus/in/:/home/dnanexus/in/', '-v', '/home/dnanexus/out/:/home/dnanexus/out/', 'dnanexus/parliament2:0.1.10', '--bam', bam_name, '-r', ref_name, '--prefix', str(prefix)]
 
     if 'illumina_bai' in job_inputs:
         input_bai = dxpy.DXFile(job_inputs['illumina_bai'])

diff --git a/resources/combine_combined.py b/resources/combine_combined.py
@@ -1,11 +1,17 @@
 import sys
 
+# arg 1: survivor_sorted.vcf (sorted SUVIVOR output file)
+# arg 2: "${prefix}"
+# arg 3: survivor_inputs (all files generated by SURVIVOR)
+# arg 4: /all.phred.txt (phred thresholds of calls by various callers)
+
 def main():
     headers = []
 
     written_additional_header = False
 
     sample = sys.argv[2]
+    # get all SV callers used to generate this file
     with open(sys.argv[3]) as survivor_input_list:
         for line in survivor_input_list:
             if "cnvnator" in line:
@@ -25,6 +31,7 @@ def main():
 
     quality_mappings = { "lt300": {}, "300to1000": {}, "1kbplus": {}, "all": {}, "ins": {} }
 
+    # parse all phred file
     with open(sys.argv[4]) as all_phred_values:
         for line in all_phred_values:
             size_split = line.split("_")
@@ -39,6 +46,7 @@ def main():
 
     with open(sys.argv[1]) as survivor_output:
         for line in survivor_output:
+            # modify header
             if line.startswith("##"):
                 if "FORMAT" in line and not written_additional_header:
                     print "##INFO=<ID=SUPP,Number=.,Type=String,Description=\"Number of callers that support an ALT call. This count is based on the presence of a call, whether it could be confirmed by SVTyper. Due to differences in the breakpoints, this number may differ from the sum of all callers in the CALLERS field\">"
@@ -51,27 +59,32 @@ def main():
                     written_additional_header = True
                 else:
                     sys.stdout.write(line)
+            # add sample to line describing VCF fields
             elif line[0] == "#" and line[1] != "#":
                 tab_split = line.strip().split("\t")
                 print "\t".join(tab_split[:9]) + "\t%s" % sample
+            # VCF entries
             else:
                 tab_split = line.strip().split("\t")
                 position = int(tab_split[1])
                 end = tab_split[7].replace("CIEND","XXXXX").split("END=")[-1].split(";")[0].split("\t")[0]
                 end_position = int(end)
+                # possibly same as correct_max_position?
                 if end_position < position:
                     new_end = str(position)
                     new_start = end
                     tab_split[1] = new_start
                     tab_split[7].replace("END=%s" % end, "END=%s" % new_end)
 
+                # adds "chr"
+                if "chr" not in tab_split[0]:
+                    tab_split[0] = "chr" + tab_split[0]
+
                 support = ""
                 het = 0
                 hom = 0
                 ref = 0
-                if "chr" not in tab_split[0]:
-                    tab_split[0] = "chr" + tab_split[0]
-
+                # counts support for het/hom/ref
                 for i in range(len(tab_split[9:])):
                     if "0/1" in tab_split[9+i] or "1/1" in tab_split[9+i] or "./1" in tab_split[9+i]:
                         if "0/1" in tab_split[9+i] or "./1" in tab_split[9+i]:
@@ -80,13 +93,16 @@ def main():
                             hom += 1
                         if "0/0" in tab_split[9+i]:
                             ref += 1
+                        # adds SV caller to "support" string if not there already
                         if headers[i] not in support:
                             support += ",%s" % headers[i]
+                # if caller(s) supports variant, adds this to string at end
                 if len(support) > 0:
                     tab_split[7] += ";CALLERS=%s" % support.lstrip(",")
                 else:
                     support = "."
 
+                # parses hom/het/ref into short genotype strings
                 tab_split[8] = "GT:SP"
                 if het == 0 and hom == 0:
                     if ref > 0:
@@ -104,6 +120,8 @@ def main():
 
                 tab_split[9] += support.lstrip(",")
 
+                # adding size range for SVs
+                # deletions:
                 if "SVTYPE=DEL" in line:
                     #try:
                     size = end_position - position
@@ -115,11 +133,12 @@ def main():
                         size_range = "1kbplus"
                     #except:
                     #    size_range = "all"
+                # insertions:
                 if "SVTYPE=INS" in line:
                     size_range="ins"
 
-
-                if "SVTYPE=DEL" in line or "SVTYPE=DEL" in line:
+                # adds quality mappings if deletion
+                if "SVTYPE=DEL" in line:
                     callers = support.lstrip(",").split(",")
                     callers.sort()
                     while len(callers) > 0:
@@ -133,6 +152,7 @@ def main():
                 if "SVTYPE=DUP" in line and (tab_split[9].split(":")[0] == "0/1" or tab_split[9].split(":")[0] == "1/1"):
                     tab_split[6] = "Unknown"
 
+                # prints final line
                 print "\t".join(tab_split[:10])
 
 main()
diff --git a/resources/correct_max_position.py b/resources/correct_max_position.py
@@ -1,21 +1,29 @@
 import sys
 
 for line in sys.stdin:
+    # line is header; continue
     if line.startswith('#'):
         sys.stdout.write(line)
         continue
+    # line contains variant
     else:
         tab_split = line.strip().split("\t")
+        # only one item in line
         if len(tab_split) == 1:
             continue
+        # full VCF entry
         else:
+            # get position, end, chr1, and chr2
             position = int(tab_split[1])
             end = int(line.replace("CIEND","XXXXX").split("END=")[-1].split(";")[0].split("\t")[0].split(",")[0])
             chr2 = line.split("CHR2=")[-1].split(";")[0].split("\t")[0]
             chr1 = line.split("\t")[0].split("chr")[-1]
 
+            # if chr1 and chr2 are the same, and the max position is greater than the end
             if end < position and chr1 == chr2:
+                # correct the end
                 tab_split[1] = str(end)
+                # correct the info field to reflect position
                 info_fields = tab_split[7].split(";")
                 for i in range(len(info_fields)):
                     if "END=" in info_fields[i]: