read the gzipped vcf instead

indapa · indapa · commit b9fc17f95462 · 2013-06-12T12:46:25.000-04:00
diff --git a/Per_sample_variantEvalGenotypeConcordance.py b/Per_sample_variantEvalGenotypeConcordance.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+import gzip
 from itertools import *
 from VcfFile import *
 from VcfSampleEval import *
@@ -13,7 +14,7 @@
    Briefly, it calculates genotype concordance metrics of an evaluation callset to a comparison callset in a merged VCF file of the two """
 
 def main():
-    usage = "usage: %prog [options] file.vcf \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n"
+    usage = "usage: %prog [options] file.vcf.gz \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n"
     parser = OptionParser(usage)
     
     parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False)
@@ -22,7 +23,7 @@ def main():
     (options, args)=parser.parse_args()
 
     vcfilename=args[0]
-    basename=os.path.splitext(vcfilename)[0]
+    basename=os.path.splitext(os.path.splitext(vcfilename)[0])[0]
     """ row is eval, column is comparison 
         make a numpy matrix to represent genotype concordance matrix """
     
@@ -47,7 +48,7 @@ def main():
     fieldsfh=open(fieldslog, 'w')
     fieldsfh.write('set'+"\n")
     vcfobj=VcfFile(vcfilename)
-    vcfh=open(vcfilename,'r')
+    vcfh=gzip.open(vcfilename,'r')
 
     vcfobj.parseMetaAndHeaderLines(vcfh)
     header=vcfobj.returnHeader() +"\n"
diff --git a/variantEvalGenotypeConcordance.py b/variantEvalGenotypeConcordance.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+import gzip
 from itertools import *
 from VcfFile import *
 from VcfSampleEval import *
@@ -13,7 +14,7 @@
    Briefly, it calculates genotype concordance metrics of an evaluation callset to a comparison callset in a merged VCF file of the two """
 
 def main():
-    usage = "usage: %prog [options] file.vcf \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n"
+    usage = "usage: %prog [options] file.vcf.gz \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n"
     parser = OptionParser(usage)
     parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False)
     parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False)
@@ -23,7 +24,8 @@ def main():
 
 
     vcfilename=args[0]
-    basename=os.path.splitext(vcfilename)[0]
+    #basename=os.path.splitext(vcfilename)[0]
+    basename=os.path.splitext(os.path.splitext(vcfilename)[0])[0]
     """ row is eval, column is comparison 
         make a numpy matrix to represent genotype concordance matrix """
     
@@ -50,7 +52,7 @@ def main():
     fieldsfh=open(fieldslog, 'w')
     fieldsfh.write('set'+"\n")
     vcfobj=VcfFile(vcfilename)
-    vcfh=open(vcfilename,'r')
+    vcfh=gzip.open(vcfilename,'r')
 
     vcfobj.parseMetaAndHeaderLines(vcfh)
     header=vcfobj.returnHeader() +"\n"
diff --git a/vcf_gt-filter.py b/vcf_gt-filter.py
@@ -1,21 +1,21 @@
 #!/usr/bin/env python
-
+import gzip
 import sys
 from optparse import OptionParser
 from collections import defaultdict
 from VcfFile import *
 import argparse
 
-""" filter records to match genotypes according to a pattern
+""" Select vcf data lines based on genotype criterions
 Specify genotypes with the -gt option: -gt <sample><single-space><genotype string>
 i.e. -gt "sampleOne 0/0 """
 
 def main():
-    usage = "usage: %prog [options] file.vcf "
-    parser = argparse.ArgumentParser(description='filter records based on genotypes')
+    usage = "usage: %prog [options] file.vcf.gz "
+    parser = argparse.ArgumentParser(description='filter records  based on genotypes')
    
     parser.add_argument('vcf', metavar='vcf', type=str,
-                   help='vcf file')
+                   help='vcf.gz file')
     """ http://stackoverflow.com/a/15008806/1735942 """
     parser.add_argument('--no-header',dest='header',action='store_false')
     parser.add_argument('-gt', metavar='gt', type=str, nargs='*', action='append',
@@ -39,7 +39,7 @@ def main():
     
     
     
-    vcfh=open(args.vcf,'r')
+    vcfh=gzip.open(args.vcf,'r')
     vcfobj=VcfFile(args.vcf)
     vcfobj.parseMetaAndHeaderLines(vcfh)
     header=vcfobj.returnHeader()
@@ -70,7 +70,7 @@ def main():
                         genotypes_toFilter.append(True)
                     else:genotypes_toFilter.append(False)
                 
-        #print genotypes_toFilter
+        # all gt filters need to evaluate to True in order for record to print
         if all(item == True for item in genotypes_toFilter):
             print vrec.toStringwithGenotypes()
                 
diff --git a/vcf_pysam_allele_pileup.py b/vcf_pysam_allele_pileup.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+import gzip
 from VcfFile import *
 from VcfMetaLines import FormatLine
 from optparse import OptionParser
@@ -8,11 +9,11 @@
 
 def main():
     
-    """ given a VCF file and bam file containing the sample(s) in the VCF this willl print out 
-    a pileup count of the ref and alt allele that is in the VCF file """
+    """ given a VCF file and bam file containing the sample(s) in the VCF this will add INFO and FORMAT tags 
+    to indicate the count of reference and alt alleles observed in total and per-sample and print out a new VCF"""
 
 
-    usage = "usage: %prog [option] file.vcf"
+    usage = "usage: %prog [option] file.vcf.gz"
     parser =OptionParser(usage)
     parser.add_option("--bam", type="string", dest="bam", default=None, help="bam file to perform pileup on")
     parser.add_option("--mapq", type="float", dest="mapq", default=0., help="Exclude alignments from analysis if they have a mapping less than mapq (default is 0)")
@@ -24,7 +25,7 @@ def main():
         sys.exit(1)
     
     vcfilename=args[0]
-    basename=os.path.splitext(vcfilename)[0]
+    
     bamfilename=options.bam
     
     ra_formatline=FormatLine("RA", number='1', type='Integer', description='number of reference alleles observed')
@@ -36,7 +37,7 @@ def main():
         
     vcfobj=VcfFile(vcfilename)
     
-    vcfh=open(vcfilename,'r')
+    vcfh=gzip.open(vcfilename,'r')
 
     vcfobj.parseMetaAndHeaderLines(vcfh)
     vcfobj.addMetaFormatHeader(ra_formatline)
diff --git a/vcf_removeSamples.py b/vcf_removeSamples.py
@@ -1,18 +1,17 @@
 #!/usr/bin/env python
+import gzip
 from itertools import *
 from VcfFile import *
 from VcfSampleEval import *
 from optparse import OptionParser
-from common import grouper
-from common import typeofGenotype
 import argparse
 import os
 
 
 def main():
     
     """  remove samples from a vcf file """
-    usage = "usage: %prog [options] file.vcf "
+    usage = "usage: %prog [options] file.vcf.gz "
     #parser = OptionParser(usage)
     parser = argparse.ArgumentParser(description='remove samples from vcf file')
     parser.add_argument('removesamples', metavar='sample', type=str, nargs='+',
@@ -24,7 +23,7 @@ def main():
     #print args.vcfile
     
     
-    vcfh=open(args.vcfile,'r')
+    vcfh=gzip.open(args.vcfile,'r')
     vcfobj=VcfFile(args.vcfile)
     
     vcfobj.parseMetaAndHeaderLines(vcfh)