Skip to content

Commit b9fc17f

Browse files
committed
read the gzipped vcf instead
1 parent d6fe987 commit b9fc17f

5 files changed

+25
-22
lines changed

Per_sample_variantEvalGenotypeConcordance.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python
2+
import gzip
23
from itertools import *
34
from VcfFile import *
45
from VcfSampleEval import *
@@ -13,7 +14,7 @@
1314
Briefly, it calculates genotype concordance metrics of an evaluation callset to a comparison callset in a merged VCF file of the two """
1415

1516
def main():
16-
usage = "usage: %prog [options] file.vcf \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n"
17+
usage = "usage: %prog [options] file.vcf.gz \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n"
1718
parser = OptionParser(usage)
1819

1920
parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False)
@@ -22,7 +23,7 @@ def main():
2223
(options, args)=parser.parse_args()
2324

2425
vcfilename=args[0]
25-
basename=os.path.splitext(vcfilename)[0]
26+
basename=os.path.splitext(os.path.splitext(vcfilename)[0])[0]
2627
""" row is eval, column is comparison
2728
make a numpy matrix to represent genotype concordance matrix """
2829

@@ -47,7 +48,7 @@ def main():
4748
fieldsfh=open(fieldslog, 'w')
4849
fieldsfh.write('set'+"\n")
4950
vcfobj=VcfFile(vcfilename)
50-
vcfh=open(vcfilename,'r')
51+
vcfh=gzip.open(vcfilename,'r')
5152

5253
vcfobj.parseMetaAndHeaderLines(vcfh)
5354
header=vcfobj.returnHeader() +"\n"

variantEvalGenotypeConcordance.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python
2+
import gzip
23
from itertools import *
34
from VcfFile import *
45
from VcfSampleEval import *
@@ -13,7 +14,7 @@
1314
Briefly, it calculates genotype concordance metrics of an evaluation callset to a comparison callset in a merged VCF file of the two """
1415

1516
def main():
16-
usage = "usage: %prog [options] file.vcf \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n"
17+
usage = "usage: %prog [options] file.vcf.gz \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n"
1718
parser = OptionParser(usage)
1819
parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False)
1920
parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False)
@@ -23,7 +24,8 @@ def main():
2324

2425

2526
vcfilename=args[0]
26-
basename=os.path.splitext(vcfilename)[0]
27+
#basename=os.path.splitext(vcfilename)[0]
28+
basename=os.path.splitext(os.path.splitext(vcfilename)[0])[0]
2729
""" row is eval, column is comparison
2830
make a numpy matrix to represent genotype concordance matrix """
2931

@@ -50,7 +52,7 @@ def main():
5052
fieldsfh=open(fieldslog, 'w')
5153
fieldsfh.write('set'+"\n")
5254
vcfobj=VcfFile(vcfilename)
53-
vcfh=open(vcfilename,'r')
55+
vcfh=gzip.open(vcfilename,'r')
5456

5557
vcfobj.parseMetaAndHeaderLines(vcfh)
5658
header=vcfobj.returnHeader() +"\n"

vcf_gt-filter.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
#!/usr/bin/env python
2-
2+
import gzip
33
import sys
44
from optparse import OptionParser
55
from collections import defaultdict
66
from VcfFile import *
77
import argparse
88

9-
""" filter records to match genotypes according to a pattern
9+
""" Select vcf data lines based on genotype criterions
1010
Specify genotypes with the -gt option: -gt <sample><single-space><genotype string>
1111
i.e. -gt "sampleOne 0/0 """
1212

1313
def main():
14-
usage = "usage: %prog [options] file.vcf "
15-
parser = argparse.ArgumentParser(description='filter records based on genotypes')
14+
usage = "usage: %prog [options] file.vcf.gz "
15+
parser = argparse.ArgumentParser(description='filter records based on genotypes')
1616

1717
parser.add_argument('vcf', metavar='vcf', type=str,
18-
help='vcf file')
18+
help='vcf.gz file')
1919
""" http://stackoverflow.com/a/15008806/1735942 """
2020
parser.add_argument('--no-header',dest='header',action='store_false')
2121
parser.add_argument('-gt', metavar='gt', type=str, nargs='*', action='append',
@@ -39,7 +39,7 @@ def main():
3939

4040

4141

42-
vcfh=open(args.vcf,'r')
42+
vcfh=gzip.open(args.vcf,'r')
4343
vcfobj=VcfFile(args.vcf)
4444
vcfobj.parseMetaAndHeaderLines(vcfh)
4545
header=vcfobj.returnHeader()
@@ -70,7 +70,7 @@ def main():
7070
genotypes_toFilter.append(True)
7171
else:genotypes_toFilter.append(False)
7272

73-
#print genotypes_toFilter
73+
# all gt filters need to evaluate to True in order for record to print
7474
if all(item == True for item in genotypes_toFilter):
7575
print vrec.toStringwithGenotypes()
7676

vcf_pysam_allele_pileup.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python
2+
import gzip
23
from VcfFile import *
34
from VcfMetaLines import FormatLine
45
from optparse import OptionParser
@@ -8,11 +9,11 @@
89

910
def main():
1011

11-
""" given a VCF file and bam file containing the sample(s) in the VCF this willl print out
12-
a pileup count of the ref and alt allele that is in the VCF file """
12+
""" given a VCF file and bam file containing the sample(s) in the VCF this will add INFO and FORMAT tags
13+
to indicate the count of reference and alt alleles observed in total and per-sample and print out a new VCF"""
1314

1415

15-
usage = "usage: %prog [option] file.vcf"
16+
usage = "usage: %prog [option] file.vcf.gz"
1617
parser =OptionParser(usage)
1718
parser.add_option("--bam", type="string", dest="bam", default=None, help="bam file to perform pileup on")
1819
parser.add_option("--mapq", type="float", dest="mapq", default=0., help="Exclude alignments from analysis if they have a mapping less than mapq (default is 0)")
@@ -24,7 +25,7 @@ def main():
2425
sys.exit(1)
2526

2627
vcfilename=args[0]
27-
basename=os.path.splitext(vcfilename)[0]
28+
2829
bamfilename=options.bam
2930

3031
ra_formatline=FormatLine("RA", number='1', type='Integer', description='number of reference alleles observed')
@@ -36,7 +37,7 @@ def main():
3637

3738
vcfobj=VcfFile(vcfilename)
3839

39-
vcfh=open(vcfilename,'r')
40+
vcfh=gzip.open(vcfilename,'r')
4041

4142
vcfobj.parseMetaAndHeaderLines(vcfh)
4243
vcfobj.addMetaFormatHeader(ra_formatline)

vcf_removeSamples.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,17 @@
11
#!/usr/bin/env python
2+
import gzip
23
from itertools import *
34
from VcfFile import *
45
from VcfSampleEval import *
56
from optparse import OptionParser
6-
from common import grouper
7-
from common import typeofGenotype
87
import argparse
98
import os
109

1110

1211
def main():
1312

1413
""" remove samples from a vcf file """
15-
usage = "usage: %prog [options] file.vcf "
14+
usage = "usage: %prog [options] file.vcf.gz "
1615
#parser = OptionParser(usage)
1716
parser = argparse.ArgumentParser(description='remove samples from vcf file')
1817
parser.add_argument('removesamples', metavar='sample', type=str, nargs='+',
@@ -24,7 +23,7 @@ def main():
2423
#print args.vcfile
2524

2625

27-
vcfh=open(args.vcfile,'r')
26+
vcfh=gzip.open(args.vcfile,'r')
2827
vcfobj=VcfFile(args.vcfile)
2928

3029
vcfobj.parseMetaAndHeaderLines(vcfh)

0 commit comments

Comments
 (0)