-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf_gt-filter.py
executable file
·81 lines (61 loc) · 2.67 KB
/
vcf_gt-filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python
import gzip
import sys
from optparse import OptionParser
from collections import defaultdict
from VcfFile import *
import argparse
""" Select vcf data lines based on genotype criterions
Specify genotypes with the -gt option: -gt <sample><single-space><genotype string>
i.e. -gt "sampleOne 0/0 """
def main():
usage = "usage: %prog [options] file.vcf.gz "
parser = argparse.ArgumentParser(description='filter records based on genotypes')
parser.add_argument('vcf', metavar='vcf', type=str,
help='vcf.gz file')
""" http://stackoverflow.com/a/15008806/1735942 """
parser.add_argument('--no-header',dest='header',action='store_false')
parser.add_argument('-gt', metavar='gt', type=str, nargs='*', action='append',
help='sample 0/0')
args = parser.parse_args()
""" http://stackoverflow.com/q/12460989/1735942 """
args.gt = [el for elements in args.gt for el in elements]
#print args.gq
gt_filter=[ tuple(x.split(' ')) for x in args.gt ]
gt_dict=defaultdict(list)
for (k,v) in gt_filter:
gt_dict[k].append(v)
#print gt_dict
vcfh=gzip.open(args.vcf,'r')
vcfobj=VcfFile(args.vcf)
vcfobj.parseMetaAndHeaderLines(vcfh)
header=vcfobj.returnHeader()
if args.header == True:
print header
samplelist=vcfobj.getSampleList()
for s in gt_dict.keys():
if s not in samplelist:
print s ," not in samples!\n"
sys.exit(1)
#print header
#print header
#print gt_dict.keys()
for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ):
genotypes_toFilter=[] #list [ (sample,genoObj) ] to be filtered
genotype_tuple= vrec.zipGenotypes(samplelist) ## get a list of tuples [ (sample, VcfGenotype object) ... ]
for (s,g) in genotype_tuple:
if s in gt_dict.keys():
#print s
if len(gt_dict[s]) > 1: # logical or
if any( [ g.getFormatVal('GT') == v for v in gt_dict[s] ] ):
genotypes_toFilter.append(True)
else: genotypes_toFilter.append(False)
else:
if all( [ g.getFormatVal('GT') == v for v in gt_dict[s] ] ):
genotypes_toFilter.append(True)
else:genotypes_toFilter.append(False)
# all gt filters need to evaluate to True in order for record to print
if all(item == True for item in genotypes_toFilter):
print vrec.toStringwithGenotypes()
if __name__ == "__main__":
main()