-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVcfGenotype.py
146 lines (111 loc) · 4.49 KB
/
VcfGenotype.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import sys
import re
import itertools
from collections import OrderedDict
class VcfGenotype(object):
""" represents a VCF genotype """
def __init__(self, formatstring, gstring ):
""" initialize a VcfGenotype """
""" formatstring is from the format column gstring is the genotypestring """
self.gstring=gstring
self.formatstring=formatstring
self.isPhased=0
self.allele1=''
self.allele2=''
#keys are format key value is from gstring
self.gdict=OrderedDict()
formatids=self.formatstring.split(':')
gstringvals=self.gstring.split(':')
self.parseAlleles(gstringvals[0]) # the first format is always the GT (genotype)
zipiter=itertools.izip_longest(formatids,gstringvals,fillvalue='.')
for (format,gstringval) in zipiter:
self.gdict[format]=gstringval
def getGenotypeFormatFields(self):
""" return list of ids for genotype format string """
return self.gdict.keys()
def getFormatVal(self, key):
""" return a value from the format string """
if key in self.gdict:
return self.gdict[key]
else:
return "."
def addFormatVal(self, key, value):
""" add a new format key/value to the genotype format string;
we then implicitly update the gstring and formatstring """
self.gdict[key]=value
self.gstring=":".join( [ self.gdict[k] for k in self.gdict.keys() ] )
self.formatstring=":".join(self.gdict.keys())
def addFormat(self,formatstring):
""" add a format field to the genotype """
self.formatstring+=":"+formatstring
def setAlleles(self, allele1, allele2):
""" set the alleles for the VcfGenotype object """
self.allele1=allele1
self.allele2=allele2
def setIsPhased(self):
""" set phased flag on the gentoype to true """
self.isPhased=1
def parseAlleles(self, allelefield):
""" set the allele1 and allele2 given a genotype (GT) field """
delimiter=''
if '|' in allelefield:
self.setIsPhased()
delimiter='|'
elif '/' in allelefield:
delimiter='/'
elif '.' == allelefield:
self.allele1='.'
self.allele2='.'
return
else:
sys.stderr.write("un-recognized genotype delimiter: " + allelefield + "\n")
exit(1)
(allele1,allele2) = allelefield.split(delimiter)
self.allele1=allele1
self.allele2=allele2
def getFormatString(self):
return self.formatstring
def getAlleles(self):
""" return tuple with (allele1, allele2) """
return (self.allele1, self.allele2)
def isCalled(self):
""" return True if genotype was called and both alleles are not '.' """
if self.allele1 != '.' and self.allele2 != '.':
return True
return False
def isSegregating(self):
""" return true if genotype has at least one non-ref allele so 1/1 and 1/0 would return true but 0/0 returns false """
if self.isCalled() == False:
return False
if self.allele1 != '0' or self.allele2 != '0':
return True
return False
def isNonRefHomz(self):
""" return true if genotype is non-ref homozygote """
if self.isCalled() == False:
return False
if self.allele1 !='0' and self.allele2 != '0':
return True
return False
def isHet(self):
""" return True if gentoype is het and both alleles are called"""
if self.allele1 != '.' and self.allele2 != '.':
if self.allele1 != self.allele2:
return True
return False
def isHomoz(self):
if self.allele1 != "." and self.allele2 != "."
if self.allele1 == '0' and self.allele2 == "0":
return True
return False
def checkFormatIds(self, formatlist):
""" check the ids in the FORMAT field to make sure they are contained in the list of FORMAT ids (the format list) """
formatfields=self.formatstring.split(':')
for elem in formatfields:
if elem not in self.gdict.keys():
sys.stderr.write(elem + " is not in FORMAT column!\n ")
exit(1)
def toString(self):
return self.gstring
def __str__(self):
return self.gstring