Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

index samples by sample name for easier access. #2

Closed
wants to merge 6 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 27 additions & 8 deletions vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,6 @@ def __doc__(self):
class VCFReader(object):
'''Read and parse a VCF v 4.0 file'''
def __init__(self, fsock, aggressive=False):
super(VCFReader, self).__init__()
self.aggro = aggressive
self._metadata = None
self._infos = None
Expand Down Expand Up @@ -351,9 +350,9 @@ def _parse_info(self, info_str):
def _parse_samples(self, samples, samp_fmt):
'''Parse a sample entry according to the format specified in the FORMAT
column.'''
samp_data = []
samp_data = {}
samp_fmt = samp_fmt.split(':')
for sample in samples:
for i, sample in enumerate(samples):
sampdict = dict(zip(samp_fmt, sample.split(':')))
for fmt in sampdict:
vals = sampdict[fmt].split(',')
Expand All @@ -372,13 +371,32 @@ def _parse_samples(self, samples, samp_fmt):
elif sampdict[fmt] == './.' and self.aggro:
sampdict[fmt] = None

samp_data.append(sampdict)

for name, data in zip(self.samples, samp_data):
data['name'] = name
sample_name = self.samples[i]
sampdict["name"] = sample_name
samp_data[sample_name] = sampdict

return samp_data

def genotypes(self, rec):
"""
return the actual allele from the encoded genotypes
"""
# create an array so that the 0/1/2 etc. just index
# ref/first-ALT/2nd-ALT respectively...
gts = [rec.REF] + rec.ALT

genotypes = {}
for name in self.samples:
sample = rec.samples[name]
sample_genotype = sample['GT']
if sample_genotype in (None, "."):
genotypes[name] = None
continue
# split on '|' or '/';
vals = map(int, re.split('\||/', sample_genotype))
genotypes[name] = [gts[i] for i in vals]
return genotypes

def next(self):
'''Return the next record in the file.'''
if self._samples is None:
Expand Down Expand Up @@ -449,7 +467,8 @@ def main():
vcf_file = VCFReader(sock, aggressive=True)
for record in vcf_file:
print record

print record
print vcf_file.genotypes(record)

if __name__ == '__main__':
main()