-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf_pysam_allele_pileup.py
executable file
·135 lines (97 loc) · 5.32 KB
/
vcf_pysam_allele_pileup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
import gzip
from VcfFile import *
from VcfMetaLines import FormatLine
from optparse import OptionParser
from collections import Counter
import os
import pysam
def main():
""" given a VCF file and bam file containing the sample(s) in the VCF this will add INFO and FORMAT tags
to indicate the count of reference and alt alleles observed in total and per-sample and print out a new VCF"""
usage = "usage: %prog [option] file.vcf.gz"
parser =OptionParser(usage)
parser.add_option("--bam", type="string", dest="bam", default=None, help="bam file to perform pileup on")
parser.add_option("--mapq", type="float", dest="mapq", default=0., help="Exclude alignments from analysis if they have a mapping less than mapq (default is 0)")
parser.add_option("--bq", type ="float", dest="bq", default =0. , help="Exclude bases from analysis if their supporting base quality is less that --bq (default is 0)")
parser.add_option("--includeDuplicates", action="store_false", dest="duplicate", help="include duplicate marked reads in analysis (turned off by default) ")
(options, args)=parser.parse_args()
if options.bam == None:
sys.stderr.write("please provide a value to --bam option\n")
sys.exit(1)
vcfilename=args[0]
bamfilename=options.bam
ra_formatline=FormatLine("RA", number='1', type='Integer', description='number of reference alleles observed')
aa_formatline=FormatLine("AA", number='1', type='Integer', description='number of alternate alleles observed')
if os.path.exists(bamfilename+".bai") == False:
sys.stderr.write("please check for existence of bam index file (*.bai)\n")
exit(1)
vcfobj=VcfFile(vcfilename)
vcfh=gzip.open(vcfilename,'r')
vcfobj.parseMetaAndHeaderLines(vcfh)
vcfobj.addMetaFormatHeader(ra_formatline)
vcfobj.addMetaFormatHeader(aa_formatline)
vcfobj.addMetaInfoHeader("RA", "Integer", "1","total number of reference alleles observed" )
vcfobj.addMetaInfoHeader("AA", "Integer", "1","total number of alternate alleles observed" )
header=vcfobj.returnHeader()
print header
readgroupdict={}
pybamfile = pysam.Samfile(bamfilename, "rb" )
rgdictlist=pybamfile.header['RG']
for dictionary in rgdictlist:
readgroupdict[ dictionary['ID'] ]= dictionary['SM']
#print readgroupdict
samples=vcfobj.getSampleList()
#print samples
for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
(chrom, start, end)=vrec.getChrom(), int( vrec.getPos() )-1, int(vrec.getPos() )
#print chrom, str(start), str(end)
#print vrec.getRef()
#print vrec.toStringwithGenotypes()
for pileupcolumn in pybamfile.pileup( chrom, start, end):
if pileupcolumn.pos != end:
continue
#sys.stdout.write('chr'+chrom+ " " + str(start) + " " + str(end) + " " + str(pileupcolumn.pos) + " ")
#print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.n)
seqdict={}
sampledict={}
for s in samples: sampledict[s]=[]
#print sampledict
for (base,count) in ( ('A',0), ('C',0), ('G',0), ('T',0), ('N',0) ):
seqdict[base]=count
for pileupread in pileupcolumn.pileups:
if pileupread.alignment.is_duplicate == True and options.duplicate == False: continue
if pileupread.alignment.mapq < options.mapq: continue
if ( ord ( pileupread.alignment.qual[ pileupread.qpos -1 ] ) - 33 ) < options.bq: continue
seqdict[ pileupread.alignment.seq[pileupread.qpos-1] ] +=1
readgroup=dict( pileupread.alignment.tags )['RG']
sample=readgroupdict[readgroup]
#print readgroup,sample, pileupread.alignment.seq[pileupread.qpos-1]
sampledict[sample].append(pileupread.alignment.seq[pileupread.qpos-1])
#print pileupread.alignment.seq, len(pileupread.alignment.seq), pileupread.qpos
vrec.addInfo("RA="+str(seqdict[vrec.getRef()]))
if vrec.getAlt() != ".":
vrec.addInfo("AA="+str(seqdict[vrec.getAlt()]))
zip_genos=vrec.zipGenotypes(samples)
for (sample, vcfgenobj) in zip_genos:
if len(sampledict[sample]) == 0:
vcfgenobj.addFormat("RA")
vcfgenobj.addFormat("AA")
continue
else:
ra=0
aa=0
c=dict(Counter(sampledict[sample]))
if vrec.getRef() in c.keys():
ra=c[vrec.getRef()]
if vrec.getAlt() in c.keys():
aa=c[vrec.getAlt()]
vcfgenobj.addFormatVal('RA', str(ra))
vcfgenobj.addFormatVal("AA", str(aa))
#for nt in ('A', 'C', 'G', 'T', 'N'):
# sys.stdout.write( str(seqdict[nt]) + " ")
#sys.stdout.write("\n")
print vrec.toStringwithGenotypes()
pybamfile.close()
if __name__ == "__main__":
main()