-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf_TwoBitExtract.py
executable file
·77 lines (61 loc) · 2.81 KB
/
vcf_TwoBitExtract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
import sys
import os
import string
import re
from optparse import OptionParser
from VcfFile import *
import bx.seq.twobit
""" given a 2bit file and a VCF extract out sequence from the 2bit file. Please note 2bit file extracton works on zero-based, half-open interval
please note, as of now only SNPs are extracted. Complex variant types like indels, MNPs, SVs are ignored for now...."""
def main():
usage = "usage: %prog [options] file.vcf"
parser = OptionParser(usage)
parser.add_option("--tbf", type="string", dest="tbf", help="2bit file")
parser.add_option("--pad", type="int", dest="pad", default=0, help="extract sequence upstream and downstream of position by pad value")
parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE")
parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default='snp')
(options, args)=parser.parse_args()
#open 2bitfile
try:
sys.stderr.write("opening twobitfile...\n")
twobit=bx.seq.twobit.TwoBitFile( open( options.tbf ) )
except:
sys.stderr.write("unable to open twobit file!\n")
exit(1)
#open the vcf file
vcfile=args[0]
vcfh=open(vcfile, 'r')
vcfobj=VcfFile(vcfh)
vcfobj.parseMetaAndHeaderLines(vcfh)
pattern=options.infotag+'=('+options.variantype+')'
sequence=''
downstream_seq=''
upstream_seq=''
for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
pos=vrec.getPos()
start=int(pos)-1
end=int(pos)
info=vrec.getInfo()
if re.search(pattern, info ) == None:
continue
else:
value=re.search(pattern, info ).groups()[0]
#print vrec.toString()
assert( end > start ),"end greater than start!"
try:
sequence=twobit['chr'+vrec.getChrom()][start:end]
sequence=sequence.upper()
except:
sys.stderr.write("unable to fetch sequence from 2bit file!\n")
if options.pad !=0:
downstream_start=int(pos)
upstream_end=int(pos)-1
downstream_seq=twobit['chr'+vrec.getChrom()][downstream_start:downstream_start+options.pad]
upstream_seq=twobit['chr'+vrec.getChrom()][upstream_end-options.pad:upstream_end]
outstr="\t". join(['chr'+vrec.getChrom(), str(start), str(end), sequence, str(upstream_end-options.pad), str(upstream_end), upstream_seq, str(downstream_start), str(downstream_start+options.pad),downstream_seq] )
else:
outstr="\t". join(['chr'+vrec.getChrom(), str(start), str(end), sequence])
print outstr
if __name__ == "__main__":
main()