-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf_intersect.py
executable file
·148 lines (108 loc) · 5.02 KB
/
vcf_intersect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/python
import sys
import os
import string
import re
from optparse import OptionParser
from VcfFile import *
from bx.bitset import *
from bx.bitset_builders import *
def binned_bitsets_from_vcffile( vcfilename, chrom_col=0, start_col=1, upstream_pad=0, downstream_pad=0, lens={} ):
"""
Read a vcffile into a dictionary of bitsets. The defaults arguments
- 'vcfilename' should be a filename for vcf file
- 'chrom_col', 'start_col', and 'end_col' must exist in each line.
- if 'lens' is provided bitset sizes will be looked up from it, otherwise
chromosomes will be assumed to be the maximum size
- the bitset interval made into a zero-based, half-open interval!!!!!!!
"""
last_chrom = None
last_bitset = None
bitsets = dict()
MAX=2147483647
vcfobj=VcfFile(vcfilename)
fh=open(vcfilename,'r')
for vrec in vcfobj.yieldVcfRecord(fh):
filtercode = vrec.getFilter()
chrom = vrec.getChrom()
pos=int( vrec.getPos() )
#if filtercode != filtercodeoption and filtercodeoption != None:
# continue
if filtercode != 'PASS':
if filtercode == '.':
pass
else:
continue
chrom="chr"+chrom
if chrom != last_chrom:
if chrom not in bitsets:
if chrom in lens:
size = lens[chrom]
else:
size = MAX
bitsets[chrom] = BinnedBitSet( size )
last_chrom = chrom
last_bitset = bitsets[chrom]
start, end = (pos-1, pos)
if upstream_pad: start = max( 0, start - upstream_pad )
if downstream_pad: end = min( size, end + downstream_pad )
if start > end: warn( "Interval start after end!" )
last_bitset.set_range( start, end-start )
fh.close()
return bitsets
def main():
usage = "usage: %prog [options] vcf_file_one vcf|bed_file_two\n\nFind regions in the first vcf file that overlap regions of the second vcf or bed file\n"
parser = OptionParser(usage)
parser.add_option("--minCols", type="int", dest="mincols", default=1, help="mininum basepair overlap (default is one)")
parser.add_option("--v", action="store_true", dest="reverse", help="Print regions in first vcf that DO NOT overlap second vcf|bed file")
parser.add_option("--filter", type="string", dest="filter", default=None, help="intersect records only set with filter (default is None")
parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE")
parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None)
parser.add_option("--noheader", action="store_true", dest="noheader", help="VCF file one has no header line", default=False)
parser.add_option("--nochrprefix", action="store_false", dest="chrprefix", help="use if the bed doesn't have chr prefix in chrom column", default=True)
(options, args)=parser.parse_args()
sys.stderr.write("intersecting two files ...\n")
vcf_file_one=args[0]
in2_fname=args[1]
in2_fname_ext= os.path.splitext(in2_fname)[1][1:]
if "bed" == in2_fname_ext:
bitsets = binned_bitsets_from_file( open( in2_fname ) )
if "vcf" == in2_fname_ext:
bitsets = binned_bitsets_from_vcffile( in2_fname , options.filter)
vcfobj=VcfFile(vcf_file_one)
vcfh=open(vcf_file_one,'r')
if options.noheader == False:
vcfobj.parseMetaAndHeaderLines(vcfh)
header=vcfobj.returnHeader()
#print header
#vcfobj.parseMetaAndHeaderLines(vcfh)
#descriptors = vcfobj.getMetaInfoDescription()
#infoids=[]
#for (tag, description) in descriptors:
# infoids.append(tag)
#if options.infotag not in infoids and options.infotag != 'QUAL' and options.infotag != "" and options.noheader == False:
# sys.stderr.write(options.infotag + " tag not in ##INFO headers!\n")
# exit(1)
print header
for dataline in vcfobj.yieldVcfDataLine(vcfh):
fields=dataline.strip().split('\t')
(chrom,pos,id,ref,alt,qual,filtercode,info)=fields[0:8]
(start,end) = (int(pos)-1, int(pos))
#pass the filter code
if filtercode != options.filter and options.filter != None:
continue
#check to see if record is the correct variant TYPE
if options.variantype != None:
pattern=options.infotag+'=('+options.variantype+')'
if re.search(pattern, info ) == None:
continue
if options.chrprefix == True:
chrom="chr"+chrom
if chrom in bitsets and bitsets[chrom].count_range( start, end-start ) >= options.mincols:
if not options.reverse:
print dataline
else:
if options.reverse == True:
print dataline
if __name__ == "__main__":
main()