-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path007resolve_annotation_conflicts.py
75 lines (52 loc) · 2.34 KB
/
007resolve_annotation_conflicts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from utils import *
INTERGENIC = 0
EXON = 1
INTRON = 2
EXON_NONCODING = 3
rtypes = ['intergenic', 'exon', 'intron', 'exon_noncoding']
if __name__ == '__main__':
anno = read_annotation_original()
maxl = 0
for chrom in anno:
_maxl = max(r['end'] for r in anno[chrom])
if _maxl > maxl: maxl = _maxl
anno[chrom] = {'+' : sorted((r for r in anno[chrom] if r['strand'] == '+'), key = lambda reg: reg['start']),
'-' : sorted((r for r in anno[chrom] if r['strand'] == '-'), key = lambda reg: reg['start'])}
bitmap = [INTERGENIC for i in xrange(maxl+1)]
elapsed('reading annotation')
print 'bitmap size:', len(bitmap)
new_anno = {}
seen = set()
for chrom in anno:
new_anno[chrom] = []
for strand in anno[chrom]:
# reset the bitmap
for i in xrange(len(bitmap)) :
bitmap[i] = INTERGENIC
for reg in anno[chrom][strand]:
# skip identical regions
rkey = '%s %d %d %s' % (chrom, reg['start'], reg['end'], reg['type'])
if rkey in seen: continue
seen.add(rkey)
if reg['type']== 'exon':
for i in xrange(reg['start'], reg['end']+1):
bitmap[i] = EXON
elif reg['type'] == 'exon_noncoding':
for i in xrange(reg['start'], reg['end']+1):
if bitmap[i] in [INTERGENIC, INTRON]: bitmap[i] = EXON_NONCODING
elif reg['type'] == 'intron':
for i in xrange(reg['start'], reg['end']+1):
if bitmap[i] == INTERGENIC: bitmap[i] = INTRON
pos = 0
reg_start, reg_end, reg_type = 0, 0, INTERGENIC
while pos < len(bitmap):
reg_type = bitmap[pos]
reg_start = pos
while pos < len(bitmap) and reg_type == bitmap[pos]:
pos += 1
reg_end = pos - 1
if reg_type != INTERGENIC:
new_anno[chrom].append({'type' : rtypes[reg_type], 'start': reg_start, 'end' : reg_end, 'strand' : strand, 'chrom' : chrom})
elapsed(chrom)
json.dump(new_anno, open('hg19.merged.to.ensg.all.tx.03.18.2011.txt.with.genetypes.final.txt.007noConflicts', 'w'), indent = 1)
elapsed('dump')