forked from UMCUGenetics/pymc4c
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extsam.py
101 lines (84 loc) · 3.13 KB
/
extsam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import sys
import pysam
import numpy as np
import bisect as bs
import collections
def mapToRefSite(refSiteList,mappedPos):
# Use bisect implementation to quickly find a matching position
pos = bs.bisect_left(refSiteList,mappedPos)
refLen = len(refSiteList)-1
# Don't bother beyond the last position in the list
pos = min(pos, refLen)
# Move start and end positions to match our heuristic
left = pos
right = pos
while refSiteList[left][0] >= mappedPos[0] + 10 and left > 0:
left -= 1
while refSiteList[right][1] <= mappedPos[1] - 10 and right < refLen:
right += 1
return [left, right]
# reflist = [[100,104],[110,120],[140,144],
# [200,204],[210,220],[240,244],
# [300,304],[310,320],[340,344]]
# placelist = [[130,200],[205,255],[295,360],[270,280]]
# for thisplace in placelist:
# #thisplace=[205,255]
# print mapToRefSite(reflist,thisplace)
#exit()
print 'Loading restrsites, this takes a while...'
restrefs=np.load(sys.argv[2])['restrsites'].item()
print 'Finished loading, moving on'
insam = sys.argv[1]
samfile = pysam.AlignmentFile(insam, "rb")
prevRead = samfile.next()
prevResult = [-1,-1]
prevID = ''
curID = ''
curStack = []
# byReads = []
byReads = collections.defaultdict(list)
for read in samfile:
if not read.is_unmapped:
if read.reference_name not in restrefs:
continue
result = mapToRefSite(restrefs[read.reference_name],[read.reference_start, read.reference_start + read.infer_query_length(always=True)])
curID = int(dict(item.split(":") for item in read.query_name.split(";"))['RD'])
# If two subsequent reads:
# were mapped to the same chromosome,
# and to the same strand,
# and with at least one overlapping restriction site
# and have the same mother read...
if prevRead.reference_id == read.reference_id \
and prevRead.is_reverse == read.is_reverse \
and result[0] <= prevResult[1] and result[1] >= prevResult[0] \
and curID == prevID:
curStack.append((read,result))
else:
if curStack != []:
for i in range(min([x[1][0] for x in curStack]), max([x[1][1] for x in curStack])+1):
# print i,len(restrefs[read.reference_name])
# readID = int(dict(item.split(":") for item in prevRead.query_name.split(";"))['RD']),[x[1] for x in curStack]
restrefs[prevRead.reference_name][i].append(prevID)
#print int(dict(item.split(":") for item in prevRead.query_name.split(";"))['RD']),[x[1] for x in curStack], min([x[1][0] for x in curStack]), max([x[1][1] for x in curStack])
curStack = [(read,result)]
prevRead = read
prevResult = result
prevID = curID
for key in restrefs:
curList = restrefs[key]
keyLen = len(curList)-1
# Turn restriction site locations into meaningful regions
for i in xrange(0,keyLen):
curList[i][0] = sum(curList[i][:2])/2
curList[i][1] = sum(curList[i+1][:2])/2
# Remove empty values from the dataset
for x in xrange(keyLen,-1,-1):
if len(curList[x]) <= 2:
curList.pop(x)
else:
curList[x] = ((curList[x][0],curList[x][1]),curList[x][2:])
# Make links from read indexes to regions
for i,val in enumerate(curList):
for x in val[1]:
byReads[x].append((key,i))
np.savez_compressed(sys.argv[3],byregion=restrefs,byread=dict(byReads))