-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlocalAlignment.py
139 lines (100 loc) · 3.96 KB
/
localAlignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import sys
filedata = open(sys.argv[1]).read().split()
v = filedata[0]
w = filedata[1]
sigma = 5
del filedata
def getScoreMat(filename):
#Generate the scoring matrix dict
filedata = open(filename).readlines()
scoringMatrix = {}
refs = filedata[0].strip().split()
for line in filedata[1:]:
line_data = line.strip().split()
for i,ref in enumerate(refs):
pair = (ref,line_data[0])
scoringMatrix[pair] = int(line_data[i+1])
return scoringMatrix
def LCS(v,w,sigma,scoremat):
#generate the scores to use for the backtracking
scores = {}
#initialize the scores dict where i or j == 0
i_j0 = dict([((i,0),-i*sigma) for i in range(len(v)+1)])
i0_j = dict([((0,j),-j*sigma) for j in range(len(w)+1)])
scores.update(i_j0)
scores.update(i0_j)
max_score = (0,0,0) #initalize the maximum score (i,j,highest score)
for i in range(1,len(v)+1):
for j in range(1,len(w)+1):
mu = scoremat[(v[i-1],w[j-1])]
max_val = max([scores[(i-1,j)]-sigma,scores[(i,j-1)]-sigma,scores[(i-1,j-1)] + mu,0])
scores[(i,j)] = max_val
#if max_val is greater than the max value stored in max_score then
#set (i,j,max_val) as the new max_score variable
if max_val > max_score[2]:
max_score = (i,j,max_val)
return max_score,scores
def findConserved(v,w,max_score,backtracking):
#Find the conserved regions in v and w
i = max_score[0]
j = max_score[1]
#get the v and w strings up to the indices of the max scoring node
v_partial = v[:i]
w_partial = w[:j]
#keep adjusting the values of i and j until either one of them equals zero
#or the source node is reached
while i*j != 0 and backtracking[(i,j)] != 0:
if backtracking[(i,j)] == backtracking[(i-1,j)]-sigma:
i -= 1
elif backtracking[(i,j)] == backtracking[(i,j-1)]-sigma:
j -= 1
else:
j -= 1
i -= 1
#get the v and w strings from the adjusted i and j index values
v_conserved = v_partial[i:]
w_conserved = w_partial[j:]
return v_conserved,w_conserved
def alignment(v,w,max_score,backtracking):
#Adjust the strings v and w according to the backtracking dict
i = max_score[0]
j = max_score[1]
#i_diff and j_diff contain the differences between the indices of the max
#scoring node and the length of their respective strings
i_diff = i-len(v)
j_diff = j-len(w)
v_temp = list(v)
w_temp = list(w)
#keep adjusting the values of i and j until one of them equals zero
while i*j != 0:
if backtracking[(i,j)] == backtracking[(i-1,j)]-sigma:
i -= 1
w_temp.insert(j-j_diff,'-')
elif backtracking[(i,j)] == backtracking[(i,j-1)]-sigma:
j -= 1
v_temp.insert(i-i_diff,'-')
else:
j -= 1
i -= 1
#add any dashes to the front of the v and w strings if necessary
v_with_dashes = ['-' for dash in range(j-j_diff)]
w_with_dashes = ['-' for dash in range(i-i_diff)]
v_with_dashes.extend(v_temp)
w_with_dashes.extend(w_temp)
return ''.join(v_temp),''.join(w_temp)
def localAlignment(v,w,sigma,scoremat):
#Generate the global alignment of v and w using sigma and the input scoring matrix
#max_score contains the indices and score of the highest scoring node
max_score,backtracking = LCS(v,w,sigma,scoremat)
v_conserved,w_conserved = findConserved(v,w,max_score,backtracking)
v_aligned,w_aligned = alignment(v_conserved,w_conserved,max_score,backtracking)
return '\n'.join([str(max_score[2]),''.join(v_aligned),''.join(w_aligned)])
scoremat = getScoreMat('PAM250.txt')
output = localAlignment(v,w,sigma,scoremat)
#write output to file and open
filename = 'ANS_'+sys.argv[1]
fh = open(filename,'w')
fh.write(output)
fh.close()
import webbrowser
webbrowser.open(filename)