-
Notifications
You must be signed in to change notification settings - Fork 0
/
solver.h
executable file
·363 lines (277 loc) · 8.02 KB
/
solver.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
#pragma once
#include <string>
#include <iostream>
using namespace std;
/*
This class receive two DNA sequences with arbitrary length,
and align them by Smith-Waterman algorithm.
The parameters needed in the algorithm are also needed in parameter table
of constructor, check the constructor for more information.
This class also detect illegal character within the sequence and
record useful statistics about the alignment.
*/
class solver
{
private:
//For memory concern, we require that one instance of solver
// are used only once.
//This will be assigned with true in method solve()
bool computed;
protected:
//the two DNA sequence to align
char * seq_a;
char * seq_b;
//similarity matrix
double ** sim;
//used in penalty fucntion
double pen_a;
double pen_b;
//scoring matrix
double ** H;
//final result
char * aliSeq_a;
char * aliSeq_b;
//record the maximum of the scoring matrix while running ComputeH()
int mx;
int my;
double maxScore;
//convert acid to number for indexing
int acid2int(char acid);
//penalty function: return gap penalty of k-gap
double pen(int k);
//this function computer H(scoring matrix) following the Smith-Waterman algorithm
//check the detail in the README file
void ComputeH();
//This function return a signal for every step of BackTrack as direction
char TrackStep(int x, int y);
//This function implement the stage of tracking-back
void BackTrack();
public:
//Strings of aligned sequences, asigned by aliSeq_a and aliSeq_b after alignment
//The pointers to aligned sequences are otherwise set in public module of this class
// to avoid memory leaks and exceptions in deconstractor.
char * result_a;
char * result_b;
//length of aligned sequences, supposed to be the same for both of them
int aliLen;
//the number of matched pairs in aligned sequences
int matchCount;
//length of the two unaligned DNA
int length_a;
int length_b;
//constructor, receiving parameters:
// squence_a: char* pointer to sequence A.
// squence_b: char* pointer to sequence B.
// similarity: char** pointer to a 4x4 matrix of similarity of 4 base.
// penalty_opening: parameter of gap penalty, see method pen().
// penalty_extension: parameter of gap penalty, see method pen().
solver(char* sequence_a, char* sequence_b, double ** similarity, double penalty_opening, double penalty_extension);
//deconstructor, delete H, aliSeq_a and aliSeq_b
~solver();
//start the process of alignment, this method is called only once
void solve();
};
int solver::acid2int(char acid)
{
switch (acid)
{
case 'A':return 0;
case 'C':return 1;
case 'G':return 2;
case 'T':return 3;
default:throw("runtime error: find illegal character in DNA sequence");
}
}
double solver::pen(int k)
{
if (k <= 0)
throw("runtime error: gap penalty function non-positive receive parameter.");
return k*pen_a + pen_b;
}
void solver::ComputeH()
{
for (int i = 1; i<length_a; i++)
for (int j = 1; j < length_b; j++)
{
double * scrArr = new double[i + j + 1];
//this array contains i+j+1 prospective maximum
// that will be assigned to H[i][j]
//calculate the i+j+1 values of scrArr
for (int k = 1; k <= i; k++)
{
scrArr[k] = H[i - k][j] - pen(k);
}
for (int k = 1; k <= j; k++)
{
scrArr[k + i] = H[i][j - k] - pen(k);
}
double similarity = sim[acid2int(seq_a[i - 1])][acid2int(seq_b[j - 1])];
scrArr[0] = H[i - 1][j - 1] + similarity;
//select the maximum of scrArr
double maxScrArr = scrArr[0];
for (int k = 1; k < i + j + 1; k++)
{
if (scrArr[k] >= maxScrArr)
maxScrArr = scrArr[k];
}
//assign H[i][j] with maximum of scrArr
// and record the current maximum of H for the later BackTrack
H[i][j] = maxScrArr;
if (maxScrArr > maxScore)
{
mx = i;
my = j;
maxScore = maxScrArr;
}
delete[] scrArr;
}
}
void solver::solve()
{
//As an instance of solve is for once-only-use,
//the second call on solve() will be terminated here.
if (computed)
{
return;
}
else
{
computed = true;
}
ComputeH();
BackTrack();
result_a = aliSeq_a;
result_b = aliSeq_b;
}
solver::~solver()
{
//We don't delete double ** sim here
// because its memory is allocated outside this class
//delete scoring matrix
for (int i = 0; i < length_a; i++)
delete[] H[i];
delete[] H;
//try this part as it's possible that user have deleted the aligned sequence strings
// by char * result_a or char * result_b before deconstruction
try
{
delete[] aliSeq_a;
delete[] aliSeq_b;
}
catch (exception)
{
throw("runtime error: the aligned sequence strings are already delete before solver's deconstruction");
}
}
solver::solver(char* sequence_a, char* sequence_b, double ** similarity, double penalty_opening, double penalty_extension)
{
//check parameters
length_a = strlen(sequence_a);
length_b = strlen(sequence_b);
if (length_a <= 0 || length_b <= 0 || penalty_extension <= 0 || penalty_opening < 0)
throw("initilization error: parameter is wrongly set!");
//initialize class attributes
computed = false;
aliLen = 0;
matchCount = 0;
//It's unlikely to validate the char ** similarity.
//Exception should be catched, if any
sim = similarity;
if (sequence_a == nullptr || sequence_b == nullptr)
throw("initilization error: pointer is null!");
seq_a = sequence_a;
seq_b = sequence_b;
pen_a = penalty_extension;
pen_b = penalty_opening;
//The lengths increase by one for convenience of constructing the scoring matrix and
length_a += 1;
length_b += 1;
mx = 0;
my = 0;
maxScore = 0;
H = new double *[length_a];
for (int i = 0; i < length_a; i++)
{
H[i] = new double[length_b];
memset(H[i], 0, length_b * sizeof(double));
}
aliSeq_a = new char[length_a + length_b];
aliSeq_b = new char[length_a + length_b];
memset(aliSeq_a, 0, (length_a + length_b) * sizeof(char));
memset(aliSeq_b, 0, (length_a + length_b) * sizeof(char));
/*
we set the length of aligned sequence to the sum of two DNA,
to handle the worst situation where no pairs are matched.
*/
result_a = nullptr;
result_b = nullptr;
}
void solver::BackTrack()
{
//load the maximum and its coordinate of H
// x,y are dynamical in the process of BackTrack
int x = mx;
int y = my;
// i pointing to current blank position of aliSeq_a and aliSeq_b
int i = 0;
//step dynamically denotes the direction of next step
char step = 'D';
//The process of BackTrack
while (step!='E')
{
step = TrackStep(x, y);
switch (step)
{
case 'D':aliSeq_a[i] = seq_a[x - 1];
aliSeq_b[i] = seq_b[y - 1];
i++;
x--; y--;
break;
case 'L':aliSeq_a[i] = '-';
aliSeq_b[i] = seq_b[y - 1];
i++;
y--;
break;
case 'U':aliSeq_a[i] = seq_a[x - 1];
aliSeq_b[i] = '-';
i++;
x--;
break;
case 'E':aliSeq_a[i] = seq_a[x - 1];
aliSeq_b[i] = seq_b[y - 1];
i++;
break;
}
}
aliSeq_a[i] = '\0';
aliSeq_b[i] = '\0';
aliLen = i;
//The length of aligned sequences are determined by current i
i--;
//make i pointing to the last character of aligned sequences
//reverse the two aligned strings and calculate matchCount
for (int j = 0; j <= i; j++, i--)
{
if (aliSeq_a[j] == aliSeq_b[j] && i != j)
matchCount++;
if (aliSeq_a[i] == aliSeq_b[i])
matchCount++;
swap(aliSeq_a[j], aliSeq_a[i]);
swap(aliSeq_b[j], aliSeq_b[i]);
}
}
char solver::TrackStep(int x, int y)
{
if (H[x-1][y-1]==0|| H[x][y - 1] == 0 || H[x - 1][y] == 0)
return 'E';
//'E' means ending the process of BackTrak as it comes a 0 in the neighbourhood
if (H[x - 1][y - 1] >= H[x - 1][y] && H[x - 1][y - 1] >= H[x][y - 1])
return 'D';
//'D' means go upper-left (diagonally)
if (H[x - 1][y] > H[x][y - 1])
return 'U';
//'U' means go up
else
return 'L';
//'L' means go left
}