-
Notifications
You must be signed in to change notification settings - Fork 0
/
bicgstabBLASD.cpp
122 lines (100 loc) · 3.4 KB
/
bicgstabBLASD.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
/**************************************************************************
BiCGSTABBLASD - double precision
Algorithm 12, Parameter-free iterative linear solver by R. Weiss, 1996
system of linear equations: aa(i,j)*x(j) = b(i)
Version using BLAS library on GPU
TWS, March 2011
**************************************************************************/
#include <cuda_runtime.h>
#include <cublas.h>
#include <stdio.h>
double bicgstabBLASD(double **a, double *b, double *x, int n, double eps, int itmax)
{
extern int useGPU;
extern double *h_x,*h_b,*h_a,*h_rs;
extern double *d_a, *d_res, *d_x, *d_b;
extern double *d_r, *d_rs, *d_v, *d_s, *d_t, *d_p, *d_er;
const int nmem0 = sizeof(double); //needed for memcpy
const int nmem1 = n*sizeof(double);
const int nmem2 = n*n*sizeof(double);
int i,j,kk,ierr;
double lu,lunew,beta,delta,gamma1,t1,t2,err;
for(j=0; j<n; j++){
for(i=0; i<n; i++) h_a[i*n + j] = a[i+1][j+1];
h_x[j] = x[j+1];
h_b[j] = b[j+1];
h_rs[j] = 1.;
}
cudaSetDevice( useGPU-1 );
cudaMemcpy(d_a, h_a, nmem2, cudaMemcpyHostToDevice);//copy variables to GPU
cudaMemcpy(d_x, h_x, nmem1, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, nmem1, cudaMemcpyHostToDevice);
cudaMemcpy(d_rs, h_rs, nmem1, cudaMemcpyHostToDevice);
// r[i] += a[i][j]*x[j];
cublasDgemv('T', n, n, 1.f, d_a, n, d_x, 1, 0.f, d_r, 1);
// r[i] -= b[i];
cublasDaxpy (n, -1.f, d_b, 1, d_r, 1);
// p[i] = r[i];
cublasDcopy (n, d_r, 1, d_p, 1);
// lu += r[i]*rs[i];
lu = cublasDdot (n, d_r, 1, d_rs, 1);
kk = 1;
do{
// v[i] += a[i][j]*p[j];
cublasDgemv('T', n, n, 1.f, d_a, n, d_p, 1, 0.f, d_v, 1);
// t1 += v[i]*rs[i];
t1 = cublasDdot (n, d_v, 1, d_rs, 1);
if(t1 == 0){
printf("t1 = 0, reset to 1e-12\n");
t1 = 1.e-12; //added January 2012
}
delta = -lu/t1;
// s[i] = r[i] + delta*v[i];
cublasDcopy (n, d_r, 1, d_s, 1);
cublasDaxpy (n, delta, d_v, 1, d_s, 1);
// t[i] += a[i][j]*s[j];
cublasDgemv('T', n, n, 1.f, d_a, n, d_s, 1, 0.f, d_t, 1);
// t1 += s[i]*t[i];
t1 = cublasDdot (n, d_t, 1, d_s, 1);
// t2 += t[i]*t[i];
t2 = cublasDdot (n, d_t, 1, d_t, 1);
if(t2 == 0){
printf("t2 = 0, reset to 1e-12\n");
t2 = 1.e-12; //added January 2012
}
gamma1 = -t1/t2;
// r[i] = s[i] + gamma1*t[i];
cublasDcopy (n, d_s, 1, d_r, 1);
cublasDaxpy (n, gamma1, d_t, 1, d_r, 1);
// er[i] = delta*p[i] + gamma1*s[i];
cublasDcopy (n, d_s, 1, d_er, 1);
cublasDscal (n, gamma1, d_er, 1);
cublasDaxpy (n, delta, d_p, 1, d_er, 1);
// x[i] += er[i];
cublasDaxpy (n, 1., d_er, 1, d_x, 1);
// lunew += r[i]*rs[i];
lunew = cublasDdot (n, d_r, 1, d_rs, 1);
if(lunew == 0){
printf("lunew = 0, reset to 1e-12\n");
lunew = 1.e-12; //added January 2012
}
if(gamma1 == 0){
printf("gamma1 = 0, reset to 1e-12\n");
gamma1 = 1.e-12; //added January 2012
}
beta = lunew*delta/(lu*gamma1);
lu = lunew;
// p[i] = r[i] + beta*(p[i]+gamma1*v[i]);
cublasDaxpy (n, gamma1, d_v, 1, d_p, 1);
cublasDscal (n, beta, d_p, 1);
cublasDaxpy (n, 1., d_r, 1, d_p, 1);
ierr = cublasIdamax (n, d_er, 1); //Find the maximum value in the array
cudaMemcpy(&err, d_er+ierr-1, nmem0, cudaMemcpyDeviceToHost);
kk++;
}
while(kk < itmax && fabs(err) > eps);
cudaMemcpy(h_x, d_x, nmem1, cudaMemcpyDeviceToHost);//bring back x for final result
for(i=0; i<n; i++) x[i+1] = h_x[i];
if(fabs(err) > eps) printf("*** Warning: linear solution using BICGSTB not converged, err = %e\n",err);
return err;
}