-
Notifications
You must be signed in to change notification settings - Fork 2
/
cudafunctions.py
209 lines (177 loc) · 8.17 KB
/
cudafunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 21 17:16:33 2018
@author: Ki Wai Chau
"""
import math
import numpy as np
from numba import float64, guvectorize, cuda
@guvectorize([(float64[:], float64, float64[:], float64[:, :], float64[:], float64[:])], '(n), (), (n), (n, m), (m)->(n)', target='cuda')
def cuda_step_euler(last, dt, drift, volatility, noise, res):
""" Approximate SDE in one time step with Euler scheme with cuda u-funcs"""
for i in range(last.shape[0]):
res[i] = last[i]
res[i] += drift[i] * dt
for j in range(volatility.shape[1]):
res[i] += volatility[i, j] * noise[j] * math.sqrt(dt)
@cuda.jit
def cuda_jit_step_euler(last, dt, drift, volatility, normdist):
""" Approximate SDE in one time step with Euler scheme with cuda jit"""
i = cuda.grid(1)
noise = normdist[i * volatility.shape[2]: (i + 1) * volatility.shape[2]]
if i < last.shape[0]:
for k in range(last.shape[1]):
last[i, k] += drift[i, k] * dt
for j in range(volatility.shape[2]):
last[i, k] += volatility[i, k, j] * math.sqrt(dt) * noise[j]
@cuda.jit(device=True)
def device_rsolv(a, n, d, b):
""" Solves Rx = b, based upon QR decomposition
For a linear equations system with upper tranagular matrix R, and constant term b,
this function calculate x = R^-1 * b inside GPU and return it in place of b. Notice
that the matrix R is seprate into 2 parts and store in the upper trangular part of a
and array d. This setting is used to save the memory usage of the whole QR algorithm.
:param a: This matrix contains the upper triangular matrix R minus the diagonal
Only the upper half of the matrix is used
:param n: The dimension of the matrix
:param d: The diagonal array for the upper trangular matrix R
:param b: The constant term for the linear system
:type a: numpy array
:type n: int
:type d: numpy array
:type b: numpy array
"""
temp = cuda.local.array(1, dtype=float64)
b[-1] /= d[-1]
for i in range(n-2, -1, -1):
temp[0] = 0
for j in range(i+1, n):
temp[0] += a[i, j] * b[j]
b[i] = (b[i] -temp[0])/d[i]
return b
@cuda.jit(device=True)
def device_qrsolv(a, m, n, c, d, b):
""" Solves Ax=b, based upon QR decomposition
For a linear equations system with QR decomposable matrix A, and constant term b,
this function calculate x = A^-1 * b inside GPU and return in in place of b. Notice
that an alternative matrix a, with the upper trangular part being the R part of A
(minus the diagonal) and the lower trangular part contains the orthogonal basis of Q.
:param a: This matrix contains the QR decomposition of A in condensed form
:param m: The row dimension of the matrix
:param n: The column dimension of the matrix
:param c: This vector contains v^Tv/2 for all v in Q.
:param d: The diagonal array for the upper trangular matrix R times scalling factor
:param b:The constant term for the linear system
:type a: numpy array
:type m: int
:type n: int
:type c: numpy array
:type d: numpy array
:type b: numpy array
"""
# Creat temporaory local storage
temp = cuda.local.array(2, dtype=float64) #(sum, type)
for j in range(n):
# Calculate sum = v_j^T*b
temp[0] = 0.
for i in range(j, m):
temp[0] += a[i, j] * b[i]
# Calculate beta * sum = (2/v_j^T*v_j) (v_j^T*b)
temp[1] = temp[0]/c[j]
# b = (I + beta v_j* v_j^T) * b
for i in range(j, m):
b[i] -= temp[1] * a[i,j]
device_rsolv(a, n, d, b)
return b
@cuda.jit(device=True)
def device_qrdcmp(a, m, n, c, d):
""" Proform QR decoposition
This function takes in a matrix a, and uses Householder reflections to perform QR
decomposition. It returns in place of a a mixed matrix with the upper half being
the R portion of the decoposition and the lower half the Q portion. The array d is
the diagonal element of R and c is a array of scaling factor.
:param a: This matrix for decomposition
:param m: The row dimension of the matrix
:param n: The column dimension of the matrix
:param c: This vector contains v^Tv/2 for all v in Q.
:param d: The diagonal array for the upper trangular matrix R times scalling factor
:type a: numpy array
:type m: int
:type n: int
:type c: numpy array
:type d: numpy array
"""
temp = cuda.local.array(4, dtype=float64) #(scale, sum, sigma, tau)
for k in range(n):
temp[0] = 0.
for i in range(k, m):
temp[0] = max(temp[0], math.fabs(a[i, k]))
if temp[0] == 0:
c[k] = d[k] = 0.0
else:
for i in range(k, m):
a[i, k] /= temp[0]
temp[1] = 0.0
for i in range(k, m):
temp[1] += a[i, k] ** 2
temp[2] = math.copysign(math.sqrt(temp[1]), a[k, k])
a[k, k] += temp[2]
c[k] = temp[2] * a[k, k]
d[k] = -temp[0] * temp[2]
for j in range(k+1, n):
temp[1] = 0.0
for i in range(k, m):
temp[1] += a[i, k] * a[i, j]
temp[3] = temp[1]/c[k]
for i in range(k, m):
a[i, j] -= temp[3] * a[i, k]
return a, c, d
@cuda.jit
def cuda_qrdcmp(bundles_num, a, m, n, c, d):
""" Proform QR decoposition in parallel with GPU"""
i = cuda.grid(1)
if i < bundles_num:
device_qrdcmp(a[i], m, n, c[i], d[i])
@cuda.jit
def cuda_qrsolv(bundle_num, index_num, a, m, n, c, d, b):
""" Solves Ax=b, based upon QR decomposition in parallel with GPU"""
i = cuda.grid(1)
if i < bundle_num*index_num:
device_qrsolv(a[int(i/index_num)], m, n, c[int(i/index_num)], d[int(i/index_num)], b[int(i/index_num), i%index_num])
def cuda_regression(basis_order, no_of_regression, c_bundles_num, bundle_range, sorted_regression_unknown, sorted_basis, regression_coeff):
"""Perform Regression in parallel for different bundle and regression target"""
blksz = 256
gridsz = int(math.ceil( c_bundles_num * no_of_regression / blksz))
regression_matrix = np.empty((c_bundles_num, basis_order, basis_order), dtype=np.double)
regression_vector = np.empty((c_bundles_num, no_of_regression, basis_order), dtype=np.double)
for b in range(c_bundles_num):
regression_matrix[b] = np.dot(np.transpose(sorted_basis[bundle_range[b, 0]: bundle_range[b, 1]]), \
sorted_basis[bundle_range[b, 0]: bundle_range[b, 1]])
for i in range(no_of_regression):
regression_vector[b, i] = np.dot(np.transpose(sorted_basis[bundle_range[b, 0]: bundle_range[b, 1]]),\
sorted_regression_unknown[bundle_range[b, 0]: bundle_range[b, 1], i])
c = cuda.device_array((c_bundles_num, basis_order), dtype=np.double)
d = cuda.device_array((c_bundles_num, basis_order), dtype=np.double)
d_regression_matrix = cuda.to_device(regression_matrix)
d_regression_vector = cuda.to_device(regression_vector)
cuda_qrdcmp[gridsz, blksz](c_bundles_num, d_regression_matrix, basis_order, basis_order, c, d)
cuda_qrsolv[gridsz, blksz](c_bundles_num, no_of_regression, d_regression_matrix, basis_order, basis_order, c, d, d_regression_vector)
regression_coeff[:, :, :] = d_regression_vector.copy_to_host()
return regression_coeff
def test():
"""Test the CUDA QR decomposition and solver"""
a = np.array([[[3., 5., 2.],[-1., 4., 2.],[1., 0., 1.]]])
c = cuda.device_array((2, a.shape[1]), dtype=np.double)
d = cuda.device_array((2, a.shape[1]), dtype=np.double)
b = np.array([[[5, 12., 1.], [-1., 3., 0.]]])
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
blksz = 256
gridsz = int(1024 / blksz)
cuda_qrdcmp[gridsz, blksz](1, d_a, a.shape[1], a.shape[2], c, d)
cuda_qrsolv[gridsz, blksz](1, 2, d_a, a.shape[1], a.shape[2], c, d, d_b)
b = d_b.copy_to_host()
print(b)
if __name__ == '__main__':
test()