-
Notifications
You must be signed in to change notification settings - Fork 356
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Developing GPU CUPY backend * Don't import aligned!! * Adding cupy FFT to scheme * Don't understand why copy here?? * Get right memory pointer * Add CUPY scheme to CLI * Typo in scheme.py * Remove warning * Adding some more array stuff * The i is kind of important for a phase shift!! * Add device num support * Set device number * Add prototype spa_tmplt_cupy * Adding new parts of CUPY backend * Typo in scheme.py * Need to force numpy in PSD estimate * Add SPATmplt in waveform.py * Starting to understand RawKernel and CUDA * Remove print statement * Force numpy returns * Gareth's comments * Try to fix test suite * Refactor LAL constants * Probably need BBHx * Move cupy array decleration to when needed * Typo * Don't want time-reversed waveforms! * Add FIXME * Testing chisq optimization - DO NOT MERGE * Revert "Testing chisq optimization - DO NOT MERGE" This reverts commit 62a59e5. * Add deprecated warning
- Loading branch information
Showing
14 changed files
with
1,120 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,259 @@ | ||
# Copyright (C) 2012 Alex Nitz | ||
# This program is free software; you can redistribute it and/or modify it | ||
# under the terms of the GNU General Public License as published by the | ||
# Free Software Foundation; either version 3 of the License, or (at your | ||
# option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, but | ||
# WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General | ||
# Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License along | ||
# with this program; if not, write to the Free Software Foundation, Inc., | ||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
|
||
|
||
# | ||
# ============================================================================= | ||
# | ||
# Preamble | ||
# | ||
# ============================================================================= | ||
# | ||
|
||
import cupy as cp | ||
import functools | ||
import mako.template | ||
from .eventmgr import _BaseThresholdCluster | ||
import pycbc.scheme | ||
|
||
val = None | ||
loc = None | ||
|
||
# https://stackoverflow.com/questions/77798014/cupy-rawkernel-cuda-error-not-found-named-symbol-not-found-cupy | ||
|
||
tkernel1 = mako.template.Template(""" | ||
extern "C" __global__ void threshold_and_cluster(float2* in, float2* outv, int* outl, int window, float threshold){ | ||
int s = window * blockIdx.x; | ||
int e = s + window; | ||
// shared memory for chuck size candidates | ||
__shared__ float svr[${chunk}]; | ||
__shared__ float svi[${chunk}]; | ||
__shared__ int sl[${chunk}]; | ||
// shared memory for the warp size candidates | ||
__shared__ float svv[32]; | ||
__shared__ int idx[32]; | ||
int ml = -1; | ||
float mvr = 0; | ||
float mvi = 0; | ||
float re; | ||
float im; | ||
// Iterate trought the entire window size chunk and find blockDim.x number | ||
// of candidates | ||
for (int i = s + threadIdx.x; i < e; i += blockDim.x){ | ||
re = in[i].x; | ||
im = in[i].y; | ||
if ((re * re + im * im) > (mvr * mvr + mvi * mvi)){ | ||
mvr = re; | ||
mvi = im; | ||
ml = i; | ||
} | ||
} | ||
// Save the candidate from this thread to shared memory | ||
svr[threadIdx.x] = mvr; | ||
svi[threadIdx.x] = mvi; | ||
sl[threadIdx.x] = ml; | ||
__syncthreads(); | ||
if (threadIdx.x < 32){ | ||
int tl = threadIdx.x; | ||
// Now that we have all the candiates for this chunk in shared memory | ||
// Iterate through in the warp size to reduce to 32 candidates | ||
for (int i = threadIdx.x; i < ${chunk}; i += 32){ | ||
re = svr[i]; | ||
im = svi[i]; | ||
if ((re * re + im * im) > (mvr * mvr + mvi * mvi)){ | ||
tl = i; | ||
mvr = re; | ||
mvi = im; | ||
} | ||
} | ||
// Store the 32 candidates into shared memory | ||
svv[threadIdx.x] = svr[tl] * svr[tl] + svi[tl] * svi[tl]; | ||
idx[threadIdx.x] = tl; | ||
// Find the 1 candidate we are looking for using a manual log algorithm | ||
if ((threadIdx.x < 16) && (svv[threadIdx.x] < svv[threadIdx.x + 16])){ | ||
svv[threadIdx.x] = svv[threadIdx.x + 16]; | ||
idx[threadIdx.x] = idx[threadIdx.x + 16]; | ||
} | ||
if ((threadIdx.x < 8) && (svv[threadIdx.x] < svv[threadIdx.x + 8])){ | ||
svv[threadIdx.x] = svv[threadIdx.x + 8]; | ||
idx[threadIdx.x] = idx[threadIdx.x + 8]; | ||
} | ||
if ((threadIdx.x < 4) && (svv[threadIdx.x] < svv[threadIdx.x + 4])){ | ||
svv[threadIdx.x] = svv[threadIdx.x + 4]; | ||
idx[threadIdx.x] = idx[threadIdx.x + 4]; | ||
} | ||
if ((threadIdx.x < 2) && (svv[threadIdx.x] < svv[threadIdx.x + 2])){ | ||
svv[threadIdx.x] = svv[threadIdx.x + 2]; | ||
idx[threadIdx.x] = idx[threadIdx.x + 2]; | ||
} | ||
// Save the 1 candidate maximum and location to the output vectors | ||
if (threadIdx.x == 0){ | ||
if (svv[threadIdx.x] < svv[threadIdx.x + 1]){ | ||
idx[0] = idx[1]; | ||
svv[0] = svv[1]; | ||
} | ||
if (svv[0] > threshold){ | ||
tl = idx[0]; | ||
outv[blockIdx.x].x = svr[tl]; | ||
outv[blockIdx.x].y = svi[tl]; | ||
outl[blockIdx.x] = sl[tl]; | ||
} else{ | ||
outl[blockIdx.x] = -1; | ||
} | ||
} | ||
} | ||
} | ||
""") | ||
|
||
tkernel2 = mako.template.Template(""" | ||
extern "C" __global__ void threshold_and_cluster2(float2* outv, int* outl, float threshold, int window){ | ||
__shared__ int loc[${blocks}]; | ||
__shared__ float val[${blocks}]; | ||
int i = threadIdx.x; | ||
int l = outl[i]; | ||
loc[i] = l; | ||
if (l == -1) | ||
return; | ||
val[i] = outv[i].x * outv[i].x + outv[i].y * outv[i].y; | ||
// Check right | ||
if ( (i < (${blocks} - 1)) && (val[i + 1] > val[i]) ){ | ||
outl[i] = -1; | ||
return; | ||
} | ||
// Check left | ||
if ( (i > 0) && (val[i - 1] > val[i]) ){ | ||
outl[i] = -1; | ||
return; | ||
} | ||
} | ||
""") | ||
|
||
@functools.lru_cache(maxsize=None) | ||
def get_tkernel(slen, window): | ||
if window < 32: | ||
raise ValueError("GPU threshold kernel does not support a window smaller than 32 samples") | ||
|
||
elif window <= 4096: | ||
nt = 128 | ||
elif window <= 16384: | ||
nt = 256 | ||
elif window <= 32768: | ||
nt = 512 | ||
else: | ||
nt = 1024 | ||
|
||
nb = int(cp.ceil(slen / float(window))) | ||
|
||
if nb > 1024: | ||
raise ValueError("More than 1024 blocks not supported yet") | ||
|
||
fn = cp.RawKernel( | ||
tkernel1.render(chunk=nt), | ||
'threshold_and_cluster', | ||
backend='nvcc' | ||
) | ||
fn2 = cp.RawKernel( | ||
tkernel2.render(blocks=nb), | ||
'threshold_and_cluster2', | ||
backend='nvcc' | ||
) | ||
return (fn, fn2), nt, nb | ||
|
||
def threshold_and_cluster(series, threshold, window): | ||
global val | ||
global loc | ||
if val is None: | ||
val = cp.zeros(4096*256, dtype=cp.complex64) | ||
if loc is None: | ||
loc = cp.zeros(4096*256, cp.int32) | ||
|
||
outl = loc | ||
outv = val | ||
slen = len(series) | ||
series = series.data | ||
(fn, fn2), nt, nb = get_tkernel(slen, window) | ||
threshold = cp.float32(threshold * threshold) | ||
window = cp.int32(window) | ||
|
||
cl = loc[0:nb] | ||
cv = val[0:nb] | ||
|
||
fn((nb,), (nt,), (series.data, outv, outl, window, threshold)) | ||
fn2((1,), (nb,), (outv, outl, threshold, window)) | ||
w = (cl != -1) | ||
return cv[w], cl[w] | ||
|
||
class CUDAThresholdCluster(_BaseThresholdCluster): | ||
def __init__(self, series): | ||
self.series = series | ||
|
||
global val | ||
global loc | ||
if val is None: | ||
val = cp.zeros(4096*256, dtype=cp.complex64) | ||
if loc is None: | ||
loc = cp.zeros(4096*256, cp.int32) | ||
|
||
self.outl = loc | ||
self.outv = val | ||
self.slen = len(series) | ||
|
||
def threshold_and_cluster(self, threshold, window): | ||
threshold = cp.float32(threshold * threshold) | ||
window = cp.int32(window) | ||
|
||
(fn, fn2), nt, nb = get_tkernel(self.slen, window) | ||
cl = loc[0:nb] | ||
cv = val[0:nb] | ||
|
||
fn( | ||
(nt, 1, 1), | ||
(nb, 1), | ||
(self.series.data, self.outv, self.outl, window, threshold) | ||
) | ||
fn2( | ||
(nb, 1, 1), | ||
(1, 1), | ||
(self.outv, self.outl, threshold, window) | ||
) | ||
w = (cl != -1) | ||
return cp.asnumpy(cv[w]), cp.asnumpy(cl[w]) | ||
|
||
def _threshold_cluster_factory(series): | ||
return CUDAThresholdCluster | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Copyright (C) 2024 Y Ddraig Goch | ||
# | ||
# This program is free software; you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation; either version 2 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with with program; see the file COPYING. If not, write to the | ||
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, | ||
# MA 02111-1307 USA | ||
|
||
from .core import _list_available | ||
|
||
_backend_dict = {'cupy' : 'cupyfft'} | ||
_backend_list = ['cupy'] | ||
|
||
_alist, _adict = _list_available(_backend_list, _backend_dict) | ||
|
||
cupy_backend = None | ||
|
||
def set_backend(backend_list): | ||
global cupy_backend | ||
for backend in backend_list: | ||
if backend in _alist: | ||
cupy_backend = backend | ||
break | ||
|
||
def get_backend(): | ||
return _adict[cupy_backend] | ||
|
||
set_backend(_backend_list) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
# Copyright (C) 2024 Y Ddraig Goch | ||
# | ||
# This program is free software; you can redistribute it and/or modify it | ||
# under the terms of the GNU General Public License as published by the | ||
# Free Software Foundation; either version 3 of the License, or (at your | ||
# option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, but | ||
# WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General | ||
# Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License along | ||
# with this program; if not, write to the Free Software Foundation, Inc., | ||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
|
||
# | ||
# ============================================================================= | ||
# | ||
# Preamble | ||
# | ||
# ============================================================================= | ||
# | ||
""" | ||
This module provides the cupy backend of the fast Fourier transform | ||
for the PyCBC package. | ||
""" | ||
|
||
import logging | ||
import cupy.fft | ||
from .core import _check_fft_args | ||
from .core import _BaseFFT, _BaseIFFT | ||
|
||
_INV_FFT_MSG = ("I cannot perform an {} between data with an input type of " | ||
"{} and an output type of {}") | ||
|
||
def fft(invec, outvec, _, itype, otype): | ||
if invec.ptr == outvec.ptr: | ||
raise NotImplementedError("cupy backend of pycbc.fft does not " | ||
"support in-place transforms") | ||
if itype == 'complex' and otype == 'complex': | ||
outvec.data[:] = cupy.asarray(cupy.fft.fft(invec.data), | ||
dtype=outvec.dtype) | ||
elif itype == 'real' and otype == 'complex': | ||
outvec.data[:] = cupy.asarray(cupy.fft.rfft(invec.data), | ||
dtype=outvec.dtype) | ||
else: | ||
raise ValueError(_INV_FFT_MSG.format("FFT", itype, otype)) | ||
|
||
|
||
def ifft(invec, outvec, _, itype, otype): | ||
if invec.ptr == outvec.ptr: | ||
raise NotImplementedError("cupy backend of pycbc.fft does not " | ||
"support in-place transforms") | ||
if itype == 'complex' and otype == 'complex': | ||
outvec.data[:] = cupy.asarray(cupy.fft.ifft(invec.data), | ||
dtype=outvec.dtype) | ||
outvec *= len(outvec) | ||
elif itype == 'complex' and otype == 'real': | ||
outvec.data[:] = cupy.asarray(cupy.fft.irfft(invec.data,len(outvec)), | ||
dtype=outvec.dtype) | ||
outvec *= len(outvec) | ||
else: | ||
raise ValueError(_INV_FFT_MSG.format("IFFT", itype, otype)) | ||
|
||
|
||
class FFT(_BaseFFT): | ||
""" | ||
Class for performing FFTs via the cupy interface. | ||
""" | ||
def __init__(self, invec, outvec, nbatch=1, size=None): | ||
super(FFT, self).__init__(invec, outvec, nbatch, size) | ||
self.prec, self.itype, self.otype = _check_fft_args(invec, outvec) | ||
|
||
def execute(self): | ||
fft(self.invec, self.outvec, self.prec, self.itype, self.otype) | ||
|
||
|
||
class IFFT(_BaseIFFT): | ||
""" | ||
Class for performing IFFTs via the cupy interface. | ||
""" | ||
def __init__(self, invec, outvec, nbatch=1, size=None): | ||
super(IFFT, self).__init__(invec, outvec, nbatch, size) | ||
self.prec, self.itype, self.otype = _check_fft_args(invec, outvec) | ||
|
||
def execute(self): | ||
ifft(self.invec, self.outvec, self.prec, self.itype, self.otype) |
Oops, something went wrong.