Prototype cupy backend (#4952)

* Developing GPU CUPY backend * Don't import aligned!! * Adding cupy FFT to scheme * Don't understand why copy here?? * Get right memory pointer * Add CUPY scheme to CLI * Typo in scheme.py * Remove warning * Adding some more array stuff * The i is kind of important for a phase shift!! * Add device num support * Set device number * Add prototype spa_tmplt_cupy * Adding new parts of CUPY backend * Typo in scheme.py * Need to force numpy in PSD estimate * Add SPATmplt in waveform.py * Starting to understand RawKernel and CUDA * Remove print statement * Force numpy returns * Gareth's comments * Try to fix test suite * Refactor LAL constants * Probably need BBHx * Move cupy array decleration to when needed * Typo * Don't want time-reversed waveforms! * Add FIXME * Testing chisq optimization - DO NOT MERGE * Revert "Testing chisq optimization - DO NOT MERGE" This reverts commit 62a59e5. * Add deprecated warning
gwastro · Dec 3, 2024 · c06f60a · c06f60a
1 parent b60abb0
commit c06f60a
Show file tree

Hide file tree

Showing 14 changed files with 1,120 additions and 6 deletions.
diff --git a/pycbc/events/threshold_cupy.py b/pycbc/events/threshold_cupy.py
@@ -0,0 +1,259 @@
+# Copyright (C) 2012  Alex Nitz
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+
+#
+# =============================================================================
+#
+#                                   Preamble
+#
+# =============================================================================
+#
+
+import cupy as cp
+import functools
+import mako.template
+from .eventmgr import _BaseThresholdCluster
+import pycbc.scheme
+
+val = None
+loc = None
+
+# https://stackoverflow.com/questions/77798014/cupy-rawkernel-cuda-error-not-found-named-symbol-not-found-cupy
+
+tkernel1 = mako.template.Template("""
+extern "C" __global__ void threshold_and_cluster(float2* in, float2* outv, int* outl, int window, float threshold){
+    int s = window * blockIdx.x;
+    int e = s + window;
+
+    // shared memory for chuck size candidates
+    __shared__ float svr[${chunk}];
+    __shared__ float svi[${chunk}];
+    __shared__ int sl[${chunk}];
+
+    // shared memory for the warp size candidates
+    __shared__ float svv[32];
+    __shared__ int idx[32];
+
+    int ml = -1;
+    float mvr = 0;
+    float mvi = 0;
+    float re;
+    float im;
+
+    // Iterate trought the entire window size chunk and find blockDim.x number
+    // of candidates
+    for (int i = s + threadIdx.x; i < e; i += blockDim.x){
+        re = in[i].x;
+        im = in[i].y;
+        if ((re * re + im * im) > (mvr * mvr + mvi * mvi)){
+            mvr = re;
+            mvi = im;
+            ml = i;
+        }
+    }
+
+    // Save the candidate from this thread to shared memory
+    svr[threadIdx.x] = mvr;
+    svi[threadIdx.x] = mvi;
+    sl[threadIdx.x] = ml;
+
+    __syncthreads();
+
+    if (threadIdx.x < 32){
+        int tl = threadIdx.x;
+
+        // Now that we have all the candiates for this chunk in shared memory
+        // Iterate through in the warp size to reduce to 32 candidates
+        for (int i = threadIdx.x; i < ${chunk}; i += 32){
+            re = svr[i];
+            im = svi[i];
+            if ((re * re + im * im) > (mvr * mvr + mvi * mvi)){
+                tl = i;
+                mvr = re;
+                mvi = im;
+            }
+        }
+
+        // Store the 32 candidates into shared memory
+        svv[threadIdx.x] = svr[tl] * svr[tl] + svi[tl] * svi[tl];
+        idx[threadIdx.x] = tl;
+
+        // Find the 1 candidate we are looking for using a manual log algorithm
+        if ((threadIdx.x < 16) && (svv[threadIdx.x] < svv[threadIdx.x + 16])){
+            svv[threadIdx.x] = svv[threadIdx.x + 16];
+            idx[threadIdx.x] = idx[threadIdx.x + 16];
+        }
+
+        if ((threadIdx.x < 8) && (svv[threadIdx.x] < svv[threadIdx.x + 8])){
+            svv[threadIdx.x] = svv[threadIdx.x + 8];
+            idx[threadIdx.x] = idx[threadIdx.x + 8];
+        }
+
+        if ((threadIdx.x < 4) && (svv[threadIdx.x] < svv[threadIdx.x + 4])){
+            svv[threadIdx.x] = svv[threadIdx.x + 4];
+            idx[threadIdx.x] = idx[threadIdx.x + 4];
+        }
+
+        if ((threadIdx.x < 2) && (svv[threadIdx.x] < svv[threadIdx.x + 2])){
+            svv[threadIdx.x] = svv[threadIdx.x + 2];
+            idx[threadIdx.x] = idx[threadIdx.x + 2];
+        }
+
+
+        // Save the 1 candidate maximum and location to the output vectors
+        if (threadIdx.x == 0){
+            if (svv[threadIdx.x] < svv[threadIdx.x + 1]){
+                idx[0] = idx[1];
+                svv[0] = svv[1];
+            }
+
+            if (svv[0] > threshold){
+                tl = idx[0];
+                outv[blockIdx.x].x = svr[tl];
+                outv[blockIdx.x].y = svi[tl];
+                outl[blockIdx.x] = sl[tl];
+            } else{
+                outl[blockIdx.x] = -1;
+            }
+        }
+    }
+}
+""")
+
+tkernel2 = mako.template.Template("""
+extern "C" __global__ void threshold_and_cluster2(float2* outv, int* outl, float threshold, int window){
+    __shared__ int loc[${blocks}];
+    __shared__ float val[${blocks}];
+
+    int i = threadIdx.x;
+
+    int l = outl[i];
+    loc[i] = l;
+
+    if (l == -1)
+        return;
+
+    val[i] = outv[i].x * outv[i].x + outv[i].y * outv[i].y;
+
+
+    // Check right
+    if ( (i < (${blocks} - 1)) && (val[i + 1] > val[i]) ){
+        outl[i] = -1;
+        return;
+    }
+
+    // Check left
+    if ( (i > 0) && (val[i - 1] > val[i]) ){
+        outl[i] = -1;
+        return;
+    }
+}
+""")
+
+@functools.lru_cache(maxsize=None)
+def get_tkernel(slen, window):
+    if window < 32:
+        raise ValueError("GPU threshold kernel does not support a window smaller than 32 samples")
+
+    elif window <= 4096:
+        nt = 128
+    elif window <= 16384:
+        nt = 256
+    elif window <= 32768:
+        nt = 512
+    else:
+        nt = 1024
+
+    nb = int(cp.ceil(slen / float(window)))
+
+    if nb > 1024:
+        raise ValueError("More than 1024 blocks not supported yet")
+
+    fn = cp.RawKernel(
+        tkernel1.render(chunk=nt),
+        'threshold_and_cluster',
+        backend='nvcc'
+    )
+    fn2 = cp.RawKernel(
+        tkernel2.render(blocks=nb),
+        'threshold_and_cluster2',
+        backend='nvcc'
+    )
+    return (fn, fn2), nt, nb
+
+def threshold_and_cluster(series, threshold, window):
+    global val
+    global loc
+    if val is None:
+        val = cp.zeros(4096*256, dtype=cp.complex64)
+    if loc is None:
+        loc = cp.zeros(4096*256, cp.int32)
+
+    outl = loc
+    outv = val
+    slen = len(series)
+    series = series.data
+    (fn, fn2), nt, nb = get_tkernel(slen, window)
+    threshold = cp.float32(threshold * threshold)
+    window = cp.int32(window)
+
+    cl = loc[0:nb]
+    cv = val[0:nb]
+
+    fn((nb,), (nt,), (series.data, outv, outl, window, threshold))
+    fn2((1,), (nb,), (outv, outl, threshold, window))
+    w = (cl != -1)
+    return cv[w], cl[w]
+
+class CUDAThresholdCluster(_BaseThresholdCluster):
+    def __init__(self, series):
+        self.series = series
+
+        global val
+        global loc
+        if val is None:
+            val = cp.zeros(4096*256, dtype=cp.complex64)
+        if loc is None:
+            loc = cp.zeros(4096*256, cp.int32)
+
+        self.outl = loc
+        self.outv = val
+        self.slen = len(series)
+
+    def threshold_and_cluster(self, threshold, window):
+        threshold = cp.float32(threshold * threshold)
+        window = cp.int32(window)
+
+        (fn, fn2), nt, nb = get_tkernel(self.slen, window)
+        cl = loc[0:nb]
+        cv = val[0:nb]
+
+        fn(
+            (nt, 1, 1),
+            (nb, 1),
+            (self.series.data, self.outv, self.outl, window, threshold)
+        )
+        fn2(
+            (nb, 1, 1),
+            (1, 1),
+            (self.outv, self.outl, threshold, window)
+        )
+        w = (cl != -1)
+        return cp.asnumpy(cv[w]), cp.asnumpy(cl[w])
+
+def _threshold_cluster_factory(series):
+    return CUDAThresholdCluster
+
diff --git a/pycbc/fft/backend_cupy.py b/pycbc/fft/backend_cupy.py
@@ -0,0 +1,37 @@
+#  Copyright (C) 2024 Y Ddraig Goch
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with with program; see the file COPYING. If not, write to the
+#  Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+#  MA  02111-1307  USA
+
+from .core import _list_available
+
+_backend_dict = {'cupy' : 'cupyfft'}
+_backend_list = ['cupy']
+
+_alist, _adict = _list_available(_backend_list, _backend_dict)
+
+cupy_backend = None
+
+def set_backend(backend_list):
+    global cupy_backend
+    for backend in backend_list:
+        if backend in _alist:
+            cupy_backend = backend
+            break
+
+def get_backend():
+    return _adict[cupy_backend]
+
+set_backend(_backend_list)
diff --git a/pycbc/fft/backend_support.py b/pycbc/fft/backend_support.py
@@ -72,7 +72,7 @@ def get_backend():
 
 # Import all scheme-dependent backends, to get _all_backends accurate:
 
-for scheme_name in ["cpu", "mkl", "cuda"]:
+for scheme_name in ["cpu", "mkl", "cuda", "cupy"]:
     try:
         mod = __import__('pycbc.fft.backend_' + scheme_name, fromlist = ['_alist', '_adict'])
         _alist = getattr(mod, "_alist")

diff --git a/pycbc/fft/cupyfft.py b/pycbc/fft/cupyfft.py
@@ -0,0 +1,88 @@
+# Copyright (C) 2024 Y Ddraig Goch
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+#
+# =============================================================================
+#
+#                                   Preamble
+#
+# =============================================================================
+#
+"""
+This module provides the cupy backend of the fast Fourier transform
+for the PyCBC package.
+"""
+
+import logging
+import cupy.fft
+from .core import _check_fft_args
+from .core import _BaseFFT, _BaseIFFT
+
+_INV_FFT_MSG = ("I cannot perform an {} between data with an input type of "
+                "{} and an output type of {}")
+
+def fft(invec, outvec, _, itype, otype):
+    if invec.ptr == outvec.ptr:
+        raise NotImplementedError("cupy backend of pycbc.fft does not "
+                                  "support in-place transforms")
+    if itype == 'complex' and otype == 'complex':
+        outvec.data[:] = cupy.asarray(cupy.fft.fft(invec.data),
+                                       dtype=outvec.dtype)
+    elif itype == 'real' and otype == 'complex':
+        outvec.data[:] = cupy.asarray(cupy.fft.rfft(invec.data),
+                                       dtype=outvec.dtype)
+    else:
+        raise ValueError(_INV_FFT_MSG.format("FFT", itype, otype))
+
+
+def ifft(invec, outvec, _, itype, otype):
+    if invec.ptr == outvec.ptr:
+        raise NotImplementedError("cupy backend of pycbc.fft does not "
+                                  "support in-place transforms")
+    if itype == 'complex' and otype == 'complex':
+        outvec.data[:] = cupy.asarray(cupy.fft.ifft(invec.data),
+                                       dtype=outvec.dtype)
+        outvec *= len(outvec)
+    elif itype == 'complex' and otype == 'real':
+        outvec.data[:] = cupy.asarray(cupy.fft.irfft(invec.data,len(outvec)),
+                                       dtype=outvec.dtype)
+        outvec *= len(outvec)
+    else:
+        raise ValueError(_INV_FFT_MSG.format("IFFT", itype, otype))
+
+
+class FFT(_BaseFFT):
+    """
+    Class for performing FFTs via the cupy interface.
+    """
+    def __init__(self, invec, outvec, nbatch=1, size=None):
+        super(FFT, self).__init__(invec, outvec, nbatch, size)
+        self.prec, self.itype, self.otype = _check_fft_args(invec, outvec)
+
+    def execute(self):
+        fft(self.invec, self.outvec, self.prec, self.itype, self.otype)
+
+
+class IFFT(_BaseIFFT):
+    """
+    Class for performing IFFTs via the cupy interface.
+    """
+    def __init__(self, invec, outvec, nbatch=1, size=None):
+        super(IFFT, self).__init__(invec, outvec, nbatch, size)
+        self.prec, self.itype, self.otype = _check_fft_args(invec, outvec)
+
+    def execute(self):
+        ifft(self.invec, self.outvec, self.prec, self.itype, self.otype)