diff --git a/few/amplitude/ampinterp2d.py b/few/amplitude/ampinterp2d.py
index 52b3c443..455a21cd 100644
--- a/few/amplitude/ampinterp2d.py
+++ b/few/amplitude/ampinterp2d.py
@@ -28,7 +28,11 @@
 # Cython/C++ imports
 
 # Python imports
-from ..utils.baseclasses import SchwarzschildEccentric, ParallelModuleBase, KerrEccentricEquatorial
+from ..utils.baseclasses import (
+    SchwarzschildEccentric,
+    ParallelModuleBase,
+    KerrEccentricEquatorial,
+)
 from .base import AmplitudeBase
 from ..utils.utility import check_for_file_download
 from ..utils.citations import *
@@ -37,20 +41,21 @@
 # check for cupy and GPU version of pymatmul
 try:
     # Cython/C++ imports
-    from pyAmpInterp2D import interp2D as interp2D_gpu
+    from ..cutils.pyAmpInterp2D import interp2D as interp2D_gpu
+
     # Python imports
     import cupy as cp
 
 except (ImportError, ModuleNotFoundError) as e:
     import numpy as np
 
-from pyAmpInterp2D_cpu import interp2D as interp2D_cpu
+from ..cutils.pyAmpInterp2D_cpu import interp2D as interp2D_cpu
 
 
 # get path to this file
 dir_path = os.path.dirname(os.path.realpath(__file__))
 
-#TODO: handle multiple waveform models
+# TODO: handle multiple waveform models
 _DEFAULT_SPINS = [
     -0.99,
     -0.95,
@@ -63,7 +68,7 @@
     -0.3,
     -0.2,
     -0.1,
-    0.,
+    0.0,
     0.1,
     0.2,
     0.3,
@@ -74,13 +79,14 @@
     0.8,
     0.9,
     0.95,
-    0.99
+    0.99,
 ]
 
 _DEFAULT_AMPLITUDE_FILENAMES = [
     f"KerrEqEccAmpCoeffs_a{spin:.3f}.h5" for spin in _DEFAULT_SPINS
 ]
 
+
 class AmpInterp2D(AmplitudeBase, ParallelModuleBase):
     """Calculate Teukolsky amplitudes with a ROMAN.
 
@@ -168,14 +174,14 @@ def __init__(self, fp, l_arr, m_arr, n_arr, file_directory=None, **kwargs):
             self.file_dir = dir_path + "/../../few/files/"
         else:
             self.file_dir = file_directory
-        
+
         # check if user has the necessary data
         # if not, the data will automatically download
         check_for_file_download(fp, self.file_dir)
-        
+
         mystery_file = h5py.File(os.path.join(self.file_dir, fp))
         try:
-            is_coeffs = mystery_file.attrs['is_coefficients']
+            is_coeffs = mystery_file.attrs["is_coefficients"]
         except KeyError:
             is_coeffs = False
 
@@ -183,30 +189,40 @@ def __init__(self, fp, l_arr, m_arr, n_arr, file_directory=None, **kwargs):
             coefficients = mystery_file
         else:
             print(fp, "is not a spline coefficients file. Attempting to convert...")
-            spline_fp = _spline_coefficients_to_file(fp, self.l_arr, self.m_arr, self.n_arr, file_directory=self.file_dir)
+            spline_fp = _spline_coefficients_to_file(
+                fp, self.l_arr, self.m_arr, self.n_arr, file_directory=self.file_dir
+            )
             coefficients = h5py.File(os.path.join(self.file_dir, spline_fp))
 
-        self.a_val_store = coefficients.attrs['signed_spin']
+        self.a_val_store = coefficients.attrs["signed_spin"]
 
-        self.num_teuk_modes = coefficients.attrs['num_teuk_modes']
+        self.num_teuk_modes = coefficients.attrs["num_teuk_modes"]
         self.tck = [
-            self.xp.asarray(coefficients['x1']), 
-            self.xp.asarray(coefficients['x2']), 
-            self.xp.asarray(coefficients['c'])
+            self.xp.asarray(coefficients["x1"]),
+            self.xp.asarray(coefficients["x2"]),
+            self.xp.asarray(coefficients["c"]),
         ]
-        self.degrees = coefficients.attrs['spline_degree_x'], coefficients.attrs['spline_degree_y']
-        self.len_indiv_c = coefficients.attrs['points_per_modegrid']
+        self.degrees = (
+            coefficients.attrs["spline_degree_x"],
+            coefficients.attrs["spline_degree_y"],
+        )
+        self.len_indiv_c = coefficients.attrs["points_per_modegrid"]
 
     @property
     def interp2D(self) -> callable:
         """GPU or CPU interp2D"""
         interp2D = interp2D_cpu if not self.use_gpu else interp2D_gpu
         return interp2D
-        
+
     @property
     def citation(self):
         """Return citations for this module"""
-        return romannet_citation + larger_few_citation + few_citation + few_software_citation
+        return (
+            romannet_citation
+            + larger_few_citation
+            + few_citation
+            + few_software_citation
+        )
 
     @property
     def gpu_capability(self):
@@ -228,13 +244,18 @@ def __call__(self, a, p, e, xI, *args, specific_modes=None, **kwargs):
             sorted to increasing order.
             Note that the axis ordering is inverted relative to
             the output of meshgrid.
-        
+
         """
 
         grid = False
 
         try:
-            a_cpu, p_cpu, e_cpu, xI_cpu = a.get().copy(), p.get().copy(), e.get().copy(), xI.get().copy()
+            a_cpu, p_cpu, e_cpu, xI_cpu = (
+                a.get().copy(),
+                p.get().copy(),
+                e.get().copy(),
+                xI.get().copy(),
+            )
         except AttributeError:
             a_cpu, p_cpu, e_cpu, xI_cpu = a.copy(), p.copy(), e.copy(), xI.copy()
 
@@ -252,7 +273,7 @@ def __call__(self, a, p, e, xI, *args, specific_modes=None, **kwargs):
 
         tw, tu, c = self.tck[:3]
         kw, ku = self.degrees
-        
+
         # standard Numpy broadcasting
         if w.shape != u.shape:
             w, u = np.broadcast_arrays(w, u)
@@ -275,36 +296,47 @@ def __call__(self, a, p, e, xI, *args, specific_modes=None, **kwargs):
 
         if specific_modes is None:
             mode_indexes = self.xp.arange(self.num_teuk_modes)
-        
+
         else:
             if isinstance(specific_modes, self.xp.ndarray):
                 mode_indexes = specific_modes
-            elif isinstance(specific_modes, list):  # the following is slow and kills efficiency
+            elif isinstance(
+                specific_modes, list
+            ):  # the following is slow and kills efficiency
                 mode_indexes = self.xp.zeros(len(specific_modes), dtype=self.xp.int32)
                 for i, (l, m, n) in enumerate(specific_modes):
                     try:
-                        mode_indexes[i] = np.where((self.l_arr == l) & (self.m_arr == abs(m)) & (self.n_arr == n))[0]
+                        mode_indexes[i] = np.where(
+                            (self.l_arr == l)
+                            & (self.m_arr == abs(m))
+                            & (self.n_arr == n)
+                        )[0]
                     except:
                         raise Exception(f"Could not find mode index ({l},{m},{n}).")
         # TODO: perform this in the kernel
         c_in = c[mode_indexes].flatten()
 
-        num_indiv_c = 2*len(mode_indexes)  # Re and Im
+        num_indiv_c = 2 * len(mode_indexes)  # Re and Im
         len_indiv_c = self.len_indiv_c
 
         z = self.xp.zeros((num_indiv_c * mw))
-        
-        self.interp2D(z, tw, nw, tu, nu, c_in, kw, ku, w, mw, u, mu, num_indiv_c, len_indiv_c)
 
-        #check = np.asarray([[spl.ev(e.get(), y.get()) for spl in spl1] for spl1 in self.spl2D.values()]).transpose(2, 1, 0)
+        self.interp2D(
+            z, tw, nw, tu, nu, c_in, kw, ku, w, mw, u, mu, num_indiv_c, len_indiv_c
+        )
+
+        # check = np.asarray([[spl.ev(e.get(), y.get()) for spl in spl1] for spl1 in self.spl2D.values()]).transpose(2, 1, 0)
 
-        z = z.reshape(num_indiv_c//2, 2, mw).transpose(2, 1, 0)
+        z = z.reshape(num_indiv_c // 2, 2, mw).transpose(2, 1, 0)
 
         z = z[:, 0] + 1j * z[:, 1]
         return z
-    
+
     def __reduce__(self):
-        return (self.__class__, (self.fp, self.l_arr, self.m_arr, self.n_arr, self.file_dir))
+        return (
+            self.__class__,
+            (self.fp, self.l_arr, self.m_arr, self.n_arr, self.file_dir),
+        )
 
 
 class AmpInterpKerrEqEcc(AmplitudeBase, KerrEccentricEquatorial, ParallelModuleBase):
@@ -324,24 +356,43 @@ def __init__(self, file_directory=None, filenames=None, **kwargs):
         else:
             self.filenames = filenames
 
-        self.spin_information_holder_unsorted = [None for _ in range(len(self.filenames))]
+        self.spin_information_holder_unsorted = [
+            None for _ in range(len(self.filenames))
+        ]
         for i, fp in enumerate(self.filenames):
-            self.spin_information_holder_unsorted[i] = AmpInterp2D(fp, self.l_arr, self.m_arr, self.n_arr, file_directory=self.file_dir, use_gpu=self.use_gpu)
-
-        spin_values_unsorted = [sh.a_val_store for sh in self.spin_information_holder_unsorted]
+            self.spin_information_holder_unsorted[i] = AmpInterp2D(
+                fp,
+                self.l_arr,
+                self.m_arr,
+                self.n_arr,
+                file_directory=self.file_dir,
+                use_gpu=self.use_gpu,
+            )
+
+        spin_values_unsorted = [
+            sh.a_val_store for sh in self.spin_information_holder_unsorted
+        ]
         rearrange_inds = np.argsort(spin_values_unsorted)
 
         self.spin_values = np.asarray(spin_values_unsorted)[rearrange_inds]
-        self.spin_information_holder = [self.spin_information_holder_unsorted[i] for i in rearrange_inds]
-        
+        self.spin_information_holder = [
+            self.spin_information_holder_unsorted[i] for i in rearrange_inds
+        ]
+
         pos_neg_n_swap_inds = []
         if self.use_gpu:
-            for l,m,n in zip(self.l_arr_no_mask.get(),self.m_arr_no_mask.get(),self.n_arr_no_mask.get()):
-                pos_neg_n_swap_inds.append(self.special_index_map[(l,m,-n)])
+            for l, m, n in zip(
+                self.l_arr_no_mask.get(),
+                self.m_arr_no_mask.get(),
+                self.n_arr_no_mask.get(),
+            ):
+                pos_neg_n_swap_inds.append(self.special_index_map[(l, m, -n)])
         else:
-            for l,m,n in zip(self.l_arr_no_mask,self.m_arr_no_mask,self.n_arr_no_mask):
-                pos_neg_n_swap_inds.append(self.special_index_map[(l,m,-n)])
-            
+            for l, m, n in zip(
+                self.l_arr_no_mask, self.m_arr_no_mask, self.n_arr_no_mask
+            ):
+                pos_neg_n_swap_inds.append(self.special_index_map[(l, m, -n)])
+
         self.pos_neg_n_swap_inds = self.xp.asarray(pos_neg_n_swap_inds)
 
     def get_amplitudes(self, a, p, e, xI, specific_modes=None):
@@ -350,18 +401,22 @@ def get_amplitudes(self, a, p, e, xI, specific_modes=None):
         # retrograde: spin pos, xI neg - >  spin neg, xI pos
         assert isinstance(a, float)
 
-        assert np.all(xI == 1.0) or np.all(xI == -1.0)  # either all prograde or all retrograde
-        xI_in = np.ones_like(p)*xI
-        
+        assert np.all(xI == 1.0) or np.all(
+            xI == -1.0
+        )  # either all prograde or all retrograde
+        xI_in = np.ones_like(p) * xI
+
         signed_spin = a * xI_in[0].item()
 
         if signed_spin in self.spin_values:
             ind_1 = np.where(self.spin_values == signed_spin)[0][0]
             a_in = np.full_like(p, signed_spin)
 
-            z = self.spin_information_holder[ind_1](a_in, p, e, xI_in, specific_modes=specific_modes)
-            if xI_in[0] == -1 and signed_spin != 0.:  # retrograde needs mode flip
-                z = self.xp.conj(z[:,self.pos_neg_n_swap_inds])
+            z = self.spin_information_holder[ind_1](
+                a_in, p, e, xI_in, specific_modes=specific_modes
+            )
+            if xI_in[0] == -1 and signed_spin != 0.0:  # retrograde needs mode flip
+                z = self.xp.conj(z[:, self.pos_neg_n_swap_inds])
 
         else:
             ind_above = np.where(self.spin_values > signed_spin)[0][0]
@@ -387,8 +442,8 @@ def get_amplitudes(self, a, p, e, xI, specific_modes=None):
                     specific_modes_below = self.pos_neg_n_swap_inds[specific_modes]
                 elif isinstance(specific_modes, list):
                     specific_modes_below = []
-                    for (l, m, n) in specific_modes:
-                        specific_modes_below.append((l,m,-n))
+                    for l, m, n in specific_modes:
+                        specific_modes_below.append((l, m, -n))
             else:
                 apply_conjugate_below = False
                 specific_modes_below = specific_modes
@@ -399,27 +454,35 @@ def get_amplitudes(self, a, p, e, xI, specific_modes=None):
             else:
                 apply_conjugate_above = False
                 specific_modes_above = specific_modes
-            
-            if apply_conjugate_above and apply_conjugate_below:  # combine the flags to save a conj call if both retrograde
+
+            if (
+                apply_conjugate_above and apply_conjugate_below
+            ):  # combine the flags to save a conj call if both retrograde
                 apply_conjugate_total = True
                 apply_conjugate_above = False
                 apply_conjugate_below = False
             else:
                 apply_conjugate_total = False
 
-            z_above = self.spin_information_holder[ind_above](a_above, p, e, xI_in, specific_modes=specific_modes_above)
-            z_below = self.spin_information_holder[ind_below](a_below, p, e, xI_in, specific_modes=specific_modes_below)
+            z_above = self.spin_information_holder[ind_above](
+                a_above, p, e, xI_in, specific_modes=specific_modes_above
+            )
+            z_below = self.spin_information_holder[ind_below](
+                a_below, p, e, xI_in, specific_modes=specific_modes_below
+            )
             if apply_conjugate_below:
                 z_below = z_below.conj()
             if apply_conjugate_above:
                 z_above = z_above.conj()
-            z = ((z_above - z_below) / (a_above_single - a_below_single)) * (signed_spin - a_below_single) + z_below
+            z = ((z_above - z_below) / (a_above_single - a_below_single)) * (
+                signed_spin - a_below_single
+            ) + z_below
             if apply_conjugate_total:
                 z = z.conj()
 
         if not isinstance(specific_modes, list):
             return z
-        
+
         # dict containing requested modes
         else:
             temp = {}
@@ -436,8 +499,9 @@ def get_amplitudes(self, a, p, e, xI, specific_modes=None):
 
 class AmpInterpSchwarzEcc(AmplitudeBase, SchwarzschildEccentric, ParallelModuleBase):
     """
-    A legacy class for compatibility with the old Schwarzschild waveform structure. 
+    A legacy class for compatibility with the old Schwarzschild waveform structure.
     """
+
     def __init__(self, file_directory=None, filenames=None, **kwargs):
 
         ParallelModuleBase.__init__(self, **kwargs)
@@ -453,13 +517,13 @@ def __init__(self, file_directory=None, filenames=None, **kwargs):
             self.filename = "Teuk_amps_a0.0_lmax_10_nmax_30_new.h5"
         else:
             if isinstance(filenames, list):
-                assert (len(filenames) == 1)
+                assert len(filenames) == 1
             self.filename = filenames
-        
+
         # check if user has the necessary data
         # if not, the data will automatically download
         check_for_file_download(self.filename, self.file_dir)
-        
+
         data = {}
         with h5py.File(os.path.join(self.file_dir, self.filename), "r") as f:
             # load attributes in the right order for correct mode sorting later
@@ -469,22 +533,22 @@ def __init__(self, file_directory=None, filenames=None, **kwargs):
             grid = f["grid"][:]
             for l, m, n in zip(self.l_arr, self.m_arr, self.n_arr):
                 if m >= 0:
-                    key1 = format_string1.format(l,m)
+                    key1 = format_string1.format(l, m)
                     key2 = format_string2.format(n)
-                    tmp = f[key1+'/'+key2][:]
+                    tmp = f[key1 + "/" + key2][:]
                     tmp2 = tmp[:, 0] + 1j * tmp[:, 1]
-                    data[savestring.format(l,m,n)] = tmp2.T
+                    data[savestring.format(l, m, n)] = tmp2.T
 
             # create the coefficients file
 
             # adjust the grid
             p = grid.T[1].copy()
             e = grid.T[2].copy()
-            u = np.round(p_to_y(p, e, use_gpu=False),8)
+            u = np.round(p_to_y(p, e, use_gpu=False), 8)
             w = e.copy()
 
             grid_size = p.shape[0]
-            
+
             unique_u = np.unique(u)
             unique_w = np.unique(w)
             num_u = len(unique_u)
@@ -498,12 +562,13 @@ def __init__(self, file_directory=None, filenames=None, **kwargs):
 
             data = {name: val[:, ::-1] for name, val in data.items()}
 
-            spl2D = {name:
-                [
-                    RectBivariateSpline(unique_w, unique_u, val.real, kx=3, ky=3), 
-                    RectBivariateSpline(unique_w, unique_u, val.imag, kx=3, ky=3)
+            spl2D = {
+                name: [
+                    RectBivariateSpline(unique_w, unique_u, val.real, kx=3, ky=3),
+                    RectBivariateSpline(unique_w, unique_u, val.imag, kx=3, ky=3),
                 ]
-            for name, val in data.items()}
+                for name, val in data.items()
+            }
 
             mode_keys = list(data.keys())
             num_teuk_modes = len(mode_keys)
@@ -514,18 +579,18 @@ def __init__(self, file_directory=None, filenames=None, **kwargs):
             for i, mode in enumerate(mode_keys):
                 tck_last_entry[i, 0] = spl2D[mode][0].tck[2]
                 tck_last_entry[i, 1] = spl2D[mode][1].tck[2]
-            
+
             self.tck = [
                 self.xp.asarray(example_spl.tck[0]),
                 self.xp.asarray(example_spl.tck[1]),
-                self.xp.asarray(tck_last_entry.copy())
+                self.xp.asarray(tck_last_entry.copy()),
             ]
-        
+
         self.num_teuk_modes = num_teuk_modes
 
         self.degrees = example_spl.degrees
         self.len_indiv_c = tck_last_entry.shape[-1]
-    
+
     @property
     def interp2D(self) -> callable:
         """GPU or CPU interp2D"""
@@ -534,10 +599,10 @@ def interp2D(self) -> callable:
 
     def get_amplitudes(self, a, p, e, xI, specific_modes=None):
 
-        assert (a == 0.)
+        assert a == 0.0
 
         assert np.all(xI == 1.0)
-        
+
         try:
             p_cpu, e_cpu = p.get().copy(), e.get().copy()
         except AttributeError:
@@ -553,7 +618,7 @@ def get_amplitudes(self, a, p, e, xI, specific_modes=None):
 
         tw, tu, c = self.tck[:3]
         kw, ku = self.degrees
-        
+
         # standard Numpy broadcasting
         if w.shape != u.shape:
             w, u = np.broadcast_arrays(w, u)
@@ -576,35 +641,43 @@ def get_amplitudes(self, a, p, e, xI, specific_modes=None):
 
         if specific_modes is None:
             mode_indexes = self.xp.arange(self.num_teuk_modes)
-        
+
         else:
             if isinstance(specific_modes, self.xp.ndarray):
                 mode_indexes = specific_modes
-            elif isinstance(specific_modes, list):  # the following is slow and kills efficiency
+            elif isinstance(
+                specific_modes, list
+            ):  # the following is slow and kills efficiency
                 mode_indexes = self.xp.zeros(len(specific_modes), dtype=self.xp.int32)
                 for i, (l, m, n) in enumerate(specific_modes):
                     try:
-                        mode_indexes[i] = np.where((self.l_arr == l) & (self.m_arr == abs(m)) & (self.n_arr == n))[0]
+                        mode_indexes[i] = np.where(
+                            (self.l_arr == l)
+                            & (self.m_arr == abs(m))
+                            & (self.n_arr == n)
+                        )[0]
                     except:
                         raise Exception(f"Could not find mode index ({l},{m},{n}).")
 
         # TODO: perform this in the kernel
         c_in = c[mode_indexes].flatten()
 
-        num_indiv_c = 2*len(mode_indexes)  # Re and Im
+        num_indiv_c = 2 * len(mode_indexes)  # Re and Im
         len_indiv_c = self.len_indiv_c
 
         z = self.xp.zeros((num_indiv_c * mw))
-        
-        self.interp2D(z, tw, nw, tu, nu, c_in, kw, ku, w, mw, u, mu, num_indiv_c, len_indiv_c)
 
-        z = z.reshape(num_indiv_c//2, 2, mw).transpose(2, 1, 0)
+        self.interp2D(
+            z, tw, nw, tu, nu, c_in, kw, ku, w, mw, u, mu, num_indiv_c, len_indiv_c
+        )
+
+        z = z.reshape(num_indiv_c // 2, 2, mw).transpose(2, 1, 0)
 
         z = z[:, 0] + 1j * z[:, 1]
 
         if not isinstance(specific_modes, list):
             return z
-        
+
         # dict containing requested modes
         else:
             temp = {}
@@ -621,6 +694,7 @@ def get_amplitudes(self, a, p, e, xI, specific_modes=None):
     def __reduce__(self):
         return (self.__class__, (self.file_dir, self.filename))
 
+
 def _spline_coefficients_to_file(fp, l_arr, m_arr, n_arr, file_directory=None):
     data = {}
     # get information about this specific model from the file
@@ -630,7 +704,7 @@ def _spline_coefficients_to_file(fp, l_arr, m_arr, n_arr, file_directory=None):
         grid = f["grid"][:]
         for l, m, n in zip(l_arr, m_arr, n_arr):
             if m >= 0:
-                key1 = kerr_format_string.format(l,m,n)
+                key1 = kerr_format_string.format(l, m, n)
                 tmp = f[key1][:]
                 tmp2 = tmp[:, 0] + 1j * tmp[:, 1]
                 data[key1] = tmp2
@@ -642,7 +716,7 @@ def _spline_coefficients_to_file(fp, l_arr, m_arr, n_arr, file_directory=None):
     p = grid.T[1].copy()
     e = grid.T[2].copy()
     xI = grid.T[3].copy()
-    u = np.round(grid.T[4].copy(),8)  # fix rounding errors in the files
+    u = np.round(grid.T[4].copy(), 8)  # fix rounding errors in the files
     sep = grid.T[5].copy()
     w = grid.T[6].copy()
 
@@ -652,14 +726,14 @@ def _spline_coefficients_to_file(fp, l_arr, m_arr, n_arr, file_directory=None):
     # retrograde needs sign flip to be applied to a
     a *= xI
     a_val_store = a[0]
-    
+
     out_fp = f"KerrEqEccAmpCoeffs_a{a_val_store:.3f}.h5"
-    outfile = h5py.File(os.path.join(file_directory, out_fp),"w")
-    outfile.attrs['signed_spin'] = a_val_store
-    outfile.attrs['is_coefficients'] = True
+    outfile = h5py.File(os.path.join(file_directory, out_fp), "w")
+    outfile.attrs["signed_spin"] = a_val_store
+    outfile.attrs["is_coefficients"] = True
 
     grid_size = p.shape[0]
-    
+
     unique_u = np.unique(u)
     unique_w = np.unique(w)
     num_u = len(unique_u)
@@ -673,17 +747,18 @@ def _spline_coefficients_to_file(fp, l_arr, m_arr, n_arr, file_directory=None):
 
     data = {name: val[:, ::-1] for name, val in data.items()}
 
-    spl2D = {name:
-        [
-            RectBivariateSpline(unique_w, unique_u, val.real, kx=3, ky=3), 
-            RectBivariateSpline(unique_w, unique_u, val.imag, kx=3, ky=3)
+    spl2D = {
+        name: [
+            RectBivariateSpline(unique_w, unique_u, val.real, kx=3, ky=3),
+            RectBivariateSpline(unique_w, unique_u, val.imag, kx=3, ky=3),
         ]
-    for name, val in data.items()}
+        for name, val in data.items()
+    }
 
     mode_keys = list(data.keys())
     num_teuk_modes = len(mode_keys)
 
-    outfile.attrs['num_teuk_modes'] = num_teuk_modes
+    outfile.attrs["num_teuk_modes"] = num_teuk_modes
 
     first_key = list(spl2D.keys())[0]
     example_spl = spl2D[first_key][0]
@@ -696,37 +771,38 @@ def _spline_coefficients_to_file(fp, l_arr, m_arr, n_arr, file_directory=None):
 
     len_indiv_c = tck_last_entry.shape[-1]
 
-    outfile.attrs['spline_degree_x'] = degrees[0]
-    outfile.attrs['spline_degree_y'] = degrees[1]
-    outfile.attrs['points_per_modegrid'] = len_indiv_c
+    outfile.attrs["spline_degree_x"] = degrees[0]
+    outfile.attrs["spline_degree_y"] = degrees[1]
+    outfile.attrs["points_per_modegrid"] = len_indiv_c
 
-    outfile.create_dataset('x1', data=example_spl.tck[0])
-    outfile.create_dataset('x2', data=example_spl.tck[1])
-    outfile.create_dataset('c', data=tck_last_entry.copy())
+    outfile.create_dataset("x1", data=example_spl.tck[0])
+    outfile.create_dataset("x2", data=example_spl.tck[1])
+    outfile.create_dataset("c", data=tck_last_entry.copy())
 
     outfile.close()
 
     return out_fp
 
+
 if __name__ == "__main__":
     # try and instantiate the amplitude class
-    spin_values = np.r_[np.linspace(0.,0.9,10),0.95,0.99]
-    spin_values = np.r_[-np.flip(spin_values)[:-1],spin_values]
+    spin_values = np.r_[np.linspace(0.0, 0.9, 10), 0.95, 0.99]
+    spin_values = np.r_[-np.flip(spin_values)[:-1], spin_values]
 
     base_path = "Teuk_amps_a{:.2f}_{}lmax_10_nmax_50_new_m+.h5"
     filepaths = []
     for spin in spin_values:
         part1 = abs(spin)
         if spin < 0:
-            part2 = 'r_'
+            part2 = "r_"
         elif spin > 0:
-            part2 = 'p_'
+            part2 = "p_"
         elif spin == 0:
-            part2 = ''
+            part2 = ""
         filepaths.append(base_path.format(part1, part2))
 
-    #running this should auto-produce coefficients files
+    # running this should auto-produce coefficients files
     AmpInterpKerrEqEcc(filenames=filepaths, file_directory="../../processed_amplitudes")
 
     amp = AmpInterpKerrEqEcc()
-    print(amp(0., np.array([10.]), np.array([0.3]), np.array([1.])))
+    print(amp(0.0, np.array([10.0]), np.array([0.3]), np.array([1.0])))
diff --git a/few/amplitude/romannet.py b/few/amplitude/romannet.py
index d9c46c93..9e3c8e56 100644
--- a/few/amplitude/romannet.py
+++ b/few/amplitude/romannet.py
@@ -23,8 +23,8 @@
 import h5py
 
 # Cython/C++ imports
-from pymatmul_cpu import neural_layer_wrap as neural_layer_wrap_cpu
-from pymatmul_cpu import transform_output_wrap as transform_output_wrap_cpu
+from ..cutils.pymatmul_cpu import neural_layer_wrap as neural_layer_wrap_cpu
+from ..cutils.pymatmul_cpu import transform_output_wrap as transform_output_wrap_cpu
 
 # Python imports
 from ..utils.baseclasses import (
@@ -40,7 +40,7 @@
 # check for cupy and GPU version of pymatmul
 try:
     # Cython/C++ imports
-    from pymatmul import neural_layer_wrap, transform_output_wrap
+    from ..cutils.pymatmul import neural_layer_wrap, transform_output_wrap
 
     # Python imports
     import cupy as cp
@@ -205,7 +205,7 @@ def _initialize_weights(self):
         self.num_layers = 0
 
         # extract all necessary information from the file
-        with h5py.File(os.path.join(self.file_dir,self.data_file), "r") as fp:
+        with h5py.File(os.path.join(self.file_dir, self.data_file), "r") as fp:
             for key, value in fp.items():
                 if key == "reduced_basis":
                     continue
@@ -283,8 +283,12 @@ def get_amplitudes(self, a, p, e, xI, *args, specific_modes=None, **kwargs):
             self.max_init_len = input_len
 
             self.temp_mats = [
-                self.xp.zeros((self.max_num * self.max_init_len,), dtype=self.xp.float64),
-                self.xp.zeros((self.max_num * self.max_init_len,), dtype=self.xp.float64),
+                self.xp.zeros(
+                    (self.max_num * self.max_init_len,), dtype=self.xp.float64
+                ),
+                self.xp.zeros(
+                    (self.max_num * self.max_init_len,), dtype=self.xp.float64
+                ),
             ]
 
         # the input is (y, e)
@@ -298,10 +302,14 @@ def get_amplitudes(self, a, p, e, xI, *args, specific_modes=None, **kwargs):
 
         # setup arrays
         # teukolsky mode (final output)
-        teuk_modes = self.xp.zeros((input_len * self.num_teuk_modes,), dtype=self.xp.complex128)
+        teuk_modes = self.xp.zeros(
+            (input_len * self.num_teuk_modes,), dtype=self.xp.complex128
+        )
 
         # neural network output
-        nn_out_mat = self.xp.zeros((input_len * self.break_index,), dtype=self.xp.complex128)
+        nn_out_mat = self.xp.zeros(
+            (input_len * self.break_index,), dtype=self.xp.complex128
+        )
 
         # run the neural network
         for i, (weight, bias, run_relu) in enumerate(
diff --git a/few/summation/aakwave.py b/few/summation/aakwave.py
index d4586c20..26e2fd5f 100644
--- a/few/summation/aakwave.py
+++ b/few/summation/aakwave.py
@@ -30,11 +30,11 @@
     import numpy as np
 
 # Cython/C++ imports
-from pycpuAAK import pyWaveform as pyWaveform_cpu
+from ..cutils.pycpuAAK import pyWaveform as pyWaveform_cpu
 
 # Attempt Cython imports of GPU functions
 try:
-    from pygpuAAK import pyWaveform as pyWaveform_gpu
+    from ..cutils.pygpuAAK import pyWaveform as pyWaveform_gpu
 
 except (ImportError, ModuleNotFoundError) as e:
     pass
@@ -214,7 +214,7 @@ def sum(
         # convert to gpu if desired
         interp_coeffs_in = self.xp.transpose(
             self.xp.asarray(interp_coeffs), [2, 0, 1]
-            ).flatten()
+        ).flatten()
 
         # generator the waveform
         self.waveform_generator(
@@ -238,6 +238,7 @@ def sum(
 
         return
 
+
 class KerrAAKSummation(SummationBase, Pn5AAK, ParallelModuleBase):
     """Calculate an AAK waveform from an input trajectory.
 
@@ -382,7 +383,9 @@ def sum(
         xI = Y_to_xI(a, p.copy(), e.copy(), Y.copy())
 
         # these are dimensionless and in radians
-        OmegaPhi, OmegaTheta, OmegaR = get_fundamental_frequencies(a, p.copy(), e.copy(), xI.copy())
+        OmegaPhi, OmegaTheta, OmegaR = get_fundamental_frequencies(
+            a, p.copy(), e.copy(), xI.copy()
+        )
 
         # Set theta trajectories equal to eachother.
         OmegaTheta = OmegaPhi
@@ -450,4 +453,3 @@ def sum(
         )
 
         return
-
diff --git a/few/summation/fdinterp.py b/few/summation/fdinterp.py
index 6487dc93..8e43e9a9 100644
--- a/few/summation/fdinterp.py
+++ b/few/summation/fdinterp.py
@@ -27,7 +27,7 @@
     import numpy as np
 
 # Cython imports
-from pyinterp_cpu import (
+from ..cutils.pyinterp_cpu import (
     get_waveform_generic_fd_wrap as get_waveform_generic_fd_wrap_cpu,
 )
 
@@ -44,7 +44,9 @@
 
 # Attempt Cython imports of GPU functions
 try:
-    from pyinterp import get_waveform_generic_fd_wrap as get_waveform_generic_fd_wrap_gpu
+    from ..cutils.pyinterp import (
+        get_waveform_generic_fd_wrap as get_waveform_generic_fd_wrap_gpu,
+    )
 
 except (ImportError, ModuleNotFoundError) as e:
     pass
@@ -114,7 +116,8 @@ def searchsorted2d_vec(a, b, batch_size=-1, xp=None, **kwargs):
 
         m, n = a_temp.shape
         max_num = (
-            self.xp.maximum(a_temp.max() - a_temp.min(), b_temp.max() - b_temp.min()) + 1
+            self.xp.maximum(a_temp.max() - a_temp.min(), b_temp.max() - b_temp.min())
+            + 1
         )
         r = max_num * self.xp.arange(a_temp.shape[0])[:, None]
         p = self.xp.searchsorted(
@@ -146,7 +149,11 @@ def __init__(self, *args, **kwargs):
     @property
     def get_waveform_fd(self) -> callable:
         """GPU or CPU waveform generation."""
-        return get_waveform_generic_fd_wrap_cpu if not self.use_gpu else get_waveform_generic_fd_wrap_gpu
+        return (
+            get_waveform_generic_fd_wrap_cpu
+            if not self.use_gpu
+            else get_waveform_generic_fd_wrap_gpu
+        )
 
     def attributes_FDInterpolatedModeSum(self):
         """
@@ -170,7 +177,7 @@ def sum(
         teuk_modes,
         ylms,
         phase_interp_t,
-        phase_interp_coeffs,        
+        phase_interp_coeffs,
         m_arr,
         n_arr,
         M,
@@ -251,12 +258,17 @@ def sum(
 
         # get fundamental frequencies across trajectory
         Omega_phi, Omega_theta, Omega_r = get_fundamental_frequencies(
-            a, p, e, xI,
+            a,
+            p,
+            e,
+            xI,
         )
 
         # convert from dimensionless frequencies
         f_phi, f_r = (
-            abs(self.xp.asarray(Omega_phi / (2 * np.pi * M * MTSUN_SI))),   # positive frequency to be consistent with amplitude generator for retrograde inspirals  # TODO get to the bottom of this!
+            abs(
+                self.xp.asarray(Omega_phi / (2 * np.pi * M * MTSUN_SI))
+            ),  # positive frequency to be consistent with amplitude generator for retrograde inspirals  # TODO get to the bottom of this!
             self.xp.asarray(Omega_r / (2 * np.pi * M * MTSUN_SI)),
         )
 
@@ -406,9 +418,9 @@ def sum(
                     axis=-1,
                 )
 
-                tmp_freqs_base_sorted_segs[
-                    check_turnover, fix_turnover_seg_ind
-                ] = tmp_segs_sorted_turnover[:, np.array([0, 2])]
+                tmp_freqs_base_sorted_segs[check_turnover, fix_turnover_seg_ind] = (
+                    tmp_segs_sorted_turnover[:, np.array([0, 2])]
+                )
 
         except ValueError:
             pass
@@ -440,9 +452,9 @@ def sum(
         df = self.frequency[1] - self.frequency[0]
 
         # figures out where in self.frequency each segment frequency falls
-        inds_check = self.xp.abs((tmp_freqs_base_sorted_segs - first_frequency) / df).astype(
-            int
-        )
+        inds_check = self.xp.abs(
+            (tmp_freqs_base_sorted_segs - first_frequency) / df
+        ).astype(int)
 
         # start frequency index of each segment
         start_inds = (inds_check[:, :, 0].copy() + 1).astype(int)
@@ -490,8 +502,8 @@ def sum(
 
         phase_interp_coeffs_in = self.xp.transpose(
             self.xp.asarray(phase_interp_coeffs), [2, 1, 0]
-            ).flatten()
-        
+        ).flatten()
+
         # run GPU kernel
         self.get_waveform_fd(
             self.waveform,
diff --git a/few/summation/interpolatedmodesum.py b/few/summation/interpolatedmodesum.py
index dc205672..a392c9b5 100644
--- a/few/summation/interpolatedmodesum.py
+++ b/few/summation/interpolatedmodesum.py
@@ -27,8 +27,8 @@
     import numpy as np
 
 # Cython imports
-from pyinterp_cpu import interpolate_arrays_wrap as interpolate_arrays_wrap_cpu
-from pyinterp_cpu import get_waveform_wrap as get_waveform_wrap_cpu
+from ..cutils.pyinterp_cpu import interpolate_arrays_wrap as interpolate_arrays_wrap_cpu
+from ..cutils.pyinterp_cpu import get_waveform_wrap as get_waveform_wrap_cpu
 
 # Python imports
 from ..utils.baseclasses import (
@@ -42,8 +42,8 @@
 
 # Attempt Cython imports of GPU functions
 try:
-    from pyinterp import interpolate_arrays_wrap as interpolate_arrays_wrap_gpu
-    from pyinterp import get_waveform_wrap as get_waveform_wrap_gpu
+    from ..cutils.pyinterp import interpolate_arrays_wrap as interpolate_arrays_wrap_gpu
+    from ..cutils.pyinterp import get_waveform_wrap as get_waveform_wrap_gpu
 
 except (ImportError, ModuleNotFoundError) as e:
     pass
@@ -151,11 +151,15 @@ def attributes_CubicSplineInterpolate(self):
                 (4, length, ninterps). The 4 is the 4 spline coefficients.
 
         """
+
     @property
     def interpolate_arrays(self) -> callable:
         """GPU or CPU waveform generation."""
-        return interpolate_arrays_wrap_cpu if not self.use_gpu else interpolate_arrays_wrap_gpu
-
+        return (
+            interpolate_arrays_wrap_cpu
+            if not self.use_gpu
+            else interpolate_arrays_wrap_gpu
+        )
 
     @property
     def gpu_capability(self):
@@ -419,7 +423,7 @@ def sum(
 
         phase_interp_coeffs_in = self.xp.transpose(
             self.xp.asarray(phase_interp_coeffs), [2, 0, 1]
-            ).flatten()
+        ).flatten()
 
         self.get_waveform(
             self.waveform,
diff --git a/setup.py b/setup.py
index 36cb7e41..e62b552b 100644
--- a/setup.py
+++ b/setup.py
@@ -258,21 +258,25 @@ def build_extensions(self):
         )
 
     matmul_ext = Extension(
-        "pymatmul", sources=["src/matmul.cu", "src/pymatmul.pyx"], **gpu_extension
+        "few.cutils.pymatmul",
+        sources=["src/matmul.cu", "src/pymatmul.pyx"],
+        **gpu_extension,
     )
 
     interp_ext = Extension(
-        "pyinterp", sources=["src/interpolate.cu", "src/pyinterp.pyx"], **gpu_extension
+        "few.cutils.pyinterp",
+        sources=["src/interpolate.cu", "src/pyinterp.pyx"],
+        **gpu_extension,
     )
 
     gpuAAK_ext = Extension(
-        "pygpuAAK",
+        "few.cutils.pygpuAAK",
         sources=["src/gpuAAK.cu", "src/gpuAAKWrap.pyx"],
         **gpu_extension,
     )
 
     gpu_amp_interp_2d_ext = Extension(
-        "pyAmpInterp2D",
+        "few.cutils.pyAmpInterp2D",
         sources=["src/AmpInterp2D.cu", "src/pyampinterp2D.pyx"],
         **gpu_extension,
     )
@@ -314,23 +318,25 @@ def build_extensions(self):
     cpu_extension["include_dirs"] += gsl_include
 
 matmul_cpu_ext = Extension(
-    "pymatmul_cpu", sources=["src/matmul.cpp", "src/pymatmul_cpu.pyx"], **cpu_extension
+    "few.cutils.pymatmul_cpu",
+    sources=["src/matmul.cpp", "src/pymatmul_cpu.pyx"],
+    **cpu_extension,
 )
 
 interp_cpu_ext = Extension(
-    "pyinterp_cpu",
+    "few.cutils.pyinterp_cpu",
     sources=["src/interpolate.cpp", "src/pyinterp_cpu.pyx"],
     **cpu_extension,
 )
 
 AAK_cpu_ext = Extension(
-    "pycpuAAK",
+    "few.cutils.pycpuAAK",
     sources=["src/gpuAAK.cpp", "src/gpuAAKWrap_cpu.pyx"],
     **cpu_extension,
 )
 
 amp_interp_2d_ext = Extension(
-    "pyAmpInterp2D_cpu",
+    "few.cutils.pyAmpInterp2D_cpu",
     sources=["src/AmpInterp2D.cpp", "src/pyampinterp2D_cpu.pyx"],
     **cpu_extension,
 )
@@ -361,7 +367,16 @@ def build_extensions(self):
     version="1.5.5",
     url="https://github.com/mikekatz04/FastEMRIWaveforms",
     ext_modules=extensions,
-    packages=["few", "few.utils", "few.trajectory", "few.trajectory.ode", "few.amplitude", "few.summation", "few.waveform"],
+    packages=[
+        "few",
+        "few.utils",
+        "few.cutils",
+        "few.trajectory",
+        "few.trajectory.ode",
+        "few.amplitude",
+        "few.summation",
+        "few.waveform",
+    ],
     classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: GNU General Public License (GPL)",
@@ -377,5 +392,3 @@ def build_extensions(self):
     zip_safe=False,
     python_requires=">=3.6",
 )
-
-
diff --git a/src/matmul.cu b/src/matmul.cu
index cd0de58e..3e60831e 100644
--- a/src/matmul.cu
+++ b/src/matmul.cu
@@ -28,17 +28,40 @@ using namespace std::chrono;
 // adjust imports for CUDA
 #ifdef __CUDACC__
 #include "cublas_v2.h"
-#else
-#include <gsl/gsl_cblas.h>
 #endif
 
 #define NUM_THREADS 256
 
+// matmul sub for cblas for backward compatibility
+// FORTRAN STYLE COLUMN MAJOR (UGH)
+template <typename T>
+CUDA_CALLABLE_MEMBER void our_cblas_gemm(
+    int m,
+    int n,
+    int k,
+    T *a,
+    T *b,
+    T *c)
+{
+    for (int i = 0; i < m; i++)
+    {
+        for (int j = 0; j < n; j++)
+        {
+            c[j * m + i] = 0.0;
+            for (int l = 0; l < k; l++)
+            {
+                c[j * m + i] += a[l * m + i] * b[j * k + l];
+            }
+        }
+    }
+}
+
 // activation function
 // fixed 0.2 in leaky end
-CUDA_CALLABLE_MEMBER double LeakyReLU(double x){
-     double out = (x >= 0.0) ? x : 0.2*x;
-     return out;
+CUDA_CALLABLE_MEMBER double LeakyReLU(double x)
+{
+    double out = (x >= 0.0) ? x : 0.2 * x;
+    return out;
 }
 
 // funciton for adding bias and then passing through activation
@@ -46,8 +69,8 @@ CUDA_KERNEL
 void add_bias_relu(double *C, double *bias, int input_len, int dim2)
 {
 
-    // adjust loop boundaries in CUDA
-    #ifdef __CUDACC__
+// adjust loop boundaries in CUDA
+#ifdef __CUDACC__
     int start1 = blockIdx.x * blockDim.x + threadIdx.x;
     int end1 = input_len;
     int diff1 = blockDim.x * gridDim.x;
@@ -56,7 +79,7 @@ void add_bias_relu(double *C, double *bias, int input_len, int dim2)
     int end2 = dim2;
     int diff2 = blockDim.y * gridDim.y;
 
-    #else
+#else
 
     int start1 = 0;
     int end1 = input_len;
@@ -66,30 +89,28 @@ void add_bias_relu(double *C, double *bias, int input_len, int dim2)
     int end2 = dim2;
     int diff2 = 1;
 
-
-    #endif
+#endif
     for (int i = start1;
          i < end1;
          i += diff1)
     {
 
         for (int j = start2;
-          j < end2;
-          j += diff2)
+             j < end2;
+             j += diff2)
         {
 
-            C[input_len*j + i] = LeakyReLU(C[input_len*j + i] + bias[j]);
-
+            C[input_len * j + i] = LeakyReLU(C[input_len * j + i] + bias[j]);
         }
     }
 }
 
 // funciton for adding bias and WITHOUT passing through activation
 CUDA_KERNEL
-void add_bias(double *C, double *bias, int input_len, int dim2){
-
+void add_bias(double *C, double *bias, int input_len, int dim2)
+{
 
-    #ifdef __CUDACC__
+#ifdef __CUDACC__
     int start1 = blockIdx.x * blockDim.x + threadIdx.x;
     int end1 = input_len;
     int diff1 = blockDim.x * gridDim.x;
@@ -98,7 +119,7 @@ void add_bias(double *C, double *bias, int input_len, int dim2){
     int end2 = dim2;
     int diff2 = blockDim.y * gridDim.y;
 
-    #else
+#else
 
     int start1 = 0;
     int end1 = input_len;
@@ -108,19 +129,18 @@ void add_bias(double *C, double *bias, int input_len, int dim2){
     int end2 = dim2;
     int diff2 = 1;
 
-
-    #endif
+#endif
     for (int i = start1;
          i < end1;
          i += diff1)
     {
 
         for (int j = start2;
-          j < end2;
-          j += diff2)
+             j < end2;
+             j += diff2)
         {
 
-            C[input_len*j + i] = C[input_len*j + i] + bias[j];
+            C[input_len * j + i] = C[input_len * j + i] + bias[j];
         }
     }
 }
@@ -128,169 +148,183 @@ void add_bias(double *C, double *bias, int input_len, int dim2){
 // perform matrix calculations in blas for a neural network layer
 void neural_layer(double *mat_out, double *mat_in, double *weight, double *bias, int m, int k, int n, int run_relu)
 {
-    #ifdef __CUDACC__
-       cublasHandle_t handle;
-
-       char * status;
-       cublasStatus_t stat;
-       double alpha = 1.0;
-       double beta = 0.0;
-       stat = cublasCreate(&handle);
-          if (stat != CUBLAS_STATUS_SUCCESS) {
-                  printf ("CUBLAS initialization failed\n");
-                  exit(0);
-              }
-
-        // matrix multiplication
-       stat = cublasDgemm(handle,
-                              CUBLAS_OP_N, CUBLAS_OP_N,
-                              m, n, k,
-                              &alpha,
-                              mat_in, m,
-                              weight, k,
-                              &beta,
-                              mat_out, m);
-
-       if (stat != CUBLAS_STATUS_SUCCESS) {
-              printf ("CUBLAS initialization failed\n");
-              exit(0);
-          }
-
-      stat = cublasDestroy(handle);
-         if (stat != CUBLAS_STATUS_SUCCESS) {
-                 printf ("CUBLAS initialization failed\n");
-                 exit(0);
-             }
+#ifdef __CUDACC__
+    cublasHandle_t handle;
+
+    char *status;
+    cublasStatus_t stat;
+    double alpha = 1.0;
+    double beta = 0.0;
+    stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS)
+    {
+        printf("CUBLAS initialization failed\n");
+        exit(0);
+    }
+
+    // matrix multiplication
+    stat = cublasDgemm(handle,
+                       CUBLAS_OP_N, CUBLAS_OP_N,
+                       m, n, k,
+                       &alpha,
+                       mat_in, m,
+                       weight, k,
+                       &beta,
+                       mat_out, m);
+
+    if (stat != CUBLAS_STATUS_SUCCESS)
+    {
+        printf("CUBLAS initialization failed\n");
+        exit(0);
+    }
+
+    stat = cublasDestroy(handle);
+    if (stat != CUBLAS_STATUS_SUCCESS)
+    {
+        printf("CUBLAS initialization failed\n");
+        exit(0);
+    }
 
     // Add the bias and activate, except in last layer do not activate
-     int num_threads = 256;
-     int num_blocks = std::ceil((m + num_threads -1)/num_threads);
-     dim3 gridDim(num_blocks, n);
+    int num_threads = 256;
+    int num_blocks = std::ceil((m + num_threads - 1) / num_threads);
+    dim3 gridDim(num_blocks, n);
 
-     if (run_relu){
-         add_bias_relu<<<gridDim, num_threads>>>(mat_out, bias, m, n);
-     } else {
-         add_bias<<<gridDim, num_threads>>>(mat_out, bias, m, n);
-     }
-  cudaDeviceSynchronize();
-  gpuErrchk(cudaGetLastError());
+    if (run_relu)
+    {
+        add_bias_relu<<<gridDim, num_threads>>>(mat_out, bias, m, n);
+    }
+    else
+    {
+        add_bias<<<gridDim, num_threads>>>(mat_out, bias, m, n);
+    }
+    cudaDeviceSynchronize();
+    gpuErrchk(cudaGetLastError());
 
-    #else
+#else
 
     // perform calculations in cblas
-     cblas_dgemm (CblasColMajor,
-               CblasNoTrans, CblasNoTrans, m, n, k,
-                1.0, mat_in, m, weight, k, 0.0, mat_out, m);
+    our_cblas_gemm<double>(m, n, k,
+                           mat_in, weight, mat_out);
 
-    if (run_relu){
+    if (run_relu)
+    {
         add_bias_relu(mat_out, bias, m, n);
-    } else {
+    }
+    else
+    {
         add_bias(mat_out, bias, m, n);
     }
 
-    #endif
+#endif
 }
 
 // take the output of the neural net and conver it from (re_1,..,re_n, im_1, ..., im_n)
 // to imaginary
 CUDA_KERNEL
 void form_complex_output(cmplx *complex_output, double *nn_output, int input_len, int break_index,
-                          double transform_factor_inv){
-
-  cmplx temp(0.0, 0.0);
+                         double transform_factor_inv)
+{
 
-  #ifdef __CUDACC__
-  int start1 = blockIdx.x * blockDim.x + threadIdx.x;
-  int end1 = input_len;
-  int diff1 = blockDim.x * gridDim.x;
+    cmplx temp(0.0, 0.0);
 
-  int start2 = blockIdx.y * blockDim.y + threadIdx.y;
-  int end2 = break_index;
-  int diff2 = blockDim.y * gridDim.y;
+#ifdef __CUDACC__
+    int start1 = blockIdx.x * blockDim.x + threadIdx.x;
+    int end1 = input_len;
+    int diff1 = blockDim.x * gridDim.x;
 
-  #else
+    int start2 = blockIdx.y * blockDim.y + threadIdx.y;
+    int end2 = break_index;
+    int diff2 = blockDim.y * gridDim.y;
 
-  int start1 = 0;
-  int end1 = input_len;
-  int diff1 = 1;
+#else
 
-  int start2 = 0;
-  int end2 = break_index;
-  int diff2 = 1;
+    int start1 = 0;
+    int end1 = input_len;
+    int diff1 = 1;
 
+    int start2 = 0;
+    int end2 = break_index;
+    int diff2 = 1;
 
-  #endif
-  for (int i = start1;
-       i < end1;
-       i += diff1){
+#endif
+    for (int i = start1;
+         i < end1;
+         i += diff1)
+    {
 
-   for (int ind = start2;
-        ind < end2;
-        ind += diff2){
+        for (int ind = start2;
+             ind < end2;
+             ind += diff2)
+        {
 
             // break index tells how many real entries or imaginary entries
-            temp = cmplx(nn_output[ind*input_len + i], nn_output[(break_index+ind)*input_len + i]);
-            complex_output[ind*input_len + i] = temp*transform_factor_inv;
-         }
-  }
+            temp = cmplx(nn_output[ind * input_len + i], nn_output[(break_index + ind) * input_len + i]);
+            complex_output[ind * input_len + i] = temp * transform_factor_inv;
+        }
+    }
 }
 
 // post neural net transform from reduced basis back to full amplitude basis
 void transform_output(cmplx *teuk_modes, cmplx *transform_matrix, cmplx *nn_output_mat, double *C,
                       int input_len, int break_index, double transform_factor_inv,
-                      int num_teuk_modes){
-
-  int m=input_len, k=break_index, n=num_teuk_modes;
-  #ifdef __CUDACC__
-  int num_blocks = std::ceil((input_len + NUM_THREADS -1)/NUM_THREADS);
-  dim3 gridDim(num_blocks, break_index);
-
-  // form the complex array of neural net outputs
-  form_complex_output<<<gridDim, NUM_THREADS>>>(nn_output_mat, C, input_len, break_index, transform_factor_inv);
-  cudaDeviceSynchronize();
-  gpuErrchk(cudaGetLastError());
-
-
-  char * status;
-  cublasHandle_t handle;
-  cublasStatus_t stat;
-  cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
-  cuDoubleComplex beta = make_cuDoubleComplex(0.0, 0.0);
-  stat = cublasCreate(&handle);
-     if (stat != CUBLAS_STATUS_SUCCESS) {
-             printf ("CUBLAS initialization failed\n");
-             exit(0);
-         }
-
-  // project back onto amplitude basis
-  stat = cublasZgemm(handle,
-                         CUBLAS_OP_N, CUBLAS_OP_N,
-                         m, n, k,
-                         &alpha,
-                         (cuDoubleComplex*)nn_output_mat, m,
-                         (cuDoubleComplex*)transform_matrix, k,
-                         &beta,
-                         (cuDoubleComplex*)teuk_modes, m);
-
-   status = _cudaGetErrorEnum(stat);
+                      int num_teuk_modes)
+{
+
+    int m = input_len, k = break_index, n = num_teuk_modes;
+#ifdef __CUDACC__
+    int num_blocks = std::ceil((input_len + NUM_THREADS - 1) / NUM_THREADS);
+    dim3 gridDim(num_blocks, break_index);
+
+    // form the complex array of neural net outputs
+    form_complex_output<<<gridDim, NUM_THREADS>>>(nn_output_mat, C, input_len, break_index, transform_factor_inv);
+    cudaDeviceSynchronize();
+    gpuErrchk(cudaGetLastError());
+
+    char *status;
+    cublasHandle_t handle;
+    cublasStatus_t stat;
+    cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
+    cuDoubleComplex beta = make_cuDoubleComplex(0.0, 0.0);
+    stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS)
+    {
+        printf("CUBLAS initialization failed\n");
+        exit(0);
+    }
+
+    // project back onto amplitude basis
+    stat = cublasZgemm(handle,
+                       CUBLAS_OP_N, CUBLAS_OP_N,
+                       m, n, k,
+                       &alpha,
+                       (cuDoubleComplex *)nn_output_mat, m,
+                       (cuDoubleComplex *)transform_matrix, k,
+                       &beta,
+                       (cuDoubleComplex *)teuk_modes, m);
+
+    status = _cudaGetErrorEnum(stat);
     cudaDeviceSynchronize();
 
     stat = cublasDestroy(handle);
-    if (stat != CUBLAS_STATUS_SUCCESS) {
-            exit(0);
-        }
+    if (stat != CUBLAS_STATUS_SUCCESS)
+    {
+        exit(0);
+    }
 
-   #else
+#else
 
-   const cmplx alpha(1.0, 0.0);
-   const cmplx beta(0.0, 0.0);
+    const cmplx alpha(1.0, 0.0);
+    const cmplx beta(0.0, 0.0);
 
     // form the complex array of neural net outputs
-   form_complex_output(nn_output_mat, C, input_len, break_index, transform_factor_inv);
+    form_complex_output(nn_output_mat, C, input_len, break_index, transform_factor_inv);
 
-   // transform to amplitude basis
-   cblas_zgemm (CblasColMajor,
-                  CblasNoTrans, CblasNoTrans, m, n, k,
-                  (void*)&alpha, (void*)nn_output_mat, m, (void*)transform_matrix, k, (void*)&beta, (void*)teuk_modes, m);
-   #endif
+    // transform to amplitude basis
+    our_cblas_gemm<cmplx>(m, n, k, nn_output_mat, transform_matrix, teuk_modes);
+    // cblas_zgemm(CblasColMajor,
+    //             CblasNoTrans, CblasNoTrans, m, n, k,
+    //             (void *)&alpha, (void *)nn_output_mat, m, (void *)transform_matrix, k, (void *)&beta, (void *)teuk_modes, m);
+
+#endif
 }