diff --git a/few/amplitude/ampinterp2d.py b/few/amplitude/ampinterp2d.py index 52b3c443..455a21cd 100644 --- a/few/amplitude/ampinterp2d.py +++ b/few/amplitude/ampinterp2d.py @@ -28,7 +28,11 @@ # Cython/C++ imports # Python imports -from ..utils.baseclasses import SchwarzschildEccentric, ParallelModuleBase, KerrEccentricEquatorial +from ..utils.baseclasses import ( + SchwarzschildEccentric, + ParallelModuleBase, + KerrEccentricEquatorial, +) from .base import AmplitudeBase from ..utils.utility import check_for_file_download from ..utils.citations import * @@ -37,20 +41,21 @@ # check for cupy and GPU version of pymatmul try: # Cython/C++ imports - from pyAmpInterp2D import interp2D as interp2D_gpu + from ..cutils.pyAmpInterp2D import interp2D as interp2D_gpu + # Python imports import cupy as cp except (ImportError, ModuleNotFoundError) as e: import numpy as np -from pyAmpInterp2D_cpu import interp2D as interp2D_cpu +from ..cutils.pyAmpInterp2D_cpu import interp2D as interp2D_cpu # get path to this file dir_path = os.path.dirname(os.path.realpath(__file__)) -#TODO: handle multiple waveform models +# TODO: handle multiple waveform models _DEFAULT_SPINS = [ -0.99, -0.95, @@ -63,7 +68,7 @@ -0.3, -0.2, -0.1, - 0., + 0.0, 0.1, 0.2, 0.3, @@ -74,13 +79,14 @@ 0.8, 0.9, 0.95, - 0.99 + 0.99, ] _DEFAULT_AMPLITUDE_FILENAMES = [ f"KerrEqEccAmpCoeffs_a{spin:.3f}.h5" for spin in _DEFAULT_SPINS ] + class AmpInterp2D(AmplitudeBase, ParallelModuleBase): """Calculate Teukolsky amplitudes with a ROMAN. @@ -168,14 +174,14 @@ def __init__(self, fp, l_arr, m_arr, n_arr, file_directory=None, **kwargs): self.file_dir = dir_path + "/../../few/files/" else: self.file_dir = file_directory - + # check if user has the necessary data # if not, the data will automatically download check_for_file_download(fp, self.file_dir) - + mystery_file = h5py.File(os.path.join(self.file_dir, fp)) try: - is_coeffs = mystery_file.attrs['is_coefficients'] + is_coeffs = mystery_file.attrs["is_coefficients"] except KeyError: is_coeffs = False @@ -183,30 +189,40 @@ def __init__(self, fp, l_arr, m_arr, n_arr, file_directory=None, **kwargs): coefficients = mystery_file else: print(fp, "is not a spline coefficients file. Attempting to convert...") - spline_fp = _spline_coefficients_to_file(fp, self.l_arr, self.m_arr, self.n_arr, file_directory=self.file_dir) + spline_fp = _spline_coefficients_to_file( + fp, self.l_arr, self.m_arr, self.n_arr, file_directory=self.file_dir + ) coefficients = h5py.File(os.path.join(self.file_dir, spline_fp)) - self.a_val_store = coefficients.attrs['signed_spin'] + self.a_val_store = coefficients.attrs["signed_spin"] - self.num_teuk_modes = coefficients.attrs['num_teuk_modes'] + self.num_teuk_modes = coefficients.attrs["num_teuk_modes"] self.tck = [ - self.xp.asarray(coefficients['x1']), - self.xp.asarray(coefficients['x2']), - self.xp.asarray(coefficients['c']) + self.xp.asarray(coefficients["x1"]), + self.xp.asarray(coefficients["x2"]), + self.xp.asarray(coefficients["c"]), ] - self.degrees = coefficients.attrs['spline_degree_x'], coefficients.attrs['spline_degree_y'] - self.len_indiv_c = coefficients.attrs['points_per_modegrid'] + self.degrees = ( + coefficients.attrs["spline_degree_x"], + coefficients.attrs["spline_degree_y"], + ) + self.len_indiv_c = coefficients.attrs["points_per_modegrid"] @property def interp2D(self) -> callable: """GPU or CPU interp2D""" interp2D = interp2D_cpu if not self.use_gpu else interp2D_gpu return interp2D - + @property def citation(self): """Return citations for this module""" - return romannet_citation + larger_few_citation + few_citation + few_software_citation + return ( + romannet_citation + + larger_few_citation + + few_citation + + few_software_citation + ) @property def gpu_capability(self): @@ -228,13 +244,18 @@ def __call__(self, a, p, e, xI, *args, specific_modes=None, **kwargs): sorted to increasing order. Note that the axis ordering is inverted relative to the output of meshgrid. - + """ grid = False try: - a_cpu, p_cpu, e_cpu, xI_cpu = a.get().copy(), p.get().copy(), e.get().copy(), xI.get().copy() + a_cpu, p_cpu, e_cpu, xI_cpu = ( + a.get().copy(), + p.get().copy(), + e.get().copy(), + xI.get().copy(), + ) except AttributeError: a_cpu, p_cpu, e_cpu, xI_cpu = a.copy(), p.copy(), e.copy(), xI.copy() @@ -252,7 +273,7 @@ def __call__(self, a, p, e, xI, *args, specific_modes=None, **kwargs): tw, tu, c = self.tck[:3] kw, ku = self.degrees - + # standard Numpy broadcasting if w.shape != u.shape: w, u = np.broadcast_arrays(w, u) @@ -275,36 +296,47 @@ def __call__(self, a, p, e, xI, *args, specific_modes=None, **kwargs): if specific_modes is None: mode_indexes = self.xp.arange(self.num_teuk_modes) - + else: if isinstance(specific_modes, self.xp.ndarray): mode_indexes = specific_modes - elif isinstance(specific_modes, list): # the following is slow and kills efficiency + elif isinstance( + specific_modes, list + ): # the following is slow and kills efficiency mode_indexes = self.xp.zeros(len(specific_modes), dtype=self.xp.int32) for i, (l, m, n) in enumerate(specific_modes): try: - mode_indexes[i] = np.where((self.l_arr == l) & (self.m_arr == abs(m)) & (self.n_arr == n))[0] + mode_indexes[i] = np.where( + (self.l_arr == l) + & (self.m_arr == abs(m)) + & (self.n_arr == n) + )[0] except: raise Exception(f"Could not find mode index ({l},{m},{n}).") # TODO: perform this in the kernel c_in = c[mode_indexes].flatten() - num_indiv_c = 2*len(mode_indexes) # Re and Im + num_indiv_c = 2 * len(mode_indexes) # Re and Im len_indiv_c = self.len_indiv_c z = self.xp.zeros((num_indiv_c * mw)) - - self.interp2D(z, tw, nw, tu, nu, c_in, kw, ku, w, mw, u, mu, num_indiv_c, len_indiv_c) - #check = np.asarray([[spl.ev(e.get(), y.get()) for spl in spl1] for spl1 in self.spl2D.values()]).transpose(2, 1, 0) + self.interp2D( + z, tw, nw, tu, nu, c_in, kw, ku, w, mw, u, mu, num_indiv_c, len_indiv_c + ) + + # check = np.asarray([[spl.ev(e.get(), y.get()) for spl in spl1] for spl1 in self.spl2D.values()]).transpose(2, 1, 0) - z = z.reshape(num_indiv_c//2, 2, mw).transpose(2, 1, 0) + z = z.reshape(num_indiv_c // 2, 2, mw).transpose(2, 1, 0) z = z[:, 0] + 1j * z[:, 1] return z - + def __reduce__(self): - return (self.__class__, (self.fp, self.l_arr, self.m_arr, self.n_arr, self.file_dir)) + return ( + self.__class__, + (self.fp, self.l_arr, self.m_arr, self.n_arr, self.file_dir), + ) class AmpInterpKerrEqEcc(AmplitudeBase, KerrEccentricEquatorial, ParallelModuleBase): @@ -324,24 +356,43 @@ def __init__(self, file_directory=None, filenames=None, **kwargs): else: self.filenames = filenames - self.spin_information_holder_unsorted = [None for _ in range(len(self.filenames))] + self.spin_information_holder_unsorted = [ + None for _ in range(len(self.filenames)) + ] for i, fp in enumerate(self.filenames): - self.spin_information_holder_unsorted[i] = AmpInterp2D(fp, self.l_arr, self.m_arr, self.n_arr, file_directory=self.file_dir, use_gpu=self.use_gpu) - - spin_values_unsorted = [sh.a_val_store for sh in self.spin_information_holder_unsorted] + self.spin_information_holder_unsorted[i] = AmpInterp2D( + fp, + self.l_arr, + self.m_arr, + self.n_arr, + file_directory=self.file_dir, + use_gpu=self.use_gpu, + ) + + spin_values_unsorted = [ + sh.a_val_store for sh in self.spin_information_holder_unsorted + ] rearrange_inds = np.argsort(spin_values_unsorted) self.spin_values = np.asarray(spin_values_unsorted)[rearrange_inds] - self.spin_information_holder = [self.spin_information_holder_unsorted[i] for i in rearrange_inds] - + self.spin_information_holder = [ + self.spin_information_holder_unsorted[i] for i in rearrange_inds + ] + pos_neg_n_swap_inds = [] if self.use_gpu: - for l,m,n in zip(self.l_arr_no_mask.get(),self.m_arr_no_mask.get(),self.n_arr_no_mask.get()): - pos_neg_n_swap_inds.append(self.special_index_map[(l,m,-n)]) + for l, m, n in zip( + self.l_arr_no_mask.get(), + self.m_arr_no_mask.get(), + self.n_arr_no_mask.get(), + ): + pos_neg_n_swap_inds.append(self.special_index_map[(l, m, -n)]) else: - for l,m,n in zip(self.l_arr_no_mask,self.m_arr_no_mask,self.n_arr_no_mask): - pos_neg_n_swap_inds.append(self.special_index_map[(l,m,-n)]) - + for l, m, n in zip( + self.l_arr_no_mask, self.m_arr_no_mask, self.n_arr_no_mask + ): + pos_neg_n_swap_inds.append(self.special_index_map[(l, m, -n)]) + self.pos_neg_n_swap_inds = self.xp.asarray(pos_neg_n_swap_inds) def get_amplitudes(self, a, p, e, xI, specific_modes=None): @@ -350,18 +401,22 @@ def get_amplitudes(self, a, p, e, xI, specific_modes=None): # retrograde: spin pos, xI neg - > spin neg, xI pos assert isinstance(a, float) - assert np.all(xI == 1.0) or np.all(xI == -1.0) # either all prograde or all retrograde - xI_in = np.ones_like(p)*xI - + assert np.all(xI == 1.0) or np.all( + xI == -1.0 + ) # either all prograde or all retrograde + xI_in = np.ones_like(p) * xI + signed_spin = a * xI_in[0].item() if signed_spin in self.spin_values: ind_1 = np.where(self.spin_values == signed_spin)[0][0] a_in = np.full_like(p, signed_spin) - z = self.spin_information_holder[ind_1](a_in, p, e, xI_in, specific_modes=specific_modes) - if xI_in[0] == -1 and signed_spin != 0.: # retrograde needs mode flip - z = self.xp.conj(z[:,self.pos_neg_n_swap_inds]) + z = self.spin_information_holder[ind_1]( + a_in, p, e, xI_in, specific_modes=specific_modes + ) + if xI_in[0] == -1 and signed_spin != 0.0: # retrograde needs mode flip + z = self.xp.conj(z[:, self.pos_neg_n_swap_inds]) else: ind_above = np.where(self.spin_values > signed_spin)[0][0] @@ -387,8 +442,8 @@ def get_amplitudes(self, a, p, e, xI, specific_modes=None): specific_modes_below = self.pos_neg_n_swap_inds[specific_modes] elif isinstance(specific_modes, list): specific_modes_below = [] - for (l, m, n) in specific_modes: - specific_modes_below.append((l,m,-n)) + for l, m, n in specific_modes: + specific_modes_below.append((l, m, -n)) else: apply_conjugate_below = False specific_modes_below = specific_modes @@ -399,27 +454,35 @@ def get_amplitudes(self, a, p, e, xI, specific_modes=None): else: apply_conjugate_above = False specific_modes_above = specific_modes - - if apply_conjugate_above and apply_conjugate_below: # combine the flags to save a conj call if both retrograde + + if ( + apply_conjugate_above and apply_conjugate_below + ): # combine the flags to save a conj call if both retrograde apply_conjugate_total = True apply_conjugate_above = False apply_conjugate_below = False else: apply_conjugate_total = False - z_above = self.spin_information_holder[ind_above](a_above, p, e, xI_in, specific_modes=specific_modes_above) - z_below = self.spin_information_holder[ind_below](a_below, p, e, xI_in, specific_modes=specific_modes_below) + z_above = self.spin_information_holder[ind_above]( + a_above, p, e, xI_in, specific_modes=specific_modes_above + ) + z_below = self.spin_information_holder[ind_below]( + a_below, p, e, xI_in, specific_modes=specific_modes_below + ) if apply_conjugate_below: z_below = z_below.conj() if apply_conjugate_above: z_above = z_above.conj() - z = ((z_above - z_below) / (a_above_single - a_below_single)) * (signed_spin - a_below_single) + z_below + z = ((z_above - z_below) / (a_above_single - a_below_single)) * ( + signed_spin - a_below_single + ) + z_below if apply_conjugate_total: z = z.conj() if not isinstance(specific_modes, list): return z - + # dict containing requested modes else: temp = {} @@ -436,8 +499,9 @@ def get_amplitudes(self, a, p, e, xI, specific_modes=None): class AmpInterpSchwarzEcc(AmplitudeBase, SchwarzschildEccentric, ParallelModuleBase): """ - A legacy class for compatibility with the old Schwarzschild waveform structure. + A legacy class for compatibility with the old Schwarzschild waveform structure. """ + def __init__(self, file_directory=None, filenames=None, **kwargs): ParallelModuleBase.__init__(self, **kwargs) @@ -453,13 +517,13 @@ def __init__(self, file_directory=None, filenames=None, **kwargs): self.filename = "Teuk_amps_a0.0_lmax_10_nmax_30_new.h5" else: if isinstance(filenames, list): - assert (len(filenames) == 1) + assert len(filenames) == 1 self.filename = filenames - + # check if user has the necessary data # if not, the data will automatically download check_for_file_download(self.filename, self.file_dir) - + data = {} with h5py.File(os.path.join(self.file_dir, self.filename), "r") as f: # load attributes in the right order for correct mode sorting later @@ -469,22 +533,22 @@ def __init__(self, file_directory=None, filenames=None, **kwargs): grid = f["grid"][:] for l, m, n in zip(self.l_arr, self.m_arr, self.n_arr): if m >= 0: - key1 = format_string1.format(l,m) + key1 = format_string1.format(l, m) key2 = format_string2.format(n) - tmp = f[key1+'/'+key2][:] + tmp = f[key1 + "/" + key2][:] tmp2 = tmp[:, 0] + 1j * tmp[:, 1] - data[savestring.format(l,m,n)] = tmp2.T + data[savestring.format(l, m, n)] = tmp2.T # create the coefficients file # adjust the grid p = grid.T[1].copy() e = grid.T[2].copy() - u = np.round(p_to_y(p, e, use_gpu=False),8) + u = np.round(p_to_y(p, e, use_gpu=False), 8) w = e.copy() grid_size = p.shape[0] - + unique_u = np.unique(u) unique_w = np.unique(w) num_u = len(unique_u) @@ -498,12 +562,13 @@ def __init__(self, file_directory=None, filenames=None, **kwargs): data = {name: val[:, ::-1] for name, val in data.items()} - spl2D = {name: - [ - RectBivariateSpline(unique_w, unique_u, val.real, kx=3, ky=3), - RectBivariateSpline(unique_w, unique_u, val.imag, kx=3, ky=3) + spl2D = { + name: [ + RectBivariateSpline(unique_w, unique_u, val.real, kx=3, ky=3), + RectBivariateSpline(unique_w, unique_u, val.imag, kx=3, ky=3), ] - for name, val in data.items()} + for name, val in data.items() + } mode_keys = list(data.keys()) num_teuk_modes = len(mode_keys) @@ -514,18 +579,18 @@ def __init__(self, file_directory=None, filenames=None, **kwargs): for i, mode in enumerate(mode_keys): tck_last_entry[i, 0] = spl2D[mode][0].tck[2] tck_last_entry[i, 1] = spl2D[mode][1].tck[2] - + self.tck = [ self.xp.asarray(example_spl.tck[0]), self.xp.asarray(example_spl.tck[1]), - self.xp.asarray(tck_last_entry.copy()) + self.xp.asarray(tck_last_entry.copy()), ] - + self.num_teuk_modes = num_teuk_modes self.degrees = example_spl.degrees self.len_indiv_c = tck_last_entry.shape[-1] - + @property def interp2D(self) -> callable: """GPU or CPU interp2D""" @@ -534,10 +599,10 @@ def interp2D(self) -> callable: def get_amplitudes(self, a, p, e, xI, specific_modes=None): - assert (a == 0.) + assert a == 0.0 assert np.all(xI == 1.0) - + try: p_cpu, e_cpu = p.get().copy(), e.get().copy() except AttributeError: @@ -553,7 +618,7 @@ def get_amplitudes(self, a, p, e, xI, specific_modes=None): tw, tu, c = self.tck[:3] kw, ku = self.degrees - + # standard Numpy broadcasting if w.shape != u.shape: w, u = np.broadcast_arrays(w, u) @@ -576,35 +641,43 @@ def get_amplitudes(self, a, p, e, xI, specific_modes=None): if specific_modes is None: mode_indexes = self.xp.arange(self.num_teuk_modes) - + else: if isinstance(specific_modes, self.xp.ndarray): mode_indexes = specific_modes - elif isinstance(specific_modes, list): # the following is slow and kills efficiency + elif isinstance( + specific_modes, list + ): # the following is slow and kills efficiency mode_indexes = self.xp.zeros(len(specific_modes), dtype=self.xp.int32) for i, (l, m, n) in enumerate(specific_modes): try: - mode_indexes[i] = np.where((self.l_arr == l) & (self.m_arr == abs(m)) & (self.n_arr == n))[0] + mode_indexes[i] = np.where( + (self.l_arr == l) + & (self.m_arr == abs(m)) + & (self.n_arr == n) + )[0] except: raise Exception(f"Could not find mode index ({l},{m},{n}).") # TODO: perform this in the kernel c_in = c[mode_indexes].flatten() - num_indiv_c = 2*len(mode_indexes) # Re and Im + num_indiv_c = 2 * len(mode_indexes) # Re and Im len_indiv_c = self.len_indiv_c z = self.xp.zeros((num_indiv_c * mw)) - - self.interp2D(z, tw, nw, tu, nu, c_in, kw, ku, w, mw, u, mu, num_indiv_c, len_indiv_c) - z = z.reshape(num_indiv_c//2, 2, mw).transpose(2, 1, 0) + self.interp2D( + z, tw, nw, tu, nu, c_in, kw, ku, w, mw, u, mu, num_indiv_c, len_indiv_c + ) + + z = z.reshape(num_indiv_c // 2, 2, mw).transpose(2, 1, 0) z = z[:, 0] + 1j * z[:, 1] if not isinstance(specific_modes, list): return z - + # dict containing requested modes else: temp = {} @@ -621,6 +694,7 @@ def get_amplitudes(self, a, p, e, xI, specific_modes=None): def __reduce__(self): return (self.__class__, (self.file_dir, self.filename)) + def _spline_coefficients_to_file(fp, l_arr, m_arr, n_arr, file_directory=None): data = {} # get information about this specific model from the file @@ -630,7 +704,7 @@ def _spline_coefficients_to_file(fp, l_arr, m_arr, n_arr, file_directory=None): grid = f["grid"][:] for l, m, n in zip(l_arr, m_arr, n_arr): if m >= 0: - key1 = kerr_format_string.format(l,m,n) + key1 = kerr_format_string.format(l, m, n) tmp = f[key1][:] tmp2 = tmp[:, 0] + 1j * tmp[:, 1] data[key1] = tmp2 @@ -642,7 +716,7 @@ def _spline_coefficients_to_file(fp, l_arr, m_arr, n_arr, file_directory=None): p = grid.T[1].copy() e = grid.T[2].copy() xI = grid.T[3].copy() - u = np.round(grid.T[4].copy(),8) # fix rounding errors in the files + u = np.round(grid.T[4].copy(), 8) # fix rounding errors in the files sep = grid.T[5].copy() w = grid.T[6].copy() @@ -652,14 +726,14 @@ def _spline_coefficients_to_file(fp, l_arr, m_arr, n_arr, file_directory=None): # retrograde needs sign flip to be applied to a a *= xI a_val_store = a[0] - + out_fp = f"KerrEqEccAmpCoeffs_a{a_val_store:.3f}.h5" - outfile = h5py.File(os.path.join(file_directory, out_fp),"w") - outfile.attrs['signed_spin'] = a_val_store - outfile.attrs['is_coefficients'] = True + outfile = h5py.File(os.path.join(file_directory, out_fp), "w") + outfile.attrs["signed_spin"] = a_val_store + outfile.attrs["is_coefficients"] = True grid_size = p.shape[0] - + unique_u = np.unique(u) unique_w = np.unique(w) num_u = len(unique_u) @@ -673,17 +747,18 @@ def _spline_coefficients_to_file(fp, l_arr, m_arr, n_arr, file_directory=None): data = {name: val[:, ::-1] for name, val in data.items()} - spl2D = {name: - [ - RectBivariateSpline(unique_w, unique_u, val.real, kx=3, ky=3), - RectBivariateSpline(unique_w, unique_u, val.imag, kx=3, ky=3) + spl2D = { + name: [ + RectBivariateSpline(unique_w, unique_u, val.real, kx=3, ky=3), + RectBivariateSpline(unique_w, unique_u, val.imag, kx=3, ky=3), ] - for name, val in data.items()} + for name, val in data.items() + } mode_keys = list(data.keys()) num_teuk_modes = len(mode_keys) - outfile.attrs['num_teuk_modes'] = num_teuk_modes + outfile.attrs["num_teuk_modes"] = num_teuk_modes first_key = list(spl2D.keys())[0] example_spl = spl2D[first_key][0] @@ -696,37 +771,38 @@ def _spline_coefficients_to_file(fp, l_arr, m_arr, n_arr, file_directory=None): len_indiv_c = tck_last_entry.shape[-1] - outfile.attrs['spline_degree_x'] = degrees[0] - outfile.attrs['spline_degree_y'] = degrees[1] - outfile.attrs['points_per_modegrid'] = len_indiv_c + outfile.attrs["spline_degree_x"] = degrees[0] + outfile.attrs["spline_degree_y"] = degrees[1] + outfile.attrs["points_per_modegrid"] = len_indiv_c - outfile.create_dataset('x1', data=example_spl.tck[0]) - outfile.create_dataset('x2', data=example_spl.tck[1]) - outfile.create_dataset('c', data=tck_last_entry.copy()) + outfile.create_dataset("x1", data=example_spl.tck[0]) + outfile.create_dataset("x2", data=example_spl.tck[1]) + outfile.create_dataset("c", data=tck_last_entry.copy()) outfile.close() return out_fp + if __name__ == "__main__": # try and instantiate the amplitude class - spin_values = np.r_[np.linspace(0.,0.9,10),0.95,0.99] - spin_values = np.r_[-np.flip(spin_values)[:-1],spin_values] + spin_values = np.r_[np.linspace(0.0, 0.9, 10), 0.95, 0.99] + spin_values = np.r_[-np.flip(spin_values)[:-1], spin_values] base_path = "Teuk_amps_a{:.2f}_{}lmax_10_nmax_50_new_m+.h5" filepaths = [] for spin in spin_values: part1 = abs(spin) if spin < 0: - part2 = 'r_' + part2 = "r_" elif spin > 0: - part2 = 'p_' + part2 = "p_" elif spin == 0: - part2 = '' + part2 = "" filepaths.append(base_path.format(part1, part2)) - #running this should auto-produce coefficients files + # running this should auto-produce coefficients files AmpInterpKerrEqEcc(filenames=filepaths, file_directory="../../processed_amplitudes") amp = AmpInterpKerrEqEcc() - print(amp(0., np.array([10.]), np.array([0.3]), np.array([1.]))) + print(amp(0.0, np.array([10.0]), np.array([0.3]), np.array([1.0]))) diff --git a/few/amplitude/romannet.py b/few/amplitude/romannet.py index d9c46c93..9e3c8e56 100644 --- a/few/amplitude/romannet.py +++ b/few/amplitude/romannet.py @@ -23,8 +23,8 @@ import h5py # Cython/C++ imports -from pymatmul_cpu import neural_layer_wrap as neural_layer_wrap_cpu -from pymatmul_cpu import transform_output_wrap as transform_output_wrap_cpu +from ..cutils.pymatmul_cpu import neural_layer_wrap as neural_layer_wrap_cpu +from ..cutils.pymatmul_cpu import transform_output_wrap as transform_output_wrap_cpu # Python imports from ..utils.baseclasses import ( @@ -40,7 +40,7 @@ # check for cupy and GPU version of pymatmul try: # Cython/C++ imports - from pymatmul import neural_layer_wrap, transform_output_wrap + from ..cutils.pymatmul import neural_layer_wrap, transform_output_wrap # Python imports import cupy as cp @@ -205,7 +205,7 @@ def _initialize_weights(self): self.num_layers = 0 # extract all necessary information from the file - with h5py.File(os.path.join(self.file_dir,self.data_file), "r") as fp: + with h5py.File(os.path.join(self.file_dir, self.data_file), "r") as fp: for key, value in fp.items(): if key == "reduced_basis": continue @@ -283,8 +283,12 @@ def get_amplitudes(self, a, p, e, xI, *args, specific_modes=None, **kwargs): self.max_init_len = input_len self.temp_mats = [ - self.xp.zeros((self.max_num * self.max_init_len,), dtype=self.xp.float64), - self.xp.zeros((self.max_num * self.max_init_len,), dtype=self.xp.float64), + self.xp.zeros( + (self.max_num * self.max_init_len,), dtype=self.xp.float64 + ), + self.xp.zeros( + (self.max_num * self.max_init_len,), dtype=self.xp.float64 + ), ] # the input is (y, e) @@ -298,10 +302,14 @@ def get_amplitudes(self, a, p, e, xI, *args, specific_modes=None, **kwargs): # setup arrays # teukolsky mode (final output) - teuk_modes = self.xp.zeros((input_len * self.num_teuk_modes,), dtype=self.xp.complex128) + teuk_modes = self.xp.zeros( + (input_len * self.num_teuk_modes,), dtype=self.xp.complex128 + ) # neural network output - nn_out_mat = self.xp.zeros((input_len * self.break_index,), dtype=self.xp.complex128) + nn_out_mat = self.xp.zeros( + (input_len * self.break_index,), dtype=self.xp.complex128 + ) # run the neural network for i, (weight, bias, run_relu) in enumerate( diff --git a/few/summation/aakwave.py b/few/summation/aakwave.py index d4586c20..26e2fd5f 100644 --- a/few/summation/aakwave.py +++ b/few/summation/aakwave.py @@ -30,11 +30,11 @@ import numpy as np # Cython/C++ imports -from pycpuAAK import pyWaveform as pyWaveform_cpu +from ..cutils.pycpuAAK import pyWaveform as pyWaveform_cpu # Attempt Cython imports of GPU functions try: - from pygpuAAK import pyWaveform as pyWaveform_gpu + from ..cutils.pygpuAAK import pyWaveform as pyWaveform_gpu except (ImportError, ModuleNotFoundError) as e: pass @@ -214,7 +214,7 @@ def sum( # convert to gpu if desired interp_coeffs_in = self.xp.transpose( self.xp.asarray(interp_coeffs), [2, 0, 1] - ).flatten() + ).flatten() # generator the waveform self.waveform_generator( @@ -238,6 +238,7 @@ def sum( return + class KerrAAKSummation(SummationBase, Pn5AAK, ParallelModuleBase): """Calculate an AAK waveform from an input trajectory. @@ -382,7 +383,9 @@ def sum( xI = Y_to_xI(a, p.copy(), e.copy(), Y.copy()) # these are dimensionless and in radians - OmegaPhi, OmegaTheta, OmegaR = get_fundamental_frequencies(a, p.copy(), e.copy(), xI.copy()) + OmegaPhi, OmegaTheta, OmegaR = get_fundamental_frequencies( + a, p.copy(), e.copy(), xI.copy() + ) # Set theta trajectories equal to eachother. OmegaTheta = OmegaPhi @@ -450,4 +453,3 @@ def sum( ) return - diff --git a/few/summation/fdinterp.py b/few/summation/fdinterp.py index 6487dc93..8e43e9a9 100644 --- a/few/summation/fdinterp.py +++ b/few/summation/fdinterp.py @@ -27,7 +27,7 @@ import numpy as np # Cython imports -from pyinterp_cpu import ( +from ..cutils.pyinterp_cpu import ( get_waveform_generic_fd_wrap as get_waveform_generic_fd_wrap_cpu, ) @@ -44,7 +44,9 @@ # Attempt Cython imports of GPU functions try: - from pyinterp import get_waveform_generic_fd_wrap as get_waveform_generic_fd_wrap_gpu + from ..cutils.pyinterp import ( + get_waveform_generic_fd_wrap as get_waveform_generic_fd_wrap_gpu, + ) except (ImportError, ModuleNotFoundError) as e: pass @@ -114,7 +116,8 @@ def searchsorted2d_vec(a, b, batch_size=-1, xp=None, **kwargs): m, n = a_temp.shape max_num = ( - self.xp.maximum(a_temp.max() - a_temp.min(), b_temp.max() - b_temp.min()) + 1 + self.xp.maximum(a_temp.max() - a_temp.min(), b_temp.max() - b_temp.min()) + + 1 ) r = max_num * self.xp.arange(a_temp.shape[0])[:, None] p = self.xp.searchsorted( @@ -146,7 +149,11 @@ def __init__(self, *args, **kwargs): @property def get_waveform_fd(self) -> callable: """GPU or CPU waveform generation.""" - return get_waveform_generic_fd_wrap_cpu if not self.use_gpu else get_waveform_generic_fd_wrap_gpu + return ( + get_waveform_generic_fd_wrap_cpu + if not self.use_gpu + else get_waveform_generic_fd_wrap_gpu + ) def attributes_FDInterpolatedModeSum(self): """ @@ -170,7 +177,7 @@ def sum( teuk_modes, ylms, phase_interp_t, - phase_interp_coeffs, + phase_interp_coeffs, m_arr, n_arr, M, @@ -251,12 +258,17 @@ def sum( # get fundamental frequencies across trajectory Omega_phi, Omega_theta, Omega_r = get_fundamental_frequencies( - a, p, e, xI, + a, + p, + e, + xI, ) # convert from dimensionless frequencies f_phi, f_r = ( - abs(self.xp.asarray(Omega_phi / (2 * np.pi * M * MTSUN_SI))), # positive frequency to be consistent with amplitude generator for retrograde inspirals # TODO get to the bottom of this! + abs( + self.xp.asarray(Omega_phi / (2 * np.pi * M * MTSUN_SI)) + ), # positive frequency to be consistent with amplitude generator for retrograde inspirals # TODO get to the bottom of this! self.xp.asarray(Omega_r / (2 * np.pi * M * MTSUN_SI)), ) @@ -406,9 +418,9 @@ def sum( axis=-1, ) - tmp_freqs_base_sorted_segs[ - check_turnover, fix_turnover_seg_ind - ] = tmp_segs_sorted_turnover[:, np.array([0, 2])] + tmp_freqs_base_sorted_segs[check_turnover, fix_turnover_seg_ind] = ( + tmp_segs_sorted_turnover[:, np.array([0, 2])] + ) except ValueError: pass @@ -440,9 +452,9 @@ def sum( df = self.frequency[1] - self.frequency[0] # figures out where in self.frequency each segment frequency falls - inds_check = self.xp.abs((tmp_freqs_base_sorted_segs - first_frequency) / df).astype( - int - ) + inds_check = self.xp.abs( + (tmp_freqs_base_sorted_segs - first_frequency) / df + ).astype(int) # start frequency index of each segment start_inds = (inds_check[:, :, 0].copy() + 1).astype(int) @@ -490,8 +502,8 @@ def sum( phase_interp_coeffs_in = self.xp.transpose( self.xp.asarray(phase_interp_coeffs), [2, 1, 0] - ).flatten() - + ).flatten() + # run GPU kernel self.get_waveform_fd( self.waveform, diff --git a/few/summation/interpolatedmodesum.py b/few/summation/interpolatedmodesum.py index dc205672..a392c9b5 100644 --- a/few/summation/interpolatedmodesum.py +++ b/few/summation/interpolatedmodesum.py @@ -27,8 +27,8 @@ import numpy as np # Cython imports -from pyinterp_cpu import interpolate_arrays_wrap as interpolate_arrays_wrap_cpu -from pyinterp_cpu import get_waveform_wrap as get_waveform_wrap_cpu +from ..cutils.pyinterp_cpu import interpolate_arrays_wrap as interpolate_arrays_wrap_cpu +from ..cutils.pyinterp_cpu import get_waveform_wrap as get_waveform_wrap_cpu # Python imports from ..utils.baseclasses import ( @@ -42,8 +42,8 @@ # Attempt Cython imports of GPU functions try: - from pyinterp import interpolate_arrays_wrap as interpolate_arrays_wrap_gpu - from pyinterp import get_waveform_wrap as get_waveform_wrap_gpu + from ..cutils.pyinterp import interpolate_arrays_wrap as interpolate_arrays_wrap_gpu + from ..cutils.pyinterp import get_waveform_wrap as get_waveform_wrap_gpu except (ImportError, ModuleNotFoundError) as e: pass @@ -151,11 +151,15 @@ def attributes_CubicSplineInterpolate(self): (4, length, ninterps). The 4 is the 4 spline coefficients. """ + @property def interpolate_arrays(self) -> callable: """GPU or CPU waveform generation.""" - return interpolate_arrays_wrap_cpu if not self.use_gpu else interpolate_arrays_wrap_gpu - + return ( + interpolate_arrays_wrap_cpu + if not self.use_gpu + else interpolate_arrays_wrap_gpu + ) @property def gpu_capability(self): @@ -419,7 +423,7 @@ def sum( phase_interp_coeffs_in = self.xp.transpose( self.xp.asarray(phase_interp_coeffs), [2, 0, 1] - ).flatten() + ).flatten() self.get_waveform( self.waveform, diff --git a/setup.py b/setup.py index 36cb7e41..e62b552b 100644 --- a/setup.py +++ b/setup.py @@ -258,21 +258,25 @@ def build_extensions(self): ) matmul_ext = Extension( - "pymatmul", sources=["src/matmul.cu", "src/pymatmul.pyx"], **gpu_extension + "few.cutils.pymatmul", + sources=["src/matmul.cu", "src/pymatmul.pyx"], + **gpu_extension, ) interp_ext = Extension( - "pyinterp", sources=["src/interpolate.cu", "src/pyinterp.pyx"], **gpu_extension + "few.cutils.pyinterp", + sources=["src/interpolate.cu", "src/pyinterp.pyx"], + **gpu_extension, ) gpuAAK_ext = Extension( - "pygpuAAK", + "few.cutils.pygpuAAK", sources=["src/gpuAAK.cu", "src/gpuAAKWrap.pyx"], **gpu_extension, ) gpu_amp_interp_2d_ext = Extension( - "pyAmpInterp2D", + "few.cutils.pyAmpInterp2D", sources=["src/AmpInterp2D.cu", "src/pyampinterp2D.pyx"], **gpu_extension, ) @@ -314,23 +318,25 @@ def build_extensions(self): cpu_extension["include_dirs"] += gsl_include matmul_cpu_ext = Extension( - "pymatmul_cpu", sources=["src/matmul.cpp", "src/pymatmul_cpu.pyx"], **cpu_extension + "few.cutils.pymatmul_cpu", + sources=["src/matmul.cpp", "src/pymatmul_cpu.pyx"], + **cpu_extension, ) interp_cpu_ext = Extension( - "pyinterp_cpu", + "few.cutils.pyinterp_cpu", sources=["src/interpolate.cpp", "src/pyinterp_cpu.pyx"], **cpu_extension, ) AAK_cpu_ext = Extension( - "pycpuAAK", + "few.cutils.pycpuAAK", sources=["src/gpuAAK.cpp", "src/gpuAAKWrap_cpu.pyx"], **cpu_extension, ) amp_interp_2d_ext = Extension( - "pyAmpInterp2D_cpu", + "few.cutils.pyAmpInterp2D_cpu", sources=["src/AmpInterp2D.cpp", "src/pyampinterp2D_cpu.pyx"], **cpu_extension, ) @@ -361,7 +367,16 @@ def build_extensions(self): version="1.5.5", url="https://github.com/mikekatz04/FastEMRIWaveforms", ext_modules=extensions, - packages=["few", "few.utils", "few.trajectory", "few.trajectory.ode", "few.amplitude", "few.summation", "few.waveform"], + packages=[ + "few", + "few.utils", + "few.cutils", + "few.trajectory", + "few.trajectory.ode", + "few.amplitude", + "few.summation", + "few.waveform", + ], classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: GNU General Public License (GPL)", @@ -377,5 +392,3 @@ def build_extensions(self): zip_safe=False, python_requires=">=3.6", ) - - diff --git a/src/matmul.cu b/src/matmul.cu index cd0de58e..3e60831e 100644 --- a/src/matmul.cu +++ b/src/matmul.cu @@ -28,17 +28,40 @@ using namespace std::chrono; // adjust imports for CUDA #ifdef __CUDACC__ #include "cublas_v2.h" -#else -#include #endif #define NUM_THREADS 256 +// matmul sub for cblas for backward compatibility +// FORTRAN STYLE COLUMN MAJOR (UGH) +template +CUDA_CALLABLE_MEMBER void our_cblas_gemm( + int m, + int n, + int k, + T *a, + T *b, + T *c) +{ + for (int i = 0; i < m; i++) + { + for (int j = 0; j < n; j++) + { + c[j * m + i] = 0.0; + for (int l = 0; l < k; l++) + { + c[j * m + i] += a[l * m + i] * b[j * k + l]; + } + } + } +} + // activation function // fixed 0.2 in leaky end -CUDA_CALLABLE_MEMBER double LeakyReLU(double x){ - double out = (x >= 0.0) ? x : 0.2*x; - return out; +CUDA_CALLABLE_MEMBER double LeakyReLU(double x) +{ + double out = (x >= 0.0) ? x : 0.2 * x; + return out; } // funciton for adding bias and then passing through activation @@ -46,8 +69,8 @@ CUDA_KERNEL void add_bias_relu(double *C, double *bias, int input_len, int dim2) { - // adjust loop boundaries in CUDA - #ifdef __CUDACC__ +// adjust loop boundaries in CUDA +#ifdef __CUDACC__ int start1 = blockIdx.x * blockDim.x + threadIdx.x; int end1 = input_len; int diff1 = blockDim.x * gridDim.x; @@ -56,7 +79,7 @@ void add_bias_relu(double *C, double *bias, int input_len, int dim2) int end2 = dim2; int diff2 = blockDim.y * gridDim.y; - #else +#else int start1 = 0; int end1 = input_len; @@ -66,30 +89,28 @@ void add_bias_relu(double *C, double *bias, int input_len, int dim2) int end2 = dim2; int diff2 = 1; - - #endif +#endif for (int i = start1; i < end1; i += diff1) { for (int j = start2; - j < end2; - j += diff2) + j < end2; + j += diff2) { - C[input_len*j + i] = LeakyReLU(C[input_len*j + i] + bias[j]); - + C[input_len * j + i] = LeakyReLU(C[input_len * j + i] + bias[j]); } } } // funciton for adding bias and WITHOUT passing through activation CUDA_KERNEL -void add_bias(double *C, double *bias, int input_len, int dim2){ - +void add_bias(double *C, double *bias, int input_len, int dim2) +{ - #ifdef __CUDACC__ +#ifdef __CUDACC__ int start1 = blockIdx.x * blockDim.x + threadIdx.x; int end1 = input_len; int diff1 = blockDim.x * gridDim.x; @@ -98,7 +119,7 @@ void add_bias(double *C, double *bias, int input_len, int dim2){ int end2 = dim2; int diff2 = blockDim.y * gridDim.y; - #else +#else int start1 = 0; int end1 = input_len; @@ -108,19 +129,18 @@ void add_bias(double *C, double *bias, int input_len, int dim2){ int end2 = dim2; int diff2 = 1; - - #endif +#endif for (int i = start1; i < end1; i += diff1) { for (int j = start2; - j < end2; - j += diff2) + j < end2; + j += diff2) { - C[input_len*j + i] = C[input_len*j + i] + bias[j]; + C[input_len * j + i] = C[input_len * j + i] + bias[j]; } } } @@ -128,169 +148,183 @@ void add_bias(double *C, double *bias, int input_len, int dim2){ // perform matrix calculations in blas for a neural network layer void neural_layer(double *mat_out, double *mat_in, double *weight, double *bias, int m, int k, int n, int run_relu) { - #ifdef __CUDACC__ - cublasHandle_t handle; - - char * status; - cublasStatus_t stat; - double alpha = 1.0; - double beta = 0.0; - stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) { - printf ("CUBLAS initialization failed\n"); - exit(0); - } - - // matrix multiplication - stat = cublasDgemm(handle, - CUBLAS_OP_N, CUBLAS_OP_N, - m, n, k, - &alpha, - mat_in, m, - weight, k, - &beta, - mat_out, m); - - if (stat != CUBLAS_STATUS_SUCCESS) { - printf ("CUBLAS initialization failed\n"); - exit(0); - } - - stat = cublasDestroy(handle); - if (stat != CUBLAS_STATUS_SUCCESS) { - printf ("CUBLAS initialization failed\n"); - exit(0); - } +#ifdef __CUDACC__ + cublasHandle_t handle; + + char *status; + cublasStatus_t stat; + double alpha = 1.0; + double beta = 0.0; + stat = cublasCreate(&handle); + if (stat != CUBLAS_STATUS_SUCCESS) + { + printf("CUBLAS initialization failed\n"); + exit(0); + } + + // matrix multiplication + stat = cublasDgemm(handle, + CUBLAS_OP_N, CUBLAS_OP_N, + m, n, k, + &alpha, + mat_in, m, + weight, k, + &beta, + mat_out, m); + + if (stat != CUBLAS_STATUS_SUCCESS) + { + printf("CUBLAS initialization failed\n"); + exit(0); + } + + stat = cublasDestroy(handle); + if (stat != CUBLAS_STATUS_SUCCESS) + { + printf("CUBLAS initialization failed\n"); + exit(0); + } // Add the bias and activate, except in last layer do not activate - int num_threads = 256; - int num_blocks = std::ceil((m + num_threads -1)/num_threads); - dim3 gridDim(num_blocks, n); + int num_threads = 256; + int num_blocks = std::ceil((m + num_threads - 1) / num_threads); + dim3 gridDim(num_blocks, n); - if (run_relu){ - add_bias_relu<<>>(mat_out, bias, m, n); - } else { - add_bias<<>>(mat_out, bias, m, n); - } - cudaDeviceSynchronize(); - gpuErrchk(cudaGetLastError()); + if (run_relu) + { + add_bias_relu<<>>(mat_out, bias, m, n); + } + else + { + add_bias<<>>(mat_out, bias, m, n); + } + cudaDeviceSynchronize(); + gpuErrchk(cudaGetLastError()); - #else +#else // perform calculations in cblas - cblas_dgemm (CblasColMajor, - CblasNoTrans, CblasNoTrans, m, n, k, - 1.0, mat_in, m, weight, k, 0.0, mat_out, m); + our_cblas_gemm(m, n, k, + mat_in, weight, mat_out); - if (run_relu){ + if (run_relu) + { add_bias_relu(mat_out, bias, m, n); - } else { + } + else + { add_bias(mat_out, bias, m, n); } - #endif +#endif } // take the output of the neural net and conver it from (re_1,..,re_n, im_1, ..., im_n) // to imaginary CUDA_KERNEL void form_complex_output(cmplx *complex_output, double *nn_output, int input_len, int break_index, - double transform_factor_inv){ - - cmplx temp(0.0, 0.0); + double transform_factor_inv) +{ - #ifdef __CUDACC__ - int start1 = blockIdx.x * blockDim.x + threadIdx.x; - int end1 = input_len; - int diff1 = blockDim.x * gridDim.x; + cmplx temp(0.0, 0.0); - int start2 = blockIdx.y * blockDim.y + threadIdx.y; - int end2 = break_index; - int diff2 = blockDim.y * gridDim.y; +#ifdef __CUDACC__ + int start1 = blockIdx.x * blockDim.x + threadIdx.x; + int end1 = input_len; + int diff1 = blockDim.x * gridDim.x; - #else + int start2 = blockIdx.y * blockDim.y + threadIdx.y; + int end2 = break_index; + int diff2 = blockDim.y * gridDim.y; - int start1 = 0; - int end1 = input_len; - int diff1 = 1; +#else - int start2 = 0; - int end2 = break_index; - int diff2 = 1; + int start1 = 0; + int end1 = input_len; + int diff1 = 1; + int start2 = 0; + int end2 = break_index; + int diff2 = 1; - #endif - for (int i = start1; - i < end1; - i += diff1){ +#endif + for (int i = start1; + i < end1; + i += diff1) + { - for (int ind = start2; - ind < end2; - ind += diff2){ + for (int ind = start2; + ind < end2; + ind += diff2) + { // break index tells how many real entries or imaginary entries - temp = cmplx(nn_output[ind*input_len + i], nn_output[(break_index+ind)*input_len + i]); - complex_output[ind*input_len + i] = temp*transform_factor_inv; - } - } + temp = cmplx(nn_output[ind * input_len + i], nn_output[(break_index + ind) * input_len + i]); + complex_output[ind * input_len + i] = temp * transform_factor_inv; + } + } } // post neural net transform from reduced basis back to full amplitude basis void transform_output(cmplx *teuk_modes, cmplx *transform_matrix, cmplx *nn_output_mat, double *C, int input_len, int break_index, double transform_factor_inv, - int num_teuk_modes){ - - int m=input_len, k=break_index, n=num_teuk_modes; - #ifdef __CUDACC__ - int num_blocks = std::ceil((input_len + NUM_THREADS -1)/NUM_THREADS); - dim3 gridDim(num_blocks, break_index); - - // form the complex array of neural net outputs - form_complex_output<<>>(nn_output_mat, C, input_len, break_index, transform_factor_inv); - cudaDeviceSynchronize(); - gpuErrchk(cudaGetLastError()); - - - char * status; - cublasHandle_t handle; - cublasStatus_t stat; - cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0); - cuDoubleComplex beta = make_cuDoubleComplex(0.0, 0.0); - stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) { - printf ("CUBLAS initialization failed\n"); - exit(0); - } - - // project back onto amplitude basis - stat = cublasZgemm(handle, - CUBLAS_OP_N, CUBLAS_OP_N, - m, n, k, - &alpha, - (cuDoubleComplex*)nn_output_mat, m, - (cuDoubleComplex*)transform_matrix, k, - &beta, - (cuDoubleComplex*)teuk_modes, m); - - status = _cudaGetErrorEnum(stat); + int num_teuk_modes) +{ + + int m = input_len, k = break_index, n = num_teuk_modes; +#ifdef __CUDACC__ + int num_blocks = std::ceil((input_len + NUM_THREADS - 1) / NUM_THREADS); + dim3 gridDim(num_blocks, break_index); + + // form the complex array of neural net outputs + form_complex_output<<>>(nn_output_mat, C, input_len, break_index, transform_factor_inv); + cudaDeviceSynchronize(); + gpuErrchk(cudaGetLastError()); + + char *status; + cublasHandle_t handle; + cublasStatus_t stat; + cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0); + cuDoubleComplex beta = make_cuDoubleComplex(0.0, 0.0); + stat = cublasCreate(&handle); + if (stat != CUBLAS_STATUS_SUCCESS) + { + printf("CUBLAS initialization failed\n"); + exit(0); + } + + // project back onto amplitude basis + stat = cublasZgemm(handle, + CUBLAS_OP_N, CUBLAS_OP_N, + m, n, k, + &alpha, + (cuDoubleComplex *)nn_output_mat, m, + (cuDoubleComplex *)transform_matrix, k, + &beta, + (cuDoubleComplex *)teuk_modes, m); + + status = _cudaGetErrorEnum(stat); cudaDeviceSynchronize(); stat = cublasDestroy(handle); - if (stat != CUBLAS_STATUS_SUCCESS) { - exit(0); - } + if (stat != CUBLAS_STATUS_SUCCESS) + { + exit(0); + } - #else +#else - const cmplx alpha(1.0, 0.0); - const cmplx beta(0.0, 0.0); + const cmplx alpha(1.0, 0.0); + const cmplx beta(0.0, 0.0); // form the complex array of neural net outputs - form_complex_output(nn_output_mat, C, input_len, break_index, transform_factor_inv); + form_complex_output(nn_output_mat, C, input_len, break_index, transform_factor_inv); - // transform to amplitude basis - cblas_zgemm (CblasColMajor, - CblasNoTrans, CblasNoTrans, m, n, k, - (void*)&alpha, (void*)nn_output_mat, m, (void*)transform_matrix, k, (void*)&beta, (void*)teuk_modes, m); - #endif + // transform to amplitude basis + our_cblas_gemm(m, n, k, nn_output_mat, transform_matrix, teuk_modes); + // cblas_zgemm(CblasColMajor, + // CblasNoTrans, CblasNoTrans, m, n, k, + // (void *)&alpha, (void *)nn_output_mat, m, (void *)transform_matrix, k, (void *)&beta, (void *)teuk_modes, m); + +#endif }