From 94891c10c27f38d9b73be532a7e09a4d38cf4003 Mon Sep 17 00:00:00 2001 From: sflorescu <43118429+sflorescu@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:09:36 +0100 Subject: [PATCH 1/7] Added support for mex wrapper for prepare A matrix --- matlab/mex_compile_prepare_a_matrix.m | 10 +++ matlab/prepare_a_matrix_mex.cu | 31 ++++++++ src/prepare_a_matrix.cu | 109 ++++++++++++++++++++++++++ src/prepare_a_matrix.h | 13 +++ 4 files changed, 163 insertions(+) create mode 100644 matlab/mex_compile_prepare_a_matrix.m create mode 100644 matlab/prepare_a_matrix_mex.cu create mode 100644 src/prepare_a_matrix.h diff --git a/matlab/mex_compile_prepare_a_matrix.m b/matlab/mex_compile_prepare_a_matrix.m new file mode 100644 index 0000000..52de9bb --- /dev/null +++ b/matlab/mex_compile_prepare_a_matrix.m @@ -0,0 +1,10 @@ +% mex_compile_prepare_a_matrix.m +% Adjust include and library paths as needed +mexcuda ... + -I'../include' ... % for tcbf.h if needed + -I'../src' ... % for prepare_a_matrix.h + -I'/path/to/ccglib/include' ... + -I'/path/to/cxxopts/include' ... + -L'/path/to/ccglib/lib' -lccglib ... + -L'/path/to/cuda/lib64' -lcudart -lcuda ... + prepare_a_matrix_mex.cu ../src/prepare_a_matrix.cu diff --git a/matlab/prepare_a_matrix_mex.cu b/matlab/prepare_a_matrix_mex.cu new file mode 100644 index 0000000..adee42c --- /dev/null +++ b/matlab/prepare_a_matrix_mex.cu @@ -0,0 +1,31 @@ +#include "../src/prepare_a_matrix.h" // include the header +#include "mex.h" + +void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + if (nrhs != 5) { + mexErrMsgIdAndTxt("beamformer_mex:InvalidInput", + "Five inputs required: a_matrix_in, a_matrix_out, pixels, samples, device_id"); + } + + // Extract arguments from MATLAB inputs + char *a_matrix_in_c = mxArrayToString(prhs[0]); + char *a_matrix_out_c = mxArrayToString(prhs[1]); + double pixels_d = mxGetScalar(prhs[2]); + double samples_d = mxGetScalar(prhs[3]); + double device_id_d = mxGetScalar(prhs[4]); + + std::string a_matrix_in(a_matrix_in_c); + std::string a_matrix_out(a_matrix_out_c); + size_t pixels = static_cast(pixels_d); + size_t samples = static_cast(samples_d); + unsigned device_id = static_cast(device_id_d); + + mxFree(a_matrix_in_c); + mxFree(a_matrix_out_c); + + int status = prepareAMatrix(a_matrix_in, a_matrix_out, pixels, samples, device_id); + + if (nlhs > 0) { + plhs[0] = mxCreateDoubleScalar(static_cast(status)); + } +} diff --git a/src/prepare_a_matrix.cu b/src/prepare_a_matrix.cu index ca05c6e..6fbc0db 100644 --- a/src/prepare_a_matrix.cu +++ b/src/prepare_a_matrix.cu @@ -7,6 +7,8 @@ #include #include +#include "prepare_a_matrix.h" + inline size_t align(size_t a, size_t b) { return b * ccglib::helper::ceildiv(a, b); } cxxopts::Options create_commandline_parser(const char *argv[]) { @@ -47,6 +49,113 @@ cxxopts::ParseResult parse_commandline(int argc, const char *argv[]) { } } +// This function will run the beamformer given the parameters +int prepareAMatrix(const std::string &path_a_matrix_in, const std::string &path_a_matrix_out, size_t pixels, + size_t samples, unsigned device_id) { + const size_t complex = 2; + + cu::init(); + cu::Device device(device_id); + cu::Context context(CU_CTX_BLOCKING_SYNC, device); + cu::Stream stream; + + // tile size in beams, frames, samples axes + dim3 tile_sizes = ccglib::mma::GEMM::GetDimensions(ccglib::mma::int1, ccglib::mma::opt); + + const size_t pixels_padded = align(pixels, tile_sizes.x); + const size_t samples_padded = align(samples, tile_sizes.z); + + // factor 2 for complex + const size_t bytes_a_matrix = complex * pixels_padded * samples_padded; + const size_t bytes_a_matrix_packed = bytes_a_matrix / CHAR_BIT; + + // Read data from disk + // row-by-row to handle padding + cu::HostMemory a_matrix_host(bytes_a_matrix); + std::ifstream in(path_a_matrix_in, std::ios::binary | std::ios::in); + if (!in) { + std::cerr << "Failed to open input file: " + path_a_matrix_in << std::endl; + return -1; + } + for (size_t c = 0; c < complex; c++) { + for (size_t pixel = 0; pixel < pixels; pixel++) { + in.read(static_cast(a_matrix_host) + c * pixels_padded * samples_padded + pixel * samples_padded, + samples); + } + } + in.close(); + + // conjugate + std::cout << "Conjugate" << std::endl; +#pragma omp parallel for collapse(2) + for (size_t pixel = 0; pixel < pixels; pixel++) { + for (size_t sample = 0; sample < samples; sample++) { + const size_t idx = pixels_padded * samples_padded + pixel * samples_padded + sample; + static_cast(a_matrix_host)[idx] = 1 - static_cast(a_matrix_host)[idx]; + } + } + + // Device memory for output packed data + cu::DeviceMemory d_a_matrix_packed(bytes_a_matrix_packed); + d_a_matrix_packed.zero(bytes_a_matrix_packed); + // Device memory for transposed data + cu::DeviceMemory d_a_transposed(bytes_a_matrix_packed); + + // chunk of input data on device in case it doesn't fit in GPU memory + // get available GPU memory (after allocating other device memory) + // use at most 80% of available memory + size_t bytes_per_chunk = static_cast(0.8 * context.getFreeMemory()); + // packing kernel uses at most 1024 threads per block + bytes_per_chunk = 1024 * (bytes_per_chunk / 1024); + if (bytes_per_chunk > bytes_a_matrix) { + bytes_per_chunk = bytes_a_matrix; + } + cu::DeviceMemory d_a_chunk(bytes_per_chunk); + d_a_chunk.zero(bytes_per_chunk); + + // process, complex-first for now + std::cout << "Packing" << std::endl; + for (size_t byte_start = 0; byte_start < bytes_a_matrix; byte_start += bytes_per_chunk) { + size_t local_nbytes = bytes_per_chunk; + // correct nbytes in last chunk + if (byte_start + local_nbytes > bytes_a_matrix) { + local_nbytes = bytes_a_matrix - byte_start; + // ensure any padded region is set to zero + d_a_chunk.zero(bytes_per_chunk); + } + // copy chunk to device + stream.memcpyHtoDAsync(d_a_chunk, static_cast(a_matrix_host) + byte_start, local_nbytes); + // get device memory slice for this chunk in a_packed + cu::DeviceMemory d_a_packed_chunk(d_a_matrix_packed, byte_start / CHAR_BIT, local_nbytes / CHAR_BIT); + // run packing kernel + ccglib::packing::Packing packing(local_nbytes, device, stream); + packing.Run(d_a_chunk, d_a_packed_chunk, ccglib::packing::pack, ccglib::packing::complex_first); + } + + // transpose + std::cout << "Transpose" << std::endl; + ccglib::transpose::Transpose transpose(1, pixels_padded, samples_padded, tile_sizes.x, tile_sizes.z, 1, device, + stream); + transpose.Run(d_a_matrix_packed, d_a_transposed); + + // copy output to host + std::cout << "Copy to host" << std::endl; + cu::HostMemory a_matrix_output(bytes_a_matrix_packed); + stream.memcpyDtoHAsync(a_matrix_output, d_a_transposed, bytes_a_matrix_packed); + stream.synchronize(); + + // write to disk + std::cout << "Write to disk" << std::endl; + std::ofstream out(path_a_matrix_out, std::ios::binary | std::ios::out); + if (!out) { + std::cerr << "Failed to open output file: " + path_a_matrix_out << std::endl; + return -1; + } + out.write(static_cast(a_matrix_output), bytes_a_matrix_packed); + + return 0; // success +} + int main(int argc, const char *argv[]) { cxxopts::ParseResult cmdline = parse_commandline(argc, argv); const std::string path_a_matrix_in = cmdline["a_matrix_in"].as(); diff --git a/src/prepare_a_matrix.h b/src/prepare_a_matrix.h new file mode 100644 index 0000000..8d30f7b --- /dev/null +++ b/src/prepare_a_matrix.h @@ -0,0 +1,13 @@ +#ifndef PREPARE_A_MATRIX_H +#define PREPARE_A_MATRIX_H + +#include +#include // for size_t + +int prepareAMatrix(const std::string &path_a_matrix_in, + const std::string &path_a_matrix_out, + size_t pixels, + size_t samples, + unsigned device_id); + +#endif // PREPARE_A_MATRIX_H From 999de820a81a555991670880ef526dc4ad19b809 Mon Sep 17 00:00:00 2001 From: sflorescu <43118429+sflorescu@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:23:26 +0100 Subject: [PATCH 2/7] Added check if prepare runs --- matlab/prepare_a_matrix_mex.cu | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/matlab/prepare_a_matrix_mex.cu b/matlab/prepare_a_matrix_mex.cu index adee42c..5d8c658 100644 --- a/matlab/prepare_a_matrix_mex.cu +++ b/matlab/prepare_a_matrix_mex.cu @@ -3,11 +3,10 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { if (nrhs != 5) { - mexErrMsgIdAndTxt("beamformer_mex:InvalidInput", + mexErrMsgIdAndTxt("prepare_a_matrix_mex:InvalidInput", "Five inputs required: a_matrix_in, a_matrix_out, pixels, samples, device_id"); } - // Extract arguments from MATLAB inputs char *a_matrix_in_c = mxArrayToString(prhs[0]); char *a_matrix_out_c = mxArrayToString(prhs[1]); double pixels_d = mxGetScalar(prhs[2]); @@ -25,7 +24,13 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { int status = prepareAMatrix(a_matrix_in, a_matrix_out, pixels, samples, device_id); + if (status == 0) { + mexPrintf("prepareAMatrix completed successfully.\n"); + } else { + mexPrintf("prepareAMatrix failed with status code: %d\n", status); + } + if (nlhs > 0) { - plhs[0] = mxCreateDoubleScalar(static_cast(status)); + plhs[0] = mxCreateDoubleScalar((double)status); } } From ff5e666c96a2a548cfba93061cdcc580df775aa7 Mon Sep 17 00:00:00 2001 From: Leon Oostrum Date: Fri, 13 Dec 2024 15:24:04 +0100 Subject: [PATCH 3/7] Fix compilation, add mex for beamforming --- matlab/beamform_mex.cu | 38 +++++++++++++++++++++++++++ matlab/mex_compile_beamform.m | 8 ++++++ matlab/mex_compile_prepare_a_matrix.m | 12 ++++----- 3 files changed, 51 insertions(+), 7 deletions(-) create mode 100644 matlab/beamform_mex.cu create mode 100644 matlab/mex_compile_beamform.m diff --git a/matlab/beamform_mex.cu b/matlab/beamform_mex.cu new file mode 100644 index 0000000..d5528ac --- /dev/null +++ b/matlab/beamform_mex.cu @@ -0,0 +1,38 @@ +#include "tcbf.h" // include the header +#include "mex.h" + +void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + if (nrhs != 7) { + mexErrMsgIdAndTxt("beamform:InvalidInput", + "Seven inputs required: a_matrix, rf, bf, pixels, frames, samples, device_id"); + } + + std::string path_a_matrix = mxArrayToString(prhs[0]); + std::string path_rf = mxArrayToString(prhs[1]); + std::string path_bf = mxArrayToString(prhs[2]); + size_t pixels = mxGetScalar(prhs[3]); + size_t frames = mxGetScalar(prhs[4]); + size_t samples = mxGetScalar(prhs[5]); + unsigned device_id = mxGetScalar(prhs[6]); + + cu::init(); + cu::Device device(device_id); + cu::Context context(CU_CTX_BLOCKING_SYNC, device); + cu::Stream stream; + + tcbf::Beamformer beamformer(pixels, frames, samples, device, stream); + cu::HostMemory RF(2 * frames * samples); + cu::HostMemory BF(2 * pixels * frames * sizeof(unsigned)); + + beamformer.read_A_matrix(path_a_matrix); + beamformer.read_RF(RF, path_rf); + beamformer.process(RF, BF); + beamformer.write_BF(BF, path_bf); + + + int status = 0; + + if (nlhs > 0) { + plhs[0] = mxCreateDoubleScalar((double)status); + } +} diff --git a/matlab/mex_compile_beamform.m b/matlab/mex_compile_beamform.m new file mode 100644 index 0000000..bd36d9e --- /dev/null +++ b/matlab/mex_compile_beamform.m @@ -0,0 +1,8 @@ +% mex_compile_prepare_a_matrix.m +% Adjust include and library paths as needed +mexcuda -v ... + -L"/usr/local/cuda/lib64" -lcudart -lcuda ... + 'NVCCFLAGS=-gencode=arch=compute_89,code=sm_89' ... + -ltcbf -lccglib ... + beamform_mex.cu + diff --git a/matlab/mex_compile_prepare_a_matrix.m b/matlab/mex_compile_prepare_a_matrix.m index 52de9bb..8b0da29 100644 --- a/matlab/mex_compile_prepare_a_matrix.m +++ b/matlab/mex_compile_prepare_a_matrix.m @@ -1,10 +1,8 @@ % mex_compile_prepare_a_matrix.m % Adjust include and library paths as needed -mexcuda ... - -I'../include' ... % for tcbf.h if needed - -I'../src' ... % for prepare_a_matrix.h - -I'/path/to/ccglib/include' ... - -I'/path/to/cxxopts/include' ... - -L'/path/to/ccglib/lib' -lccglib ... - -L'/path/to/cuda/lib64' -lcudart -lcuda ... +mexcuda -v ... + -L"/usr/local/cuda/lib64" -lcudart -lcuda ... + 'NVCCFLAGS=-gencode=arch=compute_89,code=sm_89' ... + -lccglib ... prepare_a_matrix_mex.cu ../src/prepare_a_matrix.cu + From fa216072f3755646d4a4e0118536d0073327e22d Mon Sep 17 00:00:00 2001 From: sflorescu <43118429+sflorescu@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:31:24 +0100 Subject: [PATCH 4/7] Added return of BF back to matlab --- matlab/beamform_mex.cu | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/matlab/beamform_mex.cu b/matlab/beamform_mex.cu index d5528ac..bbd7778 100644 --- a/matlab/beamform_mex.cu +++ b/matlab/beamform_mex.cu @@ -1,5 +1,5 @@ -#include "tcbf.h" // include the header #include "mex.h" +#include "tcbf.h" // include the header void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { if (nrhs != 7) { @@ -29,6 +29,15 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { beamformer.process(RF, BF); beamformer.write_BF(BF, path_bf); + mwSize dims[2] = {2 * pixels, frames}; + mxArray *outArray = mxCreateNumericArray(2, dims, mxINT32_CLASS, mxREAL); + int32_t *outData = static_cast(mxGetData(outArray)); + + // Copy the data + std::memcpy(outData, BF, totalElements * sizeof(int32_t)); + + // Assign output + plhs[0] = outArray; int status = 0; From 9fdee0e20d33009626ea62248f73506c13d00895 Mon Sep 17 00:00:00 2001 From: sflorescu <43118429+sflorescu@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:33:30 +0100 Subject: [PATCH 5/7] Editted status to work --- matlab/beamform_mex.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/matlab/beamform_mex.cu b/matlab/beamform_mex.cu index bbd7778..cdb7147 100644 --- a/matlab/beamform_mex.cu +++ b/matlab/beamform_mex.cu @@ -41,7 +41,7 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { int status = 0; - if (nlhs > 0) { - plhs[0] = mxCreateDoubleScalar((double)status); + if (nlhs > 1) { + plhs[1] = mxCreateDoubleScalar((double)status); } } From 573f35e05937914cb7f93dcf68db6f281415d67e Mon Sep 17 00:00:00 2001 From: sflorescu <43118429+sflorescu@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:36:14 +0100 Subject: [PATCH 6/7] Defined totalElements --- matlab/beamform_mex.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matlab/beamform_mex.cu b/matlab/beamform_mex.cu index cdb7147..264f07a 100644 --- a/matlab/beamform_mex.cu +++ b/matlab/beamform_mex.cu @@ -34,7 +34,7 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { int32_t *outData = static_cast(mxGetData(outArray)); // Copy the data - std::memcpy(outData, BF, totalElements * sizeof(int32_t)); + std::memcpy(outData, BF, 2 * frames * samples * sizeof(int32_t)); // Assign output plhs[0] = outArray; From ce9fa14aae00f7767cc596b413c975991280565e Mon Sep 17 00:00:00 2001 From: Leon Oostrum Date: Fri, 13 Dec 2024 16:10:16 +0100 Subject: [PATCH 7/7] make BF 3D, fix sizes --- matlab/beamform_mex.cu | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/matlab/beamform_mex.cu b/matlab/beamform_mex.cu index 264f07a..d1ae88e 100644 --- a/matlab/beamform_mex.cu +++ b/matlab/beamform_mex.cu @@ -1,3 +1,5 @@ +#include +#include #include "mex.h" #include "tcbf.h" // include the header @@ -22,19 +24,19 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { tcbf::Beamformer beamformer(pixels, frames, samples, device, stream); cu::HostMemory RF(2 * frames * samples); - cu::HostMemory BF(2 * pixels * frames * sizeof(unsigned)); + cu::HostMemory BF(2 * pixels * frames * sizeof(int32_t)); beamformer.read_A_matrix(path_a_matrix); beamformer.read_RF(RF, path_rf); beamformer.process(RF, BF); beamformer.write_BF(BF, path_bf); - mwSize dims[2] = {2 * pixels, frames}; - mxArray *outArray = mxCreateNumericArray(2, dims, mxINT32_CLASS, mxREAL); - int32_t *outData = static_cast(mxGetData(outArray)); + mwSize dims[3] = {pixels, frames, 2}; + mxArray *outArray = mxCreateNumericArray(3, dims, mxINT32_CLASS, mxREAL); + int32_t *outData = reinterpret_cast(mxGetData(outArray)); // Copy the data - std::memcpy(outData, BF, 2 * frames * samples * sizeof(int32_t)); + std::memcpy(outData, BF, 2 * frames * pixels * sizeof(int32_t)); // Assign output plhs[0] = outArray;