From 94891c10c27f38d9b73be532a7e09a4d38cf4003 Mon Sep 17 00:00:00 2001
From: sflorescu <43118429+sflorescu@users.noreply.github.com>
Date: Fri, 13 Dec 2024 14:09:36 +0100
Subject: [PATCH 1/7] Added support for mex wrapper for prepare A matrix

---
 matlab/mex_compile_prepare_a_matrix.m |  10 +++
 matlab/prepare_a_matrix_mex.cu        |  31 ++++++++
 src/prepare_a_matrix.cu               | 109 ++++++++++++++++++++++++++
 src/prepare_a_matrix.h                |  13 +++
 4 files changed, 163 insertions(+)
 create mode 100644 matlab/mex_compile_prepare_a_matrix.m
 create mode 100644 matlab/prepare_a_matrix_mex.cu
 create mode 100644 src/prepare_a_matrix.h
diff --git a/matlab/mex_compile_prepare_a_matrix.m b/matlab/mex_compile_prepare_a_matrix.m
new file mode 100644
index 0000000..52de9bb
--- /dev/null
+++ b/matlab/mex_compile_prepare_a_matrix.m
@@ -0,0 +1,10 @@
+% mex_compile_prepare_a_matrix.m
+% Adjust include and library paths as needed
+mexcuda ...
+    -I'../include' ...            % for tcbf.h if needed
+    -I'../src' ...                % for prepare_a_matrix.h
+    -I'/path/to/ccglib/include' ...
+    -I'/path/to/cxxopts/include' ...
+    -L'/path/to/ccglib/lib' -lccglib ...
+    -L'/path/to/cuda/lib64' -lcudart -lcuda ...
+    prepare_a_matrix_mex.cu ../src/prepare_a_matrix.cu
diff --git a/matlab/prepare_a_matrix_mex.cu b/matlab/prepare_a_matrix_mex.cu
new file mode 100644
index 0000000..adee42c
--- /dev/null
+++ b/matlab/prepare_a_matrix_mex.cu
@@ -0,0 +1,31 @@
+#include "../src/prepare_a_matrix.h"  // include the header
+#include "mex.h"
+
+void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  if (nrhs != 5) {
+    mexErrMsgIdAndTxt("beamformer_mex:InvalidInput",
+                      "Five inputs required: a_matrix_in, a_matrix_out, pixels, samples, device_id");
+  }
+
+  // Extract arguments from MATLAB inputs
+  char *a_matrix_in_c = mxArrayToString(prhs[0]);
+  char *a_matrix_out_c = mxArrayToString(prhs[1]);
+  double pixels_d = mxGetScalar(prhs[2]);
+  double samples_d = mxGetScalar(prhs[3]);
+  double device_id_d = mxGetScalar(prhs[4]);
+
+  std::string a_matrix_in(a_matrix_in_c);
+  std::string a_matrix_out(a_matrix_out_c);
+  size_t pixels = static_cast<size_t>(pixels_d);
+  size_t samples = static_cast<size_t>(samples_d);
+  unsigned device_id = static_cast<unsigned>(device_id_d);
+
+  mxFree(a_matrix_in_c);
+  mxFree(a_matrix_out_c);
+
+  int status = prepareAMatrix(a_matrix_in, a_matrix_out, pixels, samples, device_id);
+
+  if (nlhs > 0) {
+    plhs[0] = mxCreateDoubleScalar(static_cast<double>(status));
+  }
+}
diff --git a/src/prepare_a_matrix.cu b/src/prepare_a_matrix.cu
index ca05c6e..6fbc0db 100644
--- a/src/prepare_a_matrix.cu
+++ b/src/prepare_a_matrix.cu
@@ -7,6 +7,8 @@
 #include <fstream>
 #include <iostream>
 
+#include "prepare_a_matrix.h"
+
 inline size_t align(size_t a, size_t b) { return b * ccglib::helper::ceildiv(a, b); }
 
 cxxopts::Options create_commandline_parser(const char *argv[]) {
@@ -47,6 +49,113 @@ cxxopts::ParseResult parse_commandline(int argc, const char *argv[]) {
   }
 }
 
+// This function will run the beamformer given the parameters
+int prepareAMatrix(const std::string &path_a_matrix_in, const std::string &path_a_matrix_out, size_t pixels,
+                   size_t samples, unsigned device_id) {
+  const size_t complex = 2;
+
+  cu::init();
+  cu::Device device(device_id);
+  cu::Context context(CU_CTX_BLOCKING_SYNC, device);
+  cu::Stream stream;
+
+  // tile size in beams, frames, samples axes
+  dim3 tile_sizes = ccglib::mma::GEMM::GetDimensions(ccglib::mma::int1, ccglib::mma::opt);
+
+  const size_t pixels_padded = align(pixels, tile_sizes.x);
+  const size_t samples_padded = align(samples, tile_sizes.z);
+
+  // factor 2 for complex
+  const size_t bytes_a_matrix = complex * pixels_padded * samples_padded;
+  const size_t bytes_a_matrix_packed = bytes_a_matrix / CHAR_BIT;
+
+  // Read data from disk
+  // row-by-row to handle padding
+  cu::HostMemory a_matrix_host(bytes_a_matrix);
+  std::ifstream in(path_a_matrix_in, std::ios::binary | std::ios::in);
+  if (!in) {
+    std::cerr << "Failed to open input file: " + path_a_matrix_in << std::endl;
+    return -1;
+  }
+  for (size_t c = 0; c < complex; c++) {
+    for (size_t pixel = 0; pixel < pixels; pixel++) {
+      in.read(static_cast<char *>(a_matrix_host) + c * pixels_padded * samples_padded + pixel * samples_padded,
+              samples);
+    }
+  }
+  in.close();
+
+  // conjugate
+  std::cout << "Conjugate" << std::endl;
+#pragma omp parallel for collapse(2)
+  for (size_t pixel = 0; pixel < pixels; pixel++) {
+    for (size_t sample = 0; sample < samples; sample++) {
+      const size_t idx = pixels_padded * samples_padded + pixel * samples_padded + sample;
+      static_cast<char *>(a_matrix_host)[idx] = 1 - static_cast<char *>(a_matrix_host)[idx];
+    }
+  }
+
+  // Device memory for output packed data
+  cu::DeviceMemory d_a_matrix_packed(bytes_a_matrix_packed);
+  d_a_matrix_packed.zero(bytes_a_matrix_packed);
+  // Device memory for transposed data
+  cu::DeviceMemory d_a_transposed(bytes_a_matrix_packed);
+
+  // chunk of input data on device in case it doesn't fit in GPU memory
+  // get available GPU memory (after allocating other device memory)
+  // use at most 80% of available memory
+  size_t bytes_per_chunk = static_cast<size_t>(0.8 * context.getFreeMemory());
+  // packing kernel uses at most 1024 threads per block
+  bytes_per_chunk = 1024 * (bytes_per_chunk / 1024);
+  if (bytes_per_chunk > bytes_a_matrix) {
+    bytes_per_chunk = bytes_a_matrix;
+  }
+  cu::DeviceMemory d_a_chunk(bytes_per_chunk);
+  d_a_chunk.zero(bytes_per_chunk);
+
+  // process, complex-first for now
+  std::cout << "Packing" << std::endl;
+  for (size_t byte_start = 0; byte_start < bytes_a_matrix; byte_start += bytes_per_chunk) {
+    size_t local_nbytes = bytes_per_chunk;
+    // correct nbytes in last chunk
+    if (byte_start + local_nbytes > bytes_a_matrix) {
+      local_nbytes = bytes_a_matrix - byte_start;
+      // ensure any padded region is set to zero
+      d_a_chunk.zero(bytes_per_chunk);
+    }
+    // copy chunk to device
+    stream.memcpyHtoDAsync(d_a_chunk, static_cast<char *>(a_matrix_host) + byte_start, local_nbytes);
+    // get device memory slice for this chunk in a_packed
+    cu::DeviceMemory d_a_packed_chunk(d_a_matrix_packed, byte_start / CHAR_BIT, local_nbytes / CHAR_BIT);
+    // run packing kernel
+    ccglib::packing::Packing packing(local_nbytes, device, stream);
+    packing.Run(d_a_chunk, d_a_packed_chunk, ccglib::packing::pack, ccglib::packing::complex_first);
+  }
+
+  // transpose
+  std::cout << "Transpose" << std::endl;
+  ccglib::transpose::Transpose transpose(1, pixels_padded, samples_padded, tile_sizes.x, tile_sizes.z, 1, device,
+                                         stream);
+  transpose.Run(d_a_matrix_packed, d_a_transposed);
+
+  // copy output to host
+  std::cout << "Copy to host" << std::endl;
+  cu::HostMemory a_matrix_output(bytes_a_matrix_packed);
+  stream.memcpyDtoHAsync(a_matrix_output, d_a_transposed, bytes_a_matrix_packed);
+  stream.synchronize();
+
+  // write to disk
+  std::cout << "Write to disk" << std::endl;
+  std::ofstream out(path_a_matrix_out, std::ios::binary | std::ios::out);
+  if (!out) {
+    std::cerr << "Failed to open output file: " + path_a_matrix_out << std::endl;
+    return -1;
+  }
+  out.write(static_cast<char *>(a_matrix_output), bytes_a_matrix_packed);
+
+  return 0;  // success
+}
+
 int main(int argc, const char *argv[]) {
   cxxopts::ParseResult cmdline = parse_commandline(argc, argv);
   const std::string path_a_matrix_in = cmdline["a_matrix_in"].as<std::string>();
diff --git a/src/prepare_a_matrix.h b/src/prepare_a_matrix.h
new file mode 100644
index 0000000..8d30f7b
--- /dev/null
+++ b/src/prepare_a_matrix.h
@@ -0,0 +1,13 @@
+#ifndef PREPARE_A_MATRIX_H
+#define PREPARE_A_MATRIX_H
+
+#include <string>
+#include <cstddef> // for size_t
+
+int prepareAMatrix(const std::string &path_a_matrix_in,
+                   const std::string &path_a_matrix_out,
+                   size_t pixels,
+                   size_t samples,
+                   unsigned device_id);
+
+#endif // PREPARE_A_MATRIX_H

From 999de820a81a555991670880ef526dc4ad19b809 Mon Sep 17 00:00:00 2001
From: sflorescu <43118429+sflorescu@users.noreply.github.com>
Date: Fri, 13 Dec 2024 14:23:26 +0100
Subject: [PATCH 2/7] Added check if prepare runs

---
 matlab/prepare_a_matrix_mex.cu | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/matlab/prepare_a_matrix_mex.cu b/matlab/prepare_a_matrix_mex.cu
index adee42c..5d8c658 100644
--- a/matlab/prepare_a_matrix_mex.cu
+++ b/matlab/prepare_a_matrix_mex.cu
@@ -3,11 +3,10 @@
 
 void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
   if (nrhs != 5) {
-    mexErrMsgIdAndTxt("beamformer_mex:InvalidInput",
+    mexErrMsgIdAndTxt("prepare_a_matrix_mex:InvalidInput",
                       "Five inputs required: a_matrix_in, a_matrix_out, pixels, samples, device_id");
   }
 
-  // Extract arguments from MATLAB inputs
   char *a_matrix_in_c = mxArrayToString(prhs[0]);
   char *a_matrix_out_c = mxArrayToString(prhs[1]);
   double pixels_d = mxGetScalar(prhs[2]);
@@ -25,7 +24,13 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
 
   int status = prepareAMatrix(a_matrix_in, a_matrix_out, pixels, samples, device_id);
 
+  if (status == 0) {
+    mexPrintf("prepareAMatrix completed successfully.\n");
+  } else {
+    mexPrintf("prepareAMatrix failed with status code: %d\n", status);
+  }
+
   if (nlhs > 0) {
-    plhs[0] = mxCreateDoubleScalar(static_cast<double>(status));
+    plhs[0] = mxCreateDoubleScalar((double)status);
   }
 }

From ff5e666c96a2a548cfba93061cdcc580df775aa7 Mon Sep 17 00:00:00 2001
From: Leon Oostrum <l.oostrum@esciencecenter.nl>
Date: Fri, 13 Dec 2024 15:24:04 +0100
Subject: [PATCH 3/7] Fix compilation, add mex for beamforming

---
 matlab/beamform_mex.cu                | 38 +++++++++++++++++++++++++++
 matlab/mex_compile_beamform.m         |  8 ++++++
 matlab/mex_compile_prepare_a_matrix.m | 12 ++++-----
 3 files changed, 51 insertions(+), 7 deletions(-)
 create mode 100644 matlab/beamform_mex.cu
 create mode 100644 matlab/mex_compile_beamform.m

diff --git a/matlab/beamform_mex.cu b/matlab/beamform_mex.cu
new file mode 100644
index 0000000..d5528ac
--- /dev/null
+++ b/matlab/beamform_mex.cu
@@ -0,0 +1,38 @@
+#include "tcbf.h"  // include the header
+#include "mex.h"
+
+void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  if (nrhs != 7) {
+    mexErrMsgIdAndTxt("beamform:InvalidInput",
+                      "Seven inputs required: a_matrix, rf, bf, pixels, frames, samples, device_id");
+  }
+
+  std::string path_a_matrix = mxArrayToString(prhs[0]);
+  std::string path_rf = mxArrayToString(prhs[1]);
+  std::string path_bf = mxArrayToString(prhs[2]);
+  size_t pixels = mxGetScalar(prhs[3]);
+  size_t frames = mxGetScalar(prhs[4]);
+  size_t samples = mxGetScalar(prhs[5]);
+  unsigned device_id = mxGetScalar(prhs[6]);
+
+  cu::init();
+  cu::Device device(device_id);
+  cu::Context context(CU_CTX_BLOCKING_SYNC, device);
+  cu::Stream stream;
+
+  tcbf::Beamformer beamformer(pixels, frames, samples, device, stream);
+  cu::HostMemory RF(2 * frames * samples);
+  cu::HostMemory BF(2 * pixels * frames * sizeof(unsigned));
+
+  beamformer.read_A_matrix(path_a_matrix);
+  beamformer.read_RF(RF, path_rf);
+  beamformer.process(RF, BF);
+  beamformer.write_BF(BF, path_bf);
+
+
+  int status = 0;
+
+  if (nlhs > 0) {
+    plhs[0] = mxCreateDoubleScalar((double)status);
+  }
+}
diff --git a/matlab/mex_compile_beamform.m b/matlab/mex_compile_beamform.m
new file mode 100644
index 0000000..bd36d9e
--- /dev/null
+++ b/matlab/mex_compile_beamform.m
@@ -0,0 +1,8 @@
+% mex_compile_prepare_a_matrix.m
+% Adjust include and library paths as needed
+mexcuda -v ...
+    -L"/usr/local/cuda/lib64" -lcudart -lcuda ...
+    'NVCCFLAGS=-gencode=arch=compute_89,code=sm_89' ...
+    -ltcbf -lccglib ...
+    beamform_mex.cu
+
diff --git a/matlab/mex_compile_prepare_a_matrix.m b/matlab/mex_compile_prepare_a_matrix.m
index 52de9bb..8b0da29 100644
--- a/matlab/mex_compile_prepare_a_matrix.m
+++ b/matlab/mex_compile_prepare_a_matrix.m
@@ -1,10 +1,8 @@
 % mex_compile_prepare_a_matrix.m
 % Adjust include and library paths as needed
-mexcuda ...
-    -I'../include' ...            % for tcbf.h if needed
-    -I'../src' ...                % for prepare_a_matrix.h
-    -I'/path/to/ccglib/include' ...
-    -I'/path/to/cxxopts/include' ...
-    -L'/path/to/ccglib/lib' -lccglib ...
-    -L'/path/to/cuda/lib64' -lcudart -lcuda ...
+mexcuda -v ...
+    -L"/usr/local/cuda/lib64" -lcudart -lcuda ...
+    'NVCCFLAGS=-gencode=arch=compute_89,code=sm_89' ...
+    -lccglib ...
     prepare_a_matrix_mex.cu ../src/prepare_a_matrix.cu
+

From fa216072f3755646d4a4e0118536d0073327e22d Mon Sep 17 00:00:00 2001
From: sflorescu <43118429+sflorescu@users.noreply.github.com>
Date: Fri, 13 Dec 2024 15:31:24 +0100
Subject: [PATCH 4/7] Added return of  BF back to matlab

---
 matlab/beamform_mex.cu | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/matlab/beamform_mex.cu b/matlab/beamform_mex.cu
index d5528ac..bbd7778 100644
--- a/matlab/beamform_mex.cu
+++ b/matlab/beamform_mex.cu
@@ -1,5 +1,5 @@
-#include "tcbf.h"  // include the header
 #include "mex.h"
+#include "tcbf.h"  // include the header
 
 void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
   if (nrhs != 7) {
@@ -29,6 +29,15 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
   beamformer.process(RF, BF);
   beamformer.write_BF(BF, path_bf);
 
+  mwSize dims[2] = {2 * pixels, frames};
+  mxArray *outArray = mxCreateNumericArray(2, dims, mxINT32_CLASS, mxREAL);
+  int32_t *outData = static_cast<int32_t *>(mxGetData(outArray));
+
+  // Copy the data
+  std::memcpy(outData, BF, totalElements * sizeof(int32_t));
+
+  // Assign output
+  plhs[0] = outArray;
 
   int status = 0;
 

From 9fdee0e20d33009626ea62248f73506c13d00895 Mon Sep 17 00:00:00 2001
From: sflorescu <43118429+sflorescu@users.noreply.github.com>
Date: Fri, 13 Dec 2024 15:33:30 +0100
Subject: [PATCH 5/7] Editted status to work

---
 matlab/beamform_mex.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/matlab/beamform_mex.cu b/matlab/beamform_mex.cu
index bbd7778..cdb7147 100644
--- a/matlab/beamform_mex.cu
+++ b/matlab/beamform_mex.cu
@@ -41,7 +41,7 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
 
   int status = 0;
 
-  if (nlhs > 0) {
-    plhs[0] = mxCreateDoubleScalar((double)status);
+  if (nlhs > 1) {
+    plhs[1] = mxCreateDoubleScalar((double)status);
   }
 }

From 573f35e05937914cb7f93dcf68db6f281415d67e Mon Sep 17 00:00:00 2001
From: sflorescu <43118429+sflorescu@users.noreply.github.com>
Date: Fri, 13 Dec 2024 15:36:14 +0100
Subject: [PATCH 6/7] Defined totalElements

---
 matlab/beamform_mex.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/matlab/beamform_mex.cu b/matlab/beamform_mex.cu
index cdb7147..264f07a 100644
--- a/matlab/beamform_mex.cu
+++ b/matlab/beamform_mex.cu
@@ -34,7 +34,7 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
   int32_t *outData = static_cast<int32_t *>(mxGetData(outArray));
 
   // Copy the data
-  std::memcpy(outData, BF, totalElements * sizeof(int32_t));
+  std::memcpy(outData, BF, 2 * frames * samples * sizeof(int32_t));
 
   // Assign output
   plhs[0] = outArray;

From ce9fa14aae00f7767cc596b413c975991280565e Mon Sep 17 00:00:00 2001
From: Leon Oostrum <l.oostrum@esciencecenter.nl>
Date: Fri, 13 Dec 2024 16:10:16 +0100
Subject: [PATCH 7/7] make BF 3D, fix sizes

---
 matlab/beamform_mex.cu | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/matlab/beamform_mex.cu b/matlab/beamform_mex.cu
index 264f07a..d1ae88e 100644
--- a/matlab/beamform_mex.cu
+++ b/matlab/beamform_mex.cu
@@ -1,3 +1,5 @@
+#include <cstring>
+#include <cstdint>
 #include "mex.h"
 #include "tcbf.h"  // include the header
 
@@ -22,19 +24,19 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
 
   tcbf::Beamformer beamformer(pixels, frames, samples, device, stream);
   cu::HostMemory RF(2 * frames * samples);
-  cu::HostMemory BF(2 * pixels * frames * sizeof(unsigned));
+  cu::HostMemory BF(2 * pixels * frames * sizeof(int32_t));
 
   beamformer.read_A_matrix(path_a_matrix);
   beamformer.read_RF(RF, path_rf);
   beamformer.process(RF, BF);
   beamformer.write_BF(BF, path_bf);
 
-  mwSize dims[2] = {2 * pixels, frames};
-  mxArray *outArray = mxCreateNumericArray(2, dims, mxINT32_CLASS, mxREAL);
-  int32_t *outData = static_cast<int32_t *>(mxGetData(outArray));
+  mwSize dims[3] = {pixels, frames, 2};
+  mxArray *outArray = mxCreateNumericArray(3, dims, mxINT32_CLASS, mxREAL);
+  int32_t *outData = reinterpret_cast<int32_t *>(mxGetData(outArray));
 
   // Copy the data
-  std::memcpy(outData, BF, 2 * frames * samples * sizeof(int32_t));
+  std::memcpy(outData, BF, 2 * frames * pixels * sizeof(int32_t));
 
   // Assign output
   plhs[0] = outArray;