Skip to content

Commit

Permalink
Merge pull request #4 from nlesc-recruit/mex-integration
Browse files Browse the repository at this point in the history
Mex integration
  • Loading branch information
loostrum authored Dec 13, 2024
2 parents 635ae5d + ce9fa14 commit b9bf9ca
Show file tree
Hide file tree
Showing 6 changed files with 223 additions and 0 deletions.
49 changes: 49 additions & 0 deletions matlab/beamform_mex.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#include <cstring>
#include <cstdint>
#include "mex.h"
#include "tcbf.h" // include the header

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
if (nrhs != 7) {
mexErrMsgIdAndTxt("beamform:InvalidInput",
"Seven inputs required: a_matrix, rf, bf, pixels, frames, samples, device_id");
}

std::string path_a_matrix = mxArrayToString(prhs[0]);
std::string path_rf = mxArrayToString(prhs[1]);
std::string path_bf = mxArrayToString(prhs[2]);
size_t pixels = mxGetScalar(prhs[3]);
size_t frames = mxGetScalar(prhs[4]);
size_t samples = mxGetScalar(prhs[5]);
unsigned device_id = mxGetScalar(prhs[6]);

cu::init();
cu::Device device(device_id);
cu::Context context(CU_CTX_BLOCKING_SYNC, device);
cu::Stream stream;

tcbf::Beamformer beamformer(pixels, frames, samples, device, stream);
cu::HostMemory RF(2 * frames * samples);
cu::HostMemory BF(2 * pixels * frames * sizeof(int32_t));

beamformer.read_A_matrix(path_a_matrix);
beamformer.read_RF(RF, path_rf);
beamformer.process(RF, BF);
beamformer.write_BF(BF, path_bf);

mwSize dims[3] = {pixels, frames, 2};
mxArray *outArray = mxCreateNumericArray(3, dims, mxINT32_CLASS, mxREAL);
int32_t *outData = reinterpret_cast<int32_t *>(mxGetData(outArray));

// Copy the data
std::memcpy(outData, BF, 2 * frames * pixels * sizeof(int32_t));

// Assign output
plhs[0] = outArray;

int status = 0;

if (nlhs > 1) {
plhs[1] = mxCreateDoubleScalar((double)status);
}
}
8 changes: 8 additions & 0 deletions matlab/mex_compile_beamform.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
% mex_compile_prepare_a_matrix.m
% Adjust include and library paths as needed
mexcuda -v ...
-L"/usr/local/cuda/lib64" -lcudart -lcuda ...
'NVCCFLAGS=-gencode=arch=compute_89,code=sm_89' ...
-ltcbf -lccglib ...
beamform_mex.cu

8 changes: 8 additions & 0 deletions matlab/mex_compile_prepare_a_matrix.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
% mex_compile_prepare_a_matrix.m
% Adjust include and library paths as needed
mexcuda -v ...
-L"/usr/local/cuda/lib64" -lcudart -lcuda ...
'NVCCFLAGS=-gencode=arch=compute_89,code=sm_89' ...
-lccglib ...
prepare_a_matrix_mex.cu ../src/prepare_a_matrix.cu

36 changes: 36 additions & 0 deletions matlab/prepare_a_matrix_mex.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#include "../src/prepare_a_matrix.h" // include the header
#include "mex.h"

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
if (nrhs != 5) {
mexErrMsgIdAndTxt("prepare_a_matrix_mex:InvalidInput",
"Five inputs required: a_matrix_in, a_matrix_out, pixels, samples, device_id");
}

char *a_matrix_in_c = mxArrayToString(prhs[0]);
char *a_matrix_out_c = mxArrayToString(prhs[1]);
double pixels_d = mxGetScalar(prhs[2]);
double samples_d = mxGetScalar(prhs[3]);
double device_id_d = mxGetScalar(prhs[4]);

std::string a_matrix_in(a_matrix_in_c);
std::string a_matrix_out(a_matrix_out_c);
size_t pixels = static_cast<size_t>(pixels_d);
size_t samples = static_cast<size_t>(samples_d);
unsigned device_id = static_cast<unsigned>(device_id_d);

mxFree(a_matrix_in_c);
mxFree(a_matrix_out_c);

int status = prepareAMatrix(a_matrix_in, a_matrix_out, pixels, samples, device_id);

if (status == 0) {
mexPrintf("prepareAMatrix completed successfully.\n");
} else {
mexPrintf("prepareAMatrix failed with status code: %d\n", status);
}

if (nlhs > 0) {
plhs[0] = mxCreateDoubleScalar((double)status);
}
}
109 changes: 109 additions & 0 deletions src/prepare_a_matrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include <fstream>
#include <iostream>

#include "prepare_a_matrix.h"

inline size_t align(size_t a, size_t b) { return b * ccglib::helper::ceildiv(a, b); }

cxxopts::Options create_commandline_parser(const char *argv[]) {
Expand Down Expand Up @@ -47,6 +49,113 @@ cxxopts::ParseResult parse_commandline(int argc, const char *argv[]) {
}
}

// This function will run the beamformer given the parameters
int prepareAMatrix(const std::string &path_a_matrix_in, const std::string &path_a_matrix_out, size_t pixels,
size_t samples, unsigned device_id) {
const size_t complex = 2;

cu::init();
cu::Device device(device_id);
cu::Context context(CU_CTX_BLOCKING_SYNC, device);
cu::Stream stream;

// tile size in beams, frames, samples axes
dim3 tile_sizes = ccglib::mma::GEMM::GetDimensions(ccglib::mma::int1, ccglib::mma::opt);

const size_t pixels_padded = align(pixels, tile_sizes.x);
const size_t samples_padded = align(samples, tile_sizes.z);

// factor 2 for complex
const size_t bytes_a_matrix = complex * pixels_padded * samples_padded;
const size_t bytes_a_matrix_packed = bytes_a_matrix / CHAR_BIT;

// Read data from disk
// row-by-row to handle padding
cu::HostMemory a_matrix_host(bytes_a_matrix);
std::ifstream in(path_a_matrix_in, std::ios::binary | std::ios::in);
if (!in) {
std::cerr << "Failed to open input file: " + path_a_matrix_in << std::endl;
return -1;
}
for (size_t c = 0; c < complex; c++) {
for (size_t pixel = 0; pixel < pixels; pixel++) {
in.read(static_cast<char *>(a_matrix_host) + c * pixels_padded * samples_padded + pixel * samples_padded,
samples);
}
}
in.close();

// conjugate
std::cout << "Conjugate" << std::endl;
#pragma omp parallel for collapse(2)
for (size_t pixel = 0; pixel < pixels; pixel++) {
for (size_t sample = 0; sample < samples; sample++) {
const size_t idx = pixels_padded * samples_padded + pixel * samples_padded + sample;
static_cast<char *>(a_matrix_host)[idx] = 1 - static_cast<char *>(a_matrix_host)[idx];
}
}

// Device memory for output packed data
cu::DeviceMemory d_a_matrix_packed(bytes_a_matrix_packed);
d_a_matrix_packed.zero(bytes_a_matrix_packed);
// Device memory for transposed data
cu::DeviceMemory d_a_transposed(bytes_a_matrix_packed);

// chunk of input data on device in case it doesn't fit in GPU memory
// get available GPU memory (after allocating other device memory)
// use at most 80% of available memory
size_t bytes_per_chunk = static_cast<size_t>(0.8 * context.getFreeMemory());
// packing kernel uses at most 1024 threads per block
bytes_per_chunk = 1024 * (bytes_per_chunk / 1024);
if (bytes_per_chunk > bytes_a_matrix) {
bytes_per_chunk = bytes_a_matrix;
}
cu::DeviceMemory d_a_chunk(bytes_per_chunk);
d_a_chunk.zero(bytes_per_chunk);

// process, complex-first for now
std::cout << "Packing" << std::endl;
for (size_t byte_start = 0; byte_start < bytes_a_matrix; byte_start += bytes_per_chunk) {
size_t local_nbytes = bytes_per_chunk;
// correct nbytes in last chunk
if (byte_start + local_nbytes > bytes_a_matrix) {
local_nbytes = bytes_a_matrix - byte_start;
// ensure any padded region is set to zero
d_a_chunk.zero(bytes_per_chunk);
}
// copy chunk to device
stream.memcpyHtoDAsync(d_a_chunk, static_cast<char *>(a_matrix_host) + byte_start, local_nbytes);
// get device memory slice for this chunk in a_packed
cu::DeviceMemory d_a_packed_chunk(d_a_matrix_packed, byte_start / CHAR_BIT, local_nbytes / CHAR_BIT);
// run packing kernel
ccglib::packing::Packing packing(local_nbytes, device, stream);
packing.Run(d_a_chunk, d_a_packed_chunk, ccglib::packing::pack, ccglib::packing::complex_first);
}

// transpose
std::cout << "Transpose" << std::endl;
ccglib::transpose::Transpose transpose(1, pixels_padded, samples_padded, tile_sizes.x, tile_sizes.z, 1, device,
stream);
transpose.Run(d_a_matrix_packed, d_a_transposed);

// copy output to host
std::cout << "Copy to host" << std::endl;
cu::HostMemory a_matrix_output(bytes_a_matrix_packed);
stream.memcpyDtoHAsync(a_matrix_output, d_a_transposed, bytes_a_matrix_packed);
stream.synchronize();

// write to disk
std::cout << "Write to disk" << std::endl;
std::ofstream out(path_a_matrix_out, std::ios::binary | std::ios::out);
if (!out) {
std::cerr << "Failed to open output file: " + path_a_matrix_out << std::endl;
return -1;
}
out.write(static_cast<char *>(a_matrix_output), bytes_a_matrix_packed);

return 0; // success
}

int main(int argc, const char *argv[]) {
cxxopts::ParseResult cmdline = parse_commandline(argc, argv);
const std::string path_a_matrix_in = cmdline["a_matrix_in"].as<std::string>();
Expand Down
13 changes: 13 additions & 0 deletions src/prepare_a_matrix.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#ifndef PREPARE_A_MATRIX_H
#define PREPARE_A_MATRIX_H

#include <string>
#include <cstddef> // for size_t

int prepareAMatrix(const std::string &path_a_matrix_in,
const std::string &path_a_matrix_out,
size_t pixels,
size_t samples,
unsigned device_id);

#endif // PREPARE_A_MATRIX_H

0 comments on commit b9bf9ca

Please sign in to comment.