Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mex integration #4

Merged
merged 7 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions matlab/beamform_mex.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#include <cstring>
#include <cstdint>
#include "mex.h"
#include "tcbf.h" // include the header

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
if (nrhs != 7) {
mexErrMsgIdAndTxt("beamform:InvalidInput",
"Seven inputs required: a_matrix, rf, bf, pixels, frames, samples, device_id");
}

std::string path_a_matrix = mxArrayToString(prhs[0]);
std::string path_rf = mxArrayToString(prhs[1]);
std::string path_bf = mxArrayToString(prhs[2]);
size_t pixels = mxGetScalar(prhs[3]);
size_t frames = mxGetScalar(prhs[4]);
size_t samples = mxGetScalar(prhs[5]);
unsigned device_id = mxGetScalar(prhs[6]);

cu::init();
cu::Device device(device_id);
cu::Context context(CU_CTX_BLOCKING_SYNC, device);
cu::Stream stream;

tcbf::Beamformer beamformer(pixels, frames, samples, device, stream);
cu::HostMemory RF(2 * frames * samples);
cu::HostMemory BF(2 * pixels * frames * sizeof(int32_t));

beamformer.read_A_matrix(path_a_matrix);
beamformer.read_RF(RF, path_rf);
beamformer.process(RF, BF);
beamformer.write_BF(BF, path_bf);

mwSize dims[3] = {pixels, frames, 2};
mxArray *outArray = mxCreateNumericArray(3, dims, mxINT32_CLASS, mxREAL);
int32_t *outData = reinterpret_cast<int32_t *>(mxGetData(outArray));

// Copy the data
std::memcpy(outData, BF, 2 * frames * pixels * sizeof(int32_t));

// Assign output
plhs[0] = outArray;

int status = 0;

if (nlhs > 1) {
plhs[1] = mxCreateDoubleScalar((double)status);
}
}
8 changes: 8 additions & 0 deletions matlab/mex_compile_beamform.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
% mex_compile_prepare_a_matrix.m
% Adjust include and library paths as needed
mexcuda -v ...
-L"/usr/local/cuda/lib64" -lcudart -lcuda ...
'NVCCFLAGS=-gencode=arch=compute_89,code=sm_89' ...
-ltcbf -lccglib ...
beamform_mex.cu

8 changes: 8 additions & 0 deletions matlab/mex_compile_prepare_a_matrix.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
% mex_compile_prepare_a_matrix.m
% Adjust include and library paths as needed
mexcuda -v ...
-L"/usr/local/cuda/lib64" -lcudart -lcuda ...
'NVCCFLAGS=-gencode=arch=compute_89,code=sm_89' ...
-lccglib ...
prepare_a_matrix_mex.cu ../src/prepare_a_matrix.cu

36 changes: 36 additions & 0 deletions matlab/prepare_a_matrix_mex.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#include "../src/prepare_a_matrix.h" // include the header
#include "mex.h"

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
if (nrhs != 5) {
mexErrMsgIdAndTxt("prepare_a_matrix_mex:InvalidInput",
"Five inputs required: a_matrix_in, a_matrix_out, pixels, samples, device_id");
}

char *a_matrix_in_c = mxArrayToString(prhs[0]);
char *a_matrix_out_c = mxArrayToString(prhs[1]);
double pixels_d = mxGetScalar(prhs[2]);
double samples_d = mxGetScalar(prhs[3]);
double device_id_d = mxGetScalar(prhs[4]);

std::string a_matrix_in(a_matrix_in_c);
std::string a_matrix_out(a_matrix_out_c);
size_t pixels = static_cast<size_t>(pixels_d);
size_t samples = static_cast<size_t>(samples_d);
unsigned device_id = static_cast<unsigned>(device_id_d);

mxFree(a_matrix_in_c);
mxFree(a_matrix_out_c);

int status = prepareAMatrix(a_matrix_in, a_matrix_out, pixels, samples, device_id);

if (status == 0) {
mexPrintf("prepareAMatrix completed successfully.\n");
} else {
mexPrintf("prepareAMatrix failed with status code: %d\n", status);
}

if (nlhs > 0) {
plhs[0] = mxCreateDoubleScalar((double)status);
}
}
109 changes: 109 additions & 0 deletions src/prepare_a_matrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include <fstream>
#include <iostream>

#include "prepare_a_matrix.h"

inline size_t align(size_t a, size_t b) { return b * ccglib::helper::ceildiv(a, b); }

cxxopts::Options create_commandline_parser(const char *argv[]) {
Expand Down Expand Up @@ -47,6 +49,113 @@ cxxopts::ParseResult parse_commandline(int argc, const char *argv[]) {
}
}

// This function will run the beamformer given the parameters
int prepareAMatrix(const std::string &path_a_matrix_in, const std::string &path_a_matrix_out, size_t pixels,
size_t samples, unsigned device_id) {
const size_t complex = 2;

cu::init();
cu::Device device(device_id);
cu::Context context(CU_CTX_BLOCKING_SYNC, device);
cu::Stream stream;

// tile size in beams, frames, samples axes
dim3 tile_sizes = ccglib::mma::GEMM::GetDimensions(ccglib::mma::int1, ccglib::mma::opt);

const size_t pixels_padded = align(pixels, tile_sizes.x);
const size_t samples_padded = align(samples, tile_sizes.z);

// factor 2 for complex
const size_t bytes_a_matrix = complex * pixels_padded * samples_padded;
const size_t bytes_a_matrix_packed = bytes_a_matrix / CHAR_BIT;

// Read data from disk
// row-by-row to handle padding
cu::HostMemory a_matrix_host(bytes_a_matrix);
std::ifstream in(path_a_matrix_in, std::ios::binary | std::ios::in);
if (!in) {
std::cerr << "Failed to open input file: " + path_a_matrix_in << std::endl;
return -1;
}
for (size_t c = 0; c < complex; c++) {
for (size_t pixel = 0; pixel < pixels; pixel++) {
in.read(static_cast<char *>(a_matrix_host) + c * pixels_padded * samples_padded + pixel * samples_padded,
samples);
}
}
in.close();

// conjugate
std::cout << "Conjugate" << std::endl;
#pragma omp parallel for collapse(2)
for (size_t pixel = 0; pixel < pixels; pixel++) {
for (size_t sample = 0; sample < samples; sample++) {
const size_t idx = pixels_padded * samples_padded + pixel * samples_padded + sample;
static_cast<char *>(a_matrix_host)[idx] = 1 - static_cast<char *>(a_matrix_host)[idx];
}
}

// Device memory for output packed data
cu::DeviceMemory d_a_matrix_packed(bytes_a_matrix_packed);
d_a_matrix_packed.zero(bytes_a_matrix_packed);
// Device memory for transposed data
cu::DeviceMemory d_a_transposed(bytes_a_matrix_packed);

// chunk of input data on device in case it doesn't fit in GPU memory
// get available GPU memory (after allocating other device memory)
// use at most 80% of available memory
size_t bytes_per_chunk = static_cast<size_t>(0.8 * context.getFreeMemory());
// packing kernel uses at most 1024 threads per block
bytes_per_chunk = 1024 * (bytes_per_chunk / 1024);
if (bytes_per_chunk > bytes_a_matrix) {
bytes_per_chunk = bytes_a_matrix;
}
cu::DeviceMemory d_a_chunk(bytes_per_chunk);
d_a_chunk.zero(bytes_per_chunk);

// process, complex-first for now
std::cout << "Packing" << std::endl;
for (size_t byte_start = 0; byte_start < bytes_a_matrix; byte_start += bytes_per_chunk) {
size_t local_nbytes = bytes_per_chunk;
// correct nbytes in last chunk
if (byte_start + local_nbytes > bytes_a_matrix) {
local_nbytes = bytes_a_matrix - byte_start;
// ensure any padded region is set to zero
d_a_chunk.zero(bytes_per_chunk);
}
// copy chunk to device
stream.memcpyHtoDAsync(d_a_chunk, static_cast<char *>(a_matrix_host) + byte_start, local_nbytes);
// get device memory slice for this chunk in a_packed
cu::DeviceMemory d_a_packed_chunk(d_a_matrix_packed, byte_start / CHAR_BIT, local_nbytes / CHAR_BIT);
// run packing kernel
ccglib::packing::Packing packing(local_nbytes, device, stream);
packing.Run(d_a_chunk, d_a_packed_chunk, ccglib::packing::pack, ccglib::packing::complex_first);
}

// transpose
std::cout << "Transpose" << std::endl;
ccglib::transpose::Transpose transpose(1, pixels_padded, samples_padded, tile_sizes.x, tile_sizes.z, 1, device,
stream);
transpose.Run(d_a_matrix_packed, d_a_transposed);

// copy output to host
std::cout << "Copy to host" << std::endl;
cu::HostMemory a_matrix_output(bytes_a_matrix_packed);
stream.memcpyDtoHAsync(a_matrix_output, d_a_transposed, bytes_a_matrix_packed);
stream.synchronize();

// write to disk
std::cout << "Write to disk" << std::endl;
std::ofstream out(path_a_matrix_out, std::ios::binary | std::ios::out);
if (!out) {
std::cerr << "Failed to open output file: " + path_a_matrix_out << std::endl;
return -1;
}
out.write(static_cast<char *>(a_matrix_output), bytes_a_matrix_packed);

return 0; // success
}

int main(int argc, const char *argv[]) {
cxxopts::ParseResult cmdline = parse_commandline(argc, argv);
const std::string path_a_matrix_in = cmdline["a_matrix_in"].as<std::string>();
Expand Down
13 changes: 13 additions & 0 deletions src/prepare_a_matrix.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#ifndef PREPARE_A_MATRIX_H
#define PREPARE_A_MATRIX_H

#include <string>
#include <cstddef> // for size_t

int prepareAMatrix(const std::string &path_a_matrix_in,
const std::string &path_a_matrix_out,
size_t pixels,
size_t samples,
unsigned device_id);

#endif // PREPARE_A_MATRIX_H
Loading