Skip to content

Commit

Permalink
Merge pull request #24 from thorstone25/dev-opencl
Browse files Browse the repository at this point in the history
Merging OpenCL support and other minor changes
  • Loading branch information
thorstone25 authored Jan 28, 2024
2 parents 1c6cad7 + 458d596 commit b5785de
Show file tree
Hide file tree
Showing 16 changed files with 1,309 additions and 372 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ QUPS (pronounced "CUPS") is an abstract, lightweight, readable tool for prototyp
- Arbitrary pixel locations and beamforming apodization

- Performant:
- Hardware acceleartion via GPU or multi-threaded CPU
- Hardware acceleartion via CUDA, OpenCL, and MATLAB parallel processing
- Memory efficient data types
- Beamform a 1024 x 1024 image for 256 x 256 transmits/receives in < 2 seconds (RTX 3070)
- Batch simulations locally via [`parcluster`](https://www.mathworks.com/help/parallel-computing/parcluster.html) and scale to a cluster via the [parallel server](https://www.mathworks.com/help/matlab-parallel-server/) toolbox
- Batch simulations locally via [`parcluster`](https://www.mathworks.com/help/parallel-computing/parcluster.html) or scale to a cluster via the [parallel server](https://www.mathworks.com/help/matlab-parallel-server/) toolbox

- Modular:
- Transducer, pulse sequence, pulse waveform, scan region etc. each defined separately
Expand Down Expand Up @@ -99,7 +99,8 @@ All extensions to QUPS are optional, but must be installed separately from their
| [FieldII](https://www.field-ii.dk/) | point scatterer simulator | `addpath path/to/fieldII`|
| k-Wave([base](http://www.k-wave.org/index.php), [extension](http://www.k-wave.org/forum/topic/alpha-version-of-kwavearray-off-grid-sources)) | distributed medium simulator | `addpath path/to/kWave, addpath path/to/kWaveArray` |
| [MUST](https://www.biomecardio.com/MUST/documentation.html) | point scatterer simulator | `addpath path/to/MUST` (see issues[#2](https://github.com/thorstone25/qups/issues/2))|
| CUDA([Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html),[Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)) | hardware acceleration| see [CUDA Extension](####CUDA-Extension) |
| CUDA([Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html),[Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)) | hardware acceleration | see [CUDA Extension](####CUDA-Extension) |
| [Matlab-OpenCL](https://github.com/thorstone25/Matlab-OpenCL) | hardware acceleration | (see [README](https://github.com/thorstone25/Matlab-OpenCL/blob/main/README.md))|


### CUDA Extension
Expand Down
144 changes: 89 additions & 55 deletions kern/beamform.m
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,6 @@
else
idataType = 'double'; % default
end
if any(cellfun(@(c) isType(c, 'single'), {Pi, Pr, Pv, Nv}))
posType = 'single';
else
posType = 'double';
end
if gpuDeviceCount && any(cellfun(@(v)isa(v, 'gpuArray'), {Pi, Pr, Pv, Nv, x}))
device = -1; % GPU
else
Expand All @@ -127,9 +122,6 @@
DV = true;
case 'focused-waves'
DV = false;
case 'position-precision'
n = n + 1;
posType = varargin{n};
case 'input-precision'
n = n + 1;
idataType = varargin{n};
Expand Down Expand Up @@ -180,14 +172,26 @@
% TODO: support doing this with non-scalar t0 definition
x = permute(x, [1:3,(max(3,ndims(x))+[1,2]),4:ndims(x)]); % (T x N x M x 1 x 1 x F x ...)
fsz = size(x, 6:max(6,ndims(x))); % frame size: starts at dim 6

if device && logical(exist('bf.ptx', 'file')) % PTX track must be available

% get devices: must have package, device, and kernel
gdev = (exist('gpuDeviceCount','file') && gpuDeviceCount() && device <= gpuDeviceCount() && logical(exist('bf.ptx', 'file'))); ... % PTX track available
odev = (exist('oclDeviceCount','file') && oclDeviceCount() && device <= oclDeviceCount() && logical(exist('bf.cl' , 'file'))); ... OpenCL kernel available
if odev % check for data type support
d = oclDevice();
odev = ~isempty(d) ... % must have a device selected
&& (((idataType == "double") && d.SupportsDouble) ... double support
|| (idataType == "single") ... single always supported)
|| ((idataType == "halfT" ) && d.SupportsHalf )); % half support
end

% dispatch
if device && (gdev || odev)

% warn if non-linear interp was requested
switch interp_type
case "nearest",flagnum = 0;
case "linear", flagnum = 1;
case "cubic", flagnum = 2;
case "nearest", flagnum = 0;
case "linear", flagnum = 1;
case "cubic", flagnum = 2;
case "lanczos3",flagnum = 3;
otherwise
error('QUPS:beamform:UnrecognizedInput', "Unrecognized interpolation of type " ...
Expand All @@ -204,40 +208,28 @@
src_folder = fullfile(fileparts(mfilename('fullpath')), '..', 'src');

switch idataType
case 'halfT', postfix = 'h';
case 'halfT', postfix = 'h';
case 'single', postfix = 'f';
case 'double', postfix = '';
end

% currently selected gpu device handle
g = gpuDevice();

% reselect gpu device and copy over inputs if necessary
if device > 0 && g.Index ~= device
if device > 0 && gdev && getfield(gpuDevice(), 'Index') ~= device
inps = {Pi, Pr, Pv, Nv, x, t0, fs, cinv};
oclass = cellfun(@class, inps, per_cell{:});
tmp = cellfun(@gather, inps, per_cell{:});
g = gpuDevice(device);
tmp = cellfun(@(inp, cls)cast(inp, cls), tmp, oclass, per_cell{:});
[Pi, Pr, Pv, Nv, x, t0, fs, cinv] = deal(tmp{:});
end

% If bf.ptx isn't there, ask the user to generate it
if ~exist('bf.ptx', 'file')
error('The source code is not compiled for MATLAB on this system. Try using UltrasoundSystem.recompileCUDA() to compile it.')
end

k = parallel.gpu.CUDAKernel(...
'bf.ptx',...
fullfile(src_folder, 'bf.cu'),...
['DAS', postfix]); % use the same kernel, just modify the flag here.


% send constant data to GPU
typefun = str2func(idataType); % function to cast types
ptypefun = @(x) gpuArray(typefun(x));
ptypefun = str2func(idataType); % function to cast types
if gdev, ptypefun = @(x) gpuArray(ptypefun(x)); end
dtypefun = @(x) complex(ptypefun(x));
if idataType == "halfT"
ptypefun = @(x) gpuArray(single(x));
if gdev, ptypefun = @(x) gpuArray(single(x)); end
if odev, ptypefun = @(x) single(x) ; end
dtypefun = @(x) complex(halfT(x));
end
[x, apod] = dealfun(dtypefun, x, apod);
Expand Down Expand Up @@ -269,36 +261,77 @@
% get output data type
switch fun
case {'DAS', 'SYN', 'BF', 'MUL'}
obuftypefun = dtypefun;
obufproto = x;
case {'delays'}
obuftypefun = @(x)real(dtypefun(x));
obufproto = real(x([]));
end

% kernel sizes
nThreads = k.MaxThreadsPerBlock; % threads per block
nBlocks = min([...
g.MaxThreadBlockSize(1), ... max cause device reqs
ceil(I / nThreads)... max cause number of pixels
ceil(g.AvailableMemory / (2^8) / (prod(osize{:}) * nThreads)),... max cause GPU memory reqs (empirical)
]); % blocks per frame

% constant arg type casting
tmp = cellfun(@uint64, {T,M,N,I,Isz}, per_cell{:});
[T, M, N, I, Isz] = deal(tmp{:});

% set constant args
k.setConstantMemory('QUPS_I', I); % gauranteed
try k.setConstantMemory('QUPS_T', T); end % if not const compiled with ChannelData
try k.setConstantMemory('QUPS_M', M, 'QUPS_N', N, 'QUPS_VS', VS, 'QUPS_DV', DV, 'QUPS_I1', Isz(1), 'QUPS_I2', Isz(2), 'QUPS_I3', Isz(3)); end % if not const compiled
if gdev
% reference selected device
g = gpuDevice();

% load kernel
k = parallel.gpu.CUDAKernel(...
'bf.ptx',...
fullfile(src_folder, 'bf.cu'),...
['DAS', postfix]); % use the same kernel, just modify the flag here.

% constant arg type casting
tmp = cellfun(@uint64, {T,M,N,I,Isz}, per_cell{:});
[T, M, N, I, Isz] = deal(tmp{:});

% set constant args
k.setConstantMemory('QUPS_I', I); % gauranteed
try k.setConstantMemory('QUPS_T', T); end % if not const compiled with ChannelData
try k.setConstantMemory('QUPS_M', M, 'QUPS_N', N, 'QUPS_VS', VS, 'QUPS_DV', DV, 'QUPS_I1', Isz(1), 'QUPS_I2', Isz(2), 'QUPS_I3', Isz(3)); end % if not const compiled

% kernel sizes
k.ThreadBlockSize(1) = k.MaxThreadsPerBlock; % threads per block
k.GridSize(1) = min([...
g.MaxThreadBlockSize(1), ... max cause device reqs
ceil(I / k.ThreadBlockSize(1))... max cause number of pixels
ceil(g.AvailableMemory / (2^8) / (prod(osize{:}) * k.ThreadBlockSize(1))),... max cause GPU memory reqs (empirical)
]); % blocks per frame

elseif odev

% load kernel refernece
k = oclKernel(which('bf.cl'), 'DAS');

% select device - no risk of resetting anything
if device > 0, oclDevice(device); end
k.Device = oclDevice();

% set precision
switch idataType
case "single", prc = 32;
case "double", prc = 64;
otherwise, error("Not implemented :(");
end
k.defineTypes(repmat(idataType,[1,3]), ["V","T","U"]); % time / data / position

% set constant args
k.macros = [k.macros, ("QUPS_" + ["I","T","M","N","VS","DV","I1","I2","I3"] + "=" + [I,T,M,N,VS,DV,Isz])];
k.macros = [k.macros, ("QUPS_" + ["F","S"] + "=" + [1,M])]; % unused ...
k.macros = [k.macros, ("QUPS_BF_" + ["FLAG"] + "=" + [flagnum])];
k.macros(end+1) = "QUPS_PRECISION="+prc;
k.opts = ["-cl-mad-enable"];%, "-cl-fp32-correctly-rounded-divide-sqrt", "-cl-opt-disable"];

% compile kernel
k.build();

% set kernel size
k.ThreadBlockSize = nThreads;
k.GridSize = nBlocks;
% set kernel size
k.ThreadBlockSize(1) = min(I, k.MaxThreadsPerBlock);
k.GridSize(1) = ceil(I / k.ThreadBlockSize(1));

% expand inputs to 4D - in OpenCL, all 3D vectors interpreted are 4D underlying
[Pi(4,:), Pr(4,:), Pv(4,:), Nv(4,:)] = deal(0);
end

% allocate output data buffer
osize = cat(2, {I}, osize);
osize = cellfun(@uint64, osize, per_cell{:});
yg = repmat(obuftypefun(zeros(1)), [osize{:}]);
yg = zeros([osize{:}], 'like', obufproto);

% for half types, alias the weights/data, recast the positions as
% single
Expand All @@ -307,7 +340,7 @@
[yg, apod, x] = dealfun(@(x)getfield(alias(x),'val'), yg, apod, x);
end

% combine timing infor with the transmit positions
% combine timing info with the transmit positions
Pv(4,:) = t0;

% partition data per frame
Expand All @@ -316,6 +349,7 @@
% for each data frame, run the kernel
switch fun
case {'DAS','SYN','BF','MUL'}
if ~isreal(obufproto), yg = complex(yg); end % force complex if x is
% I [x N [x M]] x 1 x 1 x {F x ...}
for f = F:-1:1 % beamform each data frame
y{f} = k.feval(yg, Pi, Pr, Pv, Nv, apod, cinv, [astride, cstride], x(:,:,:,f), flagnum, [fs, fmod]);
Expand Down
Loading

0 comments on commit b5785de

Please sign in to comment.