diff --git a/README.md b/README.md index 1cfdc46a..4d22e512 100644 --- a/README.md +++ b/README.md @@ -11,10 +11,10 @@ QUPS (pronounced "CUPS") is an abstract, lightweight, readable tool for prototyp - Arbitrary pixel locations and beamforming apodization - Performant: - - Hardware acceleartion via GPU or multi-threaded CPU + - Hardware acceleartion via CUDA, OpenCL, and MATLAB parallel processing - Memory efficient data types - Beamform a 1024 x 1024 image for 256 x 256 transmits/receives in < 2 seconds (RTX 3070) - - Batch simulations locally via [`parcluster`](https://www.mathworks.com/help/parallel-computing/parcluster.html) and scale to a cluster via the [parallel server](https://www.mathworks.com/help/matlab-parallel-server/) toolbox + - Batch simulations locally via [`parcluster`](https://www.mathworks.com/help/parallel-computing/parcluster.html) or scale to a cluster via the [parallel server](https://www.mathworks.com/help/matlab-parallel-server/) toolbox - Modular: - Transducer, pulse sequence, pulse waveform, scan region etc. each defined separately @@ -99,7 +99,8 @@ All extensions to QUPS are optional, but must be installed separately from their | [FieldII](https://www.field-ii.dk/) | point scatterer simulator | `addpath path/to/fieldII`| | k-Wave([base](http://www.k-wave.org/index.php), [extension](http://www.k-wave.org/forum/topic/alpha-version-of-kwavearray-off-grid-sources)) | distributed medium simulator | `addpath path/to/kWave, addpath path/to/kWaveArray` | | [MUST](https://www.biomecardio.com/MUST/documentation.html) | point scatterer simulator | `addpath path/to/MUST` (see issues[#2](https://github.com/thorstone25/qups/issues/2))| -| CUDA([Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html),[Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)) | hardware acceleration| see [CUDA Extension](####CUDA-Extension) | +| CUDA([Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html),[Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)) | hardware acceleration | see [CUDA Extension](####CUDA-Extension) | +| [Matlab-OpenCL](https://github.com/thorstone25/Matlab-OpenCL) | hardware acceleration | (see [README](https://github.com/thorstone25/Matlab-OpenCL/blob/main/README.md))| ### CUDA Extension diff --git a/kern/beamform.m b/kern/beamform.m index 1a45f98d..09998c54 100644 --- a/kern/beamform.m +++ b/kern/beamform.m @@ -102,11 +102,6 @@ else idataType = 'double'; % default end -if any(cellfun(@(c) isType(c, 'single'), {Pi, Pr, Pv, Nv})) - posType = 'single'; -else - posType = 'double'; -end if gpuDeviceCount && any(cellfun(@(v)isa(v, 'gpuArray'), {Pi, Pr, Pv, Nv, x})) device = -1; % GPU else @@ -127,9 +122,6 @@ DV = true; case 'focused-waves' DV = false; - case 'position-precision' - n = n + 1; - posType = varargin{n}; case 'input-precision' n = n + 1; idataType = varargin{n}; @@ -180,14 +172,26 @@ % TODO: support doing this with non-scalar t0 definition x = permute(x, [1:3,(max(3,ndims(x))+[1,2]),4:ndims(x)]); % (T x N x M x 1 x 1 x F x ...) fsz = size(x, 6:max(6,ndims(x))); % frame size: starts at dim 6 - -if device && logical(exist('bf.ptx', 'file')) % PTX track must be available + +% get devices: must have package, device, and kernel +gdev = (exist('gpuDeviceCount','file') && gpuDeviceCount() && device <= gpuDeviceCount() && logical(exist('bf.ptx', 'file'))); ... % PTX track available +odev = (exist('oclDeviceCount','file') && oclDeviceCount() && device <= oclDeviceCount() && logical(exist('bf.cl' , 'file'))); ... OpenCL kernel available +if odev % check for data type support + d = oclDevice(); + odev = ~isempty(d) ... % must have a device selected + && (((idataType == "double") && d.SupportsDouble) ... double support + || (idataType == "single") ... single always supported) + || ((idataType == "halfT" ) && d.SupportsHalf )); % half support +end + +% dispatch +if device && (gdev || odev) % warn if non-linear interp was requested switch interp_type - case "nearest",flagnum = 0; - case "linear", flagnum = 1; - case "cubic", flagnum = 2; + case "nearest", flagnum = 0; + case "linear", flagnum = 1; + case "cubic", flagnum = 2; case "lanczos3",flagnum = 3; otherwise error('QUPS:beamform:UnrecognizedInput', "Unrecognized interpolation of type " ... @@ -204,16 +208,13 @@ src_folder = fullfile(fileparts(mfilename('fullpath')), '..', 'src'); switch idataType - case 'halfT', postfix = 'h'; + case 'halfT', postfix = 'h'; case 'single', postfix = 'f'; case 'double', postfix = ''; end - % currently selected gpu device handle - g = gpuDevice(); - % reselect gpu device and copy over inputs if necessary - if device > 0 && g.Index ~= device + if device > 0 && gdev && getfield(gpuDevice(), 'Index') ~= device inps = {Pi, Pr, Pv, Nv, x, t0, fs, cinv}; oclass = cellfun(@class, inps, per_cell{:}); tmp = cellfun(@gather, inps, per_cell{:}); @@ -221,23 +222,14 @@ tmp = cellfun(@(inp, cls)cast(inp, cls), tmp, oclass, per_cell{:}); [Pi, Pr, Pv, Nv, x, t0, fs, cinv] = deal(tmp{:}); end - - % If bf.ptx isn't there, ask the user to generate it - if ~exist('bf.ptx', 'file') - error('The source code is not compiled for MATLAB on this system. Try using UltrasoundSystem.recompileCUDA() to compile it.') - end - - k = parallel.gpu.CUDAKernel(... - 'bf.ptx',... - fullfile(src_folder, 'bf.cu'),... - ['DAS', postfix]); % use the same kernel, just modify the flag here. - + % send constant data to GPU - typefun = str2func(idataType); % function to cast types - ptypefun = @(x) gpuArray(typefun(x)); + ptypefun = str2func(idataType); % function to cast types + if gdev, ptypefun = @(x) gpuArray(ptypefun(x)); end dtypefun = @(x) complex(ptypefun(x)); if idataType == "halfT" - ptypefun = @(x) gpuArray(single(x)); + if gdev, ptypefun = @(x) gpuArray(single(x)); end + if odev, ptypefun = @(x) single(x) ; end dtypefun = @(x) complex(halfT(x)); end [x, apod] = dealfun(dtypefun, x, apod); @@ -269,36 +261,77 @@ % get output data type switch fun case {'DAS', 'SYN', 'BF', 'MUL'} - obuftypefun = dtypefun; + obufproto = x; case {'delays'} - obuftypefun = @(x)real(dtypefun(x)); + obufproto = real(x([])); end - - % kernel sizes - nThreads = k.MaxThreadsPerBlock; % threads per block - nBlocks = min([... - g.MaxThreadBlockSize(1), ... max cause device reqs - ceil(I / nThreads)... max cause number of pixels - ceil(g.AvailableMemory / (2^8) / (prod(osize{:}) * nThreads)),... max cause GPU memory reqs (empirical) - ]); % blocks per frame - - % constant arg type casting - tmp = cellfun(@uint64, {T,M,N,I,Isz}, per_cell{:}); - [T, M, N, I, Isz] = deal(tmp{:}); - % set constant args - k.setConstantMemory('QUPS_I', I); % gauranteed - try k.setConstantMemory('QUPS_T', T); end % if not const compiled with ChannelData - try k.setConstantMemory('QUPS_M', M, 'QUPS_N', N, 'QUPS_VS', VS, 'QUPS_DV', DV, 'QUPS_I1', Isz(1), 'QUPS_I2', Isz(2), 'QUPS_I3', Isz(3)); end % if not const compiled + if gdev + % reference selected device + g = gpuDevice(); + + % load kernel + k = parallel.gpu.CUDAKernel(... + 'bf.ptx',... + fullfile(src_folder, 'bf.cu'),... + ['DAS', postfix]); % use the same kernel, just modify the flag here. + + % constant arg type casting + tmp = cellfun(@uint64, {T,M,N,I,Isz}, per_cell{:}); + [T, M, N, I, Isz] = deal(tmp{:}); + + % set constant args + k.setConstantMemory('QUPS_I', I); % gauranteed + try k.setConstantMemory('QUPS_T', T); end % if not const compiled with ChannelData + try k.setConstantMemory('QUPS_M', M, 'QUPS_N', N, 'QUPS_VS', VS, 'QUPS_DV', DV, 'QUPS_I1', Isz(1), 'QUPS_I2', Isz(2), 'QUPS_I3', Isz(3)); end % if not const compiled + + % kernel sizes + k.ThreadBlockSize(1) = k.MaxThreadsPerBlock; % threads per block + k.GridSize(1) = min([... + g.MaxThreadBlockSize(1), ... max cause device reqs + ceil(I / k.ThreadBlockSize(1))... max cause number of pixels + ceil(g.AvailableMemory / (2^8) / (prod(osize{:}) * k.ThreadBlockSize(1))),... max cause GPU memory reqs (empirical) + ]); % blocks per frame + + elseif odev + + % load kernel refernece + k = oclKernel(which('bf.cl'), 'DAS'); + + % select device - no risk of resetting anything + if device > 0, oclDevice(device); end + k.Device = oclDevice(); + + % set precision + switch idataType + case "single", prc = 32; + case "double", prc = 64; + otherwise, error("Not implemented :("); + end + k.defineTypes(repmat(idataType,[1,3]), ["V","T","U"]); % time / data / position + + % set constant args + k.macros = [k.macros, ("QUPS_" + ["I","T","M","N","VS","DV","I1","I2","I3"] + "=" + [I,T,M,N,VS,DV,Isz])]; + k.macros = [k.macros, ("QUPS_" + ["F","S"] + "=" + [1,M])]; % unused ... + k.macros = [k.macros, ("QUPS_BF_" + ["FLAG"] + "=" + [flagnum])]; + k.macros(end+1) = "QUPS_PRECISION="+prc; + k.opts = ["-cl-mad-enable"];%, "-cl-fp32-correctly-rounded-divide-sqrt", "-cl-opt-disable"]; + + % compile kernel + k.build(); - % set kernel size - k.ThreadBlockSize = nThreads; - k.GridSize = nBlocks; + % set kernel size + k.ThreadBlockSize(1) = min(I, k.MaxThreadsPerBlock); + k.GridSize(1) = ceil(I / k.ThreadBlockSize(1)); + + % expand inputs to 4D - in OpenCL, all 3D vectors interpreted are 4D underlying + [Pi(4,:), Pr(4,:), Pv(4,:), Nv(4,:)] = deal(0); + end % allocate output data buffer osize = cat(2, {I}, osize); osize = cellfun(@uint64, osize, per_cell{:}); - yg = repmat(obuftypefun(zeros(1)), [osize{:}]); + yg = zeros([osize{:}], 'like', obufproto); % for half types, alias the weights/data, recast the positions as % single @@ -307,7 +340,7 @@ [yg, apod, x] = dealfun(@(x)getfield(alias(x),'val'), yg, apod, x); end - % combine timing infor with the transmit positions + % combine timing info with the transmit positions Pv(4,:) = t0; % partition data per frame @@ -316,6 +349,7 @@ % for each data frame, run the kernel switch fun case {'DAS','SYN','BF','MUL'} + if ~isreal(obufproto), yg = complex(yg); end % force complex if x is % I [x N [x M]] x 1 x 1 x {F x ...} for f = F:-1:1 % beamform each data frame y{f} = k.feval(yg, Pi, Pr, Pv, Nv, apod, cinv, [astride, cstride], x(:,:,:,f), flagnum, [fs, fmod]); diff --git a/kern/convd.m b/kern/convd.m index b6023079..146b37bf 100644 --- a/kern/convd.m +++ b/kern/convd.m @@ -10,10 +10,13 @@ % C = CONVD(A, B, dim, shape) selects the shape of the output. Must be one % of {'full'*|'same'|'valid'}. The default is 'full'. % -% C = CONVD(..., 'gpu', false) selects whether to use a gpu. If true, -% a ptx-file will be used if compiled. If false or if no ptx-file is -% available, native MATLAB code is used. The default is true if x or y is a -% gpuArray. +% C = CONVD(..., 'gpu', true) selects whether to use a gpu. A ptx-file will +% be used if compiled. The default is true if x or y is a gpuArray. +% +% C = CONVD(..., 'ocl', true) selects whether to use an OpenCL device if +% OpenCL support is available. If the currently selected device does not +% support the precision of the data, this argument is ignored. The default +% is true if OpenCL support is available via Matlab-OpenCL. % % C = CONVD(..., 'parenv', clu) or C = CONVD(..., 'parenv', pool) or % performs the convolution in parallel on the parcluster clu or the parpool @@ -24,35 +27,32 @@ % [C, lags] = CONVD(...) returns the lags of y in the same dimension as the % computation. % -% % Example: -% % Compute and plot the cross-correlation of two 16-sample -% % exponential sequences +% Example: +% % Create two 16 x 4 exponential sequences +% n = (1:16)-1; +% m = (1:4)'-1; +% xa = single(0.84).^(n+m); +% xb = single(0.92).^(n+m); % -% N = 16; -% M = 4; -% n = (0:N-1); -% m = (0:M-1)'; -% a = 0.84; -% b = 0.92; -% xa = a.^(n+m); -% xb = b.^(n+m); +% % Compute iteratively % z0 = 0*xa; % initialize -% for i = 1:M +% for i = 1:4 % z0(i,:) = conv(xa(i,:), xb(i,:), 'same'); % end -% z0 -% z = convd(xa,xb,2,'same') -% isequal(z, z0) +% +% % Compute with convd +% z = convd(xa,xb,2,'same'); +% isalmostn(z, z0) % % See also CONV CONV2 CONVN -% TODO: update this for tall variables? arguments - x {mustBeNumeric} - y {mustBeNumeric} + x {mustBeFloat} + y {mustBeFloat} dim (1,1) {mustBePositive, mustBeInteger} = findSingletonDim(x, y) shape (1,1) string {mustBeMember(shape, ["full", "same", "valid"])} = 'full' kwargs.gpu (1,1) logical = isa(x, 'gpuArray') || isa(y, 'gpuArray') + kwargs.ocl (1,1) logical = exist('oclDevice', 'file') && ~isempty(oclDevice()) kwargs.parenv {mustBeScalarOrEmpty, mustBeA(kwargs.parenv, ["parallel.Cluster", "parallel.Pool", "double"])} = gcp('nocreate') % parallel environment kwargs.lowmem (1,1) logical = 16 * max(numel(x), numel(y)) > 16 * 2^30; % default true if complex double storage of either argument exceeds 16GB end @@ -87,11 +87,11 @@ % get the proper lags for this computation switch shape case 'full' - lags = colon(-(N - 1), M - 1).'; + lags = colon(-(N - 1), M - 1); case 'same' - lags = colon(0, M - 1).' - floor((N-1)/2); + lags = colon(0, M - 1) - floor((N-1)/2); case 'valid' - lags = colon(0, M - N).'; + lags = colon(0, M - N); end % get kernel sizing info @@ -105,37 +105,20 @@ [xstr, ystr, zstr] = deal(C); sizes = [xstr, M, ystr, N, zstr, L, C]; -% whether/how to operate on GPU -if kwargs.gpu && exist('convd.ptx', 'file') && exist('convd.cu', 'file') - % get the kernel suffix - suffix = ''; - if complex_type, suffix = [suffix 'c']; end +% whether/how to operate with CUDA/OpenCL +use_gdev = kwargs.gpu && exist('convd.ptx', 'file') && exist('convd.cu', 'file'); +use_odev = kwargs.ocl && exist('oclDevice', 'file') && ~isempty(oclDevice()) && exist('convd.cl', 'file'); +if use_odev % validate the device supports the type + dev = oclDevice(); % get device switch dtype - case 'single', suffix = [suffix 'f']; - case 'halfT', suffix = [suffix 'h']; - end - - % specify the kernel - kern = parallel.gpu.CUDAKernel( ... - 'convd.ptx', 'convd.cu', ['conv' suffix] ... - ); - - % setup the execution size - if C == 1 - blk = [1 min(L, kern.MaxThreadsPerBlock) 1]; - grd = ceil([1 L S] ./ blk); - else - blk = [min(C, kern.MaxThreadsPerBlock), 1, 1]; - grd = ceil([C L S] ./ blk); + case 'double', use_odev = dev.SupportsDouble; + case 'halfT' , use_odev = dev.SupportsHalf; end - kern.ThreadBlockSize = blk; - kern.GridSize = grd; - - % define the constant data parameters: care must be taken to match the - % datatype here: for NVIDIA gpus, size_t <-> uint64 - kern.setConstantMemory('L0', int32(l0)); - - % if complex, ensure both all arguments complex +end + +% dispatch based on device (native vs. kernel) +if use_gdev || use_odev + % if complex, eshiftdimnsure both all arguments complex if complex_type && isreal(x), x = complex(x); end if complex_type && isreal(y), y = complex(y); end @@ -154,8 +137,53 @@ z = x; x = sub(z, 1:M, dim); % (shared-copy ?) end + + % get the kernel + if use_gdev + % get the kernel suffix + suffix = ''; + if complex_type, suffix = [suffix 'c']; end + switch dtype + case 'single', suffix = [suffix 'f']; + case 'halfT', suffix = [suffix 'h']; + end + + % specify the kernel + kern = parallel.gpu.CUDAKernel( ... + 'convd.ptx', 'convd.cu', ['conv' suffix] ... + ); + + % define the constant data parameters: care must be taken to match the + % datatype here: for NVIDIA gpus, size_t <-> uint64 + kern.setConstantMemory('L0', int32(l0)); + + else + % reference kernel + kern = oclKernel('convd.cl'); + + % define typing + switch dtype + case 'single', prc = 32; typ = 'single'; + case 'double', prc = 64; typ = 'double'; + case 'halfT' , prc = 16; typ = 'half' ; + end + kern.defineTypes({typ}); % {T} + kern.macros = [kern.macros, "QUPS_PRECISION="+prc]; + if complex_type, kern.macros = [kern.macros, "QUPS_COMPLEX"]; end + + % define the constant data parameters + kern.macros = [kern.macros, "QUPS_L0="+int32(l0)]; + end + + % setup the execution size + if C == 1, blk = [1 min(L, kern.MaxThreadsPerBlock) 1]; + else, blk = [min(C, kern.MaxThreadsPerBlock), 1, 1]; + end + kern.ThreadBlockSize = blk; + kern.GridSize = ceil([C L S] ./ blk); + % run the kernel - z = kern.feval(x,y,z,sizes); + z = kern.feval(x,y,z,sizes); else % vectorized MATLAB on CPU - perform convolution manually for vectors @@ -201,13 +229,12 @@ % return to original sizing sz(dim) = L; z = reshape(z, sz); - end % cast to output type / dimensions - try to use implicit in-place assignment % z = cast(z, 'like', To); C = z; % shared-copy -lags = shiftdim(lags, 1-dim); % put lags in same dimensions as operation +lags = swapdim(lags, 2, dim); % put lags in same dimensions as operation function d = findSingletonDim(A, B) dA = find(size(A) ~= 1,1,'first'); diff --git a/kern/interpd.m b/kern/interpd.m index b5eee836..99b15c2f 100644 --- a/kern/interpd.m +++ b/kern/interpd.m @@ -55,9 +55,9 @@ % get the data sizing T = size(x, dim); I = size(t, dim); -if isempty(mdms), N = 1; else, N = prod(size(t, mdms)); end -if isempty(rdms), M = 1; else, M = prod(size(t, rdms)); end -if isempty(rdms), F = 1; else, F = prod(size(x, rdms)); end +N = prod(esize(t, mdms)); +M = prod(esize(t, rdms)); +F = prod(esize(x, rdms)); % move the input data to the proper dimensions for the GPU kernel if ~isequal(ord, 1:length(ord)) % avoid data copy if already ordered @@ -68,29 +68,34 @@ % function to determine type isftype = @(x,T) strcmp(class(x), T) || any(arrayfun(@(c)isa(x,c),["tall", "gpuArray"])) && strcmp(classUnderlying(x), T); +if exist('oclDeviceCount','file') && oclDeviceCount(), ocl_dev = oclDevice(); else, ocl_dev = []; end % get oclDevice if supported + +use_gdev = exist('interpd.ptx', 'file') ... + && ( isa(x, 'gpuArray') || isa(t, 'gpuArray') || isa(x, 'halfT') && x.gtype) ... + && (ismember(interp, ["nearest", "linear", "cubic", "lanczos3"])); ... + +use_odev = exist('interpd.cl', 'file') ... + && (ismember(interp, ["nearest", "linear", "cubic", "lanczos3"])) ... + && ~isempty(ocl_dev) && ... + ( (isftype(x,'double') && ocl_dev.SupportsDouble) ... + || (isftype(x,'single') ) ... always supported + || (isftype(x,'halfT' ) && ocl_dev.SupportsHalf )); + % use ptx on gpu if available or use native MATLAB -if exist('interpd.ptx', 'file') ... - && ( isa(x, 'gpuArray') || isa(t, 'gpuArray') ) ... - && (ismember(interp, ["nearest", "linear", "cubic", "lanczos3"])) +if use_gdev || use_odev % determine the data type if isftype(x, 'double') - suffix = "" ; [x,t] = dealfun(@double, x, t); + suffix = "" ; prc = 64; [x,t] = dealfun(@double, x, t); elseif isftype(x, 'single') - suffix = "f"; [x,t] = dealfun(@single, x, t); - elseif isftype(x, 'halfT' ) - suffix = "h"; [x,t] = dealfun(@(x)gpuArray(halfT(x)), x, t); % custom type + suffix = "f"; prc = 32; [x,t] = dealfun(@single, x, t); + elseif isftype(x, 'halfT' ) + suffix = "h"; prc = 16; [x,t] = dealfun(@(x)gpuArray(halfT(x)), x, t); % custom type else + suffix = "f"; prc = 32; warning("Datatype " + class(x) + " not recognized as a GPU compatible type."); - suffix = "f" ; end - % grab the kernel reference - k = parallel.gpu.CUDAKernel('interpd.ptx', 'interpd.cu', 'interpd' + suffix); - k.setConstantMemory('QUPS_I', uint64(I), 'QUPS_T', uint64(T), 'QUPS_N', uint64(N), 'QUPS_M', uint64(M), 'QUPS_F', uint64(F)); - k.ThreadBlockSize = min(k.MaxThreadsPerBlock,M*I); % I and M are indexed together - k.GridSize = [ceil(I*M ./ k.ThreadBlockSize(1)), N, 1]; - % translate the interp flag switch interp case "nearest", flagnum = 0; @@ -100,6 +105,9 @@ otherwise, error('Interp option not recognized: ' + string(interp)); end + % cache data prototype + x0 = zeros(0, 'like', x); + % condition inputs/outputs osz = [I, max(size(t,2:maxdims), size(x,2:maxdims))]; x = complex(x); % enforce complex type @@ -108,15 +116,43 @@ y = complex(gpuArray(halfT(repelem(extrapval,osz)))); [y_,x_,t_] = dealfun(@(x)x.val, y,x,t); otherwise % others - y = repmat(cast(extrapval, 'like', x), osz); + y = complex(repmat(cast(extrapval, 'like', x), osz)); [y_,x_,t_] = deal(y,x,t); end - + + if use_gdev + % grab the kernel reference + k = parallel.gpu.CUDAKernel('interpd.ptx', 'interpd.cu', 'interpd' + suffix); + + % set constants + k.setConstantMemory('QUPS_I', uint64(I), 'QUPS_T', uint64(T), 'QUPS_N', uint64(N), 'QUPS_M', uint64(M), 'QUPS_F', uint64(F)); + + elseif use_odev + % get the kernel reference + k = oclKernel(which('interpd.cl'), 'interpd'); + + % set the data types + switch prc, case 16, tp = 'uint16'; case 32, tp = 'single'; case 64, tp = 'double'; end + k.defineTypes({tp,tp}); % all aliases are this type + + % configure the kernel sizing and options + k.macros = "QUPS_" + ["I", "T", "S", "N", "F"] + "=" + uint64([I T M N F]); % size constants + k.macros = [k.macros, "QUPS_INTERPD_" + ["NO_V","FLAG"] ... + + "=" + ["0."+suffix, flagnum]]; % input constants + k.macros(end+1) = "QUPS_PRECISION="+prc; + % k.opts = ["-cl-fp32-correctly-rounded-divide-sqrt", "-cl-mad-enable"]; + end + + % kernel sizing + k.ThreadBlockSize(1) = min(k.MaxThreadsPerBlock,M*I); % I and M are indexed together + k.GridSize = [ceil(I*M ./ k.ThreadBlockSize(1)), N, 1]; + % sample y_ = k.feval(y_, x_, t_, flagnum); % compute % restore type - switch suffix, case "h", y.val = y; otherwise y = y_; end + if isreal(x0), y_ = real(y_); end + switch suffix, case "h", y.val = y; otherwise, y = y_; end else % get new dimension mapping [~, tmp] = cellfun(@(x) ismember(x, ord), {dim, mdms, rdmst, rdmsx}, 'UniformOutput',false); @@ -154,4 +190,4 @@ y = ipermute(y, ord); end - +function sz = esize(x, sz), if ~isempty(sz), sz = size(x, sz); end diff --git a/kern/wbilerpg.m b/kern/wbilerpg.m index b795c330..ddb7e2d2 100644 --- a/kern/wbilerpg.m +++ b/kern/wbilerpg.m @@ -1,42 +1,45 @@ function [cxy, ixo, iyo] = wbilerpg(x, y, xa, ya, xb, yb) -% WBILERPG - Weighted bilinear interpolation weights (GPU-enabled) +% WBILERPG - Weighted bilinear interpolation weights (GPU/OpenCL-enabled) % -% [cxy, ixo, iyo] = WBILERPG(x, y, xa, ya, xb, yb) takes in a pair of grid -% vectors x, y and an array of pairs of endpoints (xa,ya) to (xb,yb) and +% [cxy, ixo, iyo] = WBILERPG(x, y, xa, ya, xb, yb) takes in a pair of grid +% vectors x, y and an array of pairs of endpoints (xa,ya) to (xb,yb) and % returns the interpolation weights cxy, the x-indices ixo, and the % y-indices iyo that describe the bi-linear inteprolation weights for each % segment of the line from (xa,xb) to (xb,yb) as it intersects the x-grid % and y-grid and their indices on the x-grid and y-grid. The arrays xa, ya, % xb, yb must all be the same size. -% +% % Each output is size [4 x (X + Y + 1)] x size(xa, 1:ndims(xa)) where X and % Y are the length of the x and y vectors. The indices ixo and iyo are % 1-based. If the index is invalid, ixo and iyo are 0. Multiple idenitcal % indices represent contributions from different line segments intersecting % the same grid index. These can be added afterwards to form the full % weights. +% +% Note: This function requires that either a CUDA-enabled gpu or a working +% implementation of OpenCL via Matlab-OpenCL be available. % % The following example shows how a sparse matrix can be generated by % implicitly adding overlapping pixel weights. % % Example: % % Create a grid -% [x, y] = deal(-5:5, -5:5); % 11 x 11 grid +% [x, y ] = deal(single(-5:5), single(-5:5)); % 11 x 11 grid % [xa, ya] = deal(-4, +1); % [xb, yb] = deal(+3, -2); % % % Get the interpolation weights % [cxy, ixo, iyo] = wbilerpg(x, y, xa, ya, xb, yb); % get the line segment weights -% +% % % Create a sparse matrix, implicitly summing weights from neighboring % % line segments % val = (ixo ~= 0) & (iyo ~= 0); % filter out invalid indices -% s = sparse(ixo(val), iyo(val), cxy(val), numel(x), numel(y)); -% +% s = sparse(ixo(val), iyo(val), double(cxy(val)), numel(x), numel(y)); +% % figure; % pcolor(x, y, full(s)') % title("Weights from ("+xa+","+ya+"), to ("+xb+","+yb+")."); -% +% % See also XIAOLINWU_K_SCALED SPARSE WBILERP arguments @@ -68,16 +71,45 @@ [cxy] = deal(zeros([4*N,Ia], 'like', xa)); % determine the suffix based on the type -uclass = class(xa); +uclass = class(xa); if uclass == "gpuArray", uclass = classUnderlying(xa); end switch uclass - case "double",suffix = ""; - case "single",suffix = "f"; - case "halfT", suffix = "h"; cxy = alias(cxy); % ensure aliased - we can index directly + case "double",prc = 64; suffix = ""; + case "single",prc = 32; suffix = "f"; + case "halfT", prc = 16; suffix = "h"; cxy = alias(cxy); % ensure aliased - we can index directly end +% parse device +use_gdev = logical(gpuDeviceCount()); +use_odev = exist('oclDevice','file') && ~isempty(oclDevice()); + % specify the kernel (must be on the path) -kern = parallel.gpu.CUDAKernel('wbilerp.ptx', 'wbilerp.cu', "wbilerp" + suffix); +if use_gdev + % reference kernel + kern = parallel.gpu.CUDAKernel('wbilerp.ptx', 'wbilerp.cu', "wbilerp" + suffix); +elseif use_odev + % reference kernel + kern = oclKernel('wbilerp.cl'); + + % specify aliased types + kern.defineTypes({uclass}); + kern.macros(end+1) = "QUPS_PRECISION="+prc; + + % validate precision support + if (uclass == "double") && (~kern.Device.SupportsDouble) ... + || (uclass == "halfT") && (~kern.Device.SupportsHalf) + error("QUPS:wbilerpg:precisionNotSupported", ... + "Device " + kern.Device.Index + " (" + kern.Device.Name + ") " + ... + "does not support " + uclass + " precision." ... + ); + end +else + % no kernel device found + error("QUPS:wbilerpg:supportedDeviceNotFound", ... + "wbilerpg() requires either a supported CUDA enabled GPU " + ... + "or a working OpenCL implementation via Matlab-OpenCL. Use wbilerp instead." ... + ); +end % proceed by which transform should be applied for ineg = [false, true], for isteep = [false, true], for irev = [false, true] @@ -85,7 +117,7 @@ % indices matching this tranform ind = yneg == ineg & isteep == steep & irev == reverse; I = nnz(ind); - + % if no data follows this transform, skip this batch if I == 0, continue; end @@ -106,7 +138,7 @@ % sort in x, then y, so that we have a non-descreasing line [ux, uy, vx, vy, m, mi] = dealfun(@(x) shiftdim(x(:), -2), ux, uy, vx, vy, m, mi); % move to 1 x 1 x I - + % all points ([N+1] x 2 x I) pall = [... ux, uy; ... endpoint @@ -132,7 +164,7 @@ ixyo = complex(ixyo, ixyo); % alias (x,y) -> (real, imag) % setup the execution size - kern.ThreadBlockSize = min(I,kern.MaxThreadsPerBlock); + kern.ThreadBlockSize(1) = min(I,kern.MaxThreadsPerBlock); kern.GridSize = [ceil(I / kern.ThreadBlockSize(1)), 1, 1]; % run the kernel @@ -144,7 +176,7 @@ ixyo, cxyo.val, pall.val, ... xg.val, yg.val, X, Y, I, ... pml.val, pmu.val ... - ); + ); end % set correct output for x and y in original coordinates diff --git a/kern/wsinterpd.m b/kern/wsinterpd.m index 5ca99d96..4d4b1654 100644 --- a/kern/wsinterpd.m +++ b/kern/wsinterpd.m @@ -88,11 +88,23 @@ % function to determine type isftype = @(x,T) strcmp(class(x), T) || any(arrayfun(@(c)isa(x,c),["tall", "gpuArray"])) && strcmp(classUnderlying(x), T); -% use ptx on gpu if available or use native MATLAB -if exist('interpd.ptx', 'file') ... +if exist('oclDeviceCount','file') && oclDeviceCount(), ocl_dev = oclDevice(); else, ocl_dev = []; end % get oclDevice if supported + +use_gdev = exist('interpd.ptx', 'file') ... && ( isa(x, 'gpuArray') || isa(t, 'gpuArray') || isa(x, 'halfT') && x.gtype) ... && (ismember(interp, ["nearest", "linear", "cubic", "lanczos3"])) ... - && real(omega) == 0 + && real(omega) == 0; + +use_odev = exist('interpd.cl', 'file') ... + && (ismember(interp, ["nearest", "linear", "cubic", "lanczos3"])) ... + && real(omega) == 0 ... + && ~isempty(ocl_dev) && ... + ( (isftype(x,'double') && ocl_dev.SupportsDouble) ... + || (isftype(x,'single') ) ... always supported + || (isftype(x,'halfT' ) && ocl_dev.SupportsHalf )); + +% use ptx on gpu if available or use native MATLAB +if use_gdev || use_odev % get stride for weighting wstride = size(w,1:S); @@ -112,14 +124,14 @@ % determine the data type if isftype(x, 'double') - suffix = "" ; [x,t,w] = dealfun(@double, x, t, w); + suffix = "" ; prc = 64; [x,t,w] = dealfun(@double, x, t, w); elseif isftype(x, 'single') - suffix = "f"; [x,t,w] = dealfun(@single, x, t, w); - elseif isftype(x, 'halfT' ) - suffix = "h"; [x,t,w] = dealfun(@(x)gpuArray(halfT(x)), x, t, w); % custom type + suffix = "f"; prc = 32; [x,t,w] = dealfun(@single, x, t, w); + elseif isftype(x, 'halfT' ) + suffix = "h"; prc = 16; [x,t,w] = dealfun(@(x)gpuArray(halfT(x)), x, t, w); % custom type else + suffix = "f"; prc = 32; warning("Datatype " + class(x) + " not recognized as a GPU compatible type."); - suffix = "f" ; end % get the data sizing @@ -128,15 +140,6 @@ N = prod(esize(t, mdms)); F = prod(esize(x, rdmsx)); - % grab the kernel reference - k = parallel.gpu.CUDAKernel('interpd.ptx', 'interpd.cu', 'wsinterpd' + suffix); - k.setConstantMemory( ... - 'QUPS_I', uint64(I), 'QUPS_T', uint64(T), 'QUPS_S', uint64(S), ... - 'QUPS_N', uint64(N), 'QUPS_F', uint64(F) ... - ); - k.ThreadBlockSize = min(k.MaxThreadsPerBlock,I); % I and M are indexed together - k.GridSize = [ceil(I ./ k.ThreadBlockSize(1)), min(N, 2^15), ceil(N/2^15)]; - % translate the interp flag switch interp case "nearest", flagnum = 0; @@ -146,9 +149,9 @@ otherwise, error('Interp option not recognized: ' + string(interp)); end - % enforce complex type for gpu data - if isa(x, 'gpuArray') || isa(x,'halfT') && isa(x.val, 'gpuArray'), x = complex(x); end - if isa(w, 'gpuArray') || isa(w,'halfT') && isa(w.val, 'gpuArray'), w = complex(w); end + % enforce complex type for ocl or gpu data + if use_odev || isa(x, 'gpuArray') || (isa(x,'halfT') && isa(x.val, 'gpuArray')), x = complex(x); end + if use_odev || isa(w, 'gpuArray') || (isa(w,'halfT') && isa(w.val, 'gpuArray')), w = complex(w); end switch suffix case "h", y = complex(gpuArray(halfT(zeros(osz)))); @@ -158,7 +161,38 @@ [w_,x_,t_] = deal(w,x,t); % copy data y_ = zeros(osz, 'like', x_); % pre-allocate output end + % zeros: uint16(0) == storedInteger(half(0)), so this is okay + if use_gdev + % grab the kernel reference + k = parallel.gpu.CUDAKernel('interpd.ptx', 'interpd.cu', 'wsinterpd' + suffix); + k.setConstantMemory( ... + 'QUPS_I', uint64(I), 'QUPS_T', uint64(T), 'QUPS_S', uint64(S), ... + 'QUPS_N', uint64(N), 'QUPS_F', uint64(F) ... + ); + cargs = {flagnum, imag(omega)}; % extra const arguments + elseif use_odev + + % get the kernel reference + k = oclKernel(which('interpd.cl'), 'wsinterpd'); + + % set the data types + switch prc, case 16, t = 'uint16'; case 32, t = 'single'; case 64, t = 'double'; end + k.defineTypes({t,t}); % all aliases are this type + + % configure the kernel sizing and options + k.macros = "QUPS_" + ["I", "T", "S", "N", "F"] + "=" + uint64([I T S N F]); % size constants + k.macros = [k.macros, "QUPS_INTERPD_" + ["NO_V","FLAG","OMEGA"] ... + + "=" + ["0."+suffix, flagnum, imag(omega)]]; % input constants + k.macros(end+1) = "QUPS_PRECISION="+prc; + % k.opts = ["-cl-fp32-correctly-rounded-divide-sqrt", "-cl-mad-enable"]; + cargs = {}; + end + + % kernel sizing + k.ThreadBlockSize(1) = min(k.MaxThreadsPerBlock,I); % local group size + k.GridSize = [ceil(I ./ k.ThreadBlockSize(1)), min(N, 2^15), ceil(N/2^15)]; + % index label flags iflags = zeros([1 maxdims], 'uint8'); iflags(mdms) = 1; @@ -168,10 +202,11 @@ strides = cat(1,wstride,ystride,tstride,xstride); % compute - y_ = k.feval(y_, w_, x_, t_, dsizes, iflags, strides, flagnum, imag(omega)); + y_ = k.feval(y_, w_, x_, t_, dsizes, iflags, strides, cargs{:}); % for halfT, store the data back in the output switch suffix, case "h", y.val = y_; otherwise, y = y_; end + else % promote half types diff --git a/kern/wsinterpd2.m b/kern/wsinterpd2.m index 1519a146..5ea4b47b 100644 --- a/kern/wsinterpd2.m +++ b/kern/wsinterpd2.m @@ -95,11 +95,23 @@ % function to determine type isftype = @(x,T) strcmp(class(x), T) || any(arrayfun(@(c)isa(x,c),["tall", "gpuArray"])) && strcmp(classUnderlying(x), T); -% use ptx on gpu if available or use native MATLAB -if exist('interpd.ptx', 'file') ... +if exist('oclDeviceCount','file') && oclDeviceCount(), ocl_dev = oclDevice(); else, ocl_dev = []; end % get oclDevice if supported + +use_gdev = exist('interpd.ptx', 'file') ... && ( isa(x, 'gpuArray') || isa(t1, 'gpuArray') || isa(t2, 'gpuArray') || isa(x, 'halfT') && x.gtype) ... && (ismember(interp, ["nearest", "linear", "cubic", "lanczos3"])) ... - && real(omega) == 0 + && real(omega) == 0; + +use_odev = exist('interpd.cl', 'file') ... + && (ismember(interp, ["nearest", "linear", "cubic", "lanczos3"])) ... + && real(omega) == 0 ... + && ~isempty(ocl_dev) && ... + ( (isftype(x,'double') && ocl_dev.SupportsDouble) ... + || (isftype(x,'single') ) ... always supported + || (isftype(x,'halfT' ) && ocl_dev.SupportsHalf )); + +% use ptx on gpu if available or use native MATLAB +if use_gdev || use_odev % get stride for weighting wstride = sz2stride(size(w,1:S)); @@ -117,14 +129,14 @@ % determine the data type if isftype(x, 'double') - suffix = "" ; [x,t1,t2,w] = dealfun(@double, x, t1, t2, w); + suffix = "" ; prc = 64; [x,t1,t2,w] = dealfun(@double, x, t1, t2, w); elseif isftype(x, 'single') - suffix = "f"; [x,t1,t2,w] = dealfun(@single, x, t1, t2, w); + suffix = "f"; prc = 32; [x,t1,t2,w] = dealfun(@single, x, t1, t2, w); elseif isftype(x, 'halfT' ) - suffix = "h"; [x,t1,t2,w] = dealfun(@(x)gpuArray(halfT(x)), x, t1, t2, w); % custom type + suffix = "h"; prc = 16; [x,t1,t2,w] = dealfun(@(x)gpuArray(halfT(x)), x, t1, t2, w); % custom type else + suffix = "f" ; prc = 32; warning("Datatype " + class(x) + " not recognized as a GPU compatible type."); - suffix = "f" ; end % get the data sizing @@ -133,15 +145,6 @@ N = prod(esize(x, mdms)); F = prod(esize(x, rdmsx)); - % grab the kernel reference - k = parallel.gpu.CUDAKernel('interpd.ptx', 'interpd.cu', 'wsinterpd2' + suffix); - k.setConstantMemory( ... - 'QUPS_I', uint64(I), 'QUPS_T', uint64(T), 'QUPS_S', uint64(S), ... - 'QUPS_N', uint64(N), 'QUPS_F', uint64(F) ... - ); - k.ThreadBlockSize = min(k.MaxThreadsPerBlock,I); % I and M are indexed together - k.GridSize = [ceil(I ./ k.ThreadBlockSize(1)), min(N, 2^15), ceil(N/2^15)]; - % translate the interp flag switch interp case "nearest", flagnum = 0; @@ -152,8 +155,8 @@ end % enforce complex type for gpu data - if isa(x, 'gpuArray') || isa(x, 'halfT'), x = complex(x); end - if isa(w, 'gpuArray') || isa(w, 'halfT'), w = complex(w); end + if use_odev || isa(x, 'gpuArray') || isa(x, 'halfT'), x = complex(x); end + if use_odev || isa(w, 'gpuArray') || isa(w, 'halfT'), w = complex(w); end switch suffix case "h", y = complex(gpuArray(halfT(zeros(osz)))); @@ -163,7 +166,37 @@ [w_,x_,t1_,t2_] = deal(w,x,t1,t2); % copy data y_ = zeros(osz, 'like', x_); % pre-allocate output end - % zeros: uint16(0) == storedInteger(half(0)), so this is okay + % zeros: uint16(0) == storedInteger(half(0)), so this is okay + + if use_gdev + % grab the kernel reference + k = parallel.gpu.CUDAKernel('interpd.ptx', 'interpd.cu', 'wsinterpd2' + suffix); + k.setConstantMemory( ... + 'QUPS_I', uint64(I), 'QUPS_T', uint64(T), 'QUPS_S', uint64(S), ... + 'QUPS_N', uint64(N), 'QUPS_F', uint64(F) ... + ); + cargs = {flagnum, imag(omega)}; % constant arguments + elseif use_odev + % get the kernel reference + k = oclKernel(which('interpd.cl'), 'wsinterpd2'); + + % set the data types + switch prc, case 16, t = 'uint16'; case 32, t = 'single'; case 64, t = 'double'; end + k.defineTypes({t,t}); % all aliases are this type + + % configure the kernel sizing and options + k.macros = "QUPS_" + ["I", "T", "S", "N", "F"] + "=" + uint64([I T S N F]); % size constants + k.macros = [k.macros, "QUPS_INTERPD_" + ["NO_V","FLAG","OMEGA"] ... + + "=" + ["0."+suffix, flagnum, imag(omega)]]; % input constants + k.macros(end+1) = "QUPS_PRECISION="+prc; + % k.opts = ["-cl-fp32-correctly-rounded-divide-sqrt", "-cl-mad-enable"]; + cargs = {}; + end + + % kernel sizing + k.ThreadBlockSize(1) = min(k.MaxThreadsPerBlock,I); % I and M are indexed together + k.GridSize = [ceil(I ./ k.ThreadBlockSize(1)), min(N, 2^15), ceil(N/2^15)]; + % index label flags iflags = zeros([1 maxdims], 'uint8'); % increase index i iflags(mdms) = 1; % increase index n @@ -173,7 +206,7 @@ strides = cat(1,wstride,ystride,t1stride,t2stride,xstride); % compute - y_ = k.feval(y_, w_, x_, t1_, t2_, dsizes, iflags, strides, flagnum, imag(omega)); + y_ = k.feval(y_, w_, x_, t1_, t2_, dsizes, iflags, strides, cargs{:}); % for halfT, store the data back in the output switch suffix, case "h", y.val = y_; otherwise, y = y_; end diff --git a/src/UltrasoundSystem.m b/src/UltrasoundSystem.m index 795e3fd3..29626574 100644 --- a/src/UltrasoundSystem.m +++ b/src/UltrasoundSystem.m @@ -499,7 +499,7 @@ function delete(self) element_subdivisions (1,2) double {mustBeInteger, mustBePositive} = [1,1] end arguments - kwargs.device (1,1) {mustBeInteger} = -logical(gpuDeviceCount()); % gpu device + kwargs.device (1,1) {mustBeInteger} = -1 * (logical(gpuDeviceCount()) || (exist('oclDevice','file') && ~isempty(oclDevice()))) kwargs.interp (1,1) string {mustBeMember(kwargs.interp, ["linear", "nearest", "next", "previous", "spline", "pchip", "cubic", "makima", "freq", "lanczos3"])} = 'cubic' kwargs.tall (1,1) logical = false; % whether to use a tall type kwargs.bsize (1,1) {mustBeInteger, mustBePositive} = 1; % number of simulataneous scatterers @@ -545,6 +545,18 @@ function delete(self) wv.fs = kwargs.fsk; kern = wv.samples; + % choose device + use_dev = kwargs.device && ismember(kwargs.interp, ["nearest", "linear", "cubic", "lanczos3"]); + use_gdev = use_dev && exist('greens.ptx', 'file') && gpuDeviceCount(); % use the GPU kernel + use_odev = use_dev && exist('oclDeviceCount','file') && oclDeviceCount(); % use the OpenCL kernel + if use_odev + if kwargs.device > 0, oclDevice(kwargs.device); end % select device + dev = oclDevice(); % reference + use_odev = isUnderlyingType(kern, "single") ... + || (isUnderlyingType(kern, "double") && dev.SupportsDouble) ... + || (isUnderlyingType(kern, "half" ) && dev.SupportsHalf); + end + F = numel(scat); % if kwargs.verbose, hw = waitbar(0); end @@ -569,15 +581,15 @@ function delete(self) pos = scat(f).pos; % 3 x S amp = scat(f).amp; % 1 x S fso = self.fs; % output channel data sampling frequency - if kwargs.device && exist('greens.ptx', 'file') ... % use the GPU kernel - && (ismember(kwargs.interp, ["nearest", "linear", "cubic", "lanczos3"])) + + if use_gdev || use_odev % function to determine type isftype = @(x,T) strcmp(class(x), T) || any(arrayfun(@(c)isa(x,c),["tall", "gpuArray"])) && strcmp(classUnderlying(x), T); % determine the data type - if isftype(kern, 'double'), suffix = "" ; cfun = @double; - elseif isftype(kern, 'single'), suffix = "f"; cfun = @single; - elseif isftype(kern, 'halfT' ), suffix = "h"; cfun = @(x) alias(halfT(x)); + if isftype(kern, 'double'), typ = "double"; prc = 64; suffix = "" ; cfun = @double; + elseif isftype(kern, 'single'), typ = "single"; prc = 32; suffix = "f"; cfun = @single; + elseif isftype(kern, 'halfT' ), typ = "halfT" ; prc = 16; suffix = "h"; cfun = @(x) alias(halfT(x)); else, error("Datatype " + class(kern) + " not recognized as a GPU compatible type."); end @@ -599,7 +611,7 @@ function delete(self) [QI, QS, QT, QN, QM] = deal(scat(f).numScat, length(t), length(kern), N, M); % compute the minimum and maximum distance for each scatterer - ps = gpuArray(ps); + if use_gdev, ps = gpuArray(ps); end [rminrx, rmaxrx, rmintx, rmaxtx] = deal(+inf, -inf, +inf, -inf); for p = self.tx.positions, rminrx = min(rminrx, vecnorm(ps - p, 2, 1)); end % minimum scat time for p = self.tx.positions, rmaxrx = max(rmaxrx, vecnorm(ps - p, 2, 1)); end % maximum scat time @@ -615,13 +627,29 @@ function delete(self) sb = ([rmin; rmax] ./ c0 + t0x - t0k) * fso + [0; QT]; % grab the kernel reference - k = parallel.gpu.CUDAKernel('greens.ptx', 'greens.cu', 'greens' + suffix); - k.setConstantMemory( 'QUPS_S', uint64(QS) ); % always set S - try k.setConstantMemory('QUPS_T', uint64(QT), ... - 'QUPS_N', uint64(QN), 'QUPS_M', uint64(QM) ... - ); end % already set by const compiler - try k.setConstantMemory('QUPS_I', uint64(QI)); end % might be already set by const compiler - k.ThreadBlockSize = min([k.MaxThreadsPerBlock, 32]); % smaller is better + if use_gdev + k = parallel.gpu.CUDAKernel('greens.ptx', 'greens.cu', 'greens' + suffix); + k.setConstantMemory( 'QUPS_S', uint64(QS) ); % always set S + try k.setConstantMemory('QUPS_T', uint64(QT), ... + 'QUPS_N', uint64(QN), 'QUPS_M', uint64(QM) ... + ); end % already set by const compiler + try k.setConstantMemory('QUPS_I', uint64(QI)); end % might be already set by const compiler + elseif use_odev + k = oclKernel('greens.cl'); + k.macros = [k.macros, "QUPS_"+["S","T","N","M","I"]+"="+uint64([QS,QT,QN,QM,QI])]; % set constants + k.macros(end+1) = "QUPS_PRECISION="+prc; + k.defineTypes(repmat(typ,[1,3]), ["V","T","U"]); % time / data / position + + % enforce complexity + [x, as, kn] = dealfun(@complex, x, as, kn); + + % expand positions 3D -> 4D + [ps(4,:), pn(1,4,:), pv(1,4,:)] = deal(0); + + end + + % set kernel sizing + k.ThreadBlockSize(1) = min([k.MaxThreadsPerBlock, 32]); % smaller is better k.GridSize = [ceil(QS ./ k.ThreadBlockSize(1)), N, M]; % get the computational bounds @@ -2485,7 +2513,7 @@ function delete(self) c0(:,:,:,1,1) {mustBeNumeric} = self.seq.c0 kwargs.fmod (1,1) {mustBeNumeric} = 0 kwargs.prec (1,1) string {mustBeMember(kwargs.prec, ["single", "double", "halfT"])} = "single" - kwargs.device (1,1) {mustBeInteger} = -1 * logical(gpuDeviceCount) + kwargs.device (1,1) {mustBeInteger} = -1 * (logical(gpuDeviceCount()) || (exist('oclDevice','file') && ~isempty(oclDevice()))) kwargs.apod {mustBeNumericOrLogical} = 1 kwargs.interp (1,1) string {mustBeMember(kwargs.interp, ["linear", "nearest", "next", "previous", "spline", "pchip", "cubic", "makima", "freq", "lanczos3"])} = 'cubic' kwargs.keep_tx (1,1) logical = false @@ -2530,7 +2558,7 @@ function delete(self) if ~isequal(ord, 1:3), chd = rectifyDims(chd); end % reorder if necessary % get the beamformer arguments - dat_args = {chd.data, gather(chd.t0), gather(chd.fs), c0, 'device', kwargs.device, 'position-precision', kwargs.prec}; % data args + dat_args = {chd.data, gather(chd.t0), gather(chd.fs), c0, 'device', kwargs.device, 'input-precision', kwargs.prec}; % data args if isfield(kwargs, 'interp'), interp_args = {'interp', kwargs.interp}; else, interp_args = {}; end ext_args = [interp_args, apod_args]; % extra args diff --git a/src/bf.cl b/src/bf.cl new file mode 100644 index 00000000..53be7ff0 --- /dev/null +++ b/src/bf.cl @@ -0,0 +1,138 @@ +// # include "helper_math.h" // vector math + +// # include "sizes.cu" // size defines + +# include "interpolators.cl" // samplers using constant sizing + +// # include "half2_math.h" // vector math for half types only + +/* +* Delay and sum the given data at the given pixels +* +* Given a set of pixels, (virtual or plane wave) transmitter locations, +* receiver locations, as well as a datacube equipped with a time, +* transmitter and receiver axis, perform simple delay-and-sum beamforming. +* The data is linearly interpolated at the sample time. An image is +* generated for each receiver element. Summation across the transmitters +* and receivers is implicit. +* +* All positions are in vector coordinates. +* +* If the virtual transmitter +* normal has a fourth component that is 0, this indicates that the +* transmission should be treated as a plane wave transmission instead of a +* virtual source transmission. +* +* The value of t = 0 must be the time when the peak of the wavefront +* reaches the virtual source location. Because this time must be the same +* for all transmits, the datacube must be stitched together in such a way +* that for all transmits, the same time axis is used +* +* Inputs: +* y: complex pixel values per channel (I) +* Pi: pixel positions (3 x I) +* Pr: receiver positions (3 x N) +* Pv: (virtual) transmitter positions (3 x M) +* Nv: (virtual) transmitter normal (3 x M) +* x: datacube of complex sample values (T x M x N) +* t0: initial time for the data +* fs: sampling frequency of the data +* cinv: inverse of the speed of sound used for beamforming +* +* I -> pixels, M -> transmitters, N -> receivers, T -> time samples +* +*/ + +# if QUPS_PRECISION == 32 +typedef float4 U4; +# elif QUPS_PRECISION == 64 +typedef double4 U4; +# elif QUPS_PRECISION == 16 +typedef float4 U4; +# endif + +kernel void DAS(volatile global T2 * y, + const global U4 * pi, const global U4 * pr, + const global U4 * pv, const global U4 * nv, + const global T2 * a, const global V * cinv, const global ulong * acstride, + const global T2 * x, const int flag, const global V t0fsfc[2]) { + + // unpack inputs + #ifndef QUPS_BF_FLAG + const int QUPS_BF_FLAG = flag; // set only if not fixed at compile time + #endif + // const V t0 = t0fsfc[0]; // start time + const V fs = t0fsfc[0]; // sampling frequency + const V fc = t0fsfc[1]; // modulation frequency + const global ulong * astride = acstride; + const global ulong * cstride = acstride + 5; + + // rename for readability + const size_t N = QUPS_N, M = QUPS_M, T = QUPS_T, I = QUPS_I; + const size_t I1 = QUPS_I1, I2 = QUPS_I2, I3 = QUPS_I3; + + // get starting index of this pixel + const size_t tid = get_global_id(0); // pixel index + const size_t kI = get_global_size(0); // number of pixels per call + + // temp vars + const T2 zero_v = (T2)(0, 0); + T2 w = (T2)(1, 0); + V dv, dr, tau; + T2 val, pix; + U4 rv = (U4)(0); + + // if valid pixel, for each tx/rx + for(size_t i = tid; i < I; i += kI){ + // get image coordinates + const size_t i1 = (i % QUPS_I1); // index in I1 + const size_t i2 = (i / QUPS_I1) % QUPS_I2 ; // index in I2 + const size_t i3 = (i / (I1 * I2) % QUPS_I3); // index in I3 + const size_t abase = i1 * astride[0] + i2 * astride[1] + i3 * astride[2]; // base index for this pixel + const size_t cbase = i1 * cstride[0] + i2 * cstride[1] + i3 * cstride[2]; // base index for this pixel + + // reset accumulator + pix = zero_v; + + for(size_t m = 0; m < M; ++m){ + for(size_t n = 0; n < N; ++n){ + // 2-way virtual path distance + // const U3 pvm = {pv[m].x,pv[m].y,pv[m].z}; // declared for MSVC (2019) + rv.xyz = pi[i].xyz - pv[m].xyz; // (virtual) transmit to pixel vector + + dv = QUPS_VS ? // tx path length + copysign(length(rv), (QUPS_DV ? 1.f : dot(rv.xyz, nv[m].xyz))) // virtual source + : dot(rv.xyz, nv[m].xyz); // plane wave + + dr = length(pi[i].xyz - pr[n].xyz); // rx path length + + // data/time index number + const V ci = cinv[cbase + n * cstride[3] + m * cstride[4]]; + tau = (ci * (dv + dr) - pv[m].w); + + // apply demodulation if non zero + if (fc) {w.x = cospi(2*fc*tau); w.y = sinpi(2*fc*tau);} + + // sample the trace + val = sample(&x[(n + m * N) * T], tau * fs, flag & 7, zero_v); // out of bounds: extrap 0 + // const int t = (int) (tau * fs); + // val = (0 <= t & t < T) ? x[(size_t) (t + (n + m * N) * T)] : zero_v; + + // apply apodization (requires complex multiplication) + val = cmul(val, cmul(w, a[abase + n * astride[3] + m * astride[4]])); + + // choose the accumulation + const int sflag = ((int)QUPS_BF_FLAG) & 24; // extract bits 5,4 + if(sflag == 8) + y[i + n*I] += val; // sum over tx, store over rx + else if (sflag == 16) + y[i + m*I] += val; // sum over rx, store over tx + else if (sflag == 24) + y[i + n*I + m*N*I] = val; // store over tx/rx + else + pix += val; // sum over all + } + } + if (!(((int)QUPS_BF_FLAG) & 24)) y[i] = pix; // output value here if accumulating over all + } +} diff --git a/src/bf.cu b/src/bf.cu index d44551f6..4eba16eb 100644 --- a/src/bf.cu +++ b/src/bf.cu @@ -47,8 +47,7 @@ void __device__ DAS_temp(U2 * __restrict__ y, const U * __restrict__ Pi, const U * __restrict__ Pr, const U * __restrict__ Pv, const U * __restrict__ Nv, const U2 * __restrict__ a, const U * __restrict__ cinv, const size_t * acstride, - const U2 * __restrict__ x, - const U t0fsfc[3], const int flag) { + const U2 * __restrict__ x, const U t0fsfc[2], const int flag) { // unpack // const U t0 = t0fsfc[0]; // start time @@ -138,7 +137,7 @@ __global__ void DAS(double2 * __restrict__ y, const double * __restrict__ Pv, const double * __restrict__ Nv, const double2 * __restrict__ a, const double * __restrict__ cinv, const size_t * acstride, const double2 * __restrict__ x, const int iflag, - const double tvars[3]) { + const double tvars[2]) { DAS_temp(y, Pi, Pr, Pv, Nv, a, cinv, acstride, x, tvars, iflag); } @@ -147,7 +146,7 @@ __global__ void DASf(float2 * __restrict__ y, const float * __restrict__ Pv, const float * __restrict__ Nv, const float2 * __restrict__ a, const float * __restrict__ cinv, const size_t * acstride, const float2 * __restrict__ x, const int iflag, - const float tvars[3]) { + const float tvars[2]) { DAS_temp(y, Pi, Pr, Pv, Nv, a, cinv, acstride, x, tvars, iflag); } @@ -158,7 +157,7 @@ __global__ void DASh(ushort2 * __restrict__ y, const float * __restrict__ Pv, const float * __restrict__ Nv, const ushort2 * __restrict__ a, const float * __restrict__ cinv, const size_t * acstride, const ushort2 * __restrict__ x, const int iflag, - const float tvars[3]) { + const float tvars[2]) { DAS_temp((half2 *)y, Pi, Pr, Pv, Nv, (const half2 *)a, cinv, acstride, (const half2 *)x, tvars, iflag); } #endif @@ -210,7 +209,7 @@ __global__ void delaysf(float * __restrict__ tau, // reinterpret inputs as vector pointers (makes loading faster and indexing easier) const float3 * pi = reinterpret_cast(Pi); // 3 x I const float3 * pr = reinterpret_cast(Pr); // 3 x N - const float3 * pv = reinterpret_cast(Pv); // 3 x M + const float4 * pv = reinterpret_cast(Pv); // 4 x M const float3 * nv = reinterpret_cast(Nv); // 3 x M // rename for readability @@ -227,7 +226,8 @@ __global__ void delaysf(float * __restrict__ tau, # pragma unroll for(size_t n = 0; n < N; ++n){ // 2-way virtual path distance - rv = pi[i] - pv[m]; // (virtual) transmit to pixel vector + const float3 pvm = {pv[m].x,pv[m].y,pv[m].z}; // declared for MSVC (2019) + rv = pi[i] - pvm; // (virtual) transmit to pixel vector dv = QUPS_VS ? // tx path length copysign(length(rv), (QUPS_DV ? 1.f : dot(rv, nv[m]))) // virtual source @@ -255,7 +255,7 @@ __global__ void delays(double * __restrict__ tau, // reinterpret inputs as vector pointers (makes loading faster and indexing easier) const double3 * pi = reinterpret_cast(Pi); // 3 x I const double3 * pr = reinterpret_cast(Pr); // 3 x N - const double3 * pv = reinterpret_cast(Pv); // 3 x M + const double4 * pv = reinterpret_cast(Pv); // 4 x M const double3 * nv = reinterpret_cast(Nv); // 3 x M // rename for readability @@ -272,7 +272,8 @@ __global__ void delays(double * __restrict__ tau, # pragma unroll for(size_t n = 0; n < N; ++n){ // 2-way virtual path distance - rv = pi[i] - pv[m]; // (virtual) transmit to pixel vector + const double3 pvm = {pv[m].x,pv[m].y,pv[m].z}; // declared for MSVC (2019) + rv = pi[i] - pvm; // (virtual) transmit to pixel vector dv = QUPS_VS ? // tx path length copysign(length(rv), (QUPS_DV ? 1.f : dot(rv, nv[m]))) // virtual source diff --git a/src/conv_cuda.cu b/src/conv_cuda.cu deleted file mode 100644 index cbb25ec2..00000000 --- a/src/conv_cuda.cu +++ /dev/null @@ -1,152 +0,0 @@ -#if (__CUDA_ARCH__ >= 530) -#define __CUDA_NO_HALF2_OPERATORS__ // block half2 vector math operators -#include // define half/half2 types, without half2 operators -#endif - -// real/complex conjugation -inline __host__ __device__ float conj(const float a) { - return a; -} -inline __host__ __device__ double conj(const double a) { - return a; -} -#if (__CUDA_ARCH__ >= 530) -inline __host__ __device__ half conj(const half a) { - return a; -} -#endif - -inline __host__ __device__ float2 conj(const float2 a) { - return make_float2(a.x, -a.y); -} -inline __host__ __device__ double2 conj(const double2 a) { - return make_double2(a.x, -a.y); -} -#if (__CUDA_ARCH__ >= 530) -inline __host__ __device__ half2 conj(const half2 a) { - return make_half2(a.x, -a.y); -} -#endif - - -// complex multiplication -inline __host__ __device__ float2 operator*(const float2 a, const float2 b) { - return make_float2(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x); -} -inline __host__ __device__ double2 operator*(const double2 a, const double2 b) { - return make_double2(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x); -} -#if (__CUDA_ARCH__ >= 530) -inline __host__ __device__ half2 operator*(const half2 a, const half2 b) { - return make_half2(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x); -} -#endif - -inline __host__ __device__ float2 operator*(const float2 a, const float b){ - return make_float2(b*a.x, b*a.y); -} -inline __host__ __device__ double2 operator*(const double2 a, const double b){ - return make_double2(b*a.x, b*a.y); -} -#if (__CUDA_ARCH__ >= 530) -inline __host__ __device__ half2 operator*(const half2 a, const half b){ - return make_half2(b*a.x, b*a.y); -} -#endif - -// real/complex addition/assignment -inline __host__ __device__ void operator+=(float2 &a, const float2 b){ - a.x += b.x; - a.y += b.y; -} -inline __host__ __device__ void operator+=(double2 &a, const double2 b){ - a.x += b.x; - a.y += b.y; -} -#if (__CUDA_ARCH__ >= 530) -inline __host__ __device__ void operator+=(half2 &a, const half2 b){ - a.x += b.x; - a.y += b.y; -} -#endif - - -/* -* Compute the cross correlation of two sets of data. The data will be -* correlated in the first dimension. M >= N must be satisfied. -* -* Inputs: -* x: first signal (M x S) -* y: second signal (N x S) -* -* Outputs: -* z: resulting cross correlation -* -* -*/ -# ifndef M -__constant__ size_t M; -# endif -# ifndef N -__constant__ size_t N; -# endif -# ifndef L -__constant__ size_t L; // total number of lags -# endif -# ifndef L0 -__constant__ int L0; // starting lag -# endif - -// xcorr template -template -inline __device__ void conv_temp(const T* x, const T* y, T* z, T za){ - /* xcorr_temp(const T* x, const T* y, T* z, T za) - x, y: input array pointer(s) - za: 0 value for the data type - z: output array pointer - cross correlation - */ - - // get lag and stride indices - const int l = threadIdx.x + blockDim.x*blockIdx.x; // lag index - const size_t s = threadIdx.y + blockDim.y*blockIdx.y; // slice - - // if valid lag indices, multiply and accumulate in-place - if(l < L) - # pragma unroll - for(int i = 0, j = L0 - l; i < M || j < N; ++i, ++j) - if(0 <= i && i < M && 0 <= j && j < N) // signal in bounds - za += x[i+s*M] * conj(y[j+s*N]); // accum the cross product - - // output result: - if(l < L) - z[l+s*L] = za; -} - -// xcorr kernels -__global__ void convf(const float* x, const float* y, float* z){ - conv_temp(x, y, z, 0.0f); -} - -__global__ void conv(const double* x, const double* y, double* z){ - conv_temp(x, y, z, 0.0); -} -#if (__CUDA_ARCH__ >= 530) -__global__ void convh(const ushort* x, const ushort* y, ushort* z){ - conv_temp((half*)x, (half*)y, (half*)z, 0.0f); -} -#endif -__global__ void convcf(const float2* x, const float2* y, float2* z){ - conv_temp(x, y, z, make_float2(0.0f,0.0f)); -} - -__global__ void convc(const double2* x, const double2* y, double2* z){ - conv_temp(x, y, z, make_double2(0.0,0.0)); -} -#if (__CUDA_ARCH__ >= 530) -__global__ void convch(const ushort2* x, const ushort2* y, ushort2* z){ - conv_temp((half2*)x, (half2*)y, (half2*)z, make_half2(0.0f, 0.0f)); -} -#endif - - diff --git a/src/convd.cl b/src/convd.cl new file mode 100644 index 00000000..ccaacb3a --- /dev/null +++ b/src/convd.cl @@ -0,0 +1,91 @@ +// DEBUG: for linter purposes +/* +# ifndef QUPS_PRECISION +# define QUPS_PRECISION 32 +# endif + +# ifndef QUPS_COMPLEX +# define QUPS_COMPLEX +# endif +*/ + +# if QUPS_PRECISION == 32 + # ifdef QUPS_COMPLEX +typedef float2 T; + # else +typedef float T; + # endif +# elif QUPS_PRECISION == 16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable // must enable half precision + # ifdef QUPS_COMPLEX +typedef half2 T; + # else +typedef half T; + # endif +# elif QUPS_PRECISION == 64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable // must enable double precision + # ifdef QUPS_COMPLEX +typedef double2 T; + # else +typedef double T; + # endif +# endif + + + +// complex multiplication +# ifdef QUPS_COMPLEX +inline T cmul(const T a, const T b){ return (T)(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);} +# else +inline T cmul(const T a, const T b){ return a * b;} +# endif + +/* +* Compute the cross correlation of two sets of data. The data will be +* correlated in the first dimension. M >= N must be satisfied. +* +* Inputs: +* x: first signal (M x S) +* y: second signal (N x S) +* sizes: sizing info (undocumented) +* +* Outputs: +* z: resulting cross correlation +* +* +*/ + +kernel void conv(global const T* x, global const T* y, global T* z, const global int * sizes){ + // get stride and len + const int istr = sizes[0]; + const int ilen = sizes[1]; + const int jstr = sizes[2]; + const int jlen = sizes[3]; + const int lstr = sizes[4]; + const int llen = sizes[5]; + const int clen = sizes[6]; + + const int I = ilen * clen; + const int J = jlen * clen; + const int L = llen * clen; + + const int c = get_global_id(0); // column output index + const int l = get_global_id(1); // strided output index + const int s = get_global_id(2); // batch index + + // accumulator + T za = (T) 0.f; + + // if valid lag indices, multiply and accumulate in-place + // TODO: apply work-groups and striding for faster accumulation + if(l < L && c < clen) + # pragma unroll + for(int i = 0, j = (int)(QUPS_L0) - l, jr = jlen - 1 - j; i < ilen || jr >= 0; ++i, --jr) // forward and reverse iterate over both signals + if(0 <= i && i < ilen && 0 <= jr && jr < jlen) // if both signals in bounds + za += cmul(x[c + i*istr + s*I] , y[c + jr*jstr + s*J]); // accumulate the cross product + + // output result: + if(l < L && c < clen) + z[c + l*lstr + s*L] = za; +} + diff --git a/src/greens.cl b/src/greens.cl new file mode 100644 index 00000000..75dcbdf6 --- /dev/null +++ b/src/greens.cl @@ -0,0 +1,75 @@ +// # include "helper_math.h" // vector math + +// # include "sizes.cu" // size defines + +# include "interpolators.cl" // samplers using constant sizing + +# if QUPS_PRECISION == 32 +typedef float4 U4; +# elif QUPS_PRECISION == 64 +typedef double4 U4; +# elif QUPS_PRECISION == 16 +typedef float4 U4; +# endif + +kernel void greens(global T2 * __restrict__ y, + global const U4 * __restrict__ pi, global const T2 * __restrict__ a, + global const U4 * __restrict__ pr, global const U4 * __restrict__ pv, + global const T2 * __restrict__ x, global const V * __restrict__ sb, global const ulong * iblock, + global const V s0t0fscinv[5], + global const int * E, const int iflag + ) { + + // extract time parameters + const V s0 = s0t0fscinv[0]; + const V t0 = s0t0fscinv[1]; + const V fs = s0t0fscinv[2]; + const V fsr = s0t0fscinv[3]; + const V cinv = s0t0fscinv[4]; + + // get starting index of this scatterer + const ulong s = get_global_id(0); // time + const ulong n = get_global_id(1); // rx + const ulong m = get_global_id(2); // tx + + // rename for readability + const ulong N = QUPS_N, S = QUPS_S, M = QUPS_M, I = QUPS_I; //, T = QUPS_T; + // rxs, num scat, output time size, txs, kernel time size, + // S is size of output, T is size of input kernel, I the number of scats + + // temp vars + // const float ts = s + s0*fs; // compute the time index for this thread + const T2 zero_v = (T2)(0, 0), fpow = (T2)(fsr, fsr); // OOB value, power scaling + V r, tau; // length, time (tmp values) + T2 val = zero_v; // accumulator + + // if valid scat, for each tx/rx + if(s < S){ + for(ulong i = iblock[2*get_group_id(0)+0]; i <= iblock[2*get_group_id(0)+1] && i < I; ++i){ // for each scatterer + if(s >= sb[2*i+0]){ // if within sampling window + # pragma unroll + for(ulong me = 0; me < E[1]; ++me){ // for each tx sub-aperture + # pragma unroll + for(ulong ne = 0; ne < E[0]; ++ne){ // for each rx sub-aperture + + // 2-way path distance + r = (length(pi[i] - pr[n + ne*N]) + length(pi[i] - pv[m + me*M])); // (virtual) transmit to pixel vector + + // get kernel delay for the scatterer + tau = (V)s - (cinv * r + t0 - s0)*fs; + + // sample the kernel and add to the signal at this time + // fsr applies a 'stretch' operation to the sample time, because the + // input data x is sampled at sampling frequency fsr * fs + val += cmul(a[i], sample(x, fsr*tau, iflag, zero_v)); // out of bounds: extrap 0 + } + } + } + } + + // output signal when all scatterers and sub-apertures are sampled + // normalize by the discrete length of the signal + y[s + n*S + m*N*S] = val / fpow; + } +} + diff --git a/src/interpd.cl b/src/interpd.cl new file mode 100644 index 00000000..4888c3cb --- /dev/null +++ b/src/interpd.cl @@ -0,0 +1,291 @@ +// defaults +# ifndef QUPS_INTERPD_FLAG +# define QUPS_INTERPD_FLAG 2 +# endif +# ifndef QUPS_INTERPD_NO_V +# define QUPS_INTERPD_NO_V 0.f +# endif +# ifndef QUPS_INTERPD_OMEGA +# define QUPS_INTERPD_OMEGA 0 +# endif + +# include "interpolators.cl" + +// template // channel data type, time data type, time-sampling type +kernel void interpd(global T2 * y, global const T2 * x, global const U * tau, const int flag) { + + // get sampling index + const size_t tid = get_global_id(0); + const size_t n = get_global_id(1); + // const size_t m = threadIdx.z + blockIdx.z * blockDim.z; + + // rename for readability + const size_t I = QUPS_I, M = QUPS_S, N = QUPS_N, T = QUPS_T, F = QUPS_F; + const T2 no_v = QUPS_INTERPD_NO_V; + + // remap indices + const size_t i = tid % I; + const size_t m = tid / I; + + // if valid sample, for each tx/rx + if(i < I && n < N && m < M){ + # pragma unroll + for(size_t f = 0; f < F; ++f){ // per transmit + y[i + n*I + m*N*I + f*M*N*I] = sample(&x[n*T + f*N*T], (V)(tau[i + n*I + m*I*N]), flag, no_v); + } + } +} + +// added from internet ------------------------------ +/* +void __attribute__((always_inline)) atomic_add_f(volatile global float* addr, const float val) { + union { + uint u32; + float f32; + } next, expected, current; + current.f32 = *addr; + do { + next.f32 = (expected.f32=current.f32)+val; // ...*val for atomic_mul_f() + current.u32 = atomic_cmpxchg((volatile global uint*)addr, expected.u32, next.u32); + } while(current.u32!=expected.u32); +} + +#ifdef cl_khr_int64_base_atomics +// #prgma OPENCL EXTENSION cl_khr_int64_base_atomics : enable +void __attribute__((always_inline)) atomic_add_d(volatile global double* addr, const double val) { + union { + ulong u64; + double f64; + } next, expected, current; + current.f64 = *addr; + do { + next.f64 = (expected.f64=current.f64)+val; // ...*val for atomic_mul_d() + current.u64 = atom_cmpxchg((volatile global ulong*)addr, expected.u64, next.u64); + } while(current.u64!=expected.u64); +} +#endif +*/ +// -------------------------------------------------------------------- + +// atomicAdd not natively supported for floating point types +// for double +#ifdef cl_khr_fp64 +double atomicAdd(volatile global double* address, double val) +{ + // view data as a double (.f) or long int (.i) + union {double f; long i;} old, curr, nxt; + old.f = *address; // previous value + + do { + curr.i = old.i; // current value + nxt.f = (curr.f + val); // desired next value + old.i = atomic_cmpxchg((volatile global long *) address, curr.i, nxt.i); // attempt to swap + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (curr.i != old.i); + + return old.f; +} +#endif + +// for float +float atomicAddf(volatile global float* address, float val) +{ + // view data as a float (.f) or int (.i) + union {float f; int i;} old, curr, nxt; + old.f = *address; // previous value + + do { + curr.i = old.i; // current value + nxt.f = (curr.f + val); // desired next value + old.i = atomic_cmpxchg((volatile global int *)address, curr.i, nxt.i); // attempt to swap + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (curr.i != old.i); + + return old.f; +} + +// for half2 +#ifdef cl_khr_fp16 +half2 atomicAddh(volatile global half2* address, half2 val) +{ + // view data as a half2 (.f) or int (.i) + union {half2 f; int i;} old, curr, nxt; + old.f = *address; // previous value + + do { + curr.i = old.i; // current value + nxt.f = (curr.f + val); // desired next value + old.i = atomic_cmpxchg((volatile global int *) address, curr.i, nxt.i); // attempt to swap + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (curr.i != old.i); + + return old.f; +} +#endif + + +// add using the atomic functions for each data type +#ifdef cl_khr_fp16 +inline void atomicAddStoreh(volatile global half2 * y, const half2 val){ + atomicAddh(y, val); +} +#endif +inline void atomicAddStoref(volatile global float2 * y, const float2 val){ + atomicAddf(((volatile global float*) y) + 0, val.x); + atomicAddf(((volatile global float*) y) + 1, val.y); +} +#ifdef cl_khr_fp64 +inline void atomicAddStore(volatile global double2 * y, const double2 val){ + atomicAdd(((volatile global double*) y) + 0, val.x); + atomicAdd(((volatile global double*) y) + 1, val.y); +} +#endif + + +size_t global_offset(const size_t * dind, const global size_t * sizes, const global uchar * iflags){ +// global index + // init + size_t dsz[3] = {1,1,1}; // {I,N,F} index cumulative sizes + size_t str, j = 0; // stride, output index + + // find offset + # pragma unroll + for(char i = 0; i < 3; ++i){ // each label + str = 1; // reset stride + # pragma unroll + for(size_t s = 0; s < QUPS_S; ++s){ // for each data dimension + if(i == iflags[s]){ // matching label + const size_t k = (dind[i] / dsz[i]) % sizes[s]; // get sub-index + dsz[i] *= sizes[s]; // increase size for this label + j += str * k; // add offset + } + str *= sizes[s]; // increase indexing stride + } + } + + return j; +} + +// template // channel data type, time data type, time-sampling type +kernel void wsinterpd(volatile global T2 * y, + const global T2 * w, const global T2 * x, + const global U * tau, const global ulong * sizes, + const global uchar * iflags, const global ulong * dstride){ + // , const int flag, const T2 no_v, const U omega){ + + // alias constant inputs + const int flag = QUPS_INTERPD_FLAG; + const T2 no_v = QUPS_INTERPD_NO_V; + const U omega = QUPS_INTERPD_OMEGA; + + // get sampling index + const ulong i = get_global_id(0); + const ulong n = get_global_id(1); + + // rename for readability + const ulong I = QUPS_I, N = QUPS_N, F = QUPS_F, S = QUPS_S, T = QUPS_T; // , M = QUPS_M + ulong u,v,k,l,sz; // weighting / output indexing + + // if valid sample, per each i,n,f + if(i < I && n < N){ + # pragma unroll + for(ulong f = 0; f < F; ++f){ // for f + // global index + const ulong dind[3] = {i,n,f}; + const ulong j = global_offset(dind, sizes, iflags); + + // get weight vector and output indices + k = 0, l = 0; u = 0; v = 0; + # pragma unroll + for(ulong s = 0; s < S; ++s){ // for each dimension s + // calculate the indexing stride for dimension s i.e. + // size of all prior dimensions + sz = 1; + # pragma unroll + for(ulong sp = 0; sp < s; ++sp) + sz *= sizes[sp]; + + const ulong js = ((j / sz) % sizes[s]); // index for this dimension + k += js * dstride[0 + 4*s]; // add pitched index for this dim (weights) + l += js * dstride[1 + 4*s]; // add pitched index for this dim (outputs) + u += js * dstride[2 + 4*s]; // add pitched index for this dim (time) + v += js * dstride[3 + 4*s]; // add pitched index for this dim (samples) + } + + const T2 a = (omega != 0) ? (T2)(cos(omega * tau[u]), sin(omega * tau[u])) : (T2)(1.f, 0.f); // modulation phasor + const T2 val = cmul(a, cmul(w[k],sample(&x[(v)*T], (V)tau[u], flag, no_v))); // weighted sample + // y[l] += (T2)(1.0f, 1.0f); + +# if QUPS_PRECISION == 64 + atomicAddStore(y + l, val); // store +# elif QUPS_PRECISION == 32 + atomicAddStoref(y + l, val); // store +# elif QUPS_PRECISION == 16 + atomicAddStoreh(y + l, val); // store +# endif + + } + } +} + +// template // channel data type, time data type, time-sampling type +kernel void wsinterpd2(volatile global T2 * y, + const global T2 * w, const global T2 * x, + const global U * tau1, const global U * tau2, const global ulong * sizes, + const global uchar * iflags, const global ulong * dstride){ + // , const int flag, const T2 no_v, const U omega) { + + // alias constant inputs + const int flag = QUPS_INTERPD_FLAG; + const T2 no_v = QUPS_INTERPD_NO_V; + const U omega = QUPS_INTERPD_OMEGA; + + // get sampling index + const size_t i = get_global_id(0); + const size_t n = get_global_id(1); + + // rename for readability + const size_t I = QUPS_I,N = QUPS_N, F = QUPS_F, S = QUPS_S, T = QUPS_T; // , M = QUPS_M + size_t r,u,v,k,l,sz; // weighting / output indexing + + // if valid sample, per each i,n,f + if(i < I && n < N){ + # pragma unroll + for(size_t f = 0; f < F; ++f){ // for f + // global index + size_t dind[3] = {i,n,f}; + const size_t j = global_offset(dind, sizes, iflags); + + // get weight vector and output indices + k = 0, l = 0; r = 0; u = 0; v = 0; + # pragma unroll + for(size_t s = 0; s < S; ++s){ // for each dimension s + // calculate the indexing stride for dimension s i.e. + // size of all prior dimensions + sz = 1; + for(size_t sp = 0; sp < s; ++sp) + sz *= sizes[sp]; + + const size_t js = ((j / sz) % sizes[s]); // index for this dimension + k += js * dstride[0 + 5*s]; // add pitched index for this dim (weights) + l += js * dstride[1 + 5*s]; // add pitched index for this dim (outputs) + r += js * dstride[2 + 5*s]; // add pitched index for this dim (time-1) + u += js * dstride[3 + 5*s]; // add pitched index for this dim (time-2) + v += js * dstride[4 + 5*s]; // add pitched index for this dim (samples) + } + const U t = tau1[r] + tau2[u]; // time + const T2 a = (omega != 0) ? (T2)(cos(omega * t), sin(omega * t)) : (T2)(1.f, 0.f); // modulation phasor + const T2 val = cmul(a, cmul(w[k], sample(&x[(v)*T], (V)t, flag, no_v))); // weighted sample +# if QUPS_PRECISION == 64 + atomicAddStore(&y[l], val); // store +# elif QUPS_PRECISION == 32 + atomicAddStoref(&y[l], val); // store +# elif QUPS_PRECISION == 16 + atomicAddStoreh(&y[l], val); // store +# endif + } + } +} diff --git a/src/interpolators.cl b/src/interpolators.cl new file mode 100644 index 00000000..faf59db7 --- /dev/null +++ b/src/interpolators.cl @@ -0,0 +1,162 @@ +/** + @file gpuBF/interp1d.cuh + @author Dongwoon Hyun (dongwoon.hyun@stanford.edu) + @date 2021-03-08 + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Modified for use as a stand-alone OpenCL file (Thurston Brevett ) +// data is (T x N x F) +// sample times are (I x N x M) + + +// enable integer atomics(must be supported!) +#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable + +// DEBUG: constant defines for debugging / if not specified at compile time +// default precision +// # ifndef QUPS_PRECISION +// # define QUPS_PRECISION 32 +// # endif + +// DEBUG: to avoid linter errrors +/* +# ifndef QUPS_T +# define QUPS_T 1024 +# define QUPS_N 1 +# define QUPS_M 1 +# define QUPS_F 1 +# define QUPS_I 1 +# define QUPS_S 1024 +# endif +*/ + +// should be JIT compiles with (T2, U, V) set to the complex-data type, real-data type, and time type +# if QUPS_PRECISION == 32 +typedef float2 T2; +typedef float U ; +typedef float V ; +# define PI_VAL M_PI_F +# elif QUPS_PRECISION == 64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable // must enable double precision +typedef double2 T2; +typedef double U ; +typedef double V ; +# define PI_VAL M_PI +# elif QUPS_PRECISION == 16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable // must enable half precision +typedef half2 T2; +typedef half U ; +typedef half V ; +# define PI_VAL M_PI_H +# endif + +// complex multiplication +T2 cmul(const T2 a, const T2 b){ + return (T2)(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x); +} + +/// @brief Device function for nearest-neighbor interpolation +T2 nearest(global const T2 * x, U tau, T2 no_v) { + const int ti = (int) round(tau); // round to nearest integer + return (0 <= ti && ti < QUPS_T) ? x[ti] : no_v; +} + +T2 lerp(const T2 a, const T2 b, const U t) { + return a + t*(b-a); +} + +/// @brief Device function for linear interpolation +T2 linear(global const T2 * x, U tau, T2 no_v) { + U tif; // integer part + + // fractional and integer part + const U tf = modf(tau, &tif); + const int ti = (int) tif; + + // if in bounds, linearly interpolate by ratio tau at time-index ti[+1] + return (0 <= ti && ti + 1 < QUPS_T) ? lerp(x[ti], x[ti + 1], tf) : no_v; +} + +/// @brief Device function for cubic Hermite interpolation +T2 cubic(global const T2 * x, U tau, T2 no_v) { + U tf; + const U u = modf(tau, &tf); // u is the fractional part, tf the integer part + const int ti = (int) tf; + + if (!(0 <= (ti - 1) && (ti + 2) < QUPS_T)) + return no_v; + + T2 s0 = x[ti - 1]; + T2 s1 = x[ti + 0]; + T2 s2 = x[ti + 1]; + T2 s3 = x[ti + 2]; + + // Cubic Hermite interpolation (increased precision using fused multiply-adds) + // (Catmull-Rom) + U a0 = 0 + u * (-1 + u * (+2 * u - 1)); + U a1 = 2 + u * (+0 + u * (-5 * u + 3)); + U a2 = 0 + u * (+1 + u * (+4 * u - 3)); + U a3 = 0 + u * (+0 + u * (-1 * u + 1)); + // // Cubic Hermite interpolation (naive, less precise implementation) + // float a0 = -1 * u * u * u + 2 * u * u - 1 * u + 0; + // float a1 = +3 * u * u * u - 5 * u * u + 0 * u + 2; + // float a2 = -3 * u * u * u + 4 * u * u + 1 * u + 0; + // float a3 = +1 * u * u * u - 1 * u * u + 0 * u + 0; + return (s0 * a0 + s1 * a1 + s2 * a2 + s3 * a3) * 0.5f; +} + +/// @brief Inline helper code for Lanczos 3-lobe interpolation +U lanczos_helper(U u, int a) { + return (u == 0.f ? 1.f : (2.f * sinpi(u)*sinpi(u/a) / (PI_VAL*PI_VAL * u*u))); +} + + +/// @brief Device function for Lanczos 3-lobe interpolation +T2 lanczos3(global const T2 * x, U tau, T2 no_v) { + const int a = 2; // a=2 for 3-lobe Lanczos resampling + U tf; + const U u = modf(tau, &tf); // u is the fractional part, tf the integer part + const int ti = (int) tf; + + if (!(0 <= (ti - 1) && (ti + 2) < QUPS_T)) + return no_v; + + T2 s0 = x[ti - 1]; + T2 s1 = x[ti + 0]; + T2 s2 = x[ti + 1]; + T2 s3 = x[ti + 2]; + U a0 = lanczos_helper(u + 1, a); + U a1 = lanczos_helper(u + 0, a); + U a2 = lanczos_helper(u - 1, a); + U a3 = lanczos_helper(u - 2, a); + return s0 * a0 + s1 * a1 + s2 * a2 + s3 * a3; +} + +T2 sample(global const T2 * x, U tau, int flag, const T2 no_v){ + // sample according to the flag + if (flag == 0) + return nearest (x, tau, no_v); + else if (flag == 1) + return linear (x, tau, no_v); + else if (flag == 2) + return cubic (x, tau, no_v); + else if (flag == 3) + return lanczos3 (x, tau, no_v); + else if (flag == 4) + return linear (x, tau, no_v); + else + return no_v; +} + diff --git a/src/wbilerp.cl b/src/wbilerp.cl new file mode 100644 index 00000000..d36c95ca --- /dev/null +++ b/src/wbilerp.cl @@ -0,0 +1,105 @@ +// linter ... +# ifndef QUPS_PRECISION +# define QUPS_PRECISION 32 +# endif + +# if QUPS_PRECISION == 32 +typedef float U; +typedef float2 U2; +# elif QUPS_PRECISION == 64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable // must enable double precision +typedef double U; +typedef double2 U2; +# elif QUPS_PRECISION == 16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable // must enable half precision +typedef half U; +typedef half2 U2; +# endif + +// convert integer to U type +inline U2 i2U(const int2 a){return (U2)(a.x, a.y);} + +kernel void wbilerp( + global int2 * ixy, global U * cxy, global const U2 * pall, + global const U * xg, global const U * yg, + const ulong X, const ulong Y, const ulong I, + global const U2 * pml, global const U2 * pmu + ){ + + const ulong i = get_global_id(0); + const int N = X + Y + 1; // total number of points in output, + int ix = 0; // grid x position (iterator) + int iy = 0; // grid y position (iterator) + + if(i < I){ + // index for this batch of pall + const int ip = i * (N + 1); // pall is length N+1 + + // compute all integrals one segment at a time + for(int n = 0; n < N; ++n){ + // get both points (x/y by pair) + const U2 p1 = pall[ip + n ]; + const U2 p2 = pall[ip + n+1]; + + // length of the line segment + // TODO: switch to hypot for numerical accuracy + const U l = length(p2 - p1); // hypot(p2(1) - p1(1), p2(2) - p1(2)); + + // if points are within the support of the grid / line + const bool val = + pml[i].x <= p1.x & p1.x <= pmu[i].x + & pml[i].y <= p1.y & p1.y <= pmu[i].y + & pml[i].x <= p2.x & p2.x <= pmu[i].x + & pml[i].y <= p2.y & p2.y <= pmu[i].y; + + // skip integral if out of grid / line support, the segment length is zero, or if co-located in x + if ((!val) || (l == (U)0) || (p2.x == p1.x)) continue; + + // if any point not in x/y-bounds, move to next x/y-grid interval + while ((p1.x >= xg[ix+1]) && (p2.x >= xg[ix+1]) && (ix+2 < X)) ++ix; + while ((p1.y >= yg[iy+1]) && (p2.y >= yg[iy+1]) && (iy+2 < Y)) ++iy; + + // get the line segment region's size + const U2 dp = (U2)(xg[ix+1] - xg[ix], yg[iy+1] - yg[iy]); + + // get the indices for each of the four grid points affected by this + // line segment + # pragma unroll + for(int iyy = 0; iyy < 2; ++iyy){ + # pragma unroll + for(int ixx = 0; ixx < 2; ++ixx){ + // ixloc = [ix, ix+1, ix, ix+1]; // x index + // iyloc = [iy, iy, iy+1, iy+1]; // y index + const int ixloc = ix + ixx; // x index + const int iyloc = iy + iyy; // y index + + // grid points + // q = [xg(ixloc)'; yg(iyloc)']; + const U2 q = (U2)(xg[ixloc], yg[iyloc]); + + ////// inline the integral ////// + // s_d == -1 if in positive quadrant of dimension d + const int2 s = (int2)( + ( (p1.x <= q.x) && (p2.x <= q.x) ) ? 1 : -1, + ( (p1.y <= q.y) && (p2.y <= q.y) ) ? 1 : -1 + ); + + // coefficients for the integral + // TODO: use copysign instead of multiplying by +/- 1 + // TODO: write out equations to avoid accidental complex overload + const U2 ONE = (U2)(1.f, 1.f); + const U2 a = ONE + i2U(s) * (p1 - q ) / dp; // (non-complex) vector division + const U2 b = i2U(s) * (p1 - p2) / dp; // (non-complex) vector division + + // evaluation of the integral: subject to numerical precision issues + const U c = a.x*a.y - (a.x*b.y + a.y*b.x) / (U)2 + (b.x*b.y) / (U)3; + + // apply line integral formula w/r to quadrant + // TODO: redo write indexing to vectorize writes + ixy[i*4*N + 4*n + 2*iyy + ixx] = (int2)(ixloc, iyloc); // output indices + cxy[i*4*N + 4*n + 2*iyy + ixx] = l * c; // length in this grid scaled by grid size + } + } + } + } +}