diff --git a/README.md b/README.md
index 1cfdc46a..4d22e512 100644
--- a/README.md
+++ b/README.md
@@ -11,10 +11,10 @@ QUPS (pronounced "CUPS") is an abstract, lightweight, readable tool for prototyp
     - Arbitrary pixel locations and beamforming apodization
     
 - Performant:
-    - Hardware acceleartion via GPU or multi-threaded CPU
+    - Hardware acceleartion via CUDA, OpenCL, and MATLAB parallel processing
     - Memory efficient data types
     - Beamform a 1024 x 1024 image for 256 x 256 transmits/receives in < 2 seconds (RTX 3070)
-    - Batch simulations locally via [`parcluster`](https://www.mathworks.com/help/parallel-computing/parcluster.html) and scale to a cluster via the [parallel server](https://www.mathworks.com/help/matlab-parallel-server/) toolbox
+    - Batch simulations locally via [`parcluster`](https://www.mathworks.com/help/parallel-computing/parcluster.html) or scale to a cluster via the [parallel server](https://www.mathworks.com/help/matlab-parallel-server/) toolbox
 
 - Modular:
     - Transducer, pulse sequence, pulse waveform, scan region etc. each defined separately
@@ -99,7 +99,8 @@ All extensions to QUPS are optional, but must be installed separately from their
 | [FieldII](https://www.field-ii.dk/)   | point scatterer simulator | `addpath path/to/fieldII`|
 | k-Wave([base](http://www.k-wave.org/index.php), [extension](http://www.k-wave.org/forum/topic/alpha-version-of-kwavearray-off-grid-sources)) | distributed medium simulator | `addpath path/to/kWave, addpath path/to/kWaveArray` |
 | [MUST](https://www.biomecardio.com/MUST/documentation.html)  | point scatterer simulator | `addpath path/to/MUST` (see issues[#2](https://github.com/thorstone25/qups/issues/2))|
-| CUDA([Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html),[Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)) | hardware acceleration| see [CUDA Extension](####CUDA-Extension) |
+| CUDA([Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html),[Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)) | hardware acceleration | see [CUDA Extension](####CUDA-Extension) |
+| [Matlab-OpenCL](https://github.com/thorstone25/Matlab-OpenCL) | hardware acceleration | (see [README](https://github.com/thorstone25/Matlab-OpenCL/blob/main/README.md))|
 
 
 ### CUDA Extension
diff --git a/kern/beamform.m b/kern/beamform.m
index 1a45f98d..09998c54 100644
--- a/kern/beamform.m
+++ b/kern/beamform.m
@@ -102,11 +102,6 @@
 else
     idataType = 'double'; % default
 end
-if any(cellfun(@(c) isType(c, 'single'), {Pi, Pr, Pv, Nv}))
-    posType = 'single';
-else
-    posType = 'double';
-end
 if gpuDeviceCount && any(cellfun(@(v)isa(v, 'gpuArray'), {Pi, Pr, Pv, Nv, x}))
     device = -1; % GPU
 else
@@ -127,9 +122,6 @@
             DV = true;
         case 'focused-waves'
             DV = false;
-        case 'position-precision'
-            n = n + 1;
-            posType = varargin{n};
         case 'input-precision'
             n = n + 1;
             idataType = varargin{n};
@@ -180,14 +172,26 @@
 % TODO: support doing this with non-scalar t0 definition
 x = permute(x, [1:3,(max(3,ndims(x))+[1,2]),4:ndims(x)]); % (T x N x M x 1 x 1 x F x ...)
 fsz = size(x, 6:max(6,ndims(x))); % frame size: starts at dim 6
-    
-if device && logical(exist('bf.ptx', 'file')) % PTX track must be available
+
+% get devices: must have package, device, and kernel
+gdev = (exist('gpuDeviceCount','file') && gpuDeviceCount() && device <= gpuDeviceCount() && logical(exist('bf.ptx', 'file'))); ... % PTX track available
+odev = (exist('oclDeviceCount','file') && oclDeviceCount() && device <= oclDeviceCount() && logical(exist('bf.cl' , 'file'))); ... OpenCL kernel available
+if odev % check for data type support
+    d = oclDevice();
+    odev = ~isempty(d) ... % must have a device selected
+        && (((idataType == "double") && d.SupportsDouble) ... double support 
+        ||   (idataType == "single") ...                      single always supported)
+        ||  ((idataType == "halfT" ) && d.SupportsHalf  )); % half support 
+end 
+
+% dispatch
+if device && (gdev || odev)
 
     % warn if non-linear interp was requested
     switch interp_type
-        case "nearest",flagnum = 0;
-        case "linear", flagnum = 1;
-        case "cubic",  flagnum = 2;
+        case "nearest", flagnum = 0;
+        case "linear",  flagnum = 1;
+        case "cubic",   flagnum = 2;
         case "lanczos3",flagnum = 3;
         otherwise
             error('QUPS:beamform:UnrecognizedInput', "Unrecognized interpolation of type " ...
@@ -204,16 +208,13 @@
     src_folder = fullfile(fileparts(mfilename('fullpath')), '..', 'src');
     
     switch idataType
-        case 'halfT', postfix = 'h';
+        case 'halfT',  postfix = 'h';
         case 'single', postfix = 'f';
         case 'double', postfix = '';
     end
     
-    % currently selected gpu device handle
-    g = gpuDevice();
-    
     % reselect gpu device and copy over inputs if necessary
-    if device > 0 && g.Index ~= device
+    if device > 0 && gdev && getfield(gpuDevice(), 'Index') ~= device
         inps = {Pi, Pr, Pv, Nv, x, t0, fs, cinv};
         oclass = cellfun(@class, inps, per_cell{:}); 
         tmp = cellfun(@gather, inps, per_cell{:});
@@ -221,23 +222,14 @@
         tmp = cellfun(@(inp, cls)cast(inp, cls), tmp, oclass, per_cell{:});
         [Pi, Pr, Pv, Nv, x, t0, fs, cinv] = deal(tmp{:});
     end
-    
-    % If bf.ptx isn't there, ask the user to generate it
-    if ~exist('bf.ptx', 'file')
-        error('The source code is not compiled for MATLAB on this system. Try using UltrasoundSystem.recompileCUDA() to compile it.')
-    end
-    
-    k = parallel.gpu.CUDAKernel(...
-        'bf.ptx',...
-        fullfile(src_folder, 'bf.cu'),...
-        ['DAS', postfix]); % use the same kernel, just modify the flag here.
-    
+
     % send constant data to GPU
-    typefun = str2func(idataType); % function to cast types
-    ptypefun = @(x) gpuArray(typefun(x));
+    ptypefun = str2func(idataType); % function to cast types
+    if gdev, ptypefun = @(x) gpuArray(ptypefun(x)); end
     dtypefun = @(x) complex(ptypefun(x));
     if idataType == "halfT"
-        ptypefun = @(x) gpuArray(single(x));
+        if gdev, ptypefun = @(x) gpuArray(single(x)); end
+        if odev, ptypefun = @(x)          single(x) ; end
         dtypefun = @(x) complex(halfT(x));
     end
     [x, apod] = dealfun(dtypefun, x, apod);
@@ -269,36 +261,77 @@
     % get output data type
     switch fun
         case {'DAS', 'SYN', 'BF', 'MUL'}
-            obuftypefun = dtypefun;
+            obufproto = x;
         case {'delays'}
-            obuftypefun = @(x)real(dtypefun(x));
+            obufproto = real(x([]));
     end
-   
-    % kernel sizes
-    nThreads = k.MaxThreadsPerBlock; % threads per block
-    nBlocks = min([...
-        g.MaxThreadBlockSize(1), ... max cause device reqs
-        ceil(I / nThreads)... max cause number of pixels
-        ceil(g.AvailableMemory / (2^8) / (prod(osize{:}) * nThreads)),... max cause GPU memory reqs (empirical)
-        ]); % blocks per frame
-    
-    % constant arg type casting
-    tmp = cellfun(@uint64, {T,M,N,I,Isz}, per_cell{:});
-    [T, M, N, I, Isz] = deal(tmp{:});
     
-    % set constant args
-    k.setConstantMemory('QUPS_I', I); % gauranteed
-    try k.setConstantMemory('QUPS_T', T); end % if not const compiled with ChannelData
-    try k.setConstantMemory('QUPS_M', M, 'QUPS_N', N, 'QUPS_VS', VS, 'QUPS_DV', DV, 'QUPS_I1', Isz(1), 'QUPS_I2', Isz(2), 'QUPS_I3', Isz(3)); end % if not const compiled
+    if gdev
+        % reference selected device 
+        g = gpuDevice();
+
+        % load kernel
+        k = parallel.gpu.CUDAKernel(...
+            'bf.ptx',...
+            fullfile(src_folder, 'bf.cu'),...
+            ['DAS', postfix]); % use the same kernel, just modify the flag here.
+
+        % constant arg type casting
+        tmp = cellfun(@uint64, {T,M,N,I,Isz}, per_cell{:});
+        [T, M, N, I, Isz] = deal(tmp{:});
+
+        % set constant args
+        k.setConstantMemory('QUPS_I', I); % gauranteed
+        try k.setConstantMemory('QUPS_T', T); end % if not const compiled with ChannelData
+        try k.setConstantMemory('QUPS_M', M, 'QUPS_N', N, 'QUPS_VS', VS, 'QUPS_DV', DV, 'QUPS_I1', Isz(1), 'QUPS_I2', Isz(2), 'QUPS_I3', Isz(3)); end % if not const compiled
+
+        % kernel sizes
+        k.ThreadBlockSize(1) = k.MaxThreadsPerBlock; % threads per block
+        k.GridSize(1) = min([...
+            g.MaxThreadBlockSize(1), ... max cause device reqs
+            ceil(I / k.ThreadBlockSize(1))... max cause number of pixels
+            ceil(g.AvailableMemory / (2^8) / (prod(osize{:}) * k.ThreadBlockSize(1))),... max cause GPU memory reqs (empirical)
+            ]); % blocks per frame
+
+    elseif odev
+        
+        % load kernel refernece
+        k = oclKernel(which('bf.cl'), 'DAS');
+        
+        % select device - no risk of resetting anything
+        if device > 0, oclDevice(device); end 
+        k.Device = oclDevice(); 
+
+        % set precision
+        switch idataType
+            case "single", prc = 32;
+            case "double", prc = 64;
+            otherwise, error("Not implemented :(");
+        end
+        k.defineTypes(repmat(idataType,[1,3]), ["V","T","U"]); % time / data / position
+
+        % set constant args
+        k.macros = [k.macros, ("QUPS_" + ["I","T","M","N","VS","DV","I1","I2","I3"] + "=" + [I,T,M,N,VS,DV,Isz])];
+        k.macros = [k.macros, ("QUPS_" + ["F","S"] + "=" + [1,M])]; % unused ...
+        k.macros = [k.macros, ("QUPS_BF_" + ["FLAG"] + "=" + [flagnum])];
+        k.macros(end+1) = "QUPS_PRECISION="+prc;
+        k.opts = ["-cl-mad-enable"];%, "-cl-fp32-correctly-rounded-divide-sqrt", "-cl-opt-disable"];
+
+        % compile kernel
+        k.build();
     
-    % set kernel size
-    k.ThreadBlockSize = nThreads;
-    k.GridSize = nBlocks;
+        % set kernel size
+        k.ThreadBlockSize(1) = min(I, k.MaxThreadsPerBlock);
+        k.GridSize(1) = ceil(I / k.ThreadBlockSize(1));
+
+        % expand inputs to 4D - in  OpenCL, all 3D vectors interpreted are 4D underlying
+        [Pi(4,:), Pr(4,:), Pv(4,:), Nv(4,:)] = deal(0);
+    end 
     
     % allocate output data buffer
     osize = cat(2, {I}, osize);
     osize = cellfun(@uint64, osize, per_cell{:});
-    yg = repmat(obuftypefun(zeros(1)), [osize{:}]);
+    yg = zeros([osize{:}], 'like', obufproto);
     
     % for half types, alias the weights/data, recast the positions as
     % single
@@ -307,7 +340,7 @@
         [yg, apod, x] = dealfun(@(x)getfield(alias(x),'val'), yg, apod, x);
     end
 
-    % combine timing infor with the transmit positions
+    % combine timing info with the transmit positions
     Pv(4,:) = t0;
     
     % partition data per frame
@@ -316,6 +349,7 @@
     % for each data frame, run the kernel
     switch fun
         case {'DAS','SYN','BF','MUL'}
+            if ~isreal(obufproto), yg = complex(yg); end % force complex if x is
             % I [x N [x M]] x 1 x 1 x {F x ...}
             for f = F:-1:1 % beamform each data frame
                 y{f} = k.feval(yg, Pi, Pr, Pv, Nv, apod, cinv, [astride, cstride], x(:,:,:,f), flagnum, [fs, fmod]);
diff --git a/kern/convd.m b/kern/convd.m
index b6023079..146b37bf 100644
--- a/kern/convd.m
+++ b/kern/convd.m
@@ -10,10 +10,13 @@
 % C = CONVD(A, B, dim, shape) selects the shape of the output. Must be one 
 % of {'full'*|'same'|'valid'}. The default is 'full'.
 % 
-% C = CONVD(..., 'gpu', false) selects whether to use a gpu. If true, 
-% a ptx-file will be used if compiled. If false or if no ptx-file is
-% available, native MATLAB code is used. The default is true if x or y is a
-% gpuArray.
+% C = CONVD(..., 'gpu', true) selects whether to use a gpu. A ptx-file will 
+% be used if compiled. The default is true if x or y is a gpuArray.
+%
+% C = CONVD(..., 'ocl', true) selects whether to use an OpenCL device if
+% OpenCL support is available. If the currently selected device does not 
+% support the precision of the data, this argument is ignored. The default
+% is true if OpenCL support is available via Matlab-OpenCL.
 %
 % C = CONVD(..., 'parenv', clu) or C = CONVD(..., 'parenv', pool) or 
 % performs the convolution in parallel on the parcluster clu or the parpool
@@ -24,35 +27,32 @@
 % [C, lags] = CONVD(...) returns the lags of y in the same dimension as the
 % computation.
 % 
-% % Example:
-% % Compute and plot the cross-correlation of two 16-sample
-% % exponential sequences
+% Example:
+% % Create two 16 x 4 exponential sequences
+% n = (1:16)-1;
+% m = (1:4)'-1;
+% xa = single(0.84).^(n+m);
+% xb = single(0.92).^(n+m);
 % 
-% N = 16;
-% M = 4;
-% n = (0:N-1);
-% m = (0:M-1)';
-% a = 0.84;
-% b = 0.92;
-% xa = a.^(n+m);
-% xb = b.^(n+m);
+% % Compute iteratively
 % z0 = 0*xa; % initialize
-% for i = 1:M
+% for i = 1:4
 %    z0(i,:) = conv(xa(i,:), xb(i,:), 'same');
 % end
-% z0
-% z = convd(xa,xb,2,'same')
-% isequal(z, z0)
+% 
+% % Compute with convd
+% z = convd(xa,xb,2,'same');
+% isalmostn(z, z0)
 % 
 % See also CONV CONV2 CONVN
 
-% TODO: update this for tall variables?
 arguments
-    x {mustBeNumeric}
-    y {mustBeNumeric}
+    x {mustBeFloat}
+    y {mustBeFloat}
     dim (1,1) {mustBePositive, mustBeInteger} = findSingletonDim(x, y)
     shape (1,1) string {mustBeMember(shape, ["full", "same", "valid"])} = 'full'
     kwargs.gpu (1,1) logical = isa(x, 'gpuArray') || isa(y, 'gpuArray')
+    kwargs.ocl (1,1) logical = exist('oclDevice', 'file') && ~isempty(oclDevice())
     kwargs.parenv {mustBeScalarOrEmpty, mustBeA(kwargs.parenv, ["parallel.Cluster", "parallel.Pool", "double"])} = gcp('nocreate') % parallel environment
     kwargs.lowmem (1,1) logical = 16 * max(numel(x), numel(y)) > 16 * 2^30; % default true if complex double storage of either argument exceeds 16GB
 end
@@ -87,11 +87,11 @@
 % get the proper lags for this computation
 switch shape
     case 'full'
-        lags = colon(-(N - 1), M - 1).';
+        lags = colon(-(N - 1), M - 1);
     case 'same'
-        lags = colon(0,        M - 1).' - floor((N-1)/2);
+        lags = colon(0,        M - 1) - floor((N-1)/2);
     case 'valid'
-        lags = colon(0,        M - N).';
+        lags = colon(0,        M - N);
 end
 
 % get kernel sizing info
@@ -105,37 +105,20 @@
 [xstr, ystr, zstr] = deal(C);
 sizes = [xstr, M, ystr, N, zstr, L, C];
 
-% whether/how to operate on GPU
-if kwargs.gpu && exist('convd.ptx', 'file') && exist('convd.cu', 'file')
-    % get the kernel suffix
-    suffix = '';
-    if complex_type, suffix = [suffix 'c']; end
+% whether/how to operate with CUDA/OpenCL
+use_gdev = kwargs.gpu && exist('convd.ptx', 'file') && exist('convd.cu', 'file');
+use_odev = kwargs.ocl && exist('oclDevice', 'file') && ~isempty(oclDevice()) && exist('convd.cl', 'file');
+if use_odev % validate the device supports the type
+    dev = oclDevice(); % get device
     switch dtype
-        case 'single', suffix = [suffix 'f'];
-        case 'halfT',  suffix = [suffix 'h'];
-    end
-    
-    % specify the kernel
-    kern = parallel.gpu.CUDAKernel( ...
-        'convd.ptx', 'convd.cu', ['conv' suffix] ...
-        );
-    
-    % setup the execution size
-    if C == 1
-        blk = [1 min(L, kern.MaxThreadsPerBlock) 1];
-        grd = ceil([1 L S] ./ blk);
-    else
-        blk = [min(C, kern.MaxThreadsPerBlock), 1, 1];
-        grd = ceil([C L S] ./ blk);
+        case 'double', use_odev = dev.SupportsDouble;
+        case 'halfT' , use_odev = dev.SupportsHalf;
     end
-    kern.ThreadBlockSize = blk;
-    kern.GridSize = grd;
-    
-    % define the constant data parameters: care must be taken to match the
-    % datatype here: for NVIDIA gpus, size_t <-> uint64
-    kern.setConstantMemory('L0', int32(l0));
-    
-    % if complex, ensure both all arguments complex
+end
+
+% dispatch based on device (native vs. kernel)
+if use_gdev || use_odev
+    % if complex, eshiftdimnsure both all arguments complex
     if complex_type && isreal(x), x = complex(x); end
     if complex_type && isreal(y), y = complex(y); end
 
@@ -154,8 +137,53 @@
             z = x;
             x = sub(z, 1:M, dim); % (shared-copy ?)
     end
+    
+    % get the kernel
+    if use_gdev
+        % get the kernel suffix
+        suffix = '';
+        if complex_type, suffix = [suffix 'c']; end
+        switch dtype
+            case 'single', suffix = [suffix 'f'];
+            case 'halfT',  suffix = [suffix 'h'];
+        end
+
+        % specify the kernel
+        kern = parallel.gpu.CUDAKernel( ...
+            'convd.ptx', 'convd.cu', ['conv' suffix] ...
+            );
+
+        % define the constant data parameters: care must be taken to match the
+        % datatype here: for NVIDIA gpus, size_t <-> uint64
+        kern.setConstantMemory('L0', int32(l0));
+    
+    else
+        % reference kernel
+        kern = oclKernel('convd.cl');
+
+        % define typing
+        switch dtype
+            case 'single', prc = 32; typ = 'single';
+            case 'double', prc = 64; typ = 'double';
+            case 'halfT' , prc = 16; typ = 'half'  ;
+        end
+        kern.defineTypes({typ}); % {T}
+        kern.macros = [kern.macros, "QUPS_PRECISION="+prc];
+        if complex_type, kern.macros = [kern.macros, "QUPS_COMPLEX"]; end
+
+        % define the constant data parameters
+        kern.macros = [kern.macros, "QUPS_L0="+int32(l0)];
+    end
+    
+    % setup the execution size
+    if C == 1, blk = [1 min(L, kern.MaxThreadsPerBlock) 1];
+    else,      blk = [min(C, kern.MaxThreadsPerBlock), 1, 1];
+    end
+    kern.ThreadBlockSize = blk;
+    kern.GridSize = ceil([C L S] ./ blk);
+    
     % run the kernel
-    z = kern.feval(x,y,z,sizes);
+    z = kern.feval(x,y,z,sizes); 
     
 else
     % vectorized MATLAB on CPU - perform convolution manually for vectors
@@ -201,13 +229,12 @@
     % return to original sizing
     sz(dim) = L;
     z = reshape(z, sz);
-
 end
 
 % cast to output type / dimensions - try to use implicit in-place assignment
 % z = cast(z, 'like', To); 
 C = z; % shared-copy
-lags = shiftdim(lags, 1-dim); % put lags in same dimensions as operation
+lags = swapdim(lags, 2, dim); % put lags in same dimensions as operation
 
 function d = findSingletonDim(A, B)
 dA = find(size(A) ~= 1,1,'first');
diff --git a/kern/interpd.m b/kern/interpd.m
index b5eee836..99b15c2f 100644
--- a/kern/interpd.m
+++ b/kern/interpd.m
@@ -55,9 +55,9 @@
 % get the data sizing
 T = size(x, dim);
 I = size(t, dim);
-if isempty(mdms), N = 1; else, N = prod(size(t, mdms)); end
-if isempty(rdms), M = 1; else, M = prod(size(t, rdms)); end
-if isempty(rdms), F = 1; else, F = prod(size(x, rdms)); end
+N = prod(esize(t, mdms));
+M = prod(esize(t, rdms));
+F = prod(esize(x, rdms));
 
 % move the input data to the proper dimensions for the GPU kernel
 if ~isequal(ord, 1:length(ord)) % avoid data copy if already ordered
@@ -68,29 +68,34 @@
 % function to determine type
 isftype = @(x,T) strcmp(class(x), T) || any(arrayfun(@(c)isa(x,c),["tall", "gpuArray"])) && strcmp(classUnderlying(x), T);
 
+if exist('oclDeviceCount','file') && oclDeviceCount(), ocl_dev = oclDevice(); else, ocl_dev = []; end % get oclDevice if supported
+
+use_gdev = exist('interpd.ptx', 'file') ...
+        && ( isa(x, 'gpuArray') || isa(t, 'gpuArray') || isa(x, 'halfT') && x.gtype) ...
+        && (ismember(interp, ["nearest", "linear", "cubic", "lanczos3"])); ... 
+
+use_odev = exist('interpd.cl', 'file') ...
+        && (ismember(interp, ["nearest", "linear", "cubic", "lanczos3"])) ... 
+        && ~isempty(ocl_dev) && ...
+        (  (isftype(x,'double') && ocl_dev.SupportsDouble) ...
+        || (isftype(x,'single')                          ) ... always supported
+        || (isftype(x,'halfT' ) && ocl_dev.SupportsHalf  ));
+
 % use ptx on gpu if available or use native MATLAB
-if exist('interpd.ptx', 'file') ...
-        && ( isa(x, 'gpuArray') || isa(t, 'gpuArray') ) ...
-        && (ismember(interp, ["nearest", "linear", "cubic", "lanczos3"]))
+if use_gdev || use_odev
     
     % determine the data type
     if     isftype(x, 'double')
-        suffix = "" ; [x,t] = dealfun(@double, x, t);
+        suffix = "" ; prc = 64; [x,t] = dealfun(@double, x, t);
     elseif isftype(x, 'single')
-        suffix = "f"; [x,t] = dealfun(@single, x, t);
-    elseif isftype(x, 'halfT'  )
-        suffix = "h"; [x,t] = dealfun(@(x)gpuArray(halfT(x)), x, t); % custom type
+        suffix = "f"; prc = 32; [x,t] = dealfun(@single, x, t);
+    elseif isftype(x, 'halfT' )
+        suffix = "h"; prc = 16; [x,t] = dealfun(@(x)gpuArray(halfT(x)), x, t); % custom type
     else
+        suffix = "f"; prc = 32;
         warning("Datatype " + class(x) + " not recognized as a GPU compatible type.");
-        suffix = "f" ;
     end
 
-    % grab the kernel reference
-    k = parallel.gpu.CUDAKernel('interpd.ptx', 'interpd.cu', 'interpd' + suffix); 
-    k.setConstantMemory('QUPS_I', uint64(I), 'QUPS_T', uint64(T), 'QUPS_N', uint64(N), 'QUPS_M', uint64(M), 'QUPS_F', uint64(F));
-    k.ThreadBlockSize = min(k.MaxThreadsPerBlock,M*I); % I and M are indexed together
-    k.GridSize = [ceil(I*M ./ k.ThreadBlockSize(1)), N, 1];
-
     % translate the interp flag
     switch interp
         case "nearest", flagnum = 0;
@@ -100,6 +105,9 @@
         otherwise, error('Interp option not recognized: ' + string(interp));
     end
 
+    % cache data prototype
+    x0 = zeros(0, 'like', x);
+
     % condition inputs/outputs
     osz = [I, max(size(t,2:maxdims), size(x,2:maxdims))];
     x = complex(x); % enforce complex type
@@ -108,15 +116,43 @@
             y = complex(gpuArray(halfT(repelem(extrapval,osz))));
             [y_,x_,t_] = dealfun(@(x)x.val, y,x,t);
         otherwise % others
-            y = repmat(cast(extrapval, 'like', x), osz);
+            y = complex(repmat(cast(extrapval, 'like', x), osz));
             [y_,x_,t_] = deal(y,x,t);
     end
-
+    
+    if use_gdev
+        % grab the kernel reference
+        k = parallel.gpu.CUDAKernel('interpd.ptx', 'interpd.cu', 'interpd' + suffix);
+        
+        % set constants
+        k.setConstantMemory('QUPS_I', uint64(I), 'QUPS_T', uint64(T), 'QUPS_N', uint64(N), 'QUPS_M', uint64(M), 'QUPS_F', uint64(F));
+    
+    elseif use_odev
+        % get the kernel reference
+        k = oclKernel(which('interpd.cl'), 'interpd');
+
+        % set the data types
+        switch prc, case 16, tp = 'uint16'; case 32, tp = 'single'; case 64, tp = 'double'; end
+        k.defineTypes({tp,tp}); % all aliases are this type
+
+        % configure the kernel sizing and options
+        k.macros = "QUPS_" + ["I", "T", "S", "N", "F"] + "=" + uint64([I T M N F]); % size constants
+        k.macros = [k.macros, "QUPS_INTERPD_" + ["NO_V","FLAG"] ...
+            + "=" + ["0."+suffix, flagnum]]; % input constants
+        k.macros(end+1) = "QUPS_PRECISION="+prc;
+        % k.opts = ["-cl-fp32-correctly-rounded-divide-sqrt", "-cl-mad-enable"];
+    end
+    
+    % kernel sizing
+    k.ThreadBlockSize(1) = min(k.MaxThreadsPerBlock,M*I); % I and M are indexed together
+    k.GridSize = [ceil(I*M ./ k.ThreadBlockSize(1)), N, 1];
+    
     % sample
     y_ = k.feval(y_, x_, t_, flagnum); % compute
 
     % restore type
-    switch suffix, case "h", y.val = y; otherwise y = y_; end
+    if isreal(x0), y_ = real(y_); end
+    switch suffix, case "h", y.val = y; otherwise, y = y_; end
 else
     % get new dimension mapping
     [~, tmp] = cellfun(@(x) ismember(x, ord), {dim, mdms, rdmst, rdmsx}, 'UniformOutput',false);
@@ -154,4 +190,4 @@
     y = ipermute(y, ord);
 end
 
-
+function sz = esize(x, sz), if ~isempty(sz), sz = size(x, sz); end
diff --git a/kern/wbilerpg.m b/kern/wbilerpg.m
index b795c330..ddb7e2d2 100644
--- a/kern/wbilerpg.m
+++ b/kern/wbilerpg.m
@@ -1,42 +1,45 @@
 function [cxy, ixo, iyo] = wbilerpg(x, y, xa, ya, xb, yb)
-% WBILERPG - Weighted bilinear interpolation weights (GPU-enabled)
+% WBILERPG - Weighted bilinear interpolation weights (GPU/OpenCL-enabled)
 %
-% [cxy, ixo, iyo] = WBILERPG(x, y, xa, ya, xb, yb) takes in a pair of grid 
-% vectors x, y and an array of pairs of endpoints (xa,ya) to (xb,yb) and 
+% [cxy, ixo, iyo] = WBILERPG(x, y, xa, ya, xb, yb) takes in a pair of grid
+% vectors x, y and an array of pairs of endpoints (xa,ya) to (xb,yb) and
 % returns the interpolation weights cxy, the x-indices ixo, and the
 % y-indices iyo that describe the bi-linear inteprolation weights for each
 % segment of the line from (xa,xb) to (xb,yb) as it intersects the x-grid
 % and y-grid and their indices on the x-grid and y-grid. The arrays xa, ya,
 % xb, yb must all be the same size.
-% 
+%
 % Each output is size [4 x (X + Y + 1)] x size(xa, 1:ndims(xa)) where X and
 % Y are the length of the x and y vectors. The indices ixo and iyo are
 % 1-based. If the index is invalid, ixo and iyo are 0. Multiple idenitcal
 % indices represent contributions from different line segments intersecting
 % the same grid index. These can be added afterwards to form the full
 % weights.
+%
+% Note: This function requires that either a CUDA-enabled gpu or a working
+% implementation of OpenCL via Matlab-OpenCL be available.
 % 
 % The following example shows how a sparse matrix can be generated by
 % implicitly adding overlapping pixel weights.
 %
 % Example:
 % % Create a grid
-% [x, y] = deal(-5:5, -5:5); % 11 x 11 grid
+% [x,  y ] = deal(single(-5:5), single(-5:5)); % 11 x 11 grid
 % [xa, ya] = deal(-4, +1);
 % [xb, yb] = deal(+3, -2);
 %
 % % Get the interpolation weights
 % [cxy, ixo, iyo] = wbilerpg(x, y, xa, ya, xb, yb); % get the line segment weights
-% 
+%
 % % Create a sparse matrix, implicitly summing weights from neighboring
 % % line segments
 % val = (ixo ~= 0) & (iyo ~= 0); % filter out invalid indices
-% s = sparse(ixo(val), iyo(val), cxy(val), numel(x), numel(y));
-% 
+% s = sparse(ixo(val), iyo(val), double(cxy(val)), numel(x), numel(y));
+%
 % figure;
 % pcolor(x, y, full(s)')
 % title("Weights from ("+xa+","+ya+"), to ("+xb+","+yb+").");
-% 
+%
 % See also XIAOLINWU_K_SCALED SPARSE WBILERP
 
 arguments
@@ -68,16 +71,45 @@
 [cxy]      = deal(zeros([4*N,Ia], 'like', xa));
 
 % determine the suffix based on the type
-uclass = class(xa); 
+uclass = class(xa);
 if uclass == "gpuArray", uclass = classUnderlying(xa); end
 switch uclass
-    case "double",suffix = "";
-    case "single",suffix = "f";
-    case "halfT", suffix = "h"; cxy = alias(cxy); % ensure aliased - we can index directly
+    case "double",prc = 64; suffix = "";
+    case "single",prc = 32; suffix = "f";
+    case "halfT", prc = 16; suffix = "h"; cxy = alias(cxy); % ensure aliased - we can index directly
 end
 
+% parse device
+use_gdev = logical(gpuDeviceCount());
+use_odev = exist('oclDevice','file') && ~isempty(oclDevice());
+
 % specify the kernel (must be on the path)
-kern = parallel.gpu.CUDAKernel('wbilerp.ptx', 'wbilerp.cu', "wbilerp" + suffix);
+if use_gdev
+    % reference kernel
+    kern = parallel.gpu.CUDAKernel('wbilerp.ptx', 'wbilerp.cu', "wbilerp" + suffix);
+elseif use_odev
+    % reference kernel
+    kern = oclKernel('wbilerp.cl');
+
+    % specify aliased types
+    kern.defineTypes({uclass});
+    kern.macros(end+1) = "QUPS_PRECISION="+prc;
+
+    % validate precision support
+    if   (uclass == "double") && (~kern.Device.SupportsDouble) ...
+            || (uclass == "halfT") && (~kern.Device.SupportsHalf)
+        error("QUPS:wbilerpg:precisionNotSupported", ...
+            "Device " + kern.Device.Index + " (" + kern.Device.Name + ") " + ...
+            "does not support " + uclass + " precision." ...
+            );
+    end
+else
+    % no kernel device found
+    error("QUPS:wbilerpg:supportedDeviceNotFound", ...
+        "wbilerpg() requires either a supported CUDA enabled GPU " + ...
+        "or a working OpenCL implementation via Matlab-OpenCL. Use wbilerp instead." ...
+        );
+end
 
 % proceed by which transform should be applied
 for ineg = [false, true], for isteep = [false, true], for irev = [false, true]
@@ -85,7 +117,7 @@
             % indices matching this tranform
             ind = yneg == ineg & isteep == steep & irev == reverse;
             I = nnz(ind);
-            
+
             % if no data follows this transform, skip this batch
             if I == 0, continue; end
 
@@ -106,7 +138,7 @@
 
             % sort in x, then y, so that we have a non-descreasing line
             [ux, uy, vx, vy, m, mi] = dealfun(@(x) shiftdim(x(:), -2), ux, uy, vx, vy, m, mi); % move to 1 x 1 x I
-            
+
             % all points ([N+1] x 2 x I)
             pall = [...
                 ux, uy; ... endpoint
@@ -132,7 +164,7 @@
             ixyo = complex(ixyo, ixyo); % alias (x,y) -> (real, imag)
 
             % setup the execution size
-            kern.ThreadBlockSize = min(I,kern.MaxThreadsPerBlock);
+            kern.ThreadBlockSize(1) = min(I,kern.MaxThreadsPerBlock);
             kern.GridSize = [ceil(I / kern.ThreadBlockSize(1)), 1, 1];
 
             % run the kernel
@@ -144,7 +176,7 @@
                     ixyo, cxyo.val, pall.val, ...
                     xg.val, yg.val, X, Y, I, ...
                     pml.val, pmu.val ...
-                    ); 
+                    );
             end
 
             % set correct output for x and y in original coordinates
diff --git a/kern/wsinterpd.m b/kern/wsinterpd.m
index 5ca99d96..4d4b1654 100644
--- a/kern/wsinterpd.m
+++ b/kern/wsinterpd.m
@@ -88,11 +88,23 @@
 % function to determine type
 isftype = @(x,T) strcmp(class(x), T) || any(arrayfun(@(c)isa(x,c),["tall", "gpuArray"])) && strcmp(classUnderlying(x), T);
 
-% use ptx on gpu if available or use native MATLAB
-if exist('interpd.ptx', 'file') ...
+if exist('oclDeviceCount','file') && oclDeviceCount(), ocl_dev = oclDevice(); else, ocl_dev = []; end % get oclDevice if supported
+
+use_gdev = exist('interpd.ptx', 'file') ...
         && ( isa(x, 'gpuArray') || isa(t, 'gpuArray') || isa(x, 'halfT') && x.gtype) ...
         && (ismember(interp, ["nearest", "linear", "cubic", "lanczos3"])) ... 
-        && real(omega) == 0
+        && real(omega) == 0;
+
+use_odev = exist('interpd.cl', 'file') ...
+        && (ismember(interp, ["nearest", "linear", "cubic", "lanczos3"])) ... 
+        && real(omega) == 0 ... 
+        && ~isempty(ocl_dev) && ...
+        (  (isftype(x,'double') && ocl_dev.SupportsDouble) ...
+        || (isftype(x,'single')                          ) ... always supported
+        || (isftype(x,'halfT' ) && ocl_dev.SupportsHalf  ));
+
+% use ptx on gpu if available or use native MATLAB
+if use_gdev || use_odev
 
     % get stride for weighting
     wstride = size(w,1:S);
@@ -112,14 +124,14 @@
 
     % determine the data type
     if     isftype(x, 'double')
-        suffix = "" ; [x,t,w] = dealfun(@double, x, t, w);
+        suffix = "" ; prc = 64; [x,t,w] = dealfun(@double, x, t, w);
     elseif isftype(x, 'single')
-        suffix = "f"; [x,t,w] = dealfun(@single, x, t, w);
-    elseif isftype(x, 'halfT'  )
-        suffix = "h"; [x,t,w] = dealfun(@(x)gpuArray(halfT(x)), x, t, w); % custom type
+        suffix = "f"; prc = 32; [x,t,w] = dealfun(@single, x, t, w);
+    elseif isftype(x, 'halfT' )
+        suffix = "h"; prc = 16; [x,t,w] = dealfun(@(x)gpuArray(halfT(x)), x, t, w); % custom type
     else
+        suffix = "f"; prc = 32;
         warning("Datatype " + class(x) + " not recognized as a GPU compatible type.");
-        suffix = "f" ;
     end
 
     % get the data sizing
@@ -128,15 +140,6 @@
     N = prod(esize(t, mdms));
     F = prod(esize(x, rdmsx));
 
-    % grab the kernel reference
-    k = parallel.gpu.CUDAKernel('interpd.ptx', 'interpd.cu', 'wsinterpd' + suffix); 
-    k.setConstantMemory( ...
-        'QUPS_I', uint64(I), 'QUPS_T', uint64(T), 'QUPS_S', uint64(S), ...
-        'QUPS_N', uint64(N), 'QUPS_F', uint64(F) ...
-        );
-    k.ThreadBlockSize = min(k.MaxThreadsPerBlock,I); % I and M are indexed together
-    k.GridSize = [ceil(I ./ k.ThreadBlockSize(1)), min(N, 2^15), ceil(N/2^15)];
-
     % translate the interp flag
     switch interp
         case "nearest", flagnum = 0;
@@ -146,9 +149,9 @@
         otherwise, error('Interp option not recognized: ' + string(interp));
     end
 
-    % enforce complex type for gpu data
-    if isa(x, 'gpuArray') || isa(x,'halfT') && isa(x.val, 'gpuArray'), x = complex(x); end
-    if isa(w, 'gpuArray') || isa(w,'halfT') && isa(w.val, 'gpuArray'), w = complex(w); end
+    % enforce complex type for ocl or gpu data
+    if use_odev || isa(x, 'gpuArray') || (isa(x,'halfT') && isa(x.val, 'gpuArray')), x = complex(x); end
+    if use_odev || isa(w, 'gpuArray') || (isa(w,'halfT') && isa(w.val, 'gpuArray')), w = complex(w); end
     switch suffix
         case "h", 
             y = complex(gpuArray(halfT(zeros(osz))));
@@ -158,7 +161,38 @@
             [w_,x_,t_] = deal(w,x,t); % copy data
             y_ = zeros(osz, 'like', x_); % pre-allocate output
     end
+    
     % zeros: uint16(0) == storedInteger(half(0)), so this is okay
+    if use_gdev
+        % grab the kernel reference
+        k = parallel.gpu.CUDAKernel('interpd.ptx', 'interpd.cu', 'wsinterpd' + suffix);
+        k.setConstantMemory( ...
+            'QUPS_I', uint64(I), 'QUPS_T', uint64(T), 'QUPS_S', uint64(S), ...
+            'QUPS_N', uint64(N), 'QUPS_F', uint64(F) ...
+            );
+        cargs = {flagnum, imag(omega)}; % extra const arguments
+    elseif use_odev
+
+        % get the kernel reference
+        k = oclKernel(which('interpd.cl'), 'wsinterpd');
+
+        % set the data types
+        switch prc, case 16, t = 'uint16'; case 32, t = 'single'; case 64, t = 'double'; end
+        k.defineTypes({t,t}); % all aliases are this type
+
+        % configure the kernel sizing and options
+        k.macros = "QUPS_" + ["I", "T", "S", "N", "F"] + "=" + uint64([I T S N F]); % size constants
+        k.macros = [k.macros, "QUPS_INTERPD_" + ["NO_V","FLAG","OMEGA"] ...
+            + "=" + ["0."+suffix, flagnum, imag(omega)]]; % input constants
+        k.macros(end+1) = "QUPS_PRECISION="+prc;
+        % k.opts = ["-cl-fp32-correctly-rounded-divide-sqrt", "-cl-mad-enable"];
+        cargs = {}; 
+    end
+
+    % kernel sizing
+    k.ThreadBlockSize(1) = min(k.MaxThreadsPerBlock,I); % local group size
+    k.GridSize = [ceil(I ./ k.ThreadBlockSize(1)), min(N, 2^15), ceil(N/2^15)];
+
     % index label flags
     iflags = zeros([1 maxdims], 'uint8');
     iflags(mdms) = 1;
@@ -168,10 +202,11 @@
     strides = cat(1,wstride,ystride,tstride,xstride);
 
     % compute
-    y_ = k.feval(y_, w_, x_, t_, dsizes, iflags, strides, flagnum, imag(omega)); 
+    y_ = k.feval(y_, w_, x_, t_, dsizes, iflags, strides, cargs{:}); 
 
     % for halfT, store the data back in the output
     switch suffix, case "h", y.val = y_; otherwise, y = y_; end
+
 else
 
     % promote half types
diff --git a/kern/wsinterpd2.m b/kern/wsinterpd2.m
index 1519a146..5ea4b47b 100644
--- a/kern/wsinterpd2.m
+++ b/kern/wsinterpd2.m
@@ -95,11 +95,23 @@
 % function to determine type
 isftype = @(x,T) strcmp(class(x), T) || any(arrayfun(@(c)isa(x,c),["tall", "gpuArray"])) && strcmp(classUnderlying(x), T);
 
-% use ptx on gpu if available or use native MATLAB
-if exist('interpd.ptx', 'file') ...
+if exist('oclDeviceCount','file') && oclDeviceCount(), ocl_dev = oclDevice(); else, ocl_dev = []; end % get oclDevice if supported
+
+use_gdev = exist('interpd.ptx', 'file') ...
         && ( isa(x, 'gpuArray') || isa(t1, 'gpuArray') || isa(t2, 'gpuArray') || isa(x, 'halfT') && x.gtype) ...
         && (ismember(interp, ["nearest", "linear", "cubic", "lanczos3"])) ... 
-        && real(omega) == 0
+        && real(omega) == 0;
+
+use_odev = exist('interpd.cl', 'file') ...
+        && (ismember(interp, ["nearest", "linear", "cubic", "lanczos3"])) ... 
+        && real(omega) == 0 ... 
+        && ~isempty(ocl_dev) && ...
+        (  (isftype(x,'double') && ocl_dev.SupportsDouble) ...
+        || (isftype(x,'single')                          ) ... always supported
+        || (isftype(x,'halfT' ) && ocl_dev.SupportsHalf  ));
+
+% use ptx on gpu if available or use native MATLAB
+if use_gdev || use_odev
 
     % get stride for weighting
     wstride = sz2stride(size(w,1:S));
@@ -117,14 +129,14 @@
 
     % determine the data type
     if     isftype(x, 'double')
-        suffix = "" ; [x,t1,t2,w] = dealfun(@double, x, t1, t2, w);
+        suffix = "" ; prc = 64; [x,t1,t2,w] = dealfun(@double, x, t1, t2, w);
     elseif isftype(x, 'single')
-        suffix = "f"; [x,t1,t2,w] = dealfun(@single, x, t1, t2, w);
+        suffix = "f"; prc = 32; [x,t1,t2,w] = dealfun(@single, x, t1, t2, w);
     elseif isftype(x, 'halfT'  )
-        suffix = "h"; [x,t1,t2,w] = dealfun(@(x)gpuArray(halfT(x)), x, t1, t2, w); % custom type
+        suffix = "h"; prc = 16; [x,t1,t2,w] = dealfun(@(x)gpuArray(halfT(x)), x, t1, t2, w); % custom type
     else
+        suffix = "f" ; prc = 32;
         warning("Datatype " + class(x) + " not recognized as a GPU compatible type.");
-        suffix = "f" ;
     end
 
     % get the data sizing
@@ -133,15 +145,6 @@
     N = prod(esize(x, mdms));
     F = prod(esize(x, rdmsx));
 
-    % grab the kernel reference
-    k = parallel.gpu.CUDAKernel('interpd.ptx', 'interpd.cu', 'wsinterpd2' + suffix); 
-    k.setConstantMemory( ...
-        'QUPS_I', uint64(I), 'QUPS_T', uint64(T), 'QUPS_S', uint64(S), ...
-        'QUPS_N', uint64(N), 'QUPS_F', uint64(F) ...
-        );
-    k.ThreadBlockSize = min(k.MaxThreadsPerBlock,I); % I and M are indexed together
-    k.GridSize = [ceil(I ./ k.ThreadBlockSize(1)), min(N, 2^15), ceil(N/2^15)];
-
     % translate the interp flag
     switch interp
         case "nearest", flagnum = 0;
@@ -152,8 +155,8 @@
     end
 
     % enforce complex type for gpu data
-    if isa(x, 'gpuArray') || isa(x, 'halfT'), x = complex(x); end
-    if isa(w, 'gpuArray') || isa(w, 'halfT'), w = complex(w); end
+    if use_odev || isa(x, 'gpuArray') || isa(x, 'halfT'), x = complex(x); end
+    if use_odev || isa(w, 'gpuArray') || isa(w, 'halfT'), w = complex(w); end
     switch suffix
         case "h", 
             y = complex(gpuArray(halfT(zeros(osz))));
@@ -163,7 +166,37 @@
             [w_,x_,t1_,t2_] = deal(w,x,t1,t2); % copy data
             y_ = zeros(osz, 'like', x_); % pre-allocate output
     end
-     % zeros: uint16(0) == storedInteger(half(0)), so this is okay
+    % zeros: uint16(0) == storedInteger(half(0)), so this is okay
+
+    if use_gdev
+        % grab the kernel reference
+        k = parallel.gpu.CUDAKernel('interpd.ptx', 'interpd.cu', 'wsinterpd2' + suffix);
+        k.setConstantMemory( ...
+            'QUPS_I', uint64(I), 'QUPS_T', uint64(T), 'QUPS_S', uint64(S), ...
+            'QUPS_N', uint64(N), 'QUPS_F', uint64(F) ...
+            );
+        cargs = {flagnum, imag(omega)}; % constant arguments
+    elseif use_odev
+        % get the kernel reference
+        k = oclKernel(which('interpd.cl'), 'wsinterpd2');
+
+        % set the data types
+        switch prc, case 16, t = 'uint16'; case 32, t = 'single'; case 64, t = 'double'; end
+        k.defineTypes({t,t}); % all aliases are this type
+
+        % configure the kernel sizing and options
+        k.macros = "QUPS_" + ["I", "T", "S", "N", "F"] + "=" + uint64([I T S N F]); % size constants
+        k.macros = [k.macros, "QUPS_INTERPD_" + ["NO_V","FLAG","OMEGA"] ...
+            + "=" + ["0."+suffix, flagnum, imag(omega)]]; % input constants
+        k.macros(end+1) = "QUPS_PRECISION="+prc;
+        % k.opts = ["-cl-fp32-correctly-rounded-divide-sqrt", "-cl-mad-enable"];
+        cargs = {}; 
+    end
+
+    % kernel sizing
+    k.ThreadBlockSize(1) = min(k.MaxThreadsPerBlock,I); % I and M are indexed together
+    k.GridSize = [ceil(I ./ k.ThreadBlockSize(1)), min(N, 2^15), ceil(N/2^15)];
+
     % index label flags
     iflags = zeros([1 maxdims], 'uint8'); % increase index i
     iflags(mdms) = 1; % increase index n
@@ -173,7 +206,7 @@
     strides = cat(1,wstride,ystride,t1stride,t2stride,xstride);
 
     % compute
-    y_ = k.feval(y_, w_, x_, t1_, t2_, dsizes, iflags, strides, flagnum, imag(omega)); 
+    y_ = k.feval(y_, w_, x_, t1_, t2_, dsizes, iflags, strides, cargs{:}); 
 
     % for halfT, store the data back in the output
     switch suffix, case "h", y.val = y_; otherwise, y = y_; end
diff --git a/src/UltrasoundSystem.m b/src/UltrasoundSystem.m
index 795e3fd3..29626574 100644
--- a/src/UltrasoundSystem.m
+++ b/src/UltrasoundSystem.m
@@ -499,7 +499,7 @@ function delete(self)
                 element_subdivisions (1,2) double {mustBeInteger, mustBePositive} = [1,1]
             end
             arguments
-                kwargs.device (1,1) {mustBeInteger} = -logical(gpuDeviceCount()); % gpu device
+                kwargs.device (1,1) {mustBeInteger} = -1 * (logical(gpuDeviceCount()) || (exist('oclDevice','file') && ~isempty(oclDevice())))
                 kwargs.interp (1,1) string {mustBeMember(kwargs.interp, ["linear", "nearest", "next", "previous", "spline", "pchip", "cubic", "makima", "freq", "lanczos3"])} = 'cubic'
                 kwargs.tall (1,1) logical = false; % whether to use a tall type
                 kwargs.bsize (1,1) {mustBeInteger, mustBePositive} = 1; % number of simulataneous scatterers
@@ -545,6 +545,18 @@ function delete(self)
             wv.fs = kwargs.fsk;
             kern = wv.samples;
 
+            % choose device
+            use_dev = kwargs.device && ismember(kwargs.interp, ["nearest", "linear", "cubic", "lanczos3"]);
+            use_gdev = use_dev && exist('greens.ptx', 'file') && gpuDeviceCount(); % use the GPU kernel
+            use_odev = use_dev && exist('oclDeviceCount','file') && oclDeviceCount(); % use the OpenCL kernel
+            if use_odev 
+                if kwargs.device > 0, oclDevice(kwargs.device); end % select device
+                dev = oclDevice(); % reference
+                use_odev = isUnderlyingType(kern, "single") ...
+                       || (isUnderlyingType(kern, "double") && dev.SupportsDouble) ...
+                       || (isUnderlyingType(kern, "half"  ) && dev.SupportsHalf);
+            end
+                
             F = numel(scat);
             % if kwargs.verbose, hw = waitbar(0); end
 
@@ -569,15 +581,15 @@ function delete(self)
             pos = scat(f).pos; % 3 x S
             amp = scat(f).amp; % 1 x S
             fso = self.fs; % output channel data sampling frequency
-            if kwargs.device && exist('greens.ptx', 'file') ... % use the GPU kernel
-                    && (ismember(kwargs.interp, ["nearest", "linear", "cubic", "lanczos3"]))
+
+            if use_gdev || use_odev
                 % function to determine type
                 isftype = @(x,T) strcmp(class(x), T) || any(arrayfun(@(c)isa(x,c),["tall", "gpuArray"])) && strcmp(classUnderlying(x), T);
 
                 % determine the data type
-                if     isftype(kern, 'double'), suffix = "" ; cfun = @double;
-                elseif isftype(kern, 'single'), suffix = "f"; cfun = @single;
-                elseif isftype(kern, 'halfT' ), suffix = "h"; cfun = @(x) alias(halfT(x));
+                if     isftype(kern, 'double'), typ = "double"; prc = 64; suffix = "" ; cfun = @double;
+                elseif isftype(kern, 'single'), typ = "single"; prc = 32; suffix = "f"; cfun = @single;
+                elseif isftype(kern, 'halfT' ), typ = "halfT" ; prc = 16; suffix = "h"; cfun = @(x) alias(halfT(x));
                 else,   error("Datatype " + class(kern) + " not recognized as a GPU compatible type.");
                 end
 
@@ -599,7 +611,7 @@ function delete(self)
                 [QI, QS, QT, QN, QM] = deal(scat(f).numScat, length(t), length(kern), N, M);
 
                 % compute the minimum and maximum distance for each scatterer
-                ps = gpuArray(ps);
+                if use_gdev, ps = gpuArray(ps); end
                 [rminrx, rmaxrx, rmintx, rmaxtx] = deal(+inf, -inf, +inf, -inf);
                 for p = self.tx.positions, rminrx = min(rminrx, vecnorm(ps - p, 2, 1)); end % minimum scat time
                 for p = self.tx.positions, rmaxrx = max(rmaxrx, vecnorm(ps - p, 2, 1)); end % maximum scat time
@@ -615,13 +627,29 @@ function delete(self)
                 sb = ([rmin; rmax] ./ c0 + t0x - t0k) * fso + [0; QT];
 
                 % grab the kernel reference
-                k = parallel.gpu.CUDAKernel('greens.ptx', 'greens.cu', 'greens' + suffix);
-                k.setConstantMemory( 'QUPS_S', uint64(QS) ); % always set S
-                try k.setConstantMemory('QUPS_T', uint64(QT), ...
-                    'QUPS_N', uint64(QN), 'QUPS_M', uint64(QM) ... 
-                    ); end % already set by const compiler
-                try k.setConstantMemory('QUPS_I', uint64(QI)); end % might be already set by const compiler
-                k.ThreadBlockSize = min([k.MaxThreadsPerBlock, 32]); % smaller is better
+                if use_gdev
+                    k = parallel.gpu.CUDAKernel('greens.ptx', 'greens.cu', 'greens' + suffix);
+                    k.setConstantMemory( 'QUPS_S', uint64(QS) ); % always set S
+                    try k.setConstantMemory('QUPS_T', uint64(QT), ...
+                            'QUPS_N', uint64(QN), 'QUPS_M', uint64(QM) ...
+                            ); end % already set by const compiler
+                    try k.setConstantMemory('QUPS_I', uint64(QI)); end % might be already set by const compiler
+                elseif use_odev
+                    k = oclKernel('greens.cl');
+                    k.macros = [k.macros, "QUPS_"+["S","T","N","M","I"]+"="+uint64([QS,QT,QN,QM,QI])]; % set constants
+                    k.macros(end+1) = "QUPS_PRECISION="+prc;
+                    k.defineTypes(repmat(typ,[1,3]), ["V","T","U"]); % time / data / position
+
+                    % enforce complexity
+                    [x, as, kn] = dealfun(@complex, x, as, kn);
+
+                    % expand positions 3D -> 4D
+                    [ps(4,:), pn(1,4,:), pv(1,4,:)] = deal(0);
+
+                end                
+                
+                % set kernel sizing
+                k.ThreadBlockSize(1) = min([k.MaxThreadsPerBlock, 32]); % smaller is better
                 k.GridSize = [ceil(QS ./ k.ThreadBlockSize(1)), N, M];
 
                 % get the computational bounds
@@ -2485,7 +2513,7 @@ function delete(self)
                 c0(:,:,:,1,1) {mustBeNumeric} = self.seq.c0
                 kwargs.fmod (1,1) {mustBeNumeric} = 0 
                 kwargs.prec (1,1) string {mustBeMember(kwargs.prec, ["single", "double", "halfT"])} = "single"
-                kwargs.device (1,1) {mustBeInteger} = -1 * logical(gpuDeviceCount)
+                kwargs.device (1,1) {mustBeInteger} = -1 * (logical(gpuDeviceCount()) || (exist('oclDevice','file') && ~isempty(oclDevice())))
                 kwargs.apod {mustBeNumericOrLogical} = 1
                 kwargs.interp (1,1) string {mustBeMember(kwargs.interp, ["linear", "nearest", "next", "previous", "spline", "pchip", "cubic", "makima", "freq", "lanczos3"])} = 'cubic'
                 kwargs.keep_tx (1,1) logical = false
@@ -2530,7 +2558,7 @@ function delete(self)
                 if ~isequal(ord, 1:3), chd = rectifyDims(chd); end % reorder if necessary
 
                 % get the beamformer arguments
-                dat_args = {chd.data, gather(chd.t0), gather(chd.fs), c0, 'device', kwargs.device, 'position-precision', kwargs.prec}; % data args
+                dat_args = {chd.data, gather(chd.t0), gather(chd.fs), c0, 'device', kwargs.device, 'input-precision', kwargs.prec}; % data args
                 if isfield(kwargs, 'interp'), interp_args = {'interp', kwargs.interp}; else,  interp_args = {}; end
                 ext_args = [interp_args, apod_args]; % extra args
     
diff --git a/src/bf.cl b/src/bf.cl
new file mode 100644
index 00000000..53be7ff0
--- /dev/null
+++ b/src/bf.cl
@@ -0,0 +1,138 @@
+// # include "helper_math.h" // vector math
+
+// # include "sizes.cu" // size defines
+
+# include "interpolators.cl" // samplers using constant sizing
+
+// # include "half2_math.h" // vector math for half types only 
+
+/*
+* Delay and sum the given data at the given pixels
+*
+* Given a set of pixels, (virtual or plane wave) transmitter locations, 
+* receiver locations, as well as a datacube equipped with a time, 
+* transmitter and receiver axis, perform simple delay-and-sum beamforming. 
+* The data is linearly interpolated at the sample time. An image is 
+* generated for each receiver element. Summation across the transmitters 
+* and receivers is implicit.
+*
+* All positions are in vector coordinates. 
+* 
+* If the virtual transmitter 
+* normal has a fourth component that is 0, this indicates that the 
+* transmission should be treated as a plane wave transmission instead of a 
+* virtual source transmission. 
+* 
+* The value of t = 0 must be the time when the peak of the wavefront 
+* reaches the virtual source location. Because this time must be the same 
+* for all transmits, the datacube must be stitched together in such a way 
+* that for all transmits, the same time axis is used
+*
+* Inputs:
+*  y:           complex pixel values per channel (I)
+*  Pi:          pixel positions (3 x I)
+*  Pr:          receiver positions (3 x N)
+*  Pv:          (virtual) transmitter positions (3 x M)
+*  Nv:          (virtual) transmitter normal (3 x M)
+*  x:           datacube of complex sample values (T x M x N)
+*  t0:          initial time for the data
+*  fs:          sampling frequency of the data
+*  cinv:        inverse of the speed of sound used for beamforming
+* 
+* I -> pixels, M -> transmitters, N -> receivers, T -> time samples
+*
+*/
+
+#   if QUPS_PRECISION == 32
+typedef float4 U4;
+# elif QUPS_PRECISION == 64 
+typedef double4 U4;
+# elif QUPS_PRECISION == 16 
+typedef float4 U4;
+# endif
+
+kernel void DAS(volatile global T2 * y, 
+    const global U4 * pi, const global U4 * pr,
+    const global U4 * pv, const global U4 * nv, 
+    const global T2 * a,  const global V  * cinv, const global ulong * acstride, 
+    const global T2 * x, const int flag, const global V t0fsfc[2]) {
+    
+    // unpack inputs
+    #ifndef QUPS_BF_FLAG
+    const int QUPS_BF_FLAG = flag; // set only if not fixed at compile time
+    #endif
+    // const V t0   = t0fsfc[0]; // start time
+    const V fs   = t0fsfc[0]; // sampling frequency
+    const V fc   = t0fsfc[1]; // modulation frequency
+    const global ulong * astride = acstride;
+    const global ulong * cstride = acstride + 5;
+
+    // rename for readability
+    const size_t N = QUPS_N, M = QUPS_M, T = QUPS_T, I = QUPS_I;
+    const size_t I1 = QUPS_I1, I2 = QUPS_I2, I3 = QUPS_I3;
+        
+    // get starting index of this pixel
+    const size_t tid = get_global_id(0); // pixel index
+    const size_t kI  = get_global_size(0); // number of pixels per call
+
+    // temp vars
+    const T2 zero_v = (T2)(0, 0);
+    T2 w = (T2)(1, 0);
+    V dv, dr, tau;
+    T2 val, pix;
+    U4 rv = (U4)(0);
+
+    // if valid pixel, for each tx/rx
+    for(size_t i = tid; i < I; i += kI){        
+        // get image coordinates
+        const size_t i1 = (i             % QUPS_I1); // index in I1
+        const size_t i2 = (i /  QUPS_I1) % QUPS_I2 ; // index in I2
+        const size_t i3 = (i / (I1 * I2) % QUPS_I3); // index in I3
+        const size_t abase = i1 * astride[0] + i2 * astride[1] + i3 * astride[2]; // base index for this pixel
+        const size_t cbase = i1 * cstride[0] + i2 * cstride[1] + i3 * cstride[2]; // base index for this pixel
+
+        // reset accumulator
+        pix = zero_v;
+
+        for(size_t m = 0; m < M; ++m){
+            for(size_t n = 0; n < N; ++n){
+                // 2-way virtual path distance
+                // const U3 pvm = {pv[m].x,pv[m].y,pv[m].z}; // declared for MSVC (2019)
+                rv.xyz = pi[i].xyz - pv[m].xyz; // (virtual) transmit to pixel vector
+
+                dv = QUPS_VS ? // tx path length
+                    copysign(length(rv), (QUPS_DV ? 1.f : dot(rv.xyz, nv[m].xyz))) // virtual source
+                    : dot(rv.xyz, nv[m].xyz); // plane wave
+
+                dr = length(pi[i].xyz - pr[n].xyz); // rx path length
+
+                // data/time index number
+                const V ci = cinv[cbase + n * cstride[3] + m * cstride[4]];
+                tau = (ci * (dv + dr) - pv[m].w);
+
+                // apply demodulation if non zero
+                if (fc) {w.x = cospi(2*fc*tau); w.y = sinpi(2*fc*tau);}
+
+                // sample the trace
+                val = sample(&x[(n + m * N) * T], tau * fs, flag & 7, zero_v); // out of bounds: extrap 0
+                // const int t = (int) (tau * fs);
+                // val = (0 <= t & t < T) ? x[(size_t) (t + (n + m * N) * T)] : zero_v;
+
+                // apply apodization (requires complex multiplication)
+                val = cmul(val, cmul(w, a[abase + n * astride[3] + m * astride[4]]));
+
+                // choose the accumulation
+                const int sflag = ((int)QUPS_BF_FLAG) & 24; // extract bits 5,4
+                if(sflag == 8)
+                    y[i + n*I] += val; // sum over tx, store over rx
+                else if (sflag == 16)
+                    y[i + m*I] += val; // sum over rx, store over tx
+                else if (sflag == 24)
+                    y[i + n*I + m*N*I] = val; // store over tx/rx
+                else
+                    pix += val; // sum over all
+            }
+        }
+        if (!(((int)QUPS_BF_FLAG) & 24)) y[i] = pix; // output value here if accumulating over all
+    }
+}
diff --git a/src/bf.cu b/src/bf.cu
index d44551f6..4eba16eb 100644
--- a/src/bf.cu
+++ b/src/bf.cu
@@ -47,8 +47,7 @@ void __device__ DAS_temp(U2 * __restrict__ y,
     const U * __restrict__ Pi, const U * __restrict__ Pr, 
     const U * __restrict__ Pv, const U * __restrict__ Nv, 
 	const U2 * __restrict__ a, const U * __restrict__ cinv, const size_t * acstride, 
-    const U2 * __restrict__ x,
-	const U t0fsfc[3], const int flag) {
+    const U2 * __restrict__ x, const U t0fsfc[2], const int flag) {
     
     // unpack
     // const U t0   = t0fsfc[0]; // start time
@@ -138,7 +137,7 @@ __global__ void DAS(double2 * __restrict__ y,
     const double * __restrict__ Pv, const double * __restrict__ Nv, 
     const double2 * __restrict__ a, const double * __restrict__ cinv, const size_t * acstride,
 	const double2 * __restrict__ x, const int iflag,
-	const double tvars[3]) {
+	const double tvars[2]) {
     DAS_temp<double, double2, double3, double4>(y, Pi, Pr, Pv, Nv, a, cinv, acstride, x, tvars, iflag);
 }
 
@@ -147,7 +146,7 @@ __global__ void DASf(float2 * __restrict__ y,
     const float * __restrict__ Pv, const float * __restrict__ Nv, 
     const float2 * __restrict__ a, const float * __restrict__ cinv, const size_t * acstride,
 	const float2 * __restrict__ x, const int iflag,
-	const float tvars[3]) {
+	const float tvars[2]) {
     DAS_temp<float, float2, float3, float4>(y, Pi, Pr, Pv, Nv, a, cinv, acstride, x, tvars, iflag);
 
 }
@@ -158,7 +157,7 @@ __global__ void DASh(ushort2 * __restrict__ y,
     const float * __restrict__ Pv, const float * __restrict__ Nv, 
 	const ushort2 * __restrict__ a, const float * __restrict__ cinv, const size_t * acstride, 
     const ushort2 * __restrict__ x, const int iflag,
-	const float tvars[3]) {
+	const float tvars[2]) {
     DAS_temp<float, half2, float3, float4>((half2 *)y, Pi, Pr, Pv, Nv, (const half2 *)a, cinv, acstride, (const half2 *)x, tvars, iflag);
 }
 #endif
@@ -210,7 +209,7 @@ __global__ void delaysf(float * __restrict__ tau,
     // reinterpret inputs as vector pointers (makes loading faster and indexing easier)
     const float3 * pi = reinterpret_cast<const float3*>(Pi); // 3 x I
     const float3 * pr = reinterpret_cast<const float3*>(Pr); // 3 x N
-    const float3 * pv = reinterpret_cast<const float3*>(Pv); // 3 x M
+    const float4 * pv = reinterpret_cast<const float4*>(Pv); // 4 x M
     const float3 * nv = reinterpret_cast<const float3*>(Nv); // 3 x M
     
     // rename for readability
@@ -227,7 +226,8 @@ __global__ void delaysf(float * __restrict__ tau,
             # pragma unroll
             for(size_t n = 0; n < N; ++n){
                 // 2-way virtual path distance
-                rv = pi[i] - pv[m]; // (virtual) transmit to pixel vector 
+                const float3 pvm = {pv[m].x,pv[m].y,pv[m].z}; // declared for MSVC (2019)
+                rv = pi[i] - pvm; // (virtual) transmit to pixel vector
                 
                 dv = QUPS_VS ? // tx path length
                     copysign(length(rv), (QUPS_DV ? 1.f : dot(rv, nv[m]))) // virtual source
@@ -255,7 +255,7 @@ __global__ void delays(double * __restrict__ tau,
     // reinterpret inputs as vector pointers (makes loading faster and indexing easier)
     const double3 * pi = reinterpret_cast<const double3*>(Pi); // 3 x I
     const double3 * pr = reinterpret_cast<const double3*>(Pr); // 3 x N
-    const double3 * pv = reinterpret_cast<const double3*>(Pv); // 3 x M
+    const double4 * pv = reinterpret_cast<const double4*>(Pv); // 4 x M
     const double3 * nv = reinterpret_cast<const double3*>(Nv); // 3 x M
     
     // rename for readability
@@ -272,7 +272,8 @@ __global__ void delays(double * __restrict__ tau,
             # pragma unroll
             for(size_t n = 0; n < N; ++n){
                 // 2-way virtual path distance
-                rv = pi[i] - pv[m]; // (virtual) transmit to pixel vector 
+                const double3 pvm = {pv[m].x,pv[m].y,pv[m].z}; // declared for MSVC (2019)
+                rv = pi[i] - pvm; // (virtual) transmit to pixel vector
                 
                 dv = QUPS_VS ? // tx path length
                     copysign(length(rv), (QUPS_DV ? 1.f : dot(rv, nv[m]))) // virtual source
diff --git a/src/conv_cuda.cu b/src/conv_cuda.cu
deleted file mode 100644
index cbb25ec2..00000000
--- a/src/conv_cuda.cu
+++ /dev/null
@@ -1,152 +0,0 @@
-#if (__CUDA_ARCH__ >= 530)
-#define __CUDA_NO_HALF2_OPERATORS__ // block half2 vector math operators
-#include <cuda_fp16.h> // define half/half2 types, without half2 operators
-#endif
-
-// real/complex conjugation
-inline __host__ __device__ float conj(const float a) {
-    return a; 
-}
-inline __host__ __device__ double conj(const double a) {
-    return a;
-}
-#if (__CUDA_ARCH__ >= 530)
-inline __host__ __device__ half conj(const half a) {
-    return a;
-}
-#endif
-
-inline __host__ __device__ float2 conj(const float2 a) {
-    return make_float2(a.x, -a.y); 
-}
-inline __host__ __device__ double2 conj(const double2 a) {
-    return make_double2(a.x, -a.y); 
-}
-#if (__CUDA_ARCH__ >= 530)
-inline __host__ __device__ half2 conj(const half2 a) {
-    return make_half2(a.x, -a.y); 
-}
-#endif
-
-
-// complex multiplication
-inline __host__ __device__ float2 operator*(const float2 a, const float2 b) {
-    return make_float2(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
-}
-inline __host__ __device__ double2 operator*(const double2 a, const double2 b) {
-    return make_double2(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
-}
-#if (__CUDA_ARCH__ >= 530)
-inline __host__ __device__ half2 operator*(const half2 a, const half2 b) {
-    return make_half2(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
-}
-#endif
-
-inline __host__ __device__ float2 operator*(const float2 a, const float b){
-    return make_float2(b*a.x, b*a.y);
-}
-inline __host__ __device__ double2 operator*(const double2 a, const double b){
-    return make_double2(b*a.x, b*a.y);
-}
-#if (__CUDA_ARCH__ >= 530)
-inline __host__ __device__ half2 operator*(const half2 a, const half b){
-    return make_half2(b*a.x, b*a.y);
-}
-#endif
-
-// real/complex addition/assignment
-inline __host__ __device__ void operator+=(float2 &a, const float2 b){
-    a.x += b.x;
-    a.y += b.y;
-}
-inline __host__ __device__ void operator+=(double2 &a, const double2 b){
-    a.x += b.x;
-    a.y += b.y;
-}
-#if (__CUDA_ARCH__ >= 530)
-inline __host__ __device__ void operator+=(half2 &a, const half2 b){
-    a.x += b.x;
-    a.y += b.y;
-}
-#endif
-
-
-/*
-* Compute the cross correlation of two sets of data. The data will be
-* correlated in the first dimension. M >= N must be satisfied.
-*
-* Inputs:
-*  x:         first signal  (M x S)
-*  y:         second signal (N x S)
-*
-* Outputs:
-*  z:         resulting cross correlation
-*
-*
-*/
-# ifndef M
-__constant__ size_t M; 
-# endif
-# ifndef N
-__constant__ size_t N;
-# endif
-# ifndef L
-__constant__ size_t L; // total number of lags
-# endif
-# ifndef L0
-__constant__ int L0; // starting lag
-# endif
-
-// xcorr template
-template <typename T>
-inline __device__ void conv_temp(const T* x, const T* y, T* z, T za){
-    /*    xcorr_temp(const T* x, const T* y, T* z, T za)
-     x, y: input array pointer(s)
-     za: 0 value for the data type
-     z:    output array pointer
-     cross correlation 
-    */
-
-    // get lag and stride indices
-    const int    l = threadIdx.x + blockDim.x*blockIdx.x; // lag index 
-    const size_t s = threadIdx.y + blockDim.y*blockIdx.y; // slice
-    
-    // if valid lag indices, multiply and accumulate in-place
-    if(l < L)
-        # pragma unroll
-        for(int i = 0, j = L0 - l; i < M || j < N; ++i, ++j)
-            if(0 <= i && i < M && 0 <= j && j < N) // signal in bounds
-                za += x[i+s*M] * conj(y[j+s*N]); // accum the cross product
-
-    // output result:
-    if(l < L)
-        z[l+s*L] = za;
-}
-
-// xcorr kernels
-__global__ void convf(const float* x, const float* y, float* z){
-    conv_temp<float>(x, y, z, 0.0f);
-}
-
-__global__ void conv(const double* x, const double* y, double* z){
-    conv_temp<double>(x, y, z, 0.0);
-}
-#if (__CUDA_ARCH__ >= 530)
-__global__ void convh(const ushort* x, const ushort* y, ushort* z){
-    conv_temp<half>((half*)x, (half*)y, (half*)z, 0.0f);
-}
-#endif
-__global__ void convcf(const float2* x, const float2* y, float2* z){
-    conv_temp<float2>(x, y, z, make_float2(0.0f,0.0f));
-}
-
-__global__ void convc(const double2* x, const double2* y, double2* z){
-    conv_temp<double2>(x, y, z, make_double2(0.0,0.0));
-}
-#if (__CUDA_ARCH__ >= 530)
-__global__ void convch(const ushort2* x, const ushort2* y, ushort2* z){
-    conv_temp<half2>((half2*)x, (half2*)y, (half2*)z, make_half2(0.0f, 0.0f));
-}
-#endif
-
-
diff --git a/src/convd.cl b/src/convd.cl
new file mode 100644
index 00000000..ccaacb3a
--- /dev/null
+++ b/src/convd.cl
@@ -0,0 +1,91 @@
+// DEBUG: for linter purposes
+/*
+# ifndef QUPS_PRECISION
+# define QUPS_PRECISION 32
+# endif
+
+# ifndef QUPS_COMPLEX
+# define QUPS_COMPLEX 
+# endif
+*/
+
+#   if QUPS_PRECISION == 32
+    # ifdef QUPS_COMPLEX
+typedef float2 T;
+    # else
+typedef float T;
+    # endif
+# elif QUPS_PRECISION == 16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable // must enable half precision
+    # ifdef QUPS_COMPLEX
+typedef half2 T;
+    # else
+typedef half T;
+    # endif
+# elif QUPS_PRECISION == 64 
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable // must enable double precision
+    # ifdef QUPS_COMPLEX
+typedef double2 T;
+    # else
+typedef double T;
+    # endif
+# endif
+
+
+
+// complex multiplication
+# ifdef QUPS_COMPLEX
+inline T cmul(const T a, const T b){ return (T)(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);}
+# else
+inline T cmul(const T a, const T b){ return a * b;}
+# endif
+
+/*
+* Compute the cross correlation of two sets of data. The data will be
+* correlated in the first dimension. M >= N must be satisfied.
+*
+* Inputs:
+*  x:         first signal  (M x S)
+*  y:         second signal (N x S)
+*  sizes:     sizing info (undocumented)
+*
+* Outputs:
+*  z:         resulting cross correlation
+*
+*
+*/
+
+kernel void conv(global const T* x, global const T* y, global T* z, const global int * sizes){
+    // get stride and len
+    const int istr  = sizes[0];
+    const int ilen  = sizes[1];
+    const int jstr  = sizes[2];
+    const int jlen  = sizes[3];
+    const int lstr  = sizes[4];
+    const int llen  = sizes[5];
+    const int clen  = sizes[6];
+    
+    const int I = ilen * clen;
+    const int J = jlen * clen;
+    const int L = llen * clen;
+    
+    const int c = get_global_id(0); // column output index 
+    const int l = get_global_id(1); // strided output index 
+    const int s = get_global_id(2); // batch index 
+    
+    // accumulator
+    T za = (T) 0.f;
+
+    // if valid lag indices, multiply and accumulate in-place
+    // TODO: apply work-groups and striding for faster accumulation
+    if(l < L && c < clen)
+    # pragma unroll
+        for(int i = 0, j = (int)(QUPS_L0) - l, jr = jlen - 1 - j; i < ilen || jr >= 0; ++i, --jr) // forward and reverse iterate over both signals
+            if(0 <= i && i < ilen && 0 <= jr && jr < jlen) // if both signals in bounds
+                za += cmul(x[c + i*istr + s*I] , y[c + jr*jstr + s*J]); // accumulate the cross product
+
+    // output result:
+    if(l < L && c < clen)
+        z[c + l*lstr + s*L] = za;
+}
+
diff --git a/src/greens.cl b/src/greens.cl
new file mode 100644
index 00000000..75dcbdf6
--- /dev/null
+++ b/src/greens.cl
@@ -0,0 +1,75 @@
+// # include "helper_math.h" // vector math
+
+// # include "sizes.cu" // size defines
+
+# include "interpolators.cl" // samplers using constant sizing
+
+#   if QUPS_PRECISION == 32
+typedef float4 U4;
+# elif QUPS_PRECISION == 64 
+typedef double4 U4;
+# elif QUPS_PRECISION == 16 
+typedef float4 U4;
+# endif
+
+kernel void greens(global T2 * __restrict__ y, 
+    global const U4 * __restrict__ pi, global const T2 * __restrict__ a, 
+    global const U4 * __restrict__ pr, global const U4 * __restrict__ pv, 
+    global const T2 * __restrict__ x,  global const V * __restrict__ sb, global const ulong * iblock,
+	global const V s0t0fscinv[5],
+    global const int * E, const int iflag
+    ) {
+
+    // extract time parameters
+    const V s0   = s0t0fscinv[0];
+    const V t0   = s0t0fscinv[1];
+    const V fs   = s0t0fscinv[2];
+    const V fsr  = s0t0fscinv[3];
+    const V cinv = s0t0fscinv[4];
+    
+    // get starting index of this scatterer
+    const ulong s = get_global_id(0); // time 
+    const ulong n = get_global_id(1); // rx
+    const ulong m = get_global_id(2); // tx
+
+    // rename for readability
+    const ulong N = QUPS_N, S = QUPS_S, M = QUPS_M, I = QUPS_I; //, T = QUPS_T;
+    // rxs, num scat, output time size, txs, kernel time size, 
+    // S is size of output, T is size of input kernel, I the number of scats
+    
+    // temp vars
+    // const float ts = s + s0*fs; // compute the time index for this thread
+    const T2 zero_v = (T2)(0, 0), fpow = (T2)(fsr, fsr); // OOB value, power scaling
+    V r, tau; // length, time (tmp values)
+    T2 val = zero_v; // accumulator
+
+    // if valid scat, for each tx/rx
+    if(s < S){
+        for(ulong i = iblock[2*get_group_id(0)+0]; i <= iblock[2*get_group_id(0)+1] && i < I; ++i){ // for each scatterer
+            if(s >= sb[2*i+0]){ // if within sampling window
+                # pragma unroll 
+                for(ulong me = 0; me < E[1]; ++me){ // for each tx sub-aperture
+                    # pragma unroll 
+                    for(ulong ne = 0; ne < E[0]; ++ne){ // for each rx sub-aperture
+    
+                        // 2-way path distance
+                        r = (length(pi[i] - pr[n + ne*N]) + length(pi[i] - pv[m + me*M])); // (virtual) transmit to pixel vector
+                        
+                        // get kernel delay for the scatterer
+                        tau = (V)s - (cinv * r + t0 - s0)*fs;
+                        
+                        // sample the kernel and add to the signal at this time
+                        // fsr applies a 'stretch' operation to the sample time, because the 
+                        // input data x is sampled at sampling frequency fsr * fs
+                        val += cmul(a[i], sample(x, fsr*tau, iflag, zero_v)); // out of bounds: extrap 0
+                    }
+                }
+            }
+        }
+        
+        // output signal when all scatterers and sub-apertures are sampled
+        // normalize by the discrete length of the signal
+        y[s + n*S + m*N*S] = val / fpow;
+    }
+}
+
diff --git a/src/interpd.cl b/src/interpd.cl
new file mode 100644
index 00000000..4888c3cb
--- /dev/null
+++ b/src/interpd.cl
@@ -0,0 +1,291 @@
+// defaults
+# ifndef QUPS_INTERPD_FLAG
+# define QUPS_INTERPD_FLAG 2
+# endif
+# ifndef QUPS_INTERPD_NO_V
+# define QUPS_INTERPD_NO_V 0.f
+# endif
+# ifndef QUPS_INTERPD_OMEGA
+# define QUPS_INTERPD_OMEGA 0
+# endif
+
+# include "interpolators.cl"
+
+// template<typename T2, typename U, typename V> // channel data type, time data type, time-sampling type
+kernel void interpd(global T2 * y, global const T2 * x, global const U * tau, const int flag) {
+    
+    // get sampling index
+    const size_t tid = get_global_id(0);
+    const size_t n   = get_global_id(1);
+    // const size_t m = threadIdx.z + blockIdx.z * blockDim.z;
+    
+    // rename for readability
+    const size_t I = QUPS_I, M = QUPS_S, N = QUPS_N, T = QUPS_T, F = QUPS_F;
+    const T2 no_v  = QUPS_INTERPD_NO_V;
+
+    // remap indices
+    const size_t i = tid % I;
+    const size_t m = tid / I;
+
+    // if valid sample, for each tx/rx
+    if(i < I && n < N && m < M){
+        # pragma unroll
+        for(size_t f = 0; f < F; ++f){ // per transmit
+            y[i + n*I + m*N*I + f*M*N*I] = sample(&x[n*T + f*N*T], (V)(tau[i + n*I + m*I*N]), flag, no_v);
+        }
+    }
+}
+
+// added from internet ------------------------------
+/*
+void __attribute__((always_inline)) atomic_add_f(volatile global float* addr, const float val) {
+    union {
+        uint  u32;
+        float f32;
+    } next, expected, current;
+    current.f32 = *addr;
+    do {
+        next.f32 = (expected.f32=current.f32)+val; // ...*val for atomic_mul_f()
+        current.u32 = atomic_cmpxchg((volatile global uint*)addr, expected.u32, next.u32);
+    } while(current.u32!=expected.u32);
+}
+
+#ifdef cl_khr_int64_base_atomics
+// #prgma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+void __attribute__((always_inline)) atomic_add_d(volatile global double* addr, const double val) {
+    union {
+        ulong  u64;
+        double f64;
+    } next, expected, current;
+    current.f64 = *addr;
+    do {
+        next.f64 = (expected.f64=current.f64)+val; // ...*val for atomic_mul_d()
+        current.u64 = atom_cmpxchg((volatile global ulong*)addr, expected.u64, next.u64);
+    } while(current.u64!=expected.u64);
+}
+#endif
+*/
+// --------------------------------------------------------------------
+
+// atomicAdd not natively supported for floating point types
+// for double
+#ifdef cl_khr_fp64
+double atomicAdd(volatile global double* address, double val)
+{
+    // view data as a double (.f) or long int (.i)
+    union {double f; long i;} old, curr, nxt; 
+    old.f = *address; // previous value
+    
+    do {
+        curr.i = old.i; // current value
+        nxt.f = (curr.f + val); // desired next value
+        old.i = atomic_cmpxchg((volatile global long *) address, curr.i, nxt.i); // attempt to swap
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (curr.i != old.i);
+
+    return old.f;
+}
+#endif
+
+// for float
+float atomicAddf(volatile global float* address, float val)
+{
+    // view data as a float (.f) or int (.i)
+    union {float f; int i;} old, curr, nxt; 
+    old.f = *address; // previous value
+    
+    do {
+        curr.i = old.i; // current value
+        nxt.f = (curr.f + val); // desired next value
+        old.i = atomic_cmpxchg((volatile global int *)address, curr.i, nxt.i); // attempt to swap
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (curr.i != old.i);
+
+    return old.f;
+}
+
+// for half2
+#ifdef cl_khr_fp16
+half2 atomicAddh(volatile global half2* address, half2 val)
+{
+    // view data as a half2 (.f) or int (.i)
+    union {half2 f; int i;} old, curr, nxt; 
+    old.f = *address; // previous value
+    
+    do {
+        curr.i = old.i; // current value
+        nxt.f = (curr.f + val); // desired next value
+        old.i = atomic_cmpxchg((volatile global int *) address, curr.i, nxt.i); // attempt to swap
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (curr.i != old.i);
+
+    return old.f;
+}
+#endif
+
+
+// add using the atomic functions for each data type
+#ifdef cl_khr_fp16
+inline void atomicAddStoreh(volatile global half2 * y, const half2 val){
+    atomicAddh(y, val);
+}
+#endif
+inline void atomicAddStoref(volatile global float2 * y, const float2 val){
+    atomicAddf(((volatile global float*) y) + 0, val.x);
+    atomicAddf(((volatile global float*) y) + 1, val.y);
+}
+#ifdef cl_khr_fp64
+inline void atomicAddStore(volatile global double2 * y, const double2 val){
+    atomicAdd(((volatile global double*) y) + 0, val.x);
+    atomicAdd(((volatile global double*) y) + 1, val.y);
+}
+#endif
+
+
+size_t global_offset(const size_t * dind, const global size_t * sizes, const global uchar * iflags){
+// global index
+    // init
+    size_t dsz[3] = {1,1,1}; // {I,N,F} index cumulative sizes
+    size_t str, j = 0; // stride, output index
+
+    // find offset
+    # pragma unroll
+    for(char i = 0; i < 3; ++i){ // each label
+        str = 1; // reset stride
+        # pragma unroll
+        for(size_t s = 0; s < QUPS_S; ++s){ // for each data dimension
+            if(i == iflags[s]){ // matching label
+                const size_t k = (dind[i] / dsz[i]) % sizes[s]; // get sub-index
+                dsz[i] *= sizes[s]; // increase size for this label
+                j += str * k; // add offset
+            }
+            str *= sizes[s]; // increase indexing stride
+        }
+    }
+
+    return j;
+}
+
+// template<typename T2, typename U, typename V> // channel data type, time data type, time-sampling type
+kernel void wsinterpd(volatile global T2 * y, 
+    const global T2 * w, const global T2 * x, 
+    const global U  * tau, const global ulong * sizes, 
+    const global uchar * iflags, const global ulong * dstride){
+    // , const int flag, const T2 no_v, const U omega){
+    
+    // alias constant inputs
+    const int flag = QUPS_INTERPD_FLAG;
+    const T2 no_v  = QUPS_INTERPD_NO_V;
+    const U omega  = QUPS_INTERPD_OMEGA;
+
+    // get sampling index
+    const ulong i = get_global_id(0);
+    const ulong n = get_global_id(1);
+    
+    // rename for readability
+    const ulong I = QUPS_I, N = QUPS_N, F = QUPS_F, S = QUPS_S, T = QUPS_T; // , M = QUPS_M
+    ulong u,v,k,l,sz; // weighting / output indexing
+
+    // if valid sample, per each i,n,f
+    if(i < I && n < N){
+        # pragma unroll
+        for(ulong f = 0; f < F; ++f){ // for f
+            // global index
+            const ulong dind[3] = {i,n,f};
+            const ulong j = global_offset(dind, sizes, iflags);
+
+            // get weight vector and output indices
+            k = 0, l = 0; u = 0; v = 0;
+            # pragma unroll
+            for(ulong s = 0; s < S; ++s){ // for each dimension s
+                // calculate the indexing stride for dimension s i.e. 
+                // size of all prior dimensions
+                sz = 1;
+                # pragma unroll
+                for(ulong sp = 0; sp < s; ++sp)
+                    sz *= sizes[sp]; 
+
+                const ulong js = ((j / sz) % sizes[s]); // index for this dimension
+                k += js * dstride[0 + 4*s]; // add pitched index for this dim (weights)
+                l += js * dstride[1 + 4*s]; // add pitched index for this dim (outputs)
+                u += js * dstride[2 + 4*s]; // add pitched index for this dim (time)
+                v += js * dstride[3 + 4*s]; // add pitched index for this dim (samples)
+            }
+            
+            const T2 a = (omega != 0) ? (T2)(cos(omega * tau[u]), sin(omega * tau[u])) : (T2)(1.f, 0.f); // modulation phasor
+            const T2 val = cmul(a, cmul(w[k],sample(&x[(v)*T], (V)tau[u], flag, no_v))); // weighted sample
+            // y[l] += (T2)(1.0f, 1.0f);
+
+#   if QUPS_PRECISION == 64
+            atomicAddStore(y + l, val); // store
+# elif QUPS_PRECISION == 32
+            atomicAddStoref(y + l, val); // store
+# elif QUPS_PRECISION == 16
+            atomicAddStoreh(y + l, val); // store
+# endif
+
+        }
+    }
+}
+
+// template<typename T2, typename U, typename V> // channel data type, time data type, time-sampling type
+kernel void wsinterpd2(volatile global T2 * y, 
+    const global T2 *  w, const global T2 * x, 
+    const global U * tau1, const global U * tau2, const global ulong * sizes, 
+    const global uchar * iflags, const global ulong * dstride){
+    // , const int flag, const T2 no_v, const U omega) {
+
+    // alias constant inputs
+    const int flag = QUPS_INTERPD_FLAG;
+    const T2 no_v  = QUPS_INTERPD_NO_V;
+    const U omega  = QUPS_INTERPD_OMEGA;
+
+    // get sampling index
+    const size_t i = get_global_id(0);
+    const size_t n = get_global_id(1);
+    
+    // rename for readability
+    const size_t I = QUPS_I,N = QUPS_N, F = QUPS_F, S = QUPS_S, T = QUPS_T; // , M = QUPS_M
+    size_t r,u,v,k,l,sz; // weighting / output indexing
+
+    // if valid sample, per each i,n,f
+    if(i < I && n < N){
+        # pragma unroll
+        for(size_t f = 0; f < F; ++f){ // for f
+            // global index
+            size_t dind[3] = {i,n,f};
+            const size_t j = global_offset(dind, sizes, iflags);
+
+            // get weight vector and output indices
+            k = 0, l = 0; r = 0; u = 0; v = 0;
+            # pragma unroll
+            for(size_t s = 0; s < S; ++s){ // for each dimension s
+                // calculate the indexing stride for dimension s i.e. 
+                // size of all prior dimensions
+                sz = 1;
+                for(size_t sp = 0; sp < s; ++sp)
+                    sz *= sizes[sp]; 
+
+                const size_t js = ((j / sz) % sizes[s]); // index for this dimension
+                k += js * dstride[0 + 5*s]; // add pitched index for this dim (weights)
+                l += js * dstride[1 + 5*s]; // add pitched index for this dim (outputs)
+                r += js * dstride[2 + 5*s]; // add pitched index for this dim (time-1)
+                u += js * dstride[3 + 5*s]; // add pitched index for this dim (time-2)
+                v += js * dstride[4 + 5*s]; // add pitched index for this dim (samples)
+            }
+            const U  t = tau1[r] + tau2[u]; // time
+            const T2 a = (omega != 0) ? (T2)(cos(omega * t), sin(omega * t)) : (T2)(1.f, 0.f); // modulation phasor
+            const T2 val = cmul(a, cmul(w[k], sample(&x[(v)*T], (V)t, flag, no_v))); // weighted sample
+#   if QUPS_PRECISION == 64
+            atomicAddStore(&y[l], val); // store
+# elif QUPS_PRECISION == 32
+            atomicAddStoref(&y[l], val); // store
+# elif QUPS_PRECISION == 16
+            atomicAddStoreh(&y[l], val); // store
+# endif
+        }
+    }
+}
diff --git a/src/interpolators.cl b/src/interpolators.cl
new file mode 100644
index 00000000..faf59db7
--- /dev/null
+++ b/src/interpolators.cl
@@ -0,0 +1,162 @@
+/**
+ @file gpuBF/interp1d.cuh
+ @author Dongwoon Hyun (dongwoon.hyun@stanford.edu)
+ @date 2021-03-08
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Modified for use as a stand-alone OpenCL file (Thurston Brevett <tbrevett@stanford.edu>)
+// data is (T x N x F)
+// sample times are (I x N x M)
+
+
+// enable integer atomics(must be supported!)
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable 
+
+// DEBUG: constant defines for debugging / if not specified at compile time
+// default precision
+// # ifndef QUPS_PRECISION
+// # define QUPS_PRECISION 32
+// # endif
+
+// DEBUG: to avoid linter errrors
+/*
+# ifndef QUPS_T
+# define QUPS_T 1024
+# define QUPS_N 1
+# define QUPS_M 1
+# define QUPS_F 1
+# define QUPS_I 1
+# define QUPS_S 1024
+# endif
+*/
+
+// should be JIT compiles with (T2, U, V) set to the complex-data type, real-data type, and time type
+#   if QUPS_PRECISION == 32
+typedef float2  T2;
+typedef float   U ;
+typedef float   V ;
+# define PI_VAL M_PI_F
+# elif QUPS_PRECISION == 64 
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable // must enable double precision
+typedef double2 T2;
+typedef double  U ;
+typedef double  V ;
+# define PI_VAL M_PI
+# elif QUPS_PRECISION == 16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable // must enable half precision
+typedef half2   T2;
+typedef half    U ;
+typedef half    V ;
+# define PI_VAL M_PI_H
+# endif
+
+// complex multiplication 
+T2 cmul(const T2 a, const T2 b){
+    return (T2)(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
+}
+
+/// @brief Device function for nearest-neighbor interpolation
+T2 nearest(global const T2 * x, U tau, T2 no_v) {
+    const int ti = (int) round(tau); // round to nearest integer
+    return (0 <= ti && ti < QUPS_T) ? x[ti] : no_v;
+}
+
+T2 lerp(const T2 a, const T2 b, const U t) {
+    return a + t*(b-a); 
+}
+
+/// @brief Device function for linear interpolation
+T2 linear(global const T2 * x, U tau, T2 no_v) {
+    U tif; // integer part
+    
+    // fractional and integer part
+    const U tf = modf(tau, &tif); 
+    const int ti = (int) tif;
+                
+    // if in bounds, linearly interpolate by ratio tau at time-index ti[+1]
+    return (0 <= ti && ti + 1 < QUPS_T) ? lerp(x[ti], x[ti + 1], tf) : no_v;
+}
+
+/// @brief Device function for cubic Hermite interpolation
+T2 cubic(global const T2 * x, U tau, T2 no_v) {
+  U tf;
+  const U u = modf(tau, &tf); // u is the fractional part, tf the integer part
+  const int ti = (int) tf;
+  
+  if (!(0 <= (ti - 1) && (ti + 2) < QUPS_T))
+      return no_v;
+
+  T2 s0 = x[ti - 1];
+  T2 s1 = x[ti + 0];
+  T2 s2 = x[ti + 1];
+  T2 s3 = x[ti + 2];
+
+  // Cubic Hermite interpolation (increased precision using fused multiply-adds)
+  // (Catmull-Rom)
+  U a0 = 0 + u * (-1 + u * (+2 * u - 1));
+  U a1 = 2 + u * (+0 + u * (-5 * u + 3));
+  U a2 = 0 + u * (+1 + u * (+4 * u - 3));
+  U a3 = 0 + u * (+0 + u * (-1 * u + 1));
+  // // Cubic Hermite interpolation (naive, less precise implementation)
+  // float a0 = -1 * u * u * u + 2 * u * u - 1 * u + 0;
+  // float a1 = +3 * u * u * u - 5 * u * u + 0 * u + 2;
+  // float a2 = -3 * u * u * u + 4 * u * u + 1 * u + 0;
+  // float a3 = +1 * u * u * u - 1 * u * u + 0 * u + 0;
+  return (s0 * a0 + s1 * a1 + s2 * a2 + s3 * a3) * 0.5f;
+}
+
+/// @brief Inline helper code for Lanczos 3-lobe interpolation
+U lanczos_helper(U u, int a) {
+  return (u == 0.f ? 1.f : (2.f * sinpi(u)*sinpi(u/a) / (PI_VAL*PI_VAL * u*u)));
+}
+
+
+/// @brief Device function for Lanczos 3-lobe interpolation
+T2 lanczos3(global const T2 * x, U tau, T2 no_v) {
+  const int a = 2;  // a=2 for 3-lobe Lanczos resampling
+  U tf;
+  const U u = modf(tau, &tf); // u is the fractional part, tf the integer part
+  const int ti = (int) tf;
+  
+  if (!(0 <= (ti - 1) && (ti + 2) < QUPS_T))
+      return no_v;
+  
+  T2 s0 = x[ti - 1];
+  T2 s1 = x[ti + 0];
+  T2 s2 = x[ti + 1];
+  T2 s3 = x[ti + 2];
+  U a0 = lanczos_helper(u + 1, a);
+  U a1 = lanczos_helper(u + 0, a);
+  U a2 = lanczos_helper(u - 1, a);
+  U a3 = lanczos_helper(u - 2, a);
+  return s0 * a0 + s1 * a1 + s2 * a2 + s3 * a3;
+}
+
+T2 sample(global const T2 * x, U tau, int flag, const T2 no_v){
+    // sample according to the flag
+         if (flag == 0)
+        return nearest  (x, tau, no_v);
+    else if (flag == 1)
+        return linear   (x, tau, no_v);
+    else if (flag == 2)
+        return cubic    (x, tau, no_v);
+    else if (flag == 3)
+        return lanczos3 (x, tau, no_v);
+    else if (flag == 4)
+        return linear   (x, tau, no_v);
+    else 
+        return no_v;     
+}
+
diff --git a/src/wbilerp.cl b/src/wbilerp.cl
new file mode 100644
index 00000000..d36c95ca
--- /dev/null
+++ b/src/wbilerp.cl
@@ -0,0 +1,105 @@
+// linter ...
+# ifndef QUPS_PRECISION
+# define QUPS_PRECISION 32
+# endif
+
+#   if QUPS_PRECISION == 32
+typedef float  U;
+typedef float2 U2;
+# elif QUPS_PRECISION == 64 
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable // must enable double precision
+typedef double  U;
+typedef double2 U2;
+# elif QUPS_PRECISION == 16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable // must enable half precision
+typedef half  U;
+typedef half2 U2;
+# endif
+
+// convert integer to U type
+inline U2 i2U(const int2 a){return (U2)(a.x, a.y);}
+
+kernel void wbilerp(
+    global int2 * ixy, global U * cxy, global const U2 * pall, 
+    global const U * xg, global const U * yg, 
+    const ulong X, const ulong Y, const ulong I,
+    global const U2 * pml, global const U2 * pmu
+    ){
+
+    const ulong i = get_global_id(0);
+    const int N = X + Y + 1; // total number of points in output, 
+    int ix = 0; // grid x position (iterator)
+    int iy = 0; // grid y position (iterator)
+
+    if(i < I){
+        // index for this batch of pall
+        const int ip = i * (N + 1); // pall is length N+1
+
+        // compute all integrals one segment at a time
+        for(int n = 0; n < N; ++n){
+            // get both points (x/y by pair)
+            const U2 p1   = pall[ip + n  ];
+            const U2 p2   = pall[ip + n+1];
+        
+            // length of the line segment
+            // TODO: switch to hypot for numerical accuracy
+            const U l = length(p2 - p1); // hypot(p2(1) - p1(1), p2(2) - p1(2));
+        
+            // if points are within the support of the grid / line
+            const bool val = 
+                  pml[i].x <= p1.x & p1.x <= pmu[i].x
+                & pml[i].y <= p1.y & p1.y <= pmu[i].y
+                & pml[i].x <= p2.x & p2.x <= pmu[i].x
+                & pml[i].y <= p2.y & p2.y <= pmu[i].y;
+        
+            // skip integral if out of grid / line support, the segment length is zero, or if co-located in x
+            if ((!val) || (l == (U)0) || (p2.x == p1.x)) continue;
+        
+            // if any point not in x/y-bounds, move to next x/y-grid interval
+            while ((p1.x >= xg[ix+1]) && (p2.x >= xg[ix+1]) && (ix+2 < X)) ++ix;
+            while ((p1.y >= yg[iy+1]) && (p2.y >= yg[iy+1]) && (iy+2 < Y)) ++iy;
+            
+            // get the line segment region's size
+            const U2 dp = (U2)(xg[ix+1] - xg[ix], yg[iy+1] - yg[iy]);
+        
+            // get the indices for each of the four grid points affected by this
+            // line segment
+            # pragma unroll
+            for(int iyy = 0; iyy < 2; ++iyy){
+                # pragma unroll
+                for(int ixx = 0; ixx < 2; ++ixx){
+                    // ixloc = [ix, ix+1, ix, ix+1]; // x index
+                    // iyloc = [iy, iy, iy+1, iy+1]; // y index
+                    const int ixloc = ix + ixx; // x index
+                    const int iyloc = iy + iyy; // y index
+        
+                    // grid points
+                    // q = [xg(ixloc)'; yg(iyloc)'];
+                    const U2 q = (U2)(xg[ixloc], yg[iyloc]);
+        
+                    ////// inline the integral //////
+                    // s_d == -1 if in positive quadrant of dimension d
+                    const int2 s = (int2)(
+                        ( (p1.x <= q.x) && (p2.x <= q.x) ) ? 1 : -1,
+                        ( (p1.y <= q.y) && (p2.y <= q.y) ) ? 1 : -1
+                    );
+        
+                    // coefficients for the integral
+                    // TODO: use copysign instead of multiplying by +/- 1
+                    // TODO: write out equations to avoid accidental complex overload
+                    const U2 ONE = (U2)(1.f, 1.f);
+                    const U2 a = ONE + i2U(s) * (p1 - q ) / dp; // (non-complex) vector division
+                    const U2 b =       i2U(s) * (p1 - p2) / dp; // (non-complex) vector division
+        
+                    // evaluation of the integral: subject to numerical precision issues
+                    const U c = a.x*a.y - (a.x*b.y + a.y*b.x) / (U)2 + (b.x*b.y) / (U)3;
+        
+                    // apply line integral formula w/r to quadrant
+                    // TODO: redo write indexing to vectorize writes
+                    ixy[i*4*N + 4*n + 2*iyy + ixx] = (int2)(ixloc, iyloc); // output indices
+                    cxy[i*4*N + 4*n + 2*iyy + ixx] = l * c; // length in this grid scaled by grid size
+                }
+            }
+        }
+    }
+}