From 60eadc4b414bfd048ad3d0982099998c221a22fe Mon Sep 17 00:00:00 2001 From: rodrigovimieiro Date: Wed, 16 Feb 2022 13:23:46 -0500 Subject: [PATCH] CUDA Branchless Distance Driven codes --- README.md | 6 + pydbt/functions/FBP.py | 5 +- pydbt/functions/SIRT.py | 13 +- pydbt/functions/projection_operators.py | 162 ++++ pydbt/functions/utilities2setuptools.py | 103 +++ .../backprojectionDD/backprojectionDD.cpp | 29 +- .../backprojectionDDb/backprojectionDDb.cpp | 47 +- .../backprojectionDDb/backprojectionDDb.hpp | 15 +- .../backprojectionDDb_cuda.cpp | 132 +++ .../backprojectionDDb_cuda.hpp | 85 ++ .../sources/backprojectionDDb_cuda/kernel.cu | 751 ++++++++++++++++++ pydbt/sources/projectionDD/projectionDD.cpp | 28 +- pydbt/sources/projectionDDb/projectionDDb.cpp | 47 +- pydbt/sources/projectionDDb/projectionDDb.hpp | 17 +- pydbt/sources/projectionDDb_cuda/kernel.cu | 723 +++++++++++++++++ .../projectionDDb_cuda/projectionDDb_cuda.cpp | 131 +++ .../projectionDDb_cuda/projectionDDb_cuda.hpp | 86 ++ setup.py | 50 +- 18 files changed, 2357 insertions(+), 73 deletions(-) create mode 100644 pydbt/functions/projection_operators.py create mode 100644 pydbt/functions/utilities2setuptools.py create mode 100644 pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.cpp create mode 100644 pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.hpp create mode 100644 pydbt/sources/backprojectionDDb_cuda/kernel.cu create mode 100644 pydbt/sources/projectionDDb_cuda/kernel.cu create mode 100644 pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.cpp create mode 100644 pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.hpp diff --git a/README.md b/README.md index 66db6a7..54badd0 100644 --- a/README.md +++ b/README.md @@ -15,10 +15,16 @@ This repository is a python extension of the [DBT toolbox](https://github.com/LA * ```cd pyDBT``` + 3. Clone NVIDIA cuda-samples directory inside pyDBT: + + * ```git clone https://github.com/NVIDIA/cuda-samples``` + 3. Install the package: * ```python3 setup.py install``` + 3. If you have problems with `arch=sm_XX`, modify it in the `setup.py` accordingly to your NVIDIA-GPU architecture. This [link](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) has some references. + 4. run the example: * ```cd pydbt && python3 example.py``` diff --git a/pydbt/functions/FBP.py b/pydbt/functions/FBP.py index bf33115..31c33ed 100755 --- a/pydbt/functions/FBP.py +++ b/pydbt/functions/FBP.py @@ -7,8 +7,7 @@ """ import numpy as np -from .backprojectionDDb import backprojectionDDb -from .backprojectionDD import backprojectionDD +from .projection_operators import backprojectionDD def FDK(proj, geo, filterType, cutoff, libFiles): @@ -20,7 +19,7 @@ def FDK(proj, geo, filterType, cutoff, libFiles): else: raise ValueError('Unknown filter type.') - vol = backprojectionDD(proj, geo, libFiles) + vol = backprojectionDD(proj, geo, -1, libFiles) return vol diff --git a/pydbt/functions/SIRT.py b/pydbt/functions/SIRT.py index 324559d..d543248 100755 --- a/pydbt/functions/SIRT.py +++ b/pydbt/functions/SIRT.py @@ -9,8 +9,7 @@ import numpy as np import time -from .backprojectionDDb import backprojectionDDb -from .projectionDDb import projectionDDb +from .projection_operators import backprojectionDDb, projectionDDb def SIRT(proj, geo, nIter, libFiles): @@ -20,10 +19,12 @@ def SIRT(proj, geo, nIter, libFiles): reconData3d = np.zeros([geo.ny, geo.nx, geo.nz]) # Pre calculation of Projection normalization - proj_norm = projectionDDb(np.ones([geo.ny, geo.nx, geo.nz]), geo, libFiles) + proj_norm = projectionDDb(np.ones([geo.ny, geo.nx, geo.nz]), geo, -1, libFiles) + proj_norm[proj_norm == 0] = 1 # Pre calculation of Backprojection normalization - vol_norm = backprojectionDDb(np.ones([geo.nv, geo.nu, geo.nProj]), geo, libFiles) + vol_norm = backprojectionDDb(np.ones([geo.nv, geo.nu, geo.nProj]), geo, -1, libFiles) + vol_norm[vol_norm == 0] = 1 print('----------------\nStarting SIRT Iterations... \n\n') @@ -33,14 +34,14 @@ def SIRT(proj, geo, nIter, libFiles): startTime = time.time() # Error between raw data and projection of estimated data - proj_diff = proj - projectionDDb(reconData3d, geo, libFiles) + proj_diff = proj - projectionDDb(reconData3d, geo, -1, libFiles) # Projection normalization proj_diff = proj_diff / proj_norm proj_diff[np.isnan(proj_diff)] = 0 proj_diff[np.isinf(proj_diff)] = 0 - upt_term = backprojectionDDb(proj_diff, geo, libFiles) + upt_term = backprojectionDDb(proj_diff, geo, -1, libFiles) upt_term = upt_term / vol_norm # Volume normalization upt_term[np.isnan(upt_term)] = 0 upt_term[np.isinf(upt_term)] = 0 diff --git a/pydbt/functions/projection_operators.py b/pydbt/functions/projection_operators.py new file mode 100644 index 0000000..491f787 --- /dev/null +++ b/pydbt/functions/projection_operators.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Feb 16 10:29:14 2022 + +@author: rodrigo +""" + +import numpy as np +import numpy.ctypeslib as ctl +import ctypes +import warnings + +from .utilities import findAndLoadLibray +from .utilities import geoAsNp + +############################################################################## +# BackProjection # +############################################################################## + +def backprojectionDD(proj, geo, proj_num, libFiles): + + operator_type = 'backprojectionDD' + + vol = _backprojection(proj, geo, proj_num, libFiles, operator_type) + + return vol + +def backprojectionDDb(proj, geo, proj_num, libFiles): + + operator_type = 'backprojectionDDb' + + vol = _backprojection(proj, geo, proj_num, libFiles, operator_type) + + return vol + +def backprojectionDDb_cuda(proj, geo, proj_num, libFiles): + + operator_type = 'backprojectionDDb_cuda' + + vol = _backprojection(proj, geo, proj_num, libFiles, operator_type) + + return vol + +def _backprojection(proj, geo, proj_num, libFiles, operator_type): + + # Check if the input is in proper size + if not (proj.shape[0] == geo.nv and proj.shape[1] == geo.nu and proj.shape[2] == geo.nProj): + raise ValueError('First argument needs to have the same number of rows, cols and slices as in the configuration file.') + + check_parameters(operator_type, proj_num, geo) + + # Find and library + lib = findAndLoadLibray(libFiles, operator_type) + + + backprojection = getattr(lib, operator_type + '_lib') + + backprojection.argtypes = [ctl.ndpointer(np.float64, flags='aligned, c_contiguous'), + ctl.ndpointer(np.float64, flags='aligned, c_contiguous'), + ctl.ndpointer(np.float32, flags='aligned, c_contiguous'), + ctypes.c_long] + + + # Transform geo class in numpy array + geoNp = geoAsNp(geo) + + + vol_transp = np.empty([geo.nz, geo.nx, geo.ny], dtype=np.float64) + + proj_transp = np.transpose(proj, (2, 1, 0)).copy() + + backprojection(proj_transp, vol_transp, geoNp, proj_num) + + vol = np.transpose(vol_transp, (2, 1, 0)).copy() + + + return vol + + +############################################################################## +# Projection # +############################################################################## + +def projectionDD(vol, geo, proj_num, libFiles): + + operator_type = 'projectionDD' + + proj = _projection(vol, geo, proj_num, libFiles, operator_type) + + return proj + +def projectionDDb(vol, geo, proj_num, libFiles): + + operator_type = 'projectionDDb' + + proj = _projection(vol, geo, proj_num, libFiles, operator_type) + + return proj + +def projectionDDb_cuda(vol, geo, proj_num, libFiles): + + operator_type = 'projectionDDb_cuda' + + proj = _projection(vol, geo, proj_num, libFiles, operator_type) + + return proj + +def _projection(vol, geo, proj_num, libFiles, operator_type): + + # Check if the input is in proper size + if not (vol.shape[0] == geo.ny and vol.shape[1] == geo.nx and vol.shape[2] == geo.nz): + raise ValueError('First argument needs to have the same number of rows, cols and slices as in the configuration file.') + + check_parameters(operator_type, proj_num, geo) + + # Find and library + lib = findAndLoadLibray(libFiles, operator_type) + + projection = getattr(lib, operator_type + '_lib') + + projection.argtypes = [ctl.ndpointer(np.float64, flags='aligned, c_contiguous'), + ctl.ndpointer(np.float64, flags='aligned, c_contiguous'), + ctl.ndpointer(np.float32, flags='aligned, c_contiguous'), + ctypes.c_long] + + + # Transform geo class in numpy array + geoNp = geoAsNp(geo) + + + proj_transp = np.empty([geo.nProj, geo.nu, geo.nv], dtype=np.float64) + + vol_transp = np.transpose(vol, (2, 1, 0)).copy() + + projection(vol_transp, proj_transp, geoNp, proj_num) + + proj = np.transpose(proj_transp, (2, 1, 0)).copy() + + return proj + +############################################################################## +# Auxiliar functions # +############################################################################## + + +def check_parameters(operator_type, proj_num, geo): + + if proj_num < -1 and proj_num >= geo.nProj: + raise ValueError('Projection number needs to be between 0-(geo.nProj-1). If you want to operate on all projections, set it to -1') + + if geo.detAngle != 0 and 'DDb' in operator_type: + warnings.warn('Your detector rotates, its not recomended to used branchless projectors. Use DD only') + + if (geo.x_offset != 0 or geo.y_offset != 0) and ('DDb' in operator_type): + raise ValueError('Branchless operators dont work with volume offsets') + + return + + + + diff --git a/pydbt/functions/utilities2setuptools.py b/pydbt/functions/utilities2setuptools.py new file mode 100644 index 0000000..6487049 --- /dev/null +++ b/pydbt/functions/utilities2setuptools.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Feb 15 16:41:20 2022 + +@author: Rodrigo + +Reference: https://stackoverflow.com/a/13300714/8682939 + +""" +import os + +from os.path import join as pjoin +from distutils.command.build_ext import build_ext + + +def find_in_path(name, path): + "Find a file in a search path" + #adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ + for dir in path.split(os.pathsep): + binpath = pjoin(dir, name) + if os.path.exists(binpath): + return os.path.abspath(binpath) + return None + +def locate_cuda(): + """Locate the CUDA environment on the system + + Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' + and values giving the absolute path to each directory. + + Starts by looking for the CUDAHOME env variable. If not found, everything + is based on finding 'nvcc' in the PATH. + """ + + # first check if the CUDAHOME env variable is in use + if 'CUDAHOME' in os.environ: + home = os.environ['CUDAHOME'] + nvcc = pjoin(home, 'bin', 'nvcc') + else: + # otherwise, search the PATH for NVCC + nvcc = find_in_path('nvcc', os.environ['PATH']) + if nvcc is None: + raise EnvironmentError('The nvcc binary could not be ' + 'located in your $PATH. Either add it to your path, or set $CUDAHOME') + home = os.path.dirname(os.path.dirname(nvcc)) + + cudaconfig = {'home':home, 'nvcc':nvcc, + 'include': pjoin(home, 'include'), + 'lib64': pjoin(home, 'lib64')} + + for k, v in cudaconfig.items(): + if not os.path.exists(v): + raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) + + return cudaconfig + +def get_custom_build_ext(cuda_env): + + def customize_compiler_for_nvcc(self): + """inject deep into distutils to customize how the dispatch + to gcc/nvcc works. + + If you subclass UnixCCompiler, it's not trivial to get your subclass + injected in, and still have the right customizations (i.e. + distutils.sysconfig.customize_compiler) run on it. So instead of going + the OO route, I have this. Note, it's kindof like a wierd functional + subclassing going on.""" + + # tell the compiler it can processes .cu + self.src_extensions.append('.cu') + + # save references to the default compiler_so and _comple methods + default_compiler_so = self.compiler_so + super = self._compile + + # now redefine the _compile method. This gets executed for each + # object but distutils doesn't have the ability to change compilers + # based on source extension: we add it. + def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): + if os.path.splitext(src)[1] == '.cu': + # use the cuda for .cu files + self.set_executable('compiler_so', cuda_env['nvcc']) + # use only a subset of the extra_postargs, which are 1-1 translated + # from the extra_compile_args in the Extension class + postargs = extra_postargs['nvcc'] + else: + postargs = extra_postargs['gcc'] + + super(obj, src, ext, cc_args, postargs, pp_opts) + # reset the default compiler_so, which we might have changed for cuda + self.compiler_so = default_compiler_so + + # inject our redefined _compile method into the class + self._compile = _compile + + # run the customize_compiler + class custom_build_ext(build_ext): + def build_extensions(self): + customize_compiler_for_nvcc(self.compiler) + build_ext.build_extensions(self) + + return custom_build_ext \ No newline at end of file diff --git a/pydbt/sources/backprojectionDD/backprojectionDD.cpp b/pydbt/sources/backprojectionDD/backprojectionDD.cpp index 88e699b..6ccd5e1 100644 --- a/pydbt/sources/backprojectionDD/backprojectionDD.cpp +++ b/pydbt/sources/backprojectionDD/backprojectionDD.cpp @@ -5,7 +5,7 @@ % ========================================================================= %{ % ------------------------------------------------------------------------- -% backprojectionDD_lib(pProj, pVolume, pGeo) +% backprojectionDD_lib(pProj, pVolume, pGeo, idXProj) % ------------------------------------------------------------------------- % DESCRIPTION: % This function reconstruct the 3D volume from projections, based on @@ -18,6 +18,7 @@ % % - pProj = Pointer to 2D projections for each angle % - pGeo = Pointer to parameters of all pGeometry +% - idXProj = projection number to be projected % % OUTPUT: % @@ -54,7 +55,8 @@ extern "C" void backprojectionDD_lib(double* const pProj, double* const pVolume, - float* const pGeo){ + float* const pGeo, + const signed int idXProj){ const int unsigned nPixX = (const int)pGeo[0]; @@ -81,8 +83,8 @@ extern "C" void backprojectionDD_lib(double* const pProj, double* const pTubeAngle = (double*)malloc(nProj * sizeof(double)); double* const pDetAngle = (double*)malloc(nProj * sizeof(double)); - const int x_offset = (const int)pGeo[18]; - const int y_offset = (const int)pGeo[19]; + const double x_offset = (const double)pGeo[18]; + const double y_offset = (const double)pGeo[19]; linspace(-tubeAngle/2, tubeAngle/2, nProj, pTubeAngle); linspace(-detAngle/2, detAngle/2, nProj, pDetAngle); @@ -158,8 +160,23 @@ extern "C" void backprojectionDD_lib(double* const pProj, pProjf[(p*nDetX*nDetY) + (x*nDetY) + y] = pProj[(p*nDetX*nDetY) + (x_inv*nDetY) + y]; + + // Test if we will loop over all projs or not + unsigned int projIni, projEnd, nProj2Run; + if(idXProj == -1){ + projIni = 0; + projEnd = nProj; + nProj2Run = nProj; + } + else{ + nProj2Run = 1; + projIni = (unsigned int) idXProj; + projEnd = (unsigned int) idXProj + 1; + } + + // For each projection - for (int p = 0; p < nProj; p++) { + for (unsigned int p = projIni; p < projEnd; p++) { // Get specif tube angle for the projection double theta = pTubeAngle[p] * M_PI / 180.0; @@ -378,7 +395,7 @@ extern "C" void backprojectionDD_lib(double* const pProj, for (int z = 0; z < nSlices; z++) for (int x = 0, x_inv = nPixX - 1; x < nPixX; x++, x_inv--) for (int y = 0; y < nPixY; y++) - pVolume[(z*nPixX*nPixY) + (x*nPixY) + y] = pVolumet[(z*nPixX*nPixY) + (x_inv*nPixY) + y] / (double) nProj; + pVolume[(z*nPixX*nPixY) + (x*nPixY) + y] = pVolumet[(z*nPixX*nPixY) + (x_inv*nPixY) + y] / (double) nProj2Run; free(pVolumet); diff --git a/pydbt/sources/backprojectionDDb/backprojectionDDb.cpp b/pydbt/sources/backprojectionDDb/backprojectionDDb.cpp index 1c83b6e..0110fcc 100644 --- a/pydbt/sources/backprojectionDDb/backprojectionDDb.cpp +++ b/pydbt/sources/backprojectionDDb/backprojectionDDb.cpp @@ -5,7 +5,7 @@ % ========================================================================= %{ % ------------------------------------------------------------------------- -% backprojectionDDb_lib(pProj, pVolume, pGeo) +% backprojectionDDb_lib(pProj, pVolume, pGeo, idXProj) % ------------------------------------------------------------------------- % DESCRIPTION: % This function reconstruct the 3D volume from projections, based on @@ -18,6 +18,7 @@ % % - pProj = Pointer to 2D projections for each angle % - pGeo = Pointer to parameters of all pGeometry +% - idXProj = projection number to be projected % % OUTPUT: % @@ -56,7 +57,8 @@ extern "C" void backprojectionDDb_lib(double* const pProj, double* const pVolume, - float* const pGeo){ + float* const pGeo, + const signed int idXProj){ const int unsigned nPixX = (const int)pGeo[0]; @@ -83,15 +85,15 @@ extern "C" void backprojectionDDb_lib(double* const pProj, double* const pTubeAngle = (double*)malloc(nProj * sizeof(double)); double* const pDetAngle = (double*)malloc(nProj * sizeof(double)); - const int x_offset = (const int)pGeo[18]; - const int y_offset = (const int)pGeo[19]; + const double x_offset = (const double)pGeo[18]; + const double y_offset = (const double)pGeo[19]; linspace(-tubeAngle/2, tubeAngle/2, nProj, pTubeAngle); linspace(-detAngle/2, detAngle/2, nProj, pDetAngle); // printf("Nx:%d Ny:%d Nz:%d \nNu:%d Nv:%d \nDx:%.2f Dy:%.2f Dz:%.2f \nDu:%.2f Dv:%.2f \nDSD:%.2f DDR:%.2f \nTube angle:%.2f \nDet angle:%.2f", nPixX, nPixY, nSlices, nDetX, nDetY, dx, dy, dz, du, dv, DSD, DDR, tubeAngle, detAngle); - backprojectionDDb(pVolume, pProj, pTubeAngle, pDetAngle, x_offset, y_offset, nProj, nPixX, nPixY, nSlices, nDetX, nDetY, dx, dy, dz, du, dv, DSD, DDR, DAG); + backprojectionDDb(pVolume, pProj, pTubeAngle, pDetAngle, nProj, nPixX, nPixY, nSlices, nDetX, nDetY, idXProj, x_offset, y_offset, dx, dy, dz, du, dv, DSD, DDR, DAG); free(pTubeAngle); free(pDetAngle); @@ -106,14 +108,15 @@ void backprojectionDDb(double* const pVolume, double* const pProj, double* const pTubeAngle, double* const pDetAngle, - const int x_offset, - const int y_offset, - const int nProj, - const int nPixX, - const int nPixY, - const int nSlices, - const int nDetX, - const int nDetY, + const unsigned int nProj, + const unsigned int nPixX, + const unsigned int nPixY, + const unsigned int nSlices, + const unsigned int nDetX, + const unsigned int nDetY, + const signed int idXProj, + const double x_offset, + const double y_offset, const double dx, const double dy, const double dz, @@ -250,8 +253,22 @@ void backprojectionDDb(double* const pVolume, double* pDetmX_tmp = pDetmX + (nDetYMap * (nDetXMap - 2)); + // Test if we will loop over all projs or not + unsigned int projIni, projEnd, nProj2Run; + if(idXProj == -1){ + projIni = 0; + projEnd = nProj; + nProj2Run = nProj; + } + else{ + nProj2Run = 1; + projIni = (unsigned int) idXProj; + projEnd = (unsigned int) idXProj + 1; + } + + // For each projection - for (int p = 0; p < nProj; p++) { + for (unsigned int p = projIni; p < projEnd; p++) { // Get specif tube angle for the projection double theta = pTubeAngle[p] * M_PI / 180.0; @@ -309,7 +326,7 @@ void backprojectionDDb(double* const pVolume, for (int nz = 0; nz < nSlices; nz++) for (int x = 0; x < nPixX; x++) for (int y = 0; y < nPixY; y++) - pVolume[(nz*nPixX*nPixY) + (x*nPixY) + y] = pVolumet[(nz*nPixX*nPixY) + (x*nPixY) + y] / (double) nProj; + pVolume[(nz*nPixX*nPixY) + (x*nPixY) + y] = pVolumet[(nz*nPixX*nPixY) + (x*nPixY) + y] / (double) nProj2Run; free(pProjt); diff --git a/pydbt/sources/backprojectionDDb/backprojectionDDb.hpp b/pydbt/sources/backprojectionDDb/backprojectionDDb.hpp index 8539a9a..c3d477c 100644 --- a/pydbt/sources/backprojectionDDb/backprojectionDDb.hpp +++ b/pydbt/sources/backprojectionDDb/backprojectionDDb.hpp @@ -47,12 +47,15 @@ void backprojectionDDb(double* const pVolume, double* const pProj, double* const pTubeAngle, double* const pDetAngle, - const int nProj, - const int nPixX, - const int nPixY, - const int nSlices, - const int nDetX, - const int nDetY, + const unsigned int nProj, + const unsigned int nPixX, + const unsigned int nPixY, + const unsigned int nSlices, + const unsigned int nDetX, + const unsigned int nDetY, + const signed int idXProj, + const double x_offset, + const double y_offset, const double dx, const double dy, const double dz, diff --git a/pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.cpp b/pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.cpp new file mode 100644 index 0000000..0f1b693 --- /dev/null +++ b/pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.cpp @@ -0,0 +1,132 @@ +/* +%% Author: Rodrigo de Barros Vimieiro +% Date: Feb, 2022 +% rodrigo.vimieiro@gmail.com +% ========================================================================= +%{ +% ------------------------------------------------------------------------- +% backprojectionDDb_cuda_lib(pProj, pVolume, pGeo, idXProj) +% ------------------------------------------------------------------------- +% DESCRIPTION: +% This function reconstruct the 3D volume from projections, based on +% the Distance-Driven principle. It works by calculating the overlap +% in X and Y axis of the volume and the detector boundaries. +% The geometry is for DBT with half cone-beam. All parameters are set +% in "ParameterSettings" code. +% +% INPUT: +% +% - pProj = Pointer to 2D projections for each angle +% - pGeo = Pointer to parameters of all pGeometry +% - idXProj = projection number to be projected +% +% OUTPUT: +% +% - pVolume = Pointer to reconstructed volume. +% +% Reference: +% - Branchless Distance Driven Projection and Backprojection, +% Samit Basu and Bruno De Man (2006) +% - GPU Acceleration of Branchless Distance Driven Projection and +% Backprojection, Liu et al (2016) +% - GPU-Based Branchless Distance-Driven Projection and Backprojection, +% Liu et al (2017) +% - A GPU Implementation of Distance-Driven Computed Tomography, +% Ryan D. Wagner (2017) +% --------------------------------------------------------------------- +% Copyright (C) <2022> +% +% This program is free software: you can redistribute it and/or modify +% it under the terms of the GNU General Public License as published by +% the Free Software Foundation, either version 3 of the License, or +% (at your option) any later version. +% +% This program is distributed in the hope that it will be useful, +% but WITHOUT ANY WARRANTY; without even the implied warranty of +% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +% GNU General Public License for more details. +% +% You should have received a copy of the GNU General Public License +% along with this program. If not, see . +%} +% ========================================================================= +%% 3-D Distance Driven Back-projection Code +*/ + +#include "backprojectionDDb_cuda.hpp" + +extern "C" void backprojectionDDb_cuda_lib(double* const pProj, + double* const pVolume, + float* const pGeo, + const signed int idXProj){ + + + const int unsigned nPixX = (const int)pGeo[0]; + const int unsigned nPixY = (const int)pGeo[1]; + const int unsigned nSlices = (const int)pGeo[2]; + const int unsigned nDetX = (const int)pGeo[3]; + const int unsigned nDetY = (const int)pGeo[4]; + + const double dx = (const double)pGeo[5]; + const double dy = (const double)pGeo[6]; + const double dz = (const double)pGeo[7]; + const double du = (const double)pGeo[8]; + const double dv = (const double)pGeo[9]; + + const double DSD = (const double)pGeo[10]; + const double DDR = (const double)pGeo[12]; + const double DAG = (const double)pGeo[14]; + + const int unsigned nProj = (const int)pGeo[15]; + + const double tubeAngle = (const double)pGeo[16]; + const double detAngle = (const double)pGeo[17]; + + double* const pTubeAngle = (double*)malloc(nProj * sizeof(double)); + double* const pDetAngle = (double*)malloc(nProj * sizeof(double)); + + const double x_offset = (const double)pGeo[18]; + const double y_offset = (const double)pGeo[19]; + + linspace(-tubeAngle/2, tubeAngle/2, nProj, pTubeAngle); + linspace(-detAngle/2, detAngle/2, nProj, pDetAngle); + + // printf("Nx:%d Ny:%d Nz:%d \nNu:%d Nv:%d \nDx:%.2f Dy:%.2f Dz:%.2f \nDu:%.2f Dv:%.2f \nDSD:%.2f DDR:%.2f \nTube angle:%.2f \nDet angle:%.2f", nPixX, nPixY, nSlices, nDetX, nDetY, dx, dy, dz, du, dv, DSD, DDR, tubeAngle, detAngle); + + backprojectionDDb(pVolume, pProj, pTubeAngle, pDetAngle, nProj, nPixX, nPixY, nSlices, nDetX, nDetY, idXProj, x_offset, y_offset, dx, dy, dz, du, dv, DSD, DDR, DAG); + + free(pTubeAngle); + free(pDetAngle); + + return; + +} + +// Linear spaced vector +// Ref: https://stackoverflow.com/a/27030598/8682939 +void linspace(double start, + double end, + int num, + double* pLinspaced){ + + double delta; + + if(num == 0) + delta = 0; + else{ + if(num == 1) + delta = 1; + else + delta = (end - start) / (num - 1); + } + + if((abs(start) < 0.00001) && (abs(end) < 0.00001)) + delta = 0; + + + for (int k = 0; k < num; k++){ + pLinspaced[k] = start + k * delta; + } + + return; +} \ No newline at end of file diff --git a/pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.hpp b/pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.hpp new file mode 100644 index 0000000..1d0dc26 --- /dev/null +++ b/pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.hpp @@ -0,0 +1,85 @@ +/* +%% Author: Rodrigo de Barros Vimieiro +% Date: Feb, 2022 +% rodrigo.vimieiro@gmail.com +% ========================================================================= +%{ +% ------------------------------------------------------------------------- +% +% ------------------------------------------------------------------------- +% DESCRIPTION: +% This is the header function +% --------------------------------------------------------------------- +% Copyright (C) <2022> +% +% This program is free software: you can redistribute it and/or modify +% it under the terms of the GNU General Public License as published by +% the Free Software Foundation, either version 3 of the License, or +% (at your option) any later version. +% +% This program is distributed in the hope that it will be useful, +% but WITHOUT ANY WARRANTY; without even the implied warranty of +% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +% GNU General Public License for more details. +% +% You should have received a copy of the GNU General Public License +% along with this program. If not, see . +%} +% ========================================================================= +%% 3-D Distance Driven Back-projection header +*/ + +#include +#define _USE_MATH_DEFINES +#include + +// Includes CUDA +#include + +// Utilities and timing functions +#include // includes cuda.h and cuda_runtime_api.h + +// CUDA helper functions +#include // helper functions for CUDA error check + +#define integrateXcoord 1 +#define integrateYcoord 0 + + +/* +Reference: TIGRE - https://github.com/CERN/TIGRE +*/ +#define cudaCheckErrors(msg) \ +do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + printf("%s \n",msg);\ + } \ +} while (0) + +void linspace(double start, + double end, + int num, + double* pLinspaced); + +void backprojectionDDb(double* const h_pVolume, + double* const h_pProj, + double* const h_pTubeAngle, + double* const h_pDetAngle, + const unsigned int nProj, + const unsigned int nPixX, + const unsigned int nPixY, + const unsigned int nSlices, + const unsigned int nDetX, + const unsigned int nDetY, + const signed int idXProj, + const double x_offset, + const double y_offset, + const double dx, + const double dy, + const double dz, + const double du, + const double dv, + const double DSD, + const double DDR, + const double DAG); diff --git a/pydbt/sources/backprojectionDDb_cuda/kernel.cu b/pydbt/sources/backprojectionDDb_cuda/kernel.cu new file mode 100644 index 0000000..d54f463 --- /dev/null +++ b/pydbt/sources/backprojectionDDb_cuda/kernel.cu @@ -0,0 +1,751 @@ +/* +%% Author: Rodrigo de Barros Vimieiro +% Date: Feb, 2022 +% rodrigo.vimieiro@gmail.com +% ========================================================================= +%{ +% ------------------------------------------------------------------------- +% backprojectionDDb_cuda_lib(pProj, pVolume, pGeo, idXProj) +% ------------------------------------------------------------------------- +% DESCRIPTION: +% This function reconstruct the 3D volume from projections, based on +% the Distance-Driven principle. It works by calculating the overlap +% in X and Y axis of the volume and the detector boundaries. +% The geometry is for DBT with half cone-beam. All parameters are set +% in "ParameterSettings" code. +% +% INPUT: +% +% - pProj = Pointer to 2D projections for each angle +% - pGeo = Pointer to parameters of all pGeometry +% - idXProj = projection number to be projected +% +% OUTPUT: +% +% - pVolume = Pointer to reconstructed volume. +% +% Reference: +% - Branchless Distance Driven Projection and Backprojection, +% Samit Basu and Bruno De Man (2006) +% - GPU Acceleration of Branchless Distance Driven Projection and +% Backprojection, Liu et al (2016) +% - GPU-Based Branchless Distance-Driven Projection and Backprojection, +% Liu et al (2017) +% - A GPU Implementation of Distance-Driven Computed Tomography, +% Ryan D. Wagner (2017) +% --------------------------------------------------------------------- +% Copyright (C) <2022> +% +% This program is free software: you can redistribute it and/or modify +% it under the terms of the GNU General Public License as published by +% the Free Software Foundation, either version 3 of the License, or +% (at your option) any later version. +% +% This program is distributed in the hope that it will be useful, +% but WITHOUT ANY WARRANTY; without even the implied warranty of +% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +% GNU General Public License for more details. +% +% You should have received a copy of the GNU General Public License +% along with this program. If not, see . +%} +% ========================================================================= +%% 3-D Distance Driven Back-projection Code +*/ + +#include "backprojectionDDb_cuda.hpp" + + +/**************************************************************************** +* CUDA Kernels * +****************************************************************************/ +__global__ void pad_projections_kernel(double* d_img, const int nDetXMap, const int nDetYMap, const int nElem, unsigned int np) { + + const int threadGId = blockIdx.x * blockDim.x + threadIdx.x; + + // Make sure we don't try and access memory outside + // by having any threads mapped there return early + if (threadGId >= nElem) + return; + + d_img[(np*nDetYMap *nDetXMap) + (threadGId*nDetYMap)] = 0; + + return; +} + +__global__ void map_boudaries_kernel(double* d_pBound, const int nElem, const double valueLeftBound, const double sizeElem, const double offset) { + + const int threadGId = blockIdx.x * blockDim.x + threadIdx.x; + + // Make sure we don't try and access memory outside + // by having any threads mapped there return early + if (threadGId >= nElem) + return; + + d_pBound[threadGId] = (threadGId - valueLeftBound) * sizeElem + offset; + + return; +} + +__global__ void rot_detector_kernel(double* d_pRdetY, double* d_pRdetZ, double* d_pYcoord, double* d_pZcoord, const double yOffset, const double zOffset, + const double phi, const int nElem) { + + const int threadGId = blockIdx.x * blockDim.x + threadIdx.x; + + // Make sure we don't try and access memory outside + // by having any threads mapped there return early + if (threadGId >= nElem) + return; + + // cos and sin are in measured in radians. + + d_pRdetY[threadGId] = ((d_pYcoord[threadGId] - yOffset)* cos(phi) - (d_pZcoord[threadGId] - zOffset)* sin(phi)) + yOffset; + d_pRdetZ[threadGId] = ((d_pYcoord[threadGId] - yOffset)* sin(phi) + (d_pZcoord[threadGId] - zOffset)* cos(phi)) + zOffset; + + return; + +} + +__global__ void mapDet2Slice_kernel(double* const pXmapp, double* const pYmapp, double tubeX, double tubeY, double tubeZ, double* const pXcoord, + double* const pYcoord, double* const pZcoord, double* const pZSlicecoord, const int nDetXMap, const int nDetYMap, const unsigned int nz) { + + /* + + Note: Matlab has linear indexing as a column of elements, i.e, the elements are actually stored in memory as queued columns. + So threads in X are launched in the column direction (DBT Y coord), and threads in Y are launched in the row direction (DBT X coord). + + */ + + const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y); + + const int thread_1D_pos = thread_2D_pos.y * nDetYMap + thread_2D_pos.x; + + // Make sure we don't try and access memory outside the detector + // by having any threads mapped there return early + if (thread_2D_pos.x >= nDetYMap || thread_2D_pos.y >= nDetXMap) + return; + + pXmapp[thread_1D_pos] = ((pXcoord[thread_2D_pos.y] - tubeX)*(pZSlicecoord[nz] - pZcoord[thread_2D_pos.x]) - (pXcoord[thread_2D_pos.y] * tubeZ) + (pXcoord[thread_2D_pos.y] * pZcoord[thread_2D_pos.x])) / (-tubeZ + pZcoord[thread_2D_pos.x]); + + if (thread_2D_pos.y == 0) + pYmapp[thread_2D_pos.x] = ((pYcoord[thread_2D_pos.x] - tubeY)*(pZSlicecoord[nz] - pZcoord[thread_2D_pos.x]) - (pYcoord[thread_2D_pos.x] * tubeZ) + (pYcoord[thread_2D_pos.x] * pZcoord[thread_2D_pos.x])) / (-tubeZ + pZcoord[thread_2D_pos.x]); + + return; +} + +__global__ void img_integration_kernel(double* d_img, const int nPixX, const int nPixY, bool direction, unsigned int offsetX, unsigned int offsetY, unsigned int nSlices) { + + /* + + Integration of 2D slices over the whole volume + + (S.1.Integration. - Liu et al(2017)) + + ** Perfom an inclusive scan ** + + */ + + const int3 memory_2D_pos = make_int3(blockIdx.x * blockDim.x + threadIdx.x + offsetX, + blockIdx.y * blockDim.y + threadIdx.y + offsetY, + blockIdx.z * blockDim.z + threadIdx.z); + + const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y); + + + // Make sure we don't try and access memory outside the detector + // by having any threads mapped there return early + if (memory_2D_pos.x >= nPixY || memory_2D_pos.y >= nPixX || memory_2D_pos.z >= nSlices) + return; + + + if (direction == integrateXcoord) { + + for (int s = 1; s <= blockDim.y; s *= 2) { + + int spot = thread_2D_pos.y - s; + + double val = 0; + + if (spot >= 0) { + val = d_img[(memory_2D_pos.z*nPixY*nPixX) + (offsetY + spot) * nPixY + memory_2D_pos.x]; + } + __syncthreads(); + + if (spot >= 0) { + d_img[(memory_2D_pos.z*nPixY*nPixX) + (memory_2D_pos.y * nPixY) + memory_2D_pos.x] += val; + } + __syncthreads(); + } + } + else + { + + for (int s = 1; s <= blockDim.x; s *= 2) { + + int spot = thread_2D_pos.x - s; + + double val = 0; + + if (spot >= 0) { + val = d_img[(memory_2D_pos.z*nPixY*nPixX) + memory_2D_pos.y * nPixY + spot + offsetX]; + } + __syncthreads(); + + if (spot >= 0) { + d_img[(memory_2D_pos.z*nPixY*nPixX) + (memory_2D_pos.y * nPixY) + memory_2D_pos.x] += val; + } + __syncthreads(); + } + + } + return; +} + +__global__ void bilinear_interpolation_kernel_GPU(double* d_sliceI, double* d_pProj, double* d_pObjX, double* d_pObjY, double* d_pDetmX, double* d_pDetmY, const int nPixXMap, const int nPixYMap, + const int nDetXMap, const int nDetYMap, const int nDetX, const int nDetY, const unsigned int np) { + + const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y); + + // Make sure we don't try and access memory outside the detector + // by having any threads mapped there return early + if (thread_2D_pos.x >= nPixYMap || thread_2D_pos.y >= nPixXMap) + return; + + /* + + S.2. Interpolation - Liu et al (2017) + + Reference: + - https://en.wikipedia.org/wiki/Bilinear_interpolation + - https://stackoverflow.com/questions/21128731/bilinear-interpolation-in-c-c-and-cuda + + Note: *** We are using the Unit Square equation *** + + alpha = X - X1 + 1-alpha = X2 - X + + beta = Y - Y1 + 1-beta = Y2 - Y + + ----> Xcoord (thread_2D_pos.y) + | 0___________ + v |_d00_|_d10_| + Ycoord (thread_2D_pos.x) |_d01_|_d11_| + + Matlab code -- + objX = nDetX - (objX ./ detmX(1, end - 1)); + objY = (objY ./ detmX(1, end - 1)) - (detmY(1) / detmX(1, end - 1)); + + */ + + // Adjust the mapped coordinates to cross the range of (0-nDetX).*duMap + // Divide by pixelSize to get a unitary pixel size + const double xNormData = nDetX - d_pObjX[thread_2D_pos.y] / d_pDetmX[0]; + const signed int xData = floor(xNormData); + const double alpha = xNormData - xData; + + // Adjust the mapped coordinates to cross the range of (0-nDetY).*dyMap + // Divide by pixelSize to get a unitary pixel size + const double yNormData = (d_pObjY[thread_2D_pos.x] / d_pDetmX[0]) - (d_pDetmY[0] / d_pDetmX[0]); + const signed int yData = floor(yNormData); + const double beta = yNormData - yData; + + double d00, d01, d10, d11; + if (((xNormData) >= 0) && ((xNormData) <= nDetX) && ((yNormData) >= 0) && ((yNormData) <= nDetY)) d00 = d_pProj[(np*nDetYMap*nDetXMap) + (xData*nDetYMap + yData)]; else d00 = 0.0; + if (((xData + 1) > 0) && ((xData + 1) <= nDetX) && ((yNormData) >= 0) && ((yNormData) <= nDetY)) d10 = d_pProj[(np*nDetYMap*nDetXMap) + ((xData + 1)*nDetYMap + yData)]; else d10 = 0.0; + if (((xNormData) >= 0) && ((xNormData) <= nDetX) && ((yData + 1) > 0) && ((yData + 1) <= nDetY)) d01 = d_pProj[(np*nDetYMap*nDetXMap) + (xData*nDetYMap + yData + 1)]; else d01 = 0.0; + if (((xData + 1) > 0) && ((xData + 1) <= nDetX) && ((yData + 1) > 0) && ((yData + 1) <= nDetY)) d11 = d_pProj[(np*nDetYMap*nDetXMap) + ((xData + 1)*nDetYMap + yData + 1)]; else d11 = 0.0; + + double result_temp1 = alpha * d10 + (-d00 * alpha + d00); + double result_temp2 = alpha * d11 + (-d01 * alpha + d01); + + d_sliceI[thread_2D_pos.y * nPixYMap + thread_2D_pos.x] = beta * result_temp2 + (-result_temp1 * beta + result_temp1); + +} + +__global__ void differentiation_kernel(double* d_pVolume, double* d_sliceI, double tubeX, double rtubeY, double rtubeZ, + double* const d_pObjX, double* const d_pObjY, double* const d_pObjZ, const int nPixX, const int nPixY, const int nPixXMap, + const int nPixYMap, const double du, const double dv, const double dx, const double dy, const double dz, const unsigned int nz) { + + const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y); + + const int thread_1D_pos = (nPixX*nPixY*nz) + (thread_2D_pos.y * nPixY) + thread_2D_pos.x; + + // Make sure we don't try and access memory outside the detector + // by having any threads mapped there return early + if (thread_2D_pos.x >= nPixY || thread_2D_pos.y >= nPixX) + return; + + /* + + S.3. Differentiation - Eq. 24 - Liu et al (2017) + + Detector integral projection + ___________ + |_A_|_B_|___| + |_C_|_D_|___| + |___|___|___| + + + (thread_2D_pos.x,thread_2D_pos.y) + ________________ + |_A_|__B__|_____| + |_C_|(0,0)|(0,1)| + |___|(1,0)|(1,1)| + + Threads are lauched from D up to nPixX (thread_2D_pos.y) and nPixY (thread_2D_pos.x) + i.e., they are running on the detector image. Thread (0,0) is on D. + + Coordinates on intergal projection: + + A = thread_2D_pos.y * nPixYMap + thread_2D_pos.x + B = ((thread_2D_pos.y+1) * nPixYMap) + thread_2D_pos.x + C = thread_2D_pos.y * nPixYMap + thread_2D_pos.x + 1 + D = ((thread_2D_pos.y+1) * nPixYMap) + thread_2D_pos.x + 1 + + */ + + unsigned int coordA = thread_2D_pos.y * nPixYMap + thread_2D_pos.x; + unsigned int coordB = ((thread_2D_pos.y + 1) * nPixYMap) + thread_2D_pos.x; + unsigned int coordC = coordA + 1; + unsigned int coordD = coordB + 1; + + // x - ray angle in X coord + double gamma = atan((d_pObjX[thread_2D_pos.y] + (dx / 2.0) - tubeX) / (rtubeZ - d_pObjZ[nz])); + + // x - ray angle in Y coord + double alpha = atan((d_pObjY[thread_2D_pos.x] + (dy / 2.0) - rtubeY) / (rtubeZ - d_pObjZ[nz])); + + + double dA, dB, dC, dD; + + dA = d_sliceI[coordA]; + dB = d_sliceI[coordB]; + dC = d_sliceI[coordC]; + dD = d_sliceI[coordD]; + + // Treat border of interpolated integral detector + if (dC == 0 && dD == 0) { + dC = dA; + dD = dB; + } + + + // S.3.Differentiation - Eq. 24 - Liu et al(2017) + d_pVolume[thread_1D_pos] += ((dD - dC - dB + dA)*(du*dv*dz / (cos(alpha)*cos(gamma)*dx*dy))); + + return; +} + +__global__ void division_kernel(double* d_img, const int nPixX, const int nPixY, unsigned int nSlices, unsigned int nProj){ + + const int3 thread_2D_pos = make_int3(blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y, + blockIdx.z * blockDim.z + threadIdx.z); + + const int thread_1D_pos = (nPixX*nPixY*thread_2D_pos.z) + (thread_2D_pos.y * nPixY) + thread_2D_pos.x; + + // Make sure we don't try and access memory outside the detector + // by having any threads mapped there return early + if (thread_2D_pos.x >= nPixY || thread_2D_pos.y >= nPixX || thread_2D_pos.z >= nSlices) + return; + + d_img[thread_1D_pos] /= (double) nProj; + +} + + +/**************************************************************************** +* function: backprojectionDDb() - CUDA backprojection Branchless Distance Driven. * +****************************************************************************/ + +void backprojectionDDb(double* const h_pVolume, + double* const h_pProj, + double* const h_pTubeAngle, + double* const h_pDetAngle, + const unsigned int nProj, + const unsigned int nPixX, + const unsigned int nPixY, + const unsigned int nSlices, + const unsigned int nDetX, + const unsigned int nDetY, + const signed int idXProj, + const double x_offset, + const double y_offset, + const double dx, + const double dy, + const double dz, + const double du, + const double dv, + const double DSD, + const double DDR, + const double DAG) +{ + + // Unique GPU + int devID = 0; + + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, devID); + cudaCheckErrors("cudaGetDeviceProperties"); + + const unsigned int maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; + + + printf("GPU Device %d: \"%s\" with compute capability %d.%d has %d Multi-Processors and %zu bytes of global memory\n\n", devID, + deviceProp.name, deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount, deviceProp.totalGlobalMem); + + + // Create timmer variables and start it + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); + sdkStartTimer(&timer); + + //cudaStream_t stream1; + //cudaStreamCreate(&stream1); + + dim3 threadsPerBlock(1, 1, 1); + dim3 blockSize(1, 1, 1); + + + // Number of mapped detector and pixels + const int nDetXMap = nDetX + 1; + const int nDetYMap = nDetY + 1; + const int nPixXMap = nPixX + 1; + const int nPixYMap = nPixY + 1; + + + // Convention: h_ variables live on host + // Convention: d_ variables live on device (GPU global mem) + double* d_pProj; + double* d_sliceI; + double* d_pVolume; + double* d_pTubeAngle; + double* d_pDetAngle; + + + // Allocate global memory on the device, place result in "d_----" + cudaMalloc((void **)&d_pProj, nDetXMap*nDetYMap*nProj * sizeof(double)); + cudaMalloc((void **)&d_sliceI, nPixXMap*nPixYMap * sizeof(double)); + cudaMalloc((void **)&d_pVolume, nPixX*nPixY*nSlices * sizeof(double)); + cudaMalloc((void **)&d_pTubeAngle, nProj * sizeof(double)); + cudaMalloc((void **)&d_pDetAngle, nProj * sizeof(double)); + + cudaCheckErrors("cudaMalloc Initial"); + + // Copy data from host memory "h_----" to device memory "d_----" + cudaMemcpy((void *)d_pTubeAngle, (void *)h_pTubeAngle, nProj * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy((void *)d_pDetAngle, (void *)h_pDetAngle, nProj * sizeof(double), cudaMemcpyHostToDevice); + + cudaCheckErrors("cudaMemcpy Angles"); + + + /* + Copy projection data from host memory "h_----" to device memory "d_----" padding with zeros for image integation + */ + + + // Initialize first column and row with zeros + double* h_pProj_tmp; + double* d_pProj_tmp; + + threadsPerBlock.x = maxThreadsPerBlock; + blockSize.x = (nDetXMap / maxThreadsPerBlock) + 1; + + for (unsigned int np = 0; np < nProj; np++) { + + // Pad on X coord direction + pad_projections_kernel << > > (d_pProj, nDetXMap, nDetYMap, nDetXMap, np); + + // Pad on Y coord direction + d_pProj_tmp = d_pProj + (nDetXMap*nDetYMap*np) + 1; + cudaMemset(d_pProj_tmp, 0, nPixY * sizeof(double)); + cudaCheckErrors("cudaMemset Padding Projections"); + } + + + // Copy projections data from host memory + for (unsigned int np = 0; np < nProj; np++) + for (unsigned int c = 0; c < nDetX; c++) { + + h_pProj_tmp = h_pProj + (c *nDetY) + (nDetX*nDetY*np); + d_pProj_tmp = d_pProj + (((c + 1) *nDetYMap) + 1) + (nDetXMap*nDetYMap*np); + + cudaMemcpy((void *)d_pProj_tmp, (void *)h_pProj_tmp, nDetY * sizeof(double), cudaMemcpyHostToDevice); + cudaCheckErrors("cudaMemcpy Projections"); + } + + + + // Pointer for projections coordinates + double* d_pDetX; + double* d_pDetY; + double* d_pDetZ; + double* d_pObjX; + double* d_pObjY; + double* d_pObjZ; + + // Allocate global memory on the device for projections coordinates + cudaMalloc((void **)&d_pDetX, nDetXMap * sizeof(double)); + cudaMalloc((void **)&d_pDetY, nDetYMap * sizeof(double)); + cudaMalloc((void **)&d_pDetZ, nDetYMap * sizeof(double)); + cudaMalloc((void **)&d_pObjX, nPixXMap * sizeof(double)); + cudaMalloc((void **)&d_pObjY, nPixYMap * sizeof(double)); + cudaMalloc((void **)&d_pObjZ, nSlices * sizeof(double)); + + cudaCheckErrors("cudaMalloc Coordinates"); + + + + // Pointer for mapped coordinates + double* d_pDetmY; + double* d_pDetmX; + + + // Allocate global memory on the device for mapped coordinates + cudaMalloc((void **)&d_pDetmY, nDetYMap * sizeof(double)); + cudaMalloc((void **)&d_pDetmX, nDetYMap * nDetXMap * sizeof(double)); + + cudaCheckErrors("cudaMalloc Map-Coordinates"); + + + // Pointer for rotated detector coords + double* d_pRdetY; + double* d_pRdetZ; + + // Allocate global memory on the device for for rotated detector coords + cudaMalloc((void **)&d_pRdetY, nDetYMap * sizeof(double)); + cudaMalloc((void **)&d_pRdetZ, nDetYMap * sizeof(double)); + + cudaCheckErrors("cudaMalloc Rot-Coordinates"); + + // Generate detector and object boudaries + + threadsPerBlock.x = maxThreadsPerBlock; + + blockSize.x = (nDetX / maxThreadsPerBlock) + 1; + + map_boudaries_kernel << > > (d_pDetX, nDetXMap, (double)nDetX, -du, 0.0); + + blockSize.x = (nDetY / maxThreadsPerBlock) + 1; + + map_boudaries_kernel << > > (d_pDetY, nDetYMap, nDetY / 2.0, dv, 0.0); + + blockSize.x = (nPixX / maxThreadsPerBlock) + 1; + + map_boudaries_kernel << > > (d_pObjX, nPixXMap, (double)nPixX, -dx, 0.0); + + blockSize.x = (nPixY / maxThreadsPerBlock) + 1; + + map_boudaries_kernel << > > (d_pObjY, nPixYMap, nPixY / 2.0, dy, 0.0); + + blockSize.x = (nSlices / maxThreadsPerBlock) + 1; + + map_boudaries_kernel << > > (d_pObjZ, nSlices, 0.0, dz, DAG + (dz / 2.0)); + + + //printf("Map Boundaries -- Threads:%d Blocks:%d \n", threads, blocks); + + + // Initiate variables value with 0 + cudaMemset(d_pDetZ, 0, nDetYMap * sizeof(double)); + cudaMemset(d_pVolume, 0, nPixX * nPixY * nSlices * sizeof(double)); + + cudaCheckErrors("cudaMemset Zeros"); + + //cudaDeviceSynchronize(); + + // X - ray tube initial position + double tubeX = 0; + double tubeY = 0; + double tubeZ = DSD; + + // Iso - center position + double isoY = 0; + double isoZ = DDR; + + + // Integration of 2D projection over the whole projections + // (S.1.Integration. - Liu et al(2017)) + + // Naive integration o the X coord + threadsPerBlock.x = 10; + threadsPerBlock.y = 10; + threadsPerBlock.z = 10; + + blockSize.x = (unsigned int)ceil(((double)nDetYMap / (threadsPerBlock.x - 1))); + blockSize.y = 1; + blockSize.z = (unsigned int)ceil((double)nProj / threadsPerBlock.z); + + for (int k = 0; k < ceil((double)nDetXMap / (threadsPerBlock.x - 1)); k++) { + + img_integration_kernel << > > (d_pProj, nDetXMap, nDetYMap, integrateXcoord, 0, k * 9, nProj); + + //printf("integration kernel -- threadsX:%d blocksX:%d threadsY:%d blocksY:%d \n", threadsPerBlock.x, blockSize.x, threadsPerBlock.y, blockSize.y); + + } + + // Naive integration o the Y coord + threadsPerBlock.x = 10; + threadsPerBlock.y = 10; + threadsPerBlock.z = 10; + + blockSize.x = 1; + blockSize.y = (unsigned int)ceil((double)nDetXMap / (threadsPerBlock.y - 1)); + blockSize.z = (unsigned int)ceil((double)nProj / threadsPerBlock.z); + + + for (int k = 0; k < ceil((double)nDetYMap / (threadsPerBlock.y - 1)); k++) { + + img_integration_kernel << > > (d_pProj, nDetXMap, nDetYMap, integrateYcoord, k * 9, 0, nProj); + + //printf("integration kernel -- threadsX:%d blocksX:%d threadsY:%d blocksY:%d \n", threadsPerBlock.x, blockSize.x, threadsPerBlock.y, blockSize.y); + + } + + double* d_pDetmX_tmp = d_pDetmX + (nDetYMap * (nDetXMap-2)); + + // Test if we will loop over all projs or not + unsigned int projIni, projEnd, nProj2Run; + if(idXProj == -1){ + projIni = 0; + projEnd = nProj; + nProj2Run = nProj; + } + else{ + nProj2Run = 1; + projIni = (unsigned int) idXProj; + projEnd = (unsigned int) idXProj + 1; + } + + + // For each projection + for (unsigned int p = projIni; p < projEnd; p++) { + + // Get specif tube angle for the projection + double theta = h_pTubeAngle[p] * M_PI / 180.0; + + // Get specif detector angle for the projection + double phi = h_pDetAngle[p] * M_PI / 180.0; + + //printf("Tube angle:%f Det angle:%f\n", theta, phi); + + // Tube rotation + double rtubeY = ((tubeY - isoY)*cos(theta) - (tubeZ - isoZ)*sin(theta)) + isoY; + double rtubeZ = ((tubeY - isoY)*sin(theta) + (tubeZ - isoZ)*cos(theta)) + isoZ; + + //printf("R tube Y:%f R tube Z:%f\n", rtubeY, rtubeZ); + + // Detector rotation + threadsPerBlock.x = maxThreadsPerBlock; + threadsPerBlock.y = 1; + threadsPerBlock.z = 1; + + blockSize.x = (nDetYMap / maxThreadsPerBlock) + 1; + blockSize.y = 1; + blockSize.z = 1; + + rot_detector_kernel << > > (d_pRdetY, d_pRdetZ, d_pDetY, d_pDetZ, isoY, isoZ, phi, nDetYMap); + + //cudaDeviceSynchronize(); + + //printf("Detector rotation -- Threads:%d Blocks:%d \n", threads, blocks); + + + // For each slice + for (unsigned int nz = 0; nz < nSlices; nz++) { + + /* + + Map detector onto XY plane(Inside proj loop in case detector rotates) + + *** Note: Matlab has linear indexing as a column of elements, i.e, the elements are actually stored in memory as queued columns. + So threads in X are launched in the column direction (DBT Y coord), and threads in Y are launched in the row direction (DBT X coord). + + */ + + threadsPerBlock.x = 32; + threadsPerBlock.y = 32; + threadsPerBlock.z = 1; + + blockSize.x = (nDetYMap / threadsPerBlock.x) + 1; + blockSize.y = (nDetXMap / threadsPerBlock.y) + 1; + blockSize.z = 1; + + + mapDet2Slice_kernel << > > (d_pDetmX, d_pDetmY, tubeX, rtubeY, rtubeZ, d_pDetX, d_pRdetY, d_pRdetZ, d_pObjZ, nDetXMap, nDetYMap, nz); + //cudaDeviceSynchronize(); + + //printf("Map detector onto XY plane -- ThreadsX:%d ThreadsY:%d BlocksX:%d BlocksY:%d\n", threadsPerBlock.x, threadsPerBlock.y, blockSizeX, blockSizeY); + + /* + S.2. Interpolation - Liu et al (2017) + */ + + blockSize.x = (nPixYMap / threadsPerBlock.x) + 1; + blockSize.y = (nPixXMap / threadsPerBlock.y) + 1; + + bilinear_interpolation_kernel_GPU << > > (d_sliceI, d_pProj, d_pObjX, d_pObjY, d_pDetmX_tmp, d_pDetmY, nPixXMap, nPixYMap, nDetXMap, nDetYMap, nDetX, nDetY, p); + + + /* + S.3. Differentiation - Eq. 24 - Liu et al (2017) + */ + + blockSize.x = (nPixY / threadsPerBlock.x) + 1; + blockSize.y = (nPixX / threadsPerBlock.y) + 1; + + differentiation_kernel << > > (d_pVolume, d_sliceI, tubeX, rtubeY, rtubeZ, d_pObjX, d_pObjY, d_pObjZ, nPixX, nPixY, nPixXMap, nPixYMap, du, dv, dx, dy, dz, nz); + + } // Loop end slices + + } // Loop end Projections + + + // Normalize volume dividing by the number of projs + threadsPerBlock.x = 10; + threadsPerBlock.y = 10; + threadsPerBlock.z = 10; + + blockSize.x = (nPixY / threadsPerBlock.x) + 1; + blockSize.y = (nPixX / threadsPerBlock.y) + 1; + blockSize.z = (nSlices / threadsPerBlock.z) + 1; + + division_kernel << > > (d_pVolume, nPixX, nPixY, nSlices, nProj2Run); + + // d_pVolume + cudaMemcpy((void *)h_pVolume, (void *)d_pVolume, nSlices* nPixX * nPixY * sizeof(double), cudaMemcpyDeviceToHost); + cudaCheckErrors("cudaMemcpy Final"); + + + cudaFree(d_pProj); + cudaFree(d_sliceI); + cudaFree(d_pVolume); + cudaFree(d_pTubeAngle); + cudaFree(d_pDetAngle); + cudaFree(d_pDetX); + cudaFree(d_pDetY); + cudaFree(d_pDetZ); + cudaFree(d_pObjX); + cudaFree(d_pObjY); + cudaFree(d_pObjZ); + cudaFree(d_pDetmY); + cudaFree(d_pDetmX); + cudaFree(d_pRdetY); + cudaFree(d_pRdetZ); + + cudaCheckErrors("cudaFree Final"); + + sdkStopTimer(&timer); + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + sdkDeleteTimer(&timer); + + cudaDeviceReset(); + + return; + +} \ No newline at end of file diff --git a/pydbt/sources/projectionDD/projectionDD.cpp b/pydbt/sources/projectionDD/projectionDD.cpp index e5d6c34..477aebc 100644 --- a/pydbt/sources/projectionDD/projectionDD.cpp +++ b/pydbt/sources/projectionDD/projectionDD.cpp @@ -5,7 +5,7 @@ % ========================================================================= %{ % ------------------------------------------------------------------------- -% projectionDD_lib(pVolume, pProj, ppGeo +% projectionDD_lib(pVolume, pProj, pGeo, idXProj) % ------------------------------------------------------------------------- % DESCRIPTION: % This function calculates the volume projection based on the @@ -16,8 +16,9 @@ % % INPUT: % -% - pVolume = pointer to 3D volume for projection -% - ppGeo = Parameter of all pGeometry +% - pVolume = 3D volume for projection +% - pGeo = Parameter of all geometry +% - idXProj = projection number to be projected % % OUTPUT: % @@ -54,7 +55,8 @@ extern "C" void projectionDD_lib(double* const pVolume, double* const pProj, - float* const pGeo){ + float* const pGeo, + const signed int idXProj){ const int unsigned nPixX = (const int)pGeo[0]; @@ -78,8 +80,8 @@ extern "C" void projectionDD_lib(double* const pVolume, const double tubeAngle = (const double)pGeo[16]; const double detAngle = (const double)pGeo[17]; - const int x_offset = (const int)pGeo[18]; - const int y_offset = (const int)pGeo[19]; + const double x_offset = (const double)pGeo[18]; + const double y_offset = (const double)pGeo[19]; double* const pTubeAngle = (double*)malloc(nProj * sizeof(double)); double* const pDetAngle = (double*)malloc(nProj * sizeof(double)); @@ -151,8 +153,20 @@ extern "C" void projectionDD_lib(double* const pVolume, for (int v = 0; v < nDetY; v++) pProjt[(p * nDetY * nDetX) + (u * nDetY) + v] = 0.0; + // Test if we will loop over all projs or not + unsigned int projIni, projEnd; + if(idXProj == -1){ + projIni = 0; + projEnd = nProj; + } + else{ + projIni = (unsigned int) idXProj; + projEnd = (unsigned int) idXProj + 1; + } + + // For each projection - for (int p = 0; p < nProj; p++) { + for (unsigned int p = projIni; p < projEnd; p++) { // Get specif tube angle for the projection double theta = pTubeAngle[p] * M_PI / 180.0; diff --git a/pydbt/sources/projectionDDb/projectionDDb.cpp b/pydbt/sources/projectionDDb/projectionDDb.cpp index d252791..abd75f3 100644 --- a/pydbt/sources/projectionDDb/projectionDDb.cpp +++ b/pydbt/sources/projectionDDb/projectionDDb.cpp @@ -5,7 +5,7 @@ % ========================================================================= %{ % ------------------------------------------------------------------------- -% projectionDDb_lib(pVolume, pProj, ppGeo) +% projectionDDb_lib(pVolume, pProj, pGeo, idXProj) % ------------------------------------------------------------------------- % DESCRIPTION: % This function calculates the volume projection based on the @@ -15,8 +15,9 @@ % % INPUT: % -% - pVolume = pointer to 3D volume for projection -% - ppGeo = Parameter of all pGeometry +% - pVolume = 3D volume for projection +% - pGeo = Parameter of all geometry +% - idXProj = projection number to be projected % % OUTPUT: % @@ -55,7 +56,8 @@ extern "C" void projectionDDb_lib(double* const pVolume, double* const pProj, - float* const pGeo){ + float* const pGeo, + const signed int idXProj){ const int unsigned nPixX = (const int)pGeo[0]; @@ -82,15 +84,15 @@ extern "C" void projectionDDb_lib(double* const pVolume, double* const pTubeAngle = (double*)malloc(nProj * sizeof(double)); double* const pDetAngle = (double*)malloc(nProj * sizeof(double)); - const int x_offset = (const int)pGeo[18]; - const int y_offset = (const int)pGeo[19]; + const double x_offset = (const double)pGeo[18]; + const double y_offset = (const double)pGeo[19]; linspace(-tubeAngle/2, tubeAngle/2, nProj, pTubeAngle); linspace(-detAngle/2, detAngle/2, nProj, pDetAngle); // printf("Nx:%d Ny:%d Nz:%d \nNu:%d Nv:%d \nDx:%.2f Dy:%.2f Dz:%.2f \nDu:%.2f Dv:%.2f \nDSD:%.2f DDR:%.2f \nTube angle:%.2f \nDet angle:%.2f", nPixX, nPixY, nSlices, nDetX, nDetY, dx, dy, dz, du, dv, DSD, DDR, tubeAngle, detAngle); - projectionDDb(pProj, pVolume, pTubeAngle, pDetAngle, x_offset, y_offset, nProj, nPixX, nPixY, nSlices, nDetX, nDetY, dx, dy, dz, du, dv, DSD, DDR, DAG); + projectionDDb(pProj, pVolume, pTubeAngle, pDetAngle, nProj, nPixX, nPixY, nSlices, nDetX, nDetY, idXProj, x_offset, y_offset, dx, dy, dz, du, dv, DSD, DDR, DAG); free(pTubeAngle); free(pDetAngle); @@ -105,14 +107,15 @@ void projectionDDb(double* const pProj, double* const pVolume, double* const pTubeAngle, double* const pDetAngle, - const int x_offset, - const int y_offset, - const int nProj, - const int nPixX, - const int nPixY, - const int nSlices, - const int nDetX, - const int nDetY, + const unsigned int nProj, + const unsigned int nPixX, + const unsigned int nPixY, + const unsigned int nSlices, + const unsigned int nDetX, + const unsigned int nDetY, + const signed int idXProj, + const double x_offset, + const double y_offset, const double dx, const double dy, const double dz, @@ -247,8 +250,20 @@ void projectionDDb(double* const pProj, pVolumet[(nz*nPixYMap *nPixXMap) + (x*nPixYMap) + y] += pVolumet[(nz*nPixYMap *nPixXMap) + ((x - 1)*nPixYMap) + y]; + // Test if we will loop over all projs or not + unsigned int projIni, projEnd; + if(idXProj == -1){ + projIni = 0; + projEnd = nProj; + } + else{ + projIni = (unsigned int) idXProj; + projEnd = (unsigned int) idXProj + 1; + } + + // For each projection - for (int p = 0; p < nProj; p++) { + for (unsigned int p = projIni; p < projEnd; p++) { // Get specif tube angle for the projection double theta = pTubeAngle[p] * M_PI / 180.0; diff --git a/pydbt/sources/projectionDDb/projectionDDb.hpp b/pydbt/sources/projectionDDb/projectionDDb.hpp index 07544a4..aad19c5 100644 --- a/pydbt/sources/projectionDDb/projectionDDb.hpp +++ b/pydbt/sources/projectionDDb/projectionDDb.hpp @@ -38,18 +38,19 @@ // Global variable int nThreads; -//extern "C" void projectionDDb_lib(double* const pVolume,double* const pProj,double* const geo); - void projectionDDb(double* const pProj, double* const pVolume, double* const pTubeAngle, double* const pDetAngle, - const int nProj, - const int nPixX, - const int nPixY, - const int nSlices, - const int nDetX, - const int nDetY, + const unsigned int nProj, + const unsigned int nPixX, + const unsigned int nPixY, + const unsigned int nSlices, + const unsigned int nDetX, + const unsigned int nDetY, + const signed int idXProj, + const double x_offset, + const double y_offset, const double dx, const double dy, const double dz, diff --git a/pydbt/sources/projectionDDb_cuda/kernel.cu b/pydbt/sources/projectionDDb_cuda/kernel.cu new file mode 100644 index 0000000..3056286 --- /dev/null +++ b/pydbt/sources/projectionDDb_cuda/kernel.cu @@ -0,0 +1,723 @@ +/* +%% Author: Rodrigo de Barros Vimieiro +% Date: Feb, 2022 +% rodrigo.vimieiro@gmail.com +% ========================================================================= +%{ +% ------------------------------------------------------------------------- +% projectionDDb_cuda_lib(pVolume, pProj, ppGeo) +% ------------------------------------------------------------------------- +% DESCRIPTION: +% This function calculates the volume projection based on the +% Branchless Distance-Driven principle. +% The geometry is for DBT with half cone-beam. All parameters are set +% in "ParameterSettings" code. +% +% INPUT: +% +% - data3d = 3D volume for projection +% - param = Parameter of all geometry +% - nProj = projection number to be projected +% +% OUTPUT: +% +% - proj = projections for each angle. +% +% Reference: +% - Branchless Distance Driven Projection and Backprojection, +% Samit Basu and Bruno De Man (2006) +% - GPU Acceleration of Branchless Distance Driven Projection and +% Backprojection, Liu et al (2016) +% - GPU-Based Branchless Distance-Driven Projection and Backprojection, +% Liu et al (2017) +% - A GPU Implementation of Distance-Driven Computed Tomography, +% Ryan D. Wagner (2017) +% --------------------------------------------------------------------- +% Copyright (C) <2019> +% +% This program is free software: you can redistribute it and/or modify +% it under the terms of the GNU General Public License as published by +% the Free Software Foundation, either version 3 of the License, or +% (at your option) any later version. +% +% This program is distributed in the hope that it will be useful, +% but WITHOUT ANY WARRANTY; without even the implied warranty of +% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +% GNU General Public License for more details. +% +% You should have received a copy of the GNU General Public License +% along with this program. If not, see . +%} +% ========================================================================= +%% 3-D Projection Branchless Distance Driven Code +*/ + +#include "projectionDDb_cuda.hpp" + +/**************************************************************************** +* CUDA Kernels * +****************************************************************************/ +__global__ void pad_volume_kernel(double* d_img, const int nPixXMap, const int nPixYMap, const int nElem, unsigned int nz) { + + const int threadGId = blockIdx.x * blockDim.x + threadIdx.x; + + // Make sure we don't try and access memory outside + // by having any threads mapped there return early + if (threadGId >= nElem) + return; + + d_img[(nz*nPixYMap *nPixXMap) + (threadGId*nPixYMap)] = 0; + + return; +} + +__global__ void map_boudaries_kernel(double* d_pBound, const int nElem, const double valueLeftBound, const double sizeElem, const double offset) { + + const int threadGId = blockIdx.x * blockDim.x + threadIdx.x; + + // Make sure we don't try and access memory outside + // by having any threads mapped there return early + if (threadGId >= nElem) + return; + + d_pBound[threadGId] = (threadGId - valueLeftBound) * sizeElem + offset; + + return; +} + +__global__ void rot_detector_kernel(double* d_pRdetY, double* d_pRdetZ, double* d_pYcoord, double* d_pZcoord, const double yOffset, const double zOffset, + const double phi, const int nElem) { + + const int threadGId = blockIdx.x * blockDim.x + threadIdx.x; + + // Make sure we don't try and access memory outside + // by having any threads mapped there return early + if (threadGId >= nElem) + return; + + // cos and sin are in measured in radians. + + d_pRdetY[threadGId] = ((d_pYcoord[threadGId] - yOffset)* cos(phi) - (d_pZcoord[threadGId] - zOffset)* sin(phi)) + yOffset; + d_pRdetZ[threadGId] = ((d_pYcoord[threadGId] - yOffset)* sin(phi) + (d_pZcoord[threadGId] - zOffset)* cos(phi)) + zOffset; + + return; + +} + +__global__ void mapDet2Slice_kernel(double* const pXmapp, double* const pYmapp, double tubeX, double tubeY, double tubeZ, double* const pXcoord, + double* const pYcoord, double* const pZcoord, double* const pZSlicecoord, const int nDetXMap, const int nDetYMap, const unsigned int nz) { + + /* + + Note: Matlab has linear indexing as a column of elements, i.e, the elements are actually stored in memory as queued columns. + So threads in X are launched in the column direction (DBT Y coord), and threads in Y are launched in the row direction (DBT X coord). + + */ + + const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y); + + const int thread_1D_pos = thread_2D_pos.y * nDetYMap + thread_2D_pos.x; + + // Make sure we don't try and access memory outside the detector + // by having any threads mapped there return early + if (thread_2D_pos.x >= nDetYMap || thread_2D_pos.y >= nDetXMap) + return; + + pXmapp[thread_1D_pos] = ((pXcoord[thread_2D_pos.y] - tubeX)*(pZSlicecoord[nz] - pZcoord[thread_2D_pos.x]) - (pXcoord[thread_2D_pos.y]*tubeZ) + (pXcoord[thread_2D_pos.y]*pZcoord[thread_2D_pos.x]))/(-tubeZ + pZcoord[thread_2D_pos.x]); + + if (thread_2D_pos.y == 0) + pYmapp[thread_2D_pos.x] = ((pYcoord[thread_2D_pos.x] - tubeY)*(pZSlicecoord[nz] - pZcoord[thread_2D_pos.x]) - (pYcoord[thread_2D_pos.x]*tubeZ) + (pYcoord[thread_2D_pos.x]*pZcoord[thread_2D_pos.x]))/(-tubeZ + pZcoord[thread_2D_pos.x]); + + return; +} + +__global__ void img_integration_kernel(double* d_img, const int nPixX, const int nPixY, bool direction, unsigned int offsetX, unsigned int offsetY, unsigned int nSlices) { + + /* + + Integration of 2D slices over the whole volume + + (S.1.Integration. - Liu et al(2017)) + + ** Perfom an inclusive scan ** + + */ + + const int3 memory_2D_pos = make_int3(blockIdx.x * blockDim.x + threadIdx.x + offsetX, + blockIdx.y * blockDim.y + threadIdx.y + offsetY, + blockIdx.z * blockDim.z + threadIdx.z); + + const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y); + + + // Make sure we don't try and access memory outside the detector + // by having any threads mapped there return early + if (memory_2D_pos.x >= nPixY || memory_2D_pos.y >= nPixX || memory_2D_pos.z >= nSlices) + return; + + + if (direction == integrateXcoord){ + + for (int s = 1; s <= blockDim.y; s *= 2) { + + int spot = thread_2D_pos.y - s; + + double val = 0; + + if (spot >= 0) { + val = d_img[(memory_2D_pos.z*nPixY*nPixX) + (offsetY + spot) * nPixY + memory_2D_pos.x]; + } + __syncthreads(); + + if (spot >= 0) { + d_img[(memory_2D_pos.z*nPixY*nPixX) + (memory_2D_pos.y * nPixY) + memory_2D_pos.x] += val; + } + __syncthreads(); + } + } + else + { + + for (int s = 1; s <= blockDim.x; s *= 2) { + + int spot = thread_2D_pos.x - s; + + double val = 0; + + if (spot >= 0) { + val = d_img[(memory_2D_pos.z*nPixY*nPixX) + memory_2D_pos.y * nPixY + spot + offsetX]; + } + __syncthreads(); + + if (spot >= 0) { + d_img[(memory_2D_pos.z*nPixY*nPixX) + (memory_2D_pos.y * nPixY) + memory_2D_pos.x] += val; + } + __syncthreads(); + } + + } + return; +} + + +__global__ void bilinear_interpolation_kernel_GPU(double* d_projI, double* d_pVolume, double* d_pDetmX, double* d_pDetmY, const int nDetXMap, const int nDetYMap, + const int nPixXMap, const int nPixYMap, const int nPixX, const int nPixY, const double pixelSize, const unsigned int nz) { + + const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y); + + // Make sure we don't try and access memory outside the detector + // by having any threads mapped there return early + if (thread_2D_pos.x >= nDetYMap || thread_2D_pos.y >= nDetXMap) + return; + + /* + + S.2. Interpolation - Liu et al (2017) + + Reference: + - https://en.wikipedia.org/wiki/Bilinear_interpolation + - https://stackoverflow.com/questions/21128731/bilinear-interpolation-in-c-c-and-cuda + + Note: *** We are using the Unit Square equation *** + + alpha = X - X1 + 1-alpha = X2 - X + + beta = Y - Y1 + 1-beta = Y2 - Y + + ----> Xcoord (thread_2D_pos.y) + | 0___________ + v |_d00_|_d10_| + Ycoord (thread_2D_pos.x) |_d01_|_d11_| + + */ + + // Adjust the mapped coordinates to cross the range of (0-nPixX).*dx + // Divide by pixelSize to get a unitary pixel size + const double xNormData = nPixX - d_pDetmX[thread_2D_pos.y * nDetYMap + thread_2D_pos.x] / pixelSize; + const signed int xData = floor(xNormData); + const double alpha = xNormData - xData; + + // Adjust the mapped coordinates to cross the range of (0-nPixY).*dy + // Divide by pixelSize to get a unitary pixel size + const double yNormData = (nPixY / 2.0) + (d_pDetmY[thread_2D_pos.x] / pixelSize); + const signed int yData = floor(yNormData); + const double beta = yNormData - yData; + + double d00, d01, d10, d11; + if (((xNormData) >= 0) && ((xNormData) <= nPixX) && ((yNormData) >= 0) && ((yNormData) <= nPixY)) d00 = d_pVolume[(nz*nPixYMap*nPixXMap) + (xData*nPixYMap + yData)]; else d00 = 0.0; + if (((xData + 1) > 0) && ((xData + 1) <= nPixX) && ((yNormData) >= 0) && ((yNormData) <= nPixY)) d10 = d_pVolume[(nz*nPixYMap*nPixXMap) + ((xData + 1)*nPixYMap + yData)]; else d10 = 0.0; + if (((xNormData) >= 0) && ((xNormData) <= nPixX) && ((yData + 1) > 0) && ((yData + 1) <= nPixY)) d01 = d_pVolume[(nz*nPixYMap*nPixXMap) + (xData*nPixYMap + yData + 1)]; else d01 = 0.0; + if (((xData + 1) > 0) && ((xData + 1) <= nPixX) && ((yData + 1) > 0) && ((yData + 1) <= nPixY)) d11 = d_pVolume[(nz*nPixYMap*nPixXMap) + ((xData + 1)*nPixYMap + yData + 1)]; else d11 = 0.0; + + double result_temp1 = alpha * d10 + (-d00 * alpha + d00); + double result_temp2 = alpha * d11 + (-d01 * alpha + d01); + + d_projI[thread_2D_pos.y * nDetYMap + thread_2D_pos.x] = beta * result_temp2 + (-result_temp1 * beta + result_temp1); + +} + +__global__ void differentiation_kernel(double* d_pProj, double* d_projI, double* const d_pDetmX, double* const d_pDetmY, double tubeX, double rtubeY, double rtubeZ, + double* const d_pDetX, double* const d_pRdetY, double* const d_pRdetZ, const int nDetX, const int nDetY, const int nDetXMap, + const int nDetYMap, const double du, const double dv, const double dx, const double dy, const double dz, const unsigned int p) { + + const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y); + + const int thread_1D_pos = (nDetX*nDetY*p) + (thread_2D_pos.y * nDetY) + thread_2D_pos.x; + + // Make sure we don't try and access memory outside the detector + // by having any threads mapped there return early + if (thread_2D_pos.x >= nDetY || thread_2D_pos.y >= nDetX) + return; + + /* + + S.3. Differentiation - Eq. 24 - Liu et al (2017) + + Detector integral projection + ___________ + |_A_|_B_|___| + |_C_|_D_|___| + |___|___|___| + + + (thread_2D_pos.x,thread_2D_pos.y) + ________________ + |_A_|__B__|_____| + |_C_|(0,0)|(0,1)| + |___|(1,0)|(1,1)| + + Threads are lauched from D up to nDetX (thread_2D_pos.y) and nDetY (thread_2D_pos.x) + i.e., they are running on the detector image. Thread (0,0) is on D. + + Coordinates on intergal projection: + + A = thread_2D_pos.y * nDetYMap + thread_2D_pos.x + B = ((thread_2D_pos.y+1) * nDetYMap) + thread_2D_pos.x + C = thread_2D_pos.y * nDetYMap + thread_2D_pos.x + 1 + D = ((thread_2D_pos.y+1) * nDetYMap) + thread_2D_pos.x + 1 + + */ + + unsigned int coordA = thread_2D_pos.y * nDetYMap + thread_2D_pos.x; + unsigned int coordB = ((thread_2D_pos.y + 1) * nDetYMap) + thread_2D_pos.x; + unsigned int coordC = coordA + 1; + unsigned int coordD = coordB + 1; + + + // Detector X coord overlap calculation + double detMapSizeX = d_pDetmX[coordA] - d_pDetmX[coordB]; + + // Detector Y coord overlap calculation + double detMapSizeY = d_pDetmY[thread_2D_pos.x+1] - d_pDetmY[thread_2D_pos.x]; + + // x - ray angle in X coord + double gamma = atan((d_pDetX[thread_2D_pos.y] + (du / 2.0) - tubeX) / rtubeZ); + + // x - ray angle in Y coord + double alpha = atan((d_pRdetY[thread_2D_pos.x] + (dv / 2.0) - rtubeY) / rtubeZ); + + + double dA, dB, dC, dD; + + dA = d_projI[coordA]; + dB = d_projI[coordB]; + dC = d_projI[coordC]; + dD = d_projI[coordD]; + + // Treat border of interpolated integral detector + if (dC == 0 && dD == 0) { + dC = dA; + dD = dB; + } + + + // S.3.Differentiation - Eq. 24 - Liu et al(2017) + d_pProj[thread_1D_pos] += ((dD - dC - dB + dA)*(dx*dy*dz / (cos(alpha)*cos(gamma)*detMapSizeX*detMapSizeY))); + + return; +} + + + +/**************************************************************************** +* function: projectionDDb() - CUDA projection Branchless Distance Driven. * +****************************************************************************/ + +void projectionDDb(double* const h_pProj, + double* const h_pVolume, + double* const h_pTubeAngle, + double* const h_pDetAngle, + const unsigned int nProj, + const unsigned int nPixX, + const unsigned int nPixY, + const unsigned int nSlices, + const unsigned int nDetX, + const unsigned int nDetY, + const signed int idXProj, + const double x_offset, + const double y_offset, + const double dx, + const double dy, + const double dz, + const double du, + const double dv, + const double DSD, + const double DDR, + const double DAG) +{ + + // Unique GPU + int devID = 0; + + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, devID); + cudaCheckErrors("cudaGetDeviceProperties"); + + const unsigned int maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; + + + printf("GPU Device %d: \"%s\" with compute capability %d.%d has %d Multi-Processors and %zu bytes of global memory\n\n", devID, + deviceProp.name, deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount, deviceProp.totalGlobalMem); + + + // Create timmer variables and start it + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); + sdkStartTimer(&timer); + + //cudaStream_t stream1; + //cudaStreamCreate(&stream1); + + dim3 threadsPerBlock(1, 1, 1); + dim3 blockSize(1, 1, 1); + + + // Number of mapped detector and pixels + const int nDetXMap = nDetX + 1; + const int nDetYMap = nDetY + 1; + const int nPixXMap = nPixX + 1; + const int nPixYMap = nPixY + 1; + + + // Convention: h_ variables live on host + // Convention: d_ variables live on device (GPU global mem) + double* d_pProj; + double* d_projI; + double* d_pVolume; + double* d_pTubeAngle; + double* d_pDetAngle; + + + // Allocate global memory on the device, place result in "d_----" + cudaMalloc((void **)&d_pProj, nDetX*nDetY*nProj * sizeof(double)); + cudaMalloc((void **)&d_projI, nDetXMap*nDetYMap * sizeof(double)); + cudaMalloc((void **)&d_pVolume, nPixXMap*nPixYMap*nSlices * sizeof(double)); + cudaMalloc((void **)&d_pTubeAngle, nProj * sizeof(double)); + cudaMalloc((void **)&d_pDetAngle, nProj * sizeof(double)); + + cudaCheckErrors("cudaMalloc Initial"); + + // Copy data from host memory "h_----" to device memory "d_----" + cudaMemcpy((void *)d_pTubeAngle, (void *)h_pTubeAngle, nProj * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy((void *)d_pDetAngle, (void *)h_pDetAngle, nProj * sizeof(double), cudaMemcpyHostToDevice); + + cudaCheckErrors("cudaMemcpy Angles"); + + + /* + Copy volume data from host memory "h_----" to device memory "d_----" padding with zeros for image integation + */ + + + // Initialize first column and row with zeros + double* h_pVolume_tmp; + double* d_pVolume_tmp; + + threadsPerBlock.x = maxThreadsPerBlock; + blockSize.x = (nPixXMap / maxThreadsPerBlock) + 1; + + for (int nz = 0; nz < nSlices; nz++) { + + // Pad on X coord direction + pad_volume_kernel << > > (d_pVolume, nPixXMap, nPixYMap, nPixXMap, nz); + + // Pad on Y coord direction + d_pVolume_tmp = d_pVolume + (nPixXMap*nPixYMap*nz) + 1; + cudaMemset(d_pVolume_tmp, 0, nPixY * sizeof(double)); + cudaCheckErrors("cudaMemset Padding Volume"); + } + + + // Copy volume data from host memory + for (unsigned int nz = 0; nz < nSlices; nz++) + for (unsigned int c = 0; c < nPixX; c++) { + + h_pVolume_tmp = h_pVolume + (c *nPixY) + (nPixX*nPixY*nz); + d_pVolume_tmp = d_pVolume + (((c+1) *nPixYMap) + 1) + (nPixXMap*nPixYMap*nz); + + cudaMemcpy((void *)d_pVolume_tmp, (void *)h_pVolume_tmp, nPixY * sizeof(double), cudaMemcpyHostToDevice); + cudaCheckErrors("cudaMemset Volume"); + } + + + + // Pointer for projections coordinates + double* d_pDetX; + double* d_pDetY; + double* d_pDetZ; + double* d_pObjX; + double* d_pObjY; + double* d_pObjZ; + + // Allocate global memory on the device for projections coordinates + cudaMalloc((void **)&d_pDetX, nDetXMap * sizeof(double)); + cudaMalloc((void **)&d_pDetY, nDetYMap * sizeof(double)); + cudaMalloc((void **)&d_pDetZ, nDetYMap * sizeof(double)); + cudaMalloc((void **)&d_pObjX, nPixXMap * sizeof(double)); + cudaMalloc((void **)&d_pObjY, nPixYMap * sizeof(double)); + cudaMalloc((void **)&d_pObjZ, nSlices * sizeof(double)); + + cudaCheckErrors("cudaMalloc Coordinates"); + + + + // Pointer for mapped coordinates + double* d_pDetmY; + double* d_pDetmX; + + + // Allocate global memory on the device for mapped coordinates + cudaMalloc((void **)&d_pDetmY, nDetYMap * sizeof(double)); + cudaMalloc((void **)&d_pDetmX, nDetYMap * nDetXMap * sizeof(double)); + + cudaCheckErrors("cudaMalloc Map-Coordinates"); + + + // Pointer for rotated detector coords + double* d_pRdetY; + double* d_pRdetZ; + + // Allocate global memory on the device for for rotated detector coords + cudaMalloc((void **)&d_pRdetY, nDetYMap * sizeof(double)); + cudaMalloc((void **)&d_pRdetZ, nDetYMap * sizeof(double)); + + cudaCheckErrors("cudaMalloc Rot-Coordinates"); + + // Generate detector and object boudaries + + threadsPerBlock.x = maxThreadsPerBlock; + + blockSize.x = (nDetX / maxThreadsPerBlock) + 1; + + map_boudaries_kernel << > > (d_pDetX, nDetXMap, (double)nDetX, -du, 0.0); + + blockSize.x = (nDetY / maxThreadsPerBlock) + 1; + + map_boudaries_kernel << > > (d_pDetY, nDetYMap, nDetY / 2.0, dv, 0.0); + + blockSize.x = (nPixX / maxThreadsPerBlock) + 1; + + map_boudaries_kernel << > > (d_pObjX, nPixXMap, (double)nPixX, -dx, x_offset); + + blockSize.x = (nPixY / maxThreadsPerBlock) + 1; + + map_boudaries_kernel << > > (d_pObjY, nPixYMap, nPixY / 2.0, dy, y_offset); + + blockSize.x = (nSlices / maxThreadsPerBlock) + 1; + + map_boudaries_kernel << > > (d_pObjZ, nSlices, 0.0, dz, DAG + (dz / 2.0)); + + + //printf("Map Boundaries -- Threads:%d Blocks:%d \n", threads, blocks); + + + // Initiate variables value with 0 + cudaMemset(d_pDetZ, 0, nDetYMap * sizeof(double)); + cudaMemset(d_pProj, 0, nDetX * nDetY * nProj * sizeof(double)); + + cudaCheckErrors("cudaMemset Zeros"); + + //cudaDeviceSynchronize(); + + // X - ray tube initial position + double tubeX = 0; + double tubeY = 0; + double tubeZ = DSD; + + // Iso - center position + double isoY = 0; + double isoZ = DDR; + + + // Integration of 2D slices over the whole volume + // (S.1.Integration. - Liu et al(2017)) + + // Naive integration o the X coord + threadsPerBlock.x = 10; + threadsPerBlock.y = 10; + threadsPerBlock.z = 10; + + blockSize.x = (unsigned int) ceil(((double)nPixYMap / (threadsPerBlock.x-1))); + blockSize.y = 1; + blockSize.z = (unsigned int) ceil((double)nSlices / threadsPerBlock.z); + + //printf("Divisao: %f \n", (double)nPixY / (threadsPerBlock.x - 1)); + //printf("Divisao ceil: %d \n", (unsigned int)ceil((double)nPixY / (threadsPerBlock.x - 1))); + + for (int k = 0; k < ceil((double)nPixXMap / (threadsPerBlock.x-1)); k++) { + + img_integration_kernel << > > (d_pVolume, nPixXMap, nPixYMap, integrateXcoord, 0, k * 9, nSlices); + + //printf("integration kernel -- threadsX:%d blocksX:%d threadsY:%d blocksY:%d \n", threadsPerBlock.x, blockSize.x, threadsPerBlock.y, blockSize.y); + + } + + // Naive integration o the Y coord + threadsPerBlock.x = 10; + threadsPerBlock.y = 10; + threadsPerBlock.z = 10; + + blockSize.x = 1; + blockSize.y = (unsigned int) ceil((double)nPixXMap / (threadsPerBlock.y-1)); + blockSize.z = (unsigned int) ceil((double)nSlices / threadsPerBlock.z); + + + for (int k = 0; k < ceil((double)nPixYMap / (threadsPerBlock.y-1)); k++) { + + img_integration_kernel << > > (d_pVolume, nPixXMap, nPixYMap, integrateYcoord, k * 9, 0, nSlices); + + //printf("integration kernel -- threadsX:%d blocksX:%d threadsY:%d blocksY:%d \n", threadsPerBlock.x, blockSize.x, threadsPerBlock.y, blockSize.y); + + } + + // Test if we will loop over all projs or not + unsigned int projIni, projEnd; + if(idXProj == -1){ + projIni = 0; + projEnd = nProj; + } + else{ + projIni = (unsigned int) idXProj; + projEnd = (unsigned int) idXProj + 1; + } + + // For each projection + for (unsigned int p = projIni; p < projEnd; p++) { + + // Get specif tube angle for the projection + double theta = h_pTubeAngle[p] * M_PI / 180.0; + + // Get specif detector angle for the projection + double phi = h_pDetAngle[p] * M_PI / 180.0; + + //printf("Tube angle:%f Det angle:%f\n", theta, phi); + + // Tube rotation + double rtubeY = ((tubeY - isoY)*cos(theta) - (tubeZ - isoZ)*sin(theta)) + isoY; + double rtubeZ = ((tubeY - isoY)*sin(theta) + (tubeZ - isoZ)*cos(theta)) + isoZ; + + //printf("R tube Y:%f R tube Z:%f\n", rtubeY, rtubeZ); + + // Detector rotation + threadsPerBlock.x = maxThreadsPerBlock; + threadsPerBlock.y = 1; + threadsPerBlock.z = 1; + + blockSize.x = (nDetYMap / maxThreadsPerBlock) + 1; + blockSize.y = 1; + blockSize.z = 1; + + rot_detector_kernel << > > (d_pRdetY, d_pRdetZ, d_pDetY, d_pDetZ, isoY, isoZ, phi, nDetYMap); + + //cudaDeviceSynchronize(); + + //printf("Detector rotation -- Threads:%d Blocks:%d \n", threads, blocks); + + + // For each slice + for (unsigned int nz = 0; nz < nSlices; nz++) { + + /* + + Map detector onto XY plane(Inside proj loop in case detector rotates) + + *** Note: Matlab has linear indexing as a column of elements, i.e, the elements are actually stored in memory as queued columns. + So threads in X are launched in the column direction (DBT Y coord), and threads in Y are launched in the row direction (DBT X coord). + + */ + + threadsPerBlock.x = 32; + threadsPerBlock.y = 32; + threadsPerBlock.z = 1; + + blockSize.x = (nDetYMap / threadsPerBlock.x) + 1; + blockSize.y = (nDetXMap / threadsPerBlock.y) + 1; + blockSize.z = 1; + + + mapDet2Slice_kernel << > > (d_pDetmX, d_pDetmY, tubeX, rtubeY, rtubeZ, d_pDetX, d_pRdetY, d_pRdetZ, d_pObjZ, nDetXMap, nDetYMap, nz); + //cudaDeviceSynchronize(); + + //printf("Map detector onto XY plane -- ThreadsX:%d ThreadsY:%d BlocksX:%d BlocksY:%d\n", threadsPerBlock.x, threadsPerBlock.y, blockSizeX, blockSizeY); + + /* + S.2. Interpolation - Liu et al (2017) + */ + + blockSize.x = (nDetYMap / threadsPerBlock.x) + 1; + blockSize.y = (nDetXMap / threadsPerBlock.y) + 1; + + bilinear_interpolation_kernel_GPU << > > (d_projI, d_pVolume, d_pDetmX, d_pDetmY, nDetXMap, nDetYMap, nPixXMap, nPixYMap, nPixX, nPixY, dx, nz); + + + /* + S.3. Differentiation - Eq. 24 - Liu et al (2017) + */ + + blockSize.x = (nDetY / threadsPerBlock.x) + 1; + blockSize.y = (nDetX / threadsPerBlock.y) + 1; + + differentiation_kernel << > > (d_pProj, d_projI, d_pDetmX, d_pDetmY, tubeX, rtubeY, rtubeZ, d_pDetX, d_pRdetY, d_pRdetZ, nDetX, nDetY, nDetXMap, nDetYMap, du, dv, dx, dy, dz, p); + + } // Loop end slices + + } // Loop end Projections + + // d_pProj + cudaMemcpy((void *)h_pProj, (void *)d_pProj, nProj* nDetX * nDetY * sizeof(double), cudaMemcpyDeviceToHost); + cudaCheckErrors("cudaMemcpy Final"); + + + cudaFree(d_pProj); + cudaFree(d_projI); + cudaFree(d_pVolume); + cudaFree(d_pTubeAngle); + cudaFree(d_pDetAngle); + cudaFree(d_pDetX); + cudaFree(d_pDetY); + cudaFree(d_pDetZ); + cudaFree(d_pObjX); + cudaFree(d_pObjY); + cudaFree(d_pObjZ); + cudaFree(d_pDetmY); + cudaFree(d_pDetmX); + cudaFree(d_pRdetY); + cudaFree(d_pRdetZ); + + cudaCheckErrors("cudaFree Final"); + + sdkStopTimer(&timer); + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + sdkDeleteTimer(&timer); + + cudaDeviceReset(); + + return; + +} \ No newline at end of file diff --git a/pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.cpp b/pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.cpp new file mode 100644 index 0000000..1266531 --- /dev/null +++ b/pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.cpp @@ -0,0 +1,131 @@ +/* +%% Author: Rodrigo de Barros Vimieiro +% Date: Feb, 2022 +% rodrigo.vimieiro@gmail.com +% ========================================================================= +%{ +% ------------------------------------------------------------------------- +% projectionDDb_cuda_lib(pVolume, pProj, pGeo, idXProj) +% ------------------------------------------------------------------------- +% DESCRIPTION: +% This function calculates the volume projection based on the +% Branchless Distance-Driven principle. +% The geometry is for DBT with half cone-beam. All parameters are set +% in "ParameterSettings" code. +% +% INPUT: +% +% - pVolume = 3D volume for projection +% - pGeo = Parameter of all geometry +% - idXProj = projection number to be projected +% +% OUTPUT: +% +% - pProj = projections for each angle. +% +% Reference: +% - Branchless Distance Driven Projection and Backprojection, +% Samit Basu and Bruno De Man (2006) +% - GPU Acceleration of Branchless Distance Driven Projection and +% Backprojection, Liu et al (2016) +% - GPU-Based Branchless Distance-Driven Projection and Backprojection, +% Liu et al (2017) +% - A GPU Implementation of Distance-Driven Computed Tomography, +% Ryan D. Wagner (2017) +% --------------------------------------------------------------------- +% Copyright (C) <2022> +% +% This program is free software: you can redistribute it and/or modify +% it under the terms of the GNU General Public License as published by +% the Free Software Foundation, either version 3 of the License, or +% (at your option) any later version. +% +% This program is distributed in the hope that it will be useful, +% but WITHOUT ANY WARRANTY; without even the implied warranty of +% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +% GNU General Public License for more details. +% +% You should have received a copy of the GNU General Public License +% along with this program. If not, see . +%} +% ========================================================================= +%% 3-D Projection Branchless Distance Driven Code (CUDA) +*/ + +#include "projectionDDb_cuda.hpp" + + +extern "C" void projectionDDb_cuda_lib(double* const pVolume, + double* const pProj, + float* const pGeo, + const signed int idXProj){ + + + const int unsigned nPixX = (const int)pGeo[0]; + const int unsigned nPixY = (const int)pGeo[1]; + const int unsigned nSlices = (const int)pGeo[2]; + const int unsigned nDetX = (const int)pGeo[3]; + const int unsigned nDetY = (const int)pGeo[4]; + + const double dx = (const double)pGeo[5]; + const double dy = (const double)pGeo[6]; + const double dz = (const double)pGeo[7]; + const double du = (const double)pGeo[8]; + const double dv = (const double)pGeo[9]; + + const double DSD = (const double)pGeo[10]; + const double DDR = (const double)pGeo[12]; + const double DAG = (const double)pGeo[14]; + + const int unsigned nProj = (const int)pGeo[15]; + + const double tubeAngle = (const double)pGeo[16]; + const double detAngle = (const double)pGeo[17]; + + double* const pTubeAngle = (double*)malloc(nProj * sizeof(double)); + double* const pDetAngle = (double*)malloc(nProj * sizeof(double)); + + const double x_offset = (const double)pGeo[18]; + const double y_offset = (const double)pGeo[19]; + + linspace(-tubeAngle/2, tubeAngle/2, nProj, pTubeAngle); + linspace(-detAngle/2, detAngle/2, nProj, pDetAngle); + + // printf("Nx:%d Ny:%d Nz:%d \nNu:%d Nv:%d \nDx:%.2f Dy:%.2f Dz:%.2f \nDu:%.2f Dv:%.2f \nDSD:%.2f DDR:%.2f \nTube angle:%.2f \nDet angle:%.2f", nPixX, nPixY, nSlices, nDetX, nDetY, dx, dy, dz, du, dv, DSD, DDR, tubeAngle, detAngle); + + projectionDDb(pProj, pVolume, pTubeAngle, pDetAngle, nProj, nPixX, nPixY, nSlices, nDetX, nDetY, idXProj, x_offset, y_offset, dx, dy, dz, du, dv, DSD, DDR, DAG); + + free(pTubeAngle); + free(pDetAngle); + + return; + +} + +// Linear spaced vector +// Ref: https://stackoverflow.com/a/27030598/8682939 +void linspace(double start, + double end, + int num, + double* pLinspaced){ + + double delta; + + if(num == 0) + delta = 0; + else{ + if(num == 1) + delta = 1; + else + delta = (end - start) / (num - 1); + } + + if((abs(start) < 0.00001) && (abs(end) < 0.00001)) + delta = 0; + + for (int k = 0; k < num; k++){ + pLinspaced[k] = start + k * delta; + } + + return; +} \ No newline at end of file diff --git a/pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.hpp b/pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.hpp new file mode 100644 index 0000000..23d041e --- /dev/null +++ b/pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.hpp @@ -0,0 +1,86 @@ +/* +%% Author: Rodrigo de Barros Vimieiro +% Date: Feb, 2022 +% rodrigo.vimieiro@gmail.com +% ========================================================================= +%{ +% ------------------------------------------------------------------------- +% +% ------------------------------------------------------------------------- +% DESCRIPTION: +% This is the header function +% --------------------------------------------------------------------- +% Copyright (C) <2022> +% +% This program is free software: you can redistribute it and/or modify +% it under the terms of the GNU General Public License as published by +% the Free Software Foundation, either version 3 of the License, or +% (at your option) any later version. +% +% This program is distributed in the hope that it will be useful, +% but WITHOUT ANY WARRANTY; without even the implied warranty of +% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +% GNU General Public License for more details. +% +% You should have received a copy of the GNU General Public License +% along with this program. If not, see . +%} +% ========================================================================= +%% 3-D Projection Branchless Distance Driven header +*/ + +#include +#define _USE_MATH_DEFINES +#include + +// Includes CUDA +#include + +// Utilities and timing functions +#include // includes cuda.h and cuda_runtime_api.h + +// CUDA helper functions +#include // helper functions for CUDA error check + +#define integrateXcoord 1 +#define integrateYcoord 0 + + +/* +Reference: TIGRE - https://github.com/CERN/TIGRE +*/ +#define cudaCheckErrors(msg) \ +do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + printf("%s \n",msg);\ + } \ +} while (0) + +void linspace(double start, + double end, + int num, + double* pLinspaced); + +void projectionDDb(double* const h_pProj, + double* const h_pVolume, + double* const h_pTubeAngle, + double* const h_pDetAngle, + const unsigned int nProj, + const unsigned int nPixX, + const unsigned int nPixY, + const unsigned int nSlices, + const unsigned int nDetX, + const unsigned int nDetY, + const signed int idXProj, + const double x_offset, + const double y_offset, + const double dx, + const double dy, + const double dz, + const double du, + const double dv, + const double DSD, + const double DDR, + const double DAG); + diff --git a/setup.py b/setup.py index 8cf42e3..d2dc7fe 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,15 @@ import setuptools import os + from sys import platform +from pydbt.functions.utilities2setuptools import locate_cuda, get_custom_build_ext + +# Find CUDA (NVCC) +cuda_env = locate_cuda() + +# Create custom build ext +custom_build_ext = get_custom_build_ext(cuda_env) if platform == "darwin": @@ -19,34 +27,61 @@ raise ValueError('Windows is not supported yet.') + projectionDD = setuptools.Extension('projectionDD', sources = ['pydbt/sources/projectionDD/projectionDD.cpp'], - language='c++') + language='c++', + extra_compile_args = {"gcc" : [], + "nvcc" : []}) backprojectionDD = setuptools.Extension('backprojectionDD', sources = ['pydbt/sources/backprojectionDD/backprojectionDD.cpp'], - language='c++') + language='c++', + extra_compile_args = {"gcc" : [], + "nvcc" : []}) projectionDDb = setuptools.Extension('projectionDDb', sources = ['pydbt/sources/projectionDDb/projectionDDb.cpp'], language='c++', - extra_compile_args = ['-fopenmp'], + extra_compile_args = {"gcc" : ['-fopenmp'], + "nvcc" : []}, extra_link_args=['-lgomp']) backprojectionDDb = setuptools.Extension('backprojectionDDb', sources = ['pydbt/sources/backprojectionDDb/backprojectionDDb.cpp'], language='c++', - extra_compile_args = ['-fopenmp'], + extra_compile_args = {"gcc" : ['-fopenmp'], + "nvcc" : []}, extra_link_args=['-lgomp']) +projectionDDb_cuda = setuptools.Extension('projectionDDb_cuda', + sources = ['pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.cpp', 'pydbt/sources/projectionDDb_cuda/kernel.cu'], + library_dirs=[cuda_env['lib64']], + libraries=['cudart'], + language='c++', + runtime_library_dirs=[cuda_env['lib64']], + extra_compile_args = {"gcc" : [], + "nvcc" : ['-arch=sm_75', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]}, + include_dirs = [cuda_env['include'], 'cuda-samples/Common/']) + +backprojectionDDb_cuda = setuptools.Extension('backprojectionDDb_cuda', + sources = ['pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.cpp', 'pydbt/sources/backprojectionDDb_cuda/kernel.cu'], + library_dirs=[cuda_env['lib64']], + libraries=['cudart'], + language='c++', + runtime_library_dirs=[cuda_env['lib64']], + extra_compile_args = {"gcc" : [], + "nvcc" : ['-arch=sm_75', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]}, + include_dirs = [cuda_env['include'], 'cuda-samples/Common/']) + with open("README.md", "r") as fh: long_description = fh.read() setuptools.setup( name="pyDBT", - version="0.0.2", + version="0.0.3", author="Rodrigo Vimieiro", description="This package is a python extension of the DBT toolbox from LAVI-USP", long_description=long_description, @@ -60,5 +95,8 @@ ], python_requires='>=3.6', install_requires=["numpy", "matplotlib", "pydicom"], - ext_modules = [projectionDD, backprojectionDD, projectionDDb, backprojectionDDb] + ext_modules = [projectionDD, backprojectionDD, projectionDDb, backprojectionDDb, projectionDDb_cuda, backprojectionDDb_cuda], + + # Inject our custom trigger + cmdclass={'build_ext': custom_build_ext}, ) \ No newline at end of file