From 60eadc4b414bfd048ad3d0982099998c221a22fe Mon Sep 17 00:00:00 2001
From: rodrigovimieiro <rodrigo.vimieiro@gmail.com>
Date: Wed, 16 Feb 2022 13:23:46 -0500
Subject: [PATCH] CUDA Branchless Distance Driven codes

---
 README.md                                     |   6 +
 pydbt/functions/FBP.py                        |   5 +-
 pydbt/functions/SIRT.py                       |  13 +-
 pydbt/functions/projection_operators.py       | 162 ++++
 pydbt/functions/utilities2setuptools.py       | 103 +++
 .../backprojectionDD/backprojectionDD.cpp     |  29 +-
 .../backprojectionDDb/backprojectionDDb.cpp   |  47 +-
 .../backprojectionDDb/backprojectionDDb.hpp   |  15 +-
 .../backprojectionDDb_cuda.cpp                | 132 +++
 .../backprojectionDDb_cuda.hpp                |  85 ++
 .../sources/backprojectionDDb_cuda/kernel.cu  | 751 ++++++++++++++++++
 pydbt/sources/projectionDD/projectionDD.cpp   |  28 +-
 pydbt/sources/projectionDDb/projectionDDb.cpp |  47 +-
 pydbt/sources/projectionDDb/projectionDDb.hpp |  17 +-
 pydbt/sources/projectionDDb_cuda/kernel.cu    | 723 +++++++++++++++++
 .../projectionDDb_cuda/projectionDDb_cuda.cpp | 131 +++
 .../projectionDDb_cuda/projectionDDb_cuda.hpp |  86 ++
 setup.py                                      |  50 +-
 18 files changed, 2357 insertions(+), 73 deletions(-)
 create mode 100644 pydbt/functions/projection_operators.py
 create mode 100644 pydbt/functions/utilities2setuptools.py
 create mode 100644 pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.cpp
 create mode 100644 pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.hpp
 create mode 100644 pydbt/sources/backprojectionDDb_cuda/kernel.cu
 create mode 100644 pydbt/sources/projectionDDb_cuda/kernel.cu
 create mode 100644 pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.cpp
 create mode 100644 pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.hpp

diff --git a/README.md b/README.md
index 66db6a7..54badd0 100644
--- a/README.md
+++ b/README.md
@@ -15,10 +15,16 @@ This repository is a python extension of the [DBT toolbox](https://github.com/LA
 
     * ```cd pyDBT```
 
+ 3. Clone NVIDIA cuda-samples directory inside pyDBT:
+    
+    * ```git clone https://github.com/NVIDIA/cuda-samples```
+
  3. Install the package:
 
     * ```python3 setup.py install```
 
+ 3. If you have problems with `arch=sm_XX`, modify it in the `setup.py` accordingly to your NVIDIA-GPU architecture. This [link](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) has some references.
+
  4. run the example:
 
     * ```cd pydbt && python3 example.py```
diff --git a/pydbt/functions/FBP.py b/pydbt/functions/FBP.py
index bf33115..31c33ed 100755
--- a/pydbt/functions/FBP.py
+++ b/pydbt/functions/FBP.py
@@ -7,8 +7,7 @@
 """
 
 import numpy as np
-from .backprojectionDDb import backprojectionDDb
-from .backprojectionDD import backprojectionDD
+from .projection_operators import backprojectionDD
 
 def FDK(proj, geo, filterType, cutoff, libFiles):
     
@@ -20,7 +19,7 @@ def FDK(proj, geo, filterType, cutoff, libFiles):
     else:
         raise ValueError('Unknown filter type.')
         
-    vol = backprojectionDD(proj, geo, libFiles)
+    vol = backprojectionDD(proj, geo, -1, libFiles)
     
     return vol
     
diff --git a/pydbt/functions/SIRT.py b/pydbt/functions/SIRT.py
index 324559d..d543248 100755
--- a/pydbt/functions/SIRT.py
+++ b/pydbt/functions/SIRT.py
@@ -9,8 +9,7 @@
 import numpy as np
 import time
 
-from .backprojectionDDb import backprojectionDDb
-from .projectionDDb import projectionDDb
+from .projection_operators import backprojectionDDb, projectionDDb
 
 def SIRT(proj, geo, nIter, libFiles):
        
@@ -20,10 +19,12 @@ def SIRT(proj, geo, nIter, libFiles):
     reconData3d = np.zeros([geo.ny, geo.nx, geo.nz]) 
     
     # Pre calculation of Projection normalization
-    proj_norm = projectionDDb(np.ones([geo.ny, geo.nx, geo.nz]), geo, libFiles) 
+    proj_norm = projectionDDb(np.ones([geo.ny, geo.nx, geo.nz]), geo, -1, libFiles) 
+    proj_norm[proj_norm == 0] = 1
     
     # Pre calculation of Backprojection normalization
-    vol_norm = backprojectionDDb(np.ones([geo.nv, geo.nu, geo.nProj]), geo, libFiles) 
+    vol_norm = backprojectionDDb(np.ones([geo.nv, geo.nu, geo.nProj]), geo, -1, libFiles) 
+    vol_norm[vol_norm == 0] = 1
     
     print('----------------\nStarting SIRT Iterations... \n\n')
     
@@ -33,14 +34,14 @@ def SIRT(proj, geo, nIter, libFiles):
         startTime = time.time()
         
         # Error between raw data and projection of estimated data  
-        proj_diff = proj - projectionDDb(reconData3d, geo, libFiles)  
+        proj_diff = proj - projectionDDb(reconData3d, geo, -1, libFiles)  
         
         # Projection normalization
         proj_diff = proj_diff / proj_norm  
         proj_diff[np.isnan(proj_diff)] = 0 
         proj_diff[np.isinf(proj_diff)] = 0 
         
-        upt_term = backprojectionDDb(proj_diff, geo, libFiles) 
+        upt_term = backprojectionDDb(proj_diff, geo, -1, libFiles) 
         upt_term = upt_term / vol_norm  # Volume normalization
         upt_term[np.isnan(upt_term)] = 0 
         upt_term[np.isinf(upt_term)] = 0 
diff --git a/pydbt/functions/projection_operators.py b/pydbt/functions/projection_operators.py
new file mode 100644
index 0000000..491f787
--- /dev/null
+++ b/pydbt/functions/projection_operators.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Feb 16 10:29:14 2022
+
+@author: rodrigo
+"""
+
+import numpy as np
+import numpy.ctypeslib as ctl
+import ctypes
+import warnings
+
+from .utilities import findAndLoadLibray
+from .utilities import geoAsNp
+
+##############################################################################
+#                         BackProjection                                     #
+##############################################################################
+
+def backprojectionDD(proj, geo, proj_num, libFiles):
+    
+    operator_type = 'backprojectionDD'
+    
+    vol = _backprojection(proj, geo, proj_num, libFiles, operator_type)
+    
+    return vol
+
+def backprojectionDDb(proj, geo, proj_num, libFiles):
+    
+    operator_type = 'backprojectionDDb'
+    
+    vol = _backprojection(proj, geo, proj_num, libFiles, operator_type)
+        
+    return vol
+
+def backprojectionDDb_cuda(proj, geo, proj_num, libFiles):
+    
+    operator_type = 'backprojectionDDb_cuda'
+    
+    vol = _backprojection(proj, geo, proj_num, libFiles, operator_type)
+    
+    return vol
+    
+def _backprojection(proj, geo, proj_num, libFiles, operator_type):
+    
+    # Check if the input is in proper size
+    if not (proj.shape[0] == geo.nv and proj.shape[1] == geo.nu and proj.shape[2] == geo.nProj):
+        raise ValueError('First argument needs to have the same number of rows, cols and slices as in the configuration file.')
+    
+    check_parameters(operator_type, proj_num, geo)
+    
+    # Find and library    
+    lib = findAndLoadLibray(libFiles, operator_type)
+    
+    
+    backprojection = getattr(lib, operator_type + '_lib')
+    
+    backprojection.argtypes = [ctl.ndpointer(np.float64, flags='aligned, c_contiguous'),
+                               ctl.ndpointer(np.float64, flags='aligned, c_contiguous'),
+                               ctl.ndpointer(np.float32, flags='aligned, c_contiguous'),
+                               ctypes.c_long]
+    
+    
+    # Transform geo class in numpy array
+    geoNp = geoAsNp(geo) 
+    
+    
+    vol_transp = np.empty([geo.nz, geo.nx, geo.ny], dtype=np.float64)
+    
+    proj_transp = np.transpose(proj, (2, 1, 0)).copy()
+    
+    backprojection(proj_transp, vol_transp, geoNp, proj_num)
+    
+    vol = np.transpose(vol_transp, (2, 1, 0)).copy()
+    
+    
+    return vol
+    
+
+##############################################################################
+#                           Projection                                       #
+##############################################################################
+
+def projectionDD(vol, geo, proj_num, libFiles):
+    
+    operator_type = 'projectionDD'
+    
+    proj = _projection(vol, geo, proj_num, libFiles, operator_type)
+    
+    return proj
+
+def projectionDDb(vol, geo, proj_num, libFiles):
+    
+    operator_type = 'projectionDDb'
+    
+    proj = _projection(vol, geo, proj_num, libFiles, operator_type)
+    
+    return proj
+
+def projectionDDb_cuda(vol, geo, proj_num, libFiles):
+    
+    operator_type = 'projectionDDb_cuda'
+    
+    proj = _projection(vol, geo, proj_num, libFiles, operator_type)
+    
+    return proj
+
+def _projection(vol, geo, proj_num, libFiles, operator_type):
+    
+    # Check if the input is in proper size
+    if not (vol.shape[0] == geo.ny and vol.shape[1] == geo.nx and vol.shape[2] == geo.nz):
+        raise ValueError('First argument needs to have the same number of rows, cols and slices as in the configuration file.')
+       
+    check_parameters(operator_type, proj_num, geo)
+        
+    # Find and library    
+    lib = findAndLoadLibray(libFiles, operator_type)
+    
+    projection = getattr(lib, operator_type + '_lib')
+    
+    projection.argtypes = [ctl.ndpointer(np.float64, flags='aligned, c_contiguous'),
+                           ctl.ndpointer(np.float64, flags='aligned, c_contiguous'),
+                           ctl.ndpointer(np.float32, flags='aligned, c_contiguous'),
+                           ctypes.c_long]
+    
+    
+    # Transform geo class in numpy array
+    geoNp = geoAsNp(geo)
+    
+    
+    proj_transp = np.empty([geo.nProj, geo.nu, geo.nv], dtype=np.float64)
+    
+    vol_transp = np.transpose(vol, (2, 1, 0)).copy()
+        
+    projection(vol_transp, proj_transp, geoNp, proj_num)
+    
+    proj = np.transpose(proj_transp, (2, 1, 0)).copy()
+    
+    return proj
+
+##############################################################################
+#                           Auxiliar functions                               #
+##############################################################################
+
+
+def check_parameters(operator_type, proj_num, geo):
+    
+    if proj_num < -1 and proj_num >= geo.nProj:
+        raise ValueError('Projection number needs to be between 0-(geo.nProj-1). If you want to operate on all projections, set it to -1')
+        
+    if geo.detAngle != 0 and 'DDb' in operator_type:
+        warnings.warn('Your detector rotates, its not recomended to used branchless projectors. Use DD only')
+        
+    if (geo.x_offset != 0 or geo.y_offset != 0) and ('DDb' in operator_type):
+        raise ValueError('Branchless operators dont work with volume offsets')
+        
+    return 
+    
+    
+    
+    
diff --git a/pydbt/functions/utilities2setuptools.py b/pydbt/functions/utilities2setuptools.py
new file mode 100644
index 0000000..6487049
--- /dev/null
+++ b/pydbt/functions/utilities2setuptools.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb 15 16:41:20 2022
+
+@author: Rodrigo
+
+Reference: https://stackoverflow.com/a/13300714/8682939
+
+"""
+import os 
+
+from os.path import join as pjoin
+from distutils.command.build_ext import build_ext
+
+
+def find_in_path(name, path):
+    "Find a file in a search path"
+    #adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
+    for dir in path.split(os.pathsep):
+        binpath = pjoin(dir, name)
+        if os.path.exists(binpath):
+            return os.path.abspath(binpath)
+    return None
+
+def locate_cuda():
+    """Locate the CUDA environment on the system
+
+    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
+    and values giving the absolute path to each directory.
+
+    Starts by looking for the CUDAHOME env variable. If not found, everything
+    is based on finding 'nvcc' in the PATH.
+    """
+
+    # first check if the CUDAHOME env variable is in use
+    if 'CUDAHOME' in os.environ:
+        home = os.environ['CUDAHOME']
+        nvcc = pjoin(home, 'bin', 'nvcc')
+    else:
+        # otherwise, search the PATH for NVCC
+        nvcc = find_in_path('nvcc', os.environ['PATH'])
+        if nvcc is None:
+            raise EnvironmentError('The nvcc binary could not be '
+                'located in your $PATH. Either add it to your path, or set $CUDAHOME')
+        home = os.path.dirname(os.path.dirname(nvcc))
+
+    cudaconfig = {'home':home, 'nvcc':nvcc,
+                  'include': pjoin(home, 'include'),
+                  'lib64': pjoin(home, 'lib64')}
+    
+    for k, v in cudaconfig.items():
+        if not os.path.exists(v):
+            raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))
+
+    return cudaconfig
+
+def get_custom_build_ext(cuda_env):
+    
+    def customize_compiler_for_nvcc(self):
+        """inject deep into distutils to customize how the dispatch
+        to gcc/nvcc works.
+
+        If you subclass UnixCCompiler, it's not trivial to get your subclass
+        injected in, and still have the right customizations (i.e.
+        distutils.sysconfig.customize_compiler) run on it. So instead of going
+        the OO route, I have this. Note, it's kindof like a wierd functional
+        subclassing going on."""
+        
+        # tell the compiler it can processes .cu
+        self.src_extensions.append('.cu')
+
+        # save references to the default compiler_so and _comple methods
+        default_compiler_so = self.compiler_so
+        super = self._compile
+
+        # now redefine the _compile method. This gets executed for each
+        # object but distutils doesn't have the ability to change compilers
+        # based on source extension: we add it.
+        def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+            if os.path.splitext(src)[1] == '.cu':
+                # use the cuda for .cu files
+                self.set_executable('compiler_so', cuda_env['nvcc'])
+                # use only a subset of the extra_postargs, which are 1-1 translated
+                # from the extra_compile_args in the Extension class
+                postargs = extra_postargs['nvcc']
+            else:
+                postargs = extra_postargs['gcc']
+
+            super(obj, src, ext, cc_args, postargs, pp_opts)
+            # reset the default compiler_so, which we might have changed for cuda
+            self.compiler_so = default_compiler_so
+
+        # inject our redefined _compile method into the class
+        self._compile = _compile
+
+    # run the customize_compiler
+    class custom_build_ext(build_ext):
+        def build_extensions(self):
+            customize_compiler_for_nvcc(self.compiler)
+            build_ext.build_extensions(self)
+            
+    return custom_build_ext
\ No newline at end of file
diff --git a/pydbt/sources/backprojectionDD/backprojectionDD.cpp b/pydbt/sources/backprojectionDD/backprojectionDD.cpp
index 88e699b..6ccd5e1 100644
--- a/pydbt/sources/backprojectionDD/backprojectionDD.cpp
+++ b/pydbt/sources/backprojectionDD/backprojectionDD.cpp
@@ -5,7 +5,7 @@
 % =========================================================================
 %{
 % -------------------------------------------------------------------------
-%                 backprojectionDD_lib(pProj, pVolume, pGeo)
+%                 backprojectionDD_lib(pProj, pVolume, pGeo, idXProj)
 % -------------------------------------------------------------------------
 %     DESCRIPTION:
 %     This function reconstruct the 3D volume from projections, based on
@@ -18,6 +18,7 @@
 %
 %     - pProj = Pointer to 2D projections for each angle
 %     - pGeo = Pointer to parameters of all pGeometry
+%	  - idXProj = projection number to be projected
 %
 %     OUTPUT:
 %
@@ -54,7 +55,8 @@
 
 extern "C" void backprojectionDD_lib(double* const pProj, 
 								  	 double* const pVolume, 
-								  	 float* const pGeo){
+								  	 float* const pGeo,
+									 const signed int idXProj){
 
 
 	const int unsigned nPixX = (const int)pGeo[0]; 
@@ -81,8 +83,8 @@ extern "C" void backprojectionDD_lib(double* const pProj,
 	double* const pTubeAngle = (double*)malloc(nProj * sizeof(double));
 	double* const pDetAngle = (double*)malloc(nProj * sizeof(double));
 
-	const int x_offset = (const int)pGeo[18];
-	const int y_offset = (const int)pGeo[19];
+	const double x_offset = (const double)pGeo[18];
+	const double y_offset = (const double)pGeo[19];
 
 	linspace(-tubeAngle/2, tubeAngle/2, nProj, pTubeAngle);
 	linspace(-detAngle/2, detAngle/2, nProj, pDetAngle);
@@ -158,8 +160,23 @@ extern "C" void backprojectionDD_lib(double* const pProj,
 				pProjf[(p*nDetX*nDetY) + (x*nDetY) + y] = pProj[(p*nDetX*nDetY) + (x_inv*nDetY) + y];
 
 
+
+	// Test if we will loop over all projs or not
+	unsigned int projIni, projEnd, nProj2Run;
+	if(idXProj == -1){
+		projIni = 0;
+		projEnd = nProj;
+		nProj2Run = nProj;
+	}
+	else{
+		nProj2Run = 1;
+		projIni = (unsigned int) idXProj;
+		projEnd = (unsigned int) idXProj + 1;
+	}
+	
+	
 	// For each projection
-	for (int p = 0; p < nProj; p++) {
+	for (unsigned int p = projIni; p < projEnd; p++) {
 
 		// Get specif tube angle for the projection
 		double theta = pTubeAngle[p] * M_PI / 180.0;
@@ -378,7 +395,7 @@ extern "C" void backprojectionDD_lib(double* const pProj,
 	for (int z = 0; z < nSlices; z++)
 		for (int x = 0, x_inv = nPixX - 1; x < nPixX; x++, x_inv--)
 			for (int y = 0; y < nPixY; y++)
-				pVolume[(z*nPixX*nPixY) + (x*nPixY) + y] = pVolumet[(z*nPixX*nPixY) + (x_inv*nPixY) + y] / (double) nProj;
+				pVolume[(z*nPixX*nPixY) + (x*nPixY) + y] = pVolumet[(z*nPixX*nPixY) + (x_inv*nPixY) + y] / (double) nProj2Run;
 
 	free(pVolumet);
 
diff --git a/pydbt/sources/backprojectionDDb/backprojectionDDb.cpp b/pydbt/sources/backprojectionDDb/backprojectionDDb.cpp
index 1c83b6e..0110fcc 100644
--- a/pydbt/sources/backprojectionDDb/backprojectionDDb.cpp
+++ b/pydbt/sources/backprojectionDDb/backprojectionDDb.cpp
@@ -5,7 +5,7 @@
 % =========================================================================
 %{
 % -------------------------------------------------------------------------
-%                 backprojectionDDb_lib(pProj, pVolume, pGeo)
+%                 backprojectionDDb_lib(pProj, pVolume, pGeo, idXProj)
 % -------------------------------------------------------------------------
 %     DESCRIPTION:
 %     This function reconstruct the 3D volume from projections, based on
@@ -18,6 +18,7 @@
 %
 %     - pProj = Pointer to 2D projections for each angle
 %     - pGeo = Pointer to parameters of all pGeometry
+%	  - idXProj = projection number to be projected
 %
 %     OUTPUT:
 %
@@ -56,7 +57,8 @@
 
 extern "C" void backprojectionDDb_lib(double* const pProj, 
 								  	  double* const pVolume, 
-								  	  float* const pGeo){
+								  	  float* const pGeo,
+									  const signed int idXProj){
 
 
 	const int unsigned nPixX = (const int)pGeo[0]; 
@@ -83,15 +85,15 @@ extern "C" void backprojectionDDb_lib(double* const pProj,
 	double* const pTubeAngle = (double*)malloc(nProj * sizeof(double));
 	double* const pDetAngle = (double*)malloc(nProj * sizeof(double));
 
-	const int x_offset = (const int)pGeo[18];
-	const int y_offset = (const int)pGeo[19];
+	const double x_offset = (const double)pGeo[18];
+	const double y_offset = (const double)pGeo[19];
 
 	linspace(-tubeAngle/2, tubeAngle/2, nProj, pTubeAngle);
 	linspace(-detAngle/2, detAngle/2, nProj, pDetAngle);
 
 	// printf("Nx:%d Ny:%d Nz:%d \nNu:%d Nv:%d \nDx:%.2f Dy:%.2f Dz:%.2f \nDu:%.2f Dv:%.2f \nDSD:%.2f DDR:%.2f \nTube angle:%.2f \nDet angle:%.2f", nPixX, nPixY, nSlices, nDetX, nDetY, dx, dy, dz, du, dv, DSD, DDR, tubeAngle, detAngle);
 		
-	backprojectionDDb(pVolume, pProj, pTubeAngle, pDetAngle, x_offset, y_offset, nProj, nPixX, nPixY, nSlices, nDetX, nDetY, dx, dy, dz, du, dv, DSD, DDR, DAG);
+	backprojectionDDb(pVolume, pProj, pTubeAngle, pDetAngle, nProj, nPixX, nPixY, nSlices, nDetX, nDetY, idXProj, x_offset, y_offset, dx, dy, dz, du, dv, DSD, DDR, DAG);
 
 	free(pTubeAngle);
 	free(pDetAngle);
@@ -106,14 +108,15 @@ void backprojectionDDb(double* const pVolume,
 	double* const pProj,
 	double* const pTubeAngle,
 	double* const pDetAngle,
-	const  int x_offset,
-	const  int y_offset,
-	const  int nProj,
-	const  int nPixX,
-	const  int nPixY,
-	const  int nSlices,
-	const  int nDetX,
-	const  int nDetY,
+	const unsigned int nProj,
+	const unsigned int nPixX,
+	const unsigned int nPixY,
+	const unsigned int nSlices,
+	const unsigned int nDetX,
+	const unsigned int nDetY,
+	const signed int idXProj,
+	const double x_offset,
+	const double y_offset,
 	const double dx,
 	const double dy,
 	const double dz,
@@ -250,8 +253,22 @@ void backprojectionDDb(double* const pVolume,
 	
 	double* pDetmX_tmp = pDetmX + (nDetYMap * (nDetXMap - 2));
 
+	// Test if we will loop over all projs or not
+	unsigned int projIni, projEnd, nProj2Run;
+	if(idXProj == -1){
+		projIni = 0;
+		projEnd = nProj;
+		nProj2Run = nProj;
+	}
+	else{
+		nProj2Run = 1;
+		projIni = (unsigned int) idXProj;
+		projEnd = (unsigned int) idXProj + 1;
+	}
+	
+	
 	// For each projection
-	for (int p = 0; p < nProj; p++) {
+	for (unsigned int p = projIni; p < projEnd; p++) {
 		
 		// Get specif tube angle for the projection
 		double theta = pTubeAngle[p] * M_PI / 180.0;
@@ -309,7 +326,7 @@ void backprojectionDDb(double* const pVolume,
 	for (int nz = 0; nz < nSlices; nz++)
 		for (int x = 0; x < nPixX; x++)
 			for (int y = 0; y < nPixY; y++)
-				pVolume[(nz*nPixX*nPixY) + (x*nPixY) + y] = pVolumet[(nz*nPixX*nPixY) + (x*nPixY) + y] / (double) nProj;
+				pVolume[(nz*nPixX*nPixY) + (x*nPixY) + y] = pVolumet[(nz*nPixX*nPixY) + (x*nPixY) + y] / (double) nProj2Run;
 	
 	
 	free(pProjt);
diff --git a/pydbt/sources/backprojectionDDb/backprojectionDDb.hpp b/pydbt/sources/backprojectionDDb/backprojectionDDb.hpp
index 8539a9a..c3d477c 100644
--- a/pydbt/sources/backprojectionDDb/backprojectionDDb.hpp
+++ b/pydbt/sources/backprojectionDDb/backprojectionDDb.hpp
@@ -47,12 +47,15 @@ void backprojectionDDb(double* const pVolume,
 	double* const pProj,
 	double* const pTubeAngle,
 	double* const pDetAngle,
-	const  int nProj,
-	const  int nPixX,
-	const  int nPixY,
-	const  int nSlices,
-	const  int nDetX,
-	const  int nDetY,
+	const unsigned int nProj,
+	const unsigned int nPixX,
+	const unsigned int nPixY,
+	const unsigned int nSlices,
+	const unsigned int nDetX,
+	const unsigned int nDetY,
+	const signed int idXProj,
+	const double x_offset,
+	const double y_offset,
 	const double dx,
 	const double dy,
 	const double dz,
diff --git a/pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.cpp b/pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.cpp
new file mode 100644
index 0000000..0f1b693
--- /dev/null
+++ b/pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.cpp
@@ -0,0 +1,132 @@
+/*
+%% Author: Rodrigo de Barros Vimieiro
+% Date: Feb, 2022
+% rodrigo.vimieiro@gmail.com
+% =========================================================================
+%{
+% -------------------------------------------------------------------------
+%                 backprojectionDDb_cuda_lib(pProj, pVolume, pGeo, idXProj)
+% -------------------------------------------------------------------------
+%     DESCRIPTION:
+%     This function reconstruct the 3D volume from projections, based on
+%     the Distance-Driven principle. It works by calculating the overlap
+%     in X and Y axis of the volume and the detector boundaries.
+%     The geometry is for DBT with half cone-beam. All parameters are set
+%     in "ParameterSettings" code.
+%
+%     INPUT:
+%
+%     - pProj = Pointer to 2D projections for each angle
+%     - pGeo = Pointer to parameters of all pGeometry
+%	  - idXProj = projection number to be projected
+%
+%     OUTPUT:
+%
+%     - pVolume = Pointer to reconstructed volume.
+%
+%     Reference:
+%     - Branchless Distance Driven Projection and Backprojection,
+%     Samit Basu and Bruno De Man (2006)
+%     - GPU Acceleration of Branchless Distance Driven Projection and
+%     Backprojection, Liu et al (2016)
+%     - GPU-Based Branchless Distance-Driven Projection and Backprojection,
+%     Liu et al (2017)
+%     - A GPU Implementation of Distance-Driven Computed Tomography,
+%     Ryan D. Wagner (2017)
+%     ---------------------------------------------------------------------
+%     Copyright (C) <2022>  <Rodrigo de Barros Vimieiro>
+%
+%     This program is free software: you can redistribute it and/or modify
+%     it under the terms of the GNU General Public License as published by
+%     the Free Software Foundation, either version 3 of the License, or
+%     (at your option) any later version.
+%
+%     This program is distributed in the hope that it will be useful,
+%     but WITHOUT ANY WARRANTY; without even the implied warranty of
+%     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+%     GNU General Public License for more details.
+%
+%     You should have received a copy of the GNU General Public License
+%     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+%}
+% =========================================================================
+%% 3-D Distance Driven Back-projection Code
+*/
+
+#include "backprojectionDDb_cuda.hpp"
+
+extern "C" void backprojectionDDb_cuda_lib(double* const pProj, 
+										   double* const pVolume, 
+										   float* const pGeo,
+										   const signed int idXProj){
+
+
+	const int unsigned nPixX = (const int)pGeo[0]; 
+	const int unsigned nPixY = (const int)pGeo[1]; 
+	const int unsigned nSlices = (const int)pGeo[2]; 
+	const int unsigned nDetX = (const int)pGeo[3]; 
+	const int unsigned nDetY = (const int)pGeo[4]; 
+
+	const double dx = (const double)pGeo[5]; 
+	const double dy = (const double)pGeo[6]; 
+	const double dz = (const double)pGeo[7]; 
+	const double du = (const double)pGeo[8]; 
+	const double dv = (const double)pGeo[9]; 
+
+	const double DSD = (const double)pGeo[10]; 
+	const double DDR = (const double)pGeo[12]; 
+	const double DAG = (const double)pGeo[14]; 
+
+	const int unsigned nProj = (const int)pGeo[15];
+
+	const double tubeAngle = (const double)pGeo[16];
+	const double detAngle = (const double)pGeo[17];
+
+	double* const pTubeAngle = (double*)malloc(nProj * sizeof(double));
+	double* const pDetAngle = (double*)malloc(nProj * sizeof(double));
+
+	const double x_offset = (const double)pGeo[18];
+	const double y_offset = (const double)pGeo[19];
+
+	linspace(-tubeAngle/2, tubeAngle/2, nProj, pTubeAngle);
+	linspace(-detAngle/2, detAngle/2, nProj, pDetAngle);
+
+	// printf("Nx:%d Ny:%d Nz:%d \nNu:%d Nv:%d \nDx:%.2f Dy:%.2f Dz:%.2f \nDu:%.2f Dv:%.2f \nDSD:%.2f DDR:%.2f \nTube angle:%.2f \nDet angle:%.2f", nPixX, nPixY, nSlices, nDetX, nDetY, dx, dy, dz, du, dv, DSD, DDR, tubeAngle, detAngle);
+		
+	backprojectionDDb(pVolume, pProj, pTubeAngle, pDetAngle, nProj, nPixX, nPixY, nSlices, nDetX, nDetY, idXProj, x_offset, y_offset, dx, dy, dz, du, dv, DSD, DDR, DAG);
+
+	free(pTubeAngle);
+	free(pDetAngle);
+
+	return;
+
+}
+
+// Linear spaced vector
+// Ref: https://stackoverflow.com/a/27030598/8682939
+void linspace(double start, 
+			  double end, 
+			  int num,
+			  double* pLinspaced){
+
+	double delta;
+
+	if(num == 0)
+		delta = 0;
+	else{
+		if(num == 1)
+			delta = 1;
+		else
+			delta = (end - start) / (num - 1);
+	}
+
+	if((abs(start) < 0.00001) && (abs(end) < 0.00001))
+		delta = 0;
+
+
+	for (int k = 0; k < num; k++){
+			pLinspaced[k] = start + k * delta;
+	}
+
+  return;
+}
\ No newline at end of file
diff --git a/pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.hpp b/pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.hpp
new file mode 100644
index 0000000..1d0dc26
--- /dev/null
+++ b/pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.hpp
@@ -0,0 +1,85 @@
+/*
+%% Author: Rodrigo de Barros Vimieiro
+% Date: Feb, 2022
+% rodrigo.vimieiro@gmail.com
+% =========================================================================
+%{
+% -------------------------------------------------------------------------
+%
+% -------------------------------------------------------------------------
+%     DESCRIPTION:
+%     This is the header function
+%     ---------------------------------------------------------------------
+%     Copyright (C) <2022>  <Rodrigo de Barros Vimieiro>
+%
+%     This program is free software: you can redistribute it and/or modify
+%     it under the terms of the GNU General Public License as published by
+%     the Free Software Foundation, either version 3 of the License, or
+%     (at your option) any later version.
+%
+%     This program is distributed in the hope that it will be useful,
+%     but WITHOUT ANY WARRANTY; without even the implied warranty of
+%     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+%     GNU General Public License for more details.
+%
+%     You should have received a copy of the GNU General Public License
+%     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+%}
+% =========================================================================
+%% 3-D Distance Driven Back-projection header
+*/
+
+#include <stdio.h>
+#define _USE_MATH_DEFINES
+#include <math.h>
+
+// Includes CUDA
+#include <cuda_runtime.h>
+
+// Utilities and timing functions
+#include <helper_functions.h>    // includes cuda.h and cuda_runtime_api.h
+
+// CUDA helper functions
+#include <helper_cuda.h>         // helper functions for CUDA error check
+
+#define integrateXcoord 1
+#define integrateYcoord 0
+
+
+/*
+Reference: TIGRE - https://github.com/CERN/TIGRE
+*/
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                printf("%s \n",msg);\
+        } \
+} while (0)
+
+void linspace(double start, 
+                double end, 
+                int num,
+                double* pLinspaced);
+
+void backprojectionDDb(double* const h_pVolume,
+	double* const h_pProj,
+	double* const h_pTubeAngle,
+	double* const h_pDetAngle,
+	const unsigned int nProj,
+	const unsigned int nPixX,
+	const unsigned int nPixY,
+	const unsigned int nSlices,
+	const unsigned int nDetX,
+	const unsigned int nDetY,
+	const signed int idXProj,
+	const double x_offset,
+	const double y_offset,
+	const double dx,
+	const double dy,
+	const double dz,
+	const double du,
+	const double dv,
+	const double DSD,
+	const double DDR,
+	const double DAG);
diff --git a/pydbt/sources/backprojectionDDb_cuda/kernel.cu b/pydbt/sources/backprojectionDDb_cuda/kernel.cu
new file mode 100644
index 0000000..d54f463
--- /dev/null
+++ b/pydbt/sources/backprojectionDDb_cuda/kernel.cu
@@ -0,0 +1,751 @@
+/*
+%% Author: Rodrigo de Barros Vimieiro
+% Date: Feb, 2022
+% rodrigo.vimieiro@gmail.com
+% =========================================================================
+%{
+% -------------------------------------------------------------------------
+%                 backprojectionDDb_cuda_lib(pProj, pVolume, pGeo, idXProj)
+% -------------------------------------------------------------------------
+%     DESCRIPTION:
+%     This function reconstruct the 3D volume from projections, based on
+%     the Distance-Driven principle. It works by calculating the overlap
+%     in X and Y axis of the volume and the detector boundaries.
+%     The geometry is for DBT with half cone-beam. All parameters are set
+%     in "ParameterSettings" code.
+%
+%     INPUT:
+%
+%     - pProj = Pointer to 2D projections for each angle
+%     - pGeo = Pointer to parameters of all pGeometry
+%	  - idXProj = projection number to be projected
+%
+%     OUTPUT:
+%
+%     - pVolume = Pointer to reconstructed volume.
+%
+%     Reference:
+%     - Branchless Distance Driven Projection and Backprojection,
+%     Samit Basu and Bruno De Man (2006)
+%     - GPU Acceleration of Branchless Distance Driven Projection and
+%     Backprojection, Liu et al (2016)
+%     - GPU-Based Branchless Distance-Driven Projection and Backprojection,
+%     Liu et al (2017)
+%     - A GPU Implementation of Distance-Driven Computed Tomography,
+%     Ryan D. Wagner (2017)
+%     ---------------------------------------------------------------------
+%     Copyright (C) <2022>  <Rodrigo de Barros Vimieiro>
+%
+%     This program is free software: you can redistribute it and/or modify
+%     it under the terms of the GNU General Public License as published by
+%     the Free Software Foundation, either version 3 of the License, or
+%     (at your option) any later version.
+%
+%     This program is distributed in the hope that it will be useful,
+%     but WITHOUT ANY WARRANTY; without even the implied warranty of
+%     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+%     GNU General Public License for more details.
+%
+%     You should have received a copy of the GNU General Public License
+%     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+%}
+% =========================================================================
+%% 3-D Distance Driven Back-projection Code
+*/
+
+#include "backprojectionDDb_cuda.hpp"
+
+
+/****************************************************************************
+*							CUDA Kernels									*
+****************************************************************************/
+__global__ void pad_projections_kernel(double* d_img, const int nDetXMap, const int nDetYMap, const int nElem, unsigned int np) {
+
+	const int threadGId = blockIdx.x * blockDim.x + threadIdx.x;
+
+	// Make sure we don't try and access memory outside
+	// by having any threads mapped there return early
+	if (threadGId >= nElem)
+		return;
+
+	d_img[(np*nDetYMap *nDetXMap) + (threadGId*nDetYMap)] = 0;
+
+	return;
+}
+
+__global__ void map_boudaries_kernel(double* d_pBound, const int nElem, const double valueLeftBound, const double sizeElem, const double offset) {
+
+	const int threadGId = blockIdx.x * blockDim.x + threadIdx.x;
+
+	// Make sure we don't try and access memory outside
+	// by having any threads mapped there return early
+	if (threadGId >= nElem)
+		return;
+
+	d_pBound[threadGId] = (threadGId - valueLeftBound) * sizeElem + offset;
+
+	return;
+}
+
+__global__ void rot_detector_kernel(double* d_pRdetY, double* d_pRdetZ, double* d_pYcoord, double* d_pZcoord, const double yOffset, const double zOffset,
+	const double phi, const int nElem) {
+
+	const int threadGId = blockIdx.x * blockDim.x + threadIdx.x;
+
+	// Make sure we don't try and access memory outside
+	// by having any threads mapped there return early
+	if (threadGId >= nElem)
+		return;
+
+	// cos and sin are in measured in radians.
+
+	d_pRdetY[threadGId] = ((d_pYcoord[threadGId] - yOffset)* cos(phi) - (d_pZcoord[threadGId] - zOffset)* sin(phi)) + yOffset;
+	d_pRdetZ[threadGId] = ((d_pYcoord[threadGId] - yOffset)* sin(phi) + (d_pZcoord[threadGId] - zOffset)* cos(phi)) + zOffset;
+
+	return;
+
+}
+
+__global__ void mapDet2Slice_kernel(double* const pXmapp, double* const pYmapp, double tubeX, double tubeY, double tubeZ, double* const pXcoord,
+	double* const pYcoord, double* const pZcoord, double* const pZSlicecoord, const int nDetXMap, const int nDetYMap, const unsigned int nz) {
+
+	/*
+
+	Note: Matlab has linear indexing as a column of elements, i.e, the elements are actually stored in memory as queued columns.
+	So threads in X are launched in the column direction (DBT Y coord), and threads in Y are launched in the row direction (DBT X coord).
+
+	*/
+
+	const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
+		blockIdx.y * blockDim.y + threadIdx.y);
+
+	const int thread_1D_pos = thread_2D_pos.y * nDetYMap + thread_2D_pos.x;
+
+	// Make sure we don't try and access memory outside the detector
+	// by having any threads mapped there return early
+	if (thread_2D_pos.x >= nDetYMap || thread_2D_pos.y >= nDetXMap)
+		return;
+
+	pXmapp[thread_1D_pos] = ((pXcoord[thread_2D_pos.y] - tubeX)*(pZSlicecoord[nz] - pZcoord[thread_2D_pos.x]) - (pXcoord[thread_2D_pos.y] * tubeZ) + (pXcoord[thread_2D_pos.y] * pZcoord[thread_2D_pos.x])) / (-tubeZ + pZcoord[thread_2D_pos.x]);
+
+	if (thread_2D_pos.y == 0)
+		pYmapp[thread_2D_pos.x] = ((pYcoord[thread_2D_pos.x] - tubeY)*(pZSlicecoord[nz] - pZcoord[thread_2D_pos.x]) - (pYcoord[thread_2D_pos.x] * tubeZ) + (pYcoord[thread_2D_pos.x] * pZcoord[thread_2D_pos.x])) / (-tubeZ + pZcoord[thread_2D_pos.x]);
+
+	return;
+}
+
+__global__ void img_integration_kernel(double* d_img, const int nPixX, const int nPixY, bool direction, unsigned int offsetX, unsigned int offsetY, unsigned int nSlices) {
+
+	/*
+
+	Integration of 2D slices over the whole volume
+
+	(S.1.Integration. - Liu et al(2017))
+
+	** Perfom an inclusive scan **
+
+	*/
+
+	const int3 memory_2D_pos = make_int3(blockIdx.x * blockDim.x + threadIdx.x + offsetX,
+		blockIdx.y * blockDim.y + threadIdx.y + offsetY,
+		blockIdx.z * blockDim.z + threadIdx.z);
+
+	const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
+		blockIdx.y * blockDim.y + threadIdx.y);
+
+
+	// Make sure we don't try and access memory outside the detector
+	// by having any threads mapped there return early
+	if (memory_2D_pos.x >= nPixY || memory_2D_pos.y >= nPixX || memory_2D_pos.z >= nSlices)
+		return;
+
+
+	if (direction == integrateXcoord) {
+
+		for (int s = 1; s <= blockDim.y; s *= 2) {
+
+			int spot = thread_2D_pos.y - s;
+
+			double val = 0;
+
+			if (spot >= 0) {
+				val = d_img[(memory_2D_pos.z*nPixY*nPixX) + (offsetY + spot) * nPixY + memory_2D_pos.x];
+			}
+			__syncthreads();
+
+			if (spot >= 0) {
+				d_img[(memory_2D_pos.z*nPixY*nPixX) + (memory_2D_pos.y * nPixY) + memory_2D_pos.x] += val;
+			}
+			__syncthreads();
+		}
+	}
+	else
+	{
+
+		for (int s = 1; s <= blockDim.x; s *= 2) {
+
+			int spot = thread_2D_pos.x - s;
+
+			double val = 0;
+
+			if (spot >= 0) {
+				val = d_img[(memory_2D_pos.z*nPixY*nPixX) + memory_2D_pos.y * nPixY + spot + offsetX];
+			}
+			__syncthreads();
+
+			if (spot >= 0) {
+				d_img[(memory_2D_pos.z*nPixY*nPixX) + (memory_2D_pos.y * nPixY) + memory_2D_pos.x] += val;
+			}
+			__syncthreads();
+		}
+
+	}
+	return;
+}
+
+__global__ void bilinear_interpolation_kernel_GPU(double* d_sliceI, double* d_pProj, double* d_pObjX, double* d_pObjY, double* d_pDetmX, double* d_pDetmY, const int nPixXMap, const int nPixYMap,
+	const int nDetXMap, const int nDetYMap, const int nDetX, const int nDetY, const unsigned int np) {
+
+	const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
+										 blockIdx.y * blockDim.y + threadIdx.y);
+
+	// Make sure we don't try and access memory outside the detector
+	// by having any threads mapped there return early
+	if (thread_2D_pos.x >= nPixYMap || thread_2D_pos.y >= nPixXMap)
+		return;
+
+	/*
+
+	S.2. Interpolation - Liu et al (2017)
+
+	Reference:
+	- https://en.wikipedia.org/wiki/Bilinear_interpolation
+	- https://stackoverflow.com/questions/21128731/bilinear-interpolation-in-c-c-and-cuda
+
+	Note: *** We are using the Unit Square equation ***
+
+	alpha = X - X1
+	1-alpha = X2 - X
+
+	beta = Y - Y1
+	1-beta = Y2 - Y
+
+	----> Xcoord (thread_2D_pos.y)
+	|							0___________
+	v							|_d00_|_d10_|
+	Ycoord (thread_2D_pos.x)	|_d01_|_d11_|
+
+	Matlab code --
+	objX = nDetX - (objX ./ detmX(1, end - 1));
+	objY = (objY ./ detmX(1, end - 1)) - (detmY(1) / detmX(1, end - 1));
+
+	*/
+
+	// Adjust the mapped coordinates to cross the range of (0-nDetX).*duMap 
+	//  Divide by pixelSize to get a unitary pixel size
+	const double xNormData = nDetX - d_pObjX[thread_2D_pos.y] / d_pDetmX[0];
+	const signed int    xData = floor(xNormData);
+	const double  alpha = xNormData - xData;
+
+	// Adjust the mapped coordinates to cross the range of (0-nDetY).*dyMap  
+	//  Divide by pixelSize to get a unitary pixel size
+	const double yNormData = (d_pObjY[thread_2D_pos.x] / d_pDetmX[0]) - (d_pDetmY[0] / d_pDetmX[0]);
+	const signed int    yData = floor(yNormData);
+	const double  beta = yNormData - yData;
+
+	double d00, d01, d10, d11;
+	if (((xNormData) >= 0) && ((xNormData) <= nDetX) && ((yNormData) >= 0) && ((yNormData) <= nDetY))	d00 = d_pProj[(np*nDetYMap*nDetXMap) + (xData*nDetYMap + yData)];						else    d00 = 0.0;
+	if (((xData + 1) > 0) && ((xData + 1) <= nDetX) && ((yNormData) >= 0) && ((yNormData) <= nDetY))	d10 = d_pProj[(np*nDetYMap*nDetXMap) + ((xData + 1)*nDetYMap + yData)]; 				else    d10 = 0.0;
+	if (((xNormData) >= 0) && ((xNormData) <= nDetX) && ((yData + 1) > 0) && ((yData + 1) <= nDetY))	d01 = d_pProj[(np*nDetYMap*nDetXMap) + (xData*nDetYMap + yData + 1)]; 				else    d01 = 0.0;
+	if (((xData + 1) > 0) && ((xData + 1) <= nDetX) && ((yData + 1) > 0) && ((yData + 1) <= nDetY))		d11 = d_pProj[(np*nDetYMap*nDetXMap) + ((xData + 1)*nDetYMap + yData + 1)];			else    d11 = 0.0;
+
+	double result_temp1 = alpha * d10 + (-d00 * alpha + d00);
+	double result_temp2 = alpha * d11 + (-d01 * alpha + d01);
+
+	d_sliceI[thread_2D_pos.y * nPixYMap + thread_2D_pos.x] = beta * result_temp2 + (-result_temp1 * beta + result_temp1);
+
+}
+
+__global__ void differentiation_kernel(double* d_pVolume, double* d_sliceI, double tubeX, double rtubeY, double rtubeZ,
+	double* const d_pObjX, double* const d_pObjY, double* const d_pObjZ, const int nPixX, const int nPixY, const int nPixXMap,
+	const int nPixYMap, const double du, const double dv, const double dx, const double dy, const double dz, const unsigned int nz) {
+
+	const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
+		blockIdx.y * blockDim.y + threadIdx.y);
+
+	const int thread_1D_pos = (nPixX*nPixY*nz) + (thread_2D_pos.y * nPixY) + thread_2D_pos.x;
+
+	// Make sure we don't try and access memory outside the detector
+	// by having any threads mapped there return early
+	if (thread_2D_pos.x >= nPixY || thread_2D_pos.y >= nPixX)
+		return;
+
+	/*
+
+	S.3. Differentiation - Eq. 24 - Liu et al (2017)
+
+	Detector integral projection
+	___________
+	|_A_|_B_|___|
+	|_C_|_D_|___|
+	|___|___|___|
+
+
+	(thread_2D_pos.x,thread_2D_pos.y)
+	________________
+	|_A_|__B__|_____|
+	|_C_|(0,0)|(0,1)|
+	|___|(1,0)|(1,1)|
+
+	Threads are lauched from D up to nPixX (thread_2D_pos.y) and nPixY (thread_2D_pos.x)
+	i.e., they are running on the detector image. Thread (0,0) is on D.
+
+	Coordinates on intergal projection:
+
+	A = thread_2D_pos.y * nPixYMap + thread_2D_pos.x
+	B = ((thread_2D_pos.y+1) * nPixYMap) + thread_2D_pos.x
+	C = thread_2D_pos.y * nPixYMap + thread_2D_pos.x + 1
+	D = ((thread_2D_pos.y+1) * nPixYMap) + thread_2D_pos.x + 1
+
+	*/
+
+	unsigned int coordA = thread_2D_pos.y * nPixYMap + thread_2D_pos.x;
+	unsigned int coordB = ((thread_2D_pos.y + 1) * nPixYMap) + thread_2D_pos.x;
+	unsigned int coordC = coordA + 1;
+	unsigned int coordD = coordB + 1;
+
+	// x - ray angle in X coord
+	double gamma = atan((d_pObjX[thread_2D_pos.y] + (dx / 2.0) - tubeX) / (rtubeZ - d_pObjZ[nz]));
+
+	// x - ray angle in Y coord
+	double alpha = atan((d_pObjY[thread_2D_pos.x] + (dy / 2.0) - rtubeY) / (rtubeZ - d_pObjZ[nz]));
+
+
+	double dA, dB, dC, dD;
+
+	dA = d_sliceI[coordA];
+	dB = d_sliceI[coordB];
+	dC = d_sliceI[coordC];
+	dD = d_sliceI[coordD];
+
+	// Treat border of interpolated integral detector
+	if (dC == 0 && dD == 0) {
+		dC = dA;
+		dD = dB;
+	}
+
+
+	// S.3.Differentiation - Eq. 24 - Liu et al(2017)
+	d_pVolume[thread_1D_pos] += ((dD - dC - dB + dA)*(du*dv*dz / (cos(alpha)*cos(gamma)*dx*dy)));
+
+	return;
+}
+
+__global__ void division_kernel(double* d_img, const int nPixX, const int nPixY, unsigned int nSlices, unsigned int nProj){
+
+	const int3 thread_2D_pos = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
+		blockIdx.y * blockDim.y + threadIdx.y,
+		blockIdx.z * blockDim.z + threadIdx.z);
+
+	const int thread_1D_pos = (nPixX*nPixY*thread_2D_pos.z) + (thread_2D_pos.y * nPixY) + thread_2D_pos.x;
+
+	// Make sure we don't try and access memory outside the detector
+	// by having any threads mapped there return early
+	if (thread_2D_pos.x >= nPixY || thread_2D_pos.y >= nPixX || thread_2D_pos.z >= nSlices)
+		return;
+
+	d_img[thread_1D_pos] /= (double) nProj;
+
+}
+
+
+/****************************************************************************
+* function: backprojectionDDb() - CUDA backprojection Branchless Distance Driven.	*
+****************************************************************************/
+
+void backprojectionDDb(double* const h_pVolume,
+	double* const h_pProj,
+	double* const h_pTubeAngle,
+	double* const h_pDetAngle,
+	const unsigned int nProj,
+	const unsigned int nPixX,
+	const unsigned int nPixY,
+	const unsigned int nSlices,
+	const unsigned int nDetX,
+	const unsigned int nDetY,
+	const signed int idXProj,
+	const double x_offset,
+	const double y_offset,
+	const double dx,
+	const double dy,
+	const double dz,
+	const double du,
+	const double dv,
+	const double DSD,
+	const double DDR,
+	const double DAG)
+{
+
+	// Unique GPU
+	int devID = 0;
+
+	cudaDeviceProp deviceProp;
+	cudaGetDeviceProperties(&deviceProp, devID);
+	cudaCheckErrors("cudaGetDeviceProperties");
+
+	const unsigned int maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
+
+
+	printf("GPU Device %d: \"%s\" with compute capability %d.%d has %d Multi-Processors and %zu bytes of global memory\n\n", devID,
+		deviceProp.name, deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount, deviceProp.totalGlobalMem);
+
+	
+	// Create timmer variables and start it
+	StopWatchInterface *timer = NULL;
+	sdkCreateTimer(&timer);
+	sdkStartTimer(&timer);
+	
+	//cudaStream_t stream1;
+	//cudaStreamCreate(&stream1);
+
+	dim3 threadsPerBlock(1, 1, 1);
+	dim3 blockSize(1, 1, 1);
+
+
+	// Number of mapped detector and pixels
+	const int nDetXMap = nDetX + 1;
+	const int nDetYMap = nDetY + 1;
+	const int nPixXMap = nPixX + 1;
+	const int nPixYMap = nPixY + 1;
+
+
+	// Convention: h_ variables live on host
+	// Convention: d_ variables live on device (GPU global mem)
+	double* d_pProj;
+	double* d_sliceI;
+	double* d_pVolume;
+	double* d_pTubeAngle;
+	double* d_pDetAngle;
+	
+	
+	// Allocate global memory on the device, place result in "d_----"
+	cudaMalloc((void **)&d_pProj, nDetXMap*nDetYMap*nProj * sizeof(double)); 
+	cudaMalloc((void **)&d_sliceI, nPixXMap*nPixYMap * sizeof(double));
+	cudaMalloc((void **)&d_pVolume, nPixX*nPixY*nSlices * sizeof(double));
+	cudaMalloc((void **)&d_pTubeAngle, nProj * sizeof(double));
+	cudaMalloc((void **)&d_pDetAngle, nProj * sizeof(double));
+	
+	cudaCheckErrors("cudaMalloc Initial");
+	
+	// Copy data from host memory "h_----" to device memory "d_----"
+	cudaMemcpy((void *)d_pTubeAngle, (void *)h_pTubeAngle, nProj * sizeof(double), cudaMemcpyHostToDevice);
+	cudaMemcpy((void *)d_pDetAngle, (void *)h_pDetAngle, nProj * sizeof(double), cudaMemcpyHostToDevice);
+
+	cudaCheckErrors("cudaMemcpy Angles");
+	
+
+	/*
+	Copy projection data from host memory "h_----" to device memory "d_----" padding with zeros for image integation
+	*/
+	
+
+	// Initialize first column and row with zeros
+	double* h_pProj_tmp;
+	double* d_pProj_tmp;
+
+	threadsPerBlock.x = maxThreadsPerBlock;
+	blockSize.x = (nDetXMap / maxThreadsPerBlock) + 1;
+
+	for (unsigned int np = 0; np < nProj; np++) {
+
+		// Pad on X coord direction
+		pad_projections_kernel << <blockSize, threadsPerBlock >> > (d_pProj, nDetXMap, nDetYMap, nDetXMap, np);
+
+		// Pad on Y coord direction
+		d_pProj_tmp = d_pProj + (nDetXMap*nDetYMap*np) + 1;
+		cudaMemset(d_pProj_tmp, 0, nPixY * sizeof(double));
+		cudaCheckErrors("cudaMemset Padding Projections");
+	}
+	
+	
+	// Copy projections data from host memory
+	for (unsigned int np = 0; np < nProj; np++)
+		for (unsigned int c = 0; c < nDetX; c++) {
+
+			h_pProj_tmp = h_pProj + (c *nDetY) + (nDetX*nDetY*np);
+			d_pProj_tmp = d_pProj + (((c + 1) *nDetYMap) + 1) + (nDetXMap*nDetYMap*np);
+
+			cudaMemcpy((void *)d_pProj_tmp, (void *)h_pProj_tmp, nDetY * sizeof(double), cudaMemcpyHostToDevice);
+			cudaCheckErrors("cudaMemcpy Projections");
+		}
+
+	
+	
+	// Pointer for projections coordinates
+	double* d_pDetX;
+	double* d_pDetY;
+	double* d_pDetZ;
+	double* d_pObjX;
+	double* d_pObjY;
+	double* d_pObjZ;
+
+	// Allocate global memory on the device for projections coordinates
+	cudaMalloc((void **)&d_pDetX, nDetXMap * sizeof(double));
+	cudaMalloc((void **)&d_pDetY, nDetYMap * sizeof(double));
+	cudaMalloc((void **)&d_pDetZ, nDetYMap * sizeof(double));
+	cudaMalloc((void **)&d_pObjX, nPixXMap * sizeof(double));
+	cudaMalloc((void **)&d_pObjY, nPixYMap * sizeof(double));
+	cudaMalloc((void **)&d_pObjZ, nSlices * sizeof(double));
+
+	cudaCheckErrors("cudaMalloc Coordinates");
+
+
+
+	// Pointer for mapped coordinates
+	double* d_pDetmY;
+	double* d_pDetmX;
+
+
+	// Allocate global memory on the device for mapped coordinates
+	cudaMalloc((void **)&d_pDetmY, nDetYMap * sizeof(double));
+	cudaMalloc((void **)&d_pDetmX, nDetYMap * nDetXMap * sizeof(double));
+
+	cudaCheckErrors("cudaMalloc Map-Coordinates");
+
+
+	// Pointer for rotated detector coords
+	double* d_pRdetY;
+	double* d_pRdetZ;
+
+	// Allocate global memory on the device for for rotated detector coords
+	cudaMalloc((void **)&d_pRdetY, nDetYMap * sizeof(double));
+	cudaMalloc((void **)&d_pRdetZ, nDetYMap * sizeof(double));
+
+	cudaCheckErrors("cudaMalloc Rot-Coordinates");
+	
+	// Generate detector and object boudaries
+	
+	threadsPerBlock.x = maxThreadsPerBlock;
+
+	blockSize.x = (nDetX / maxThreadsPerBlock) + 1;
+
+	map_boudaries_kernel << <blockSize, threadsPerBlock >> > (d_pDetX, nDetXMap, (double)nDetX, -du, 0.0);
+
+	blockSize.x = (nDetY / maxThreadsPerBlock) + 1;
+
+	map_boudaries_kernel << <blockSize, threadsPerBlock >> > (d_pDetY, nDetYMap, nDetY / 2.0, dv, 0.0);
+
+	blockSize.x = (nPixX / maxThreadsPerBlock) + 1;
+
+	map_boudaries_kernel << <blockSize, threadsPerBlock >> > (d_pObjX, nPixXMap, (double)nPixX, -dx, 0.0);
+
+	blockSize.x = (nPixY / maxThreadsPerBlock) + 1;
+
+	map_boudaries_kernel << <blockSize, threadsPerBlock >> > (d_pObjY, nPixYMap, nPixY / 2.0, dy, 0.0);
+
+	blockSize.x = (nSlices / maxThreadsPerBlock) + 1;
+
+	map_boudaries_kernel << <blockSize, threadsPerBlock >> > (d_pObjZ, nSlices, 0.0, dz, DAG + (dz / 2.0));
+
+	
+	//printf("Map Boundaries -- Threads:%d Blocks:%d \n", threads, blocks);
+
+	
+	// Initiate variables value with 0
+	cudaMemset(d_pDetZ, 0, nDetYMap * sizeof(double));
+	cudaMemset(d_pVolume, 0, nPixX * nPixY * nSlices * sizeof(double));
+
+	cudaCheckErrors("cudaMemset Zeros");
+	
+	//cudaDeviceSynchronize();
+
+	// X - ray tube initial position
+	double tubeX = 0;
+	double tubeY = 0;
+	double tubeZ = DSD;
+
+	// Iso - center position
+	double isoY = 0;
+	double isoZ = DDR;
+	
+
+	// Integration of 2D projection over the whole projections
+	// (S.1.Integration. - Liu et al(2017))
+
+	// Naive integration o the X coord
+	threadsPerBlock.x = 10;
+	threadsPerBlock.y = 10;
+	threadsPerBlock.z = 10;
+
+	blockSize.x = (unsigned int)ceil(((double)nDetYMap / (threadsPerBlock.x - 1)));
+	blockSize.y = 1;
+	blockSize.z = (unsigned int)ceil((double)nProj / threadsPerBlock.z);
+
+	for (int k = 0; k < ceil((double)nDetXMap / (threadsPerBlock.x - 1)); k++) {
+
+		img_integration_kernel << <blockSize, threadsPerBlock >> > (d_pProj, nDetXMap, nDetYMap, integrateXcoord, 0, k * 9, nProj);
+
+		//printf("integration kernel -- threadsX:%d blocksX:%d threadsY:%d blocksY:%d \n", threadsPerBlock.x, blockSize.x, threadsPerBlock.y, blockSize.y);
+
+	}
+
+	// Naive integration o the Y coord
+	threadsPerBlock.x = 10;
+	threadsPerBlock.y = 10;
+	threadsPerBlock.z = 10;
+
+	blockSize.x = 1;
+	blockSize.y = (unsigned int)ceil((double)nDetXMap / (threadsPerBlock.y - 1));
+	blockSize.z = (unsigned int)ceil((double)nProj / threadsPerBlock.z);
+
+
+	for (int k = 0; k < ceil((double)nDetYMap / (threadsPerBlock.y - 1)); k++) {
+
+		img_integration_kernel << <blockSize, threadsPerBlock >> > (d_pProj, nDetXMap, nDetYMap, integrateYcoord, k * 9, 0, nProj);
+
+		//printf("integration kernel -- threadsX:%d blocksX:%d threadsY:%d blocksY:%d \n", threadsPerBlock.x, blockSize.x, threadsPerBlock.y, blockSize.y);
+
+	}
+	
+	double* d_pDetmX_tmp = d_pDetmX + (nDetYMap * (nDetXMap-2));
+
+	// Test if we will loop over all projs or not
+	unsigned int projIni, projEnd, nProj2Run;
+	if(idXProj == -1){
+		projIni = 0;
+		projEnd = nProj;
+		nProj2Run = nProj;
+	}
+	else{
+		nProj2Run = 1;
+		projIni = (unsigned int) idXProj;
+		projEnd = (unsigned int) idXProj + 1;
+	}
+	
+	
+	// For each projection
+	for (unsigned int p = projIni; p < projEnd; p++) {
+	
+		// Get specif tube angle for the projection
+		double theta = h_pTubeAngle[p] * M_PI / 180.0;
+
+		// Get specif detector angle for the projection
+		double phi = h_pDetAngle[p] * M_PI / 180.0;
+
+		//printf("Tube angle:%f Det angle:%f\n", theta, phi);
+
+		// Tube rotation
+		double rtubeY = ((tubeY - isoY)*cos(theta) - (tubeZ - isoZ)*sin(theta)) + isoY;
+		double rtubeZ = ((tubeY - isoY)*sin(theta) + (tubeZ - isoZ)*cos(theta)) + isoZ;
+
+		//printf("R tube Y:%f R tube Z:%f\n", rtubeY, rtubeZ);
+
+		// Detector rotation
+		threadsPerBlock.x = maxThreadsPerBlock;
+		threadsPerBlock.y = 1;
+		threadsPerBlock.z = 1;
+
+		blockSize.x = (nDetYMap / maxThreadsPerBlock) + 1;
+		blockSize.y = 1;
+		blockSize.z = 1;
+
+		rot_detector_kernel << <blockSize, threadsPerBlock >> > (d_pRdetY, d_pRdetZ, d_pDetY, d_pDetZ, isoY, isoZ, phi, nDetYMap);
+
+		//cudaDeviceSynchronize();
+
+		//printf("Detector rotation -- Threads:%d Blocks:%d \n", threads, blocks);
+
+
+		// For each slice
+		for (unsigned int nz = 0; nz < nSlices; nz++) {
+			
+			/*
+
+			Map detector onto XY plane(Inside proj loop in case detector rotates)
+
+			*** Note: Matlab has linear indexing as a column of elements, i.e, the elements are actually stored in memory as queued columns.
+			So threads in X are launched in the column direction (DBT Y coord), and threads in Y are launched in the row direction (DBT X coord).
+
+			*/
+		
+			threadsPerBlock.x = 32;
+			threadsPerBlock.y = 32;
+			threadsPerBlock.z = 1;
+
+			blockSize.x = (nDetYMap / threadsPerBlock.x) + 1;
+			blockSize.y = (nDetXMap / threadsPerBlock.y) + 1;
+			blockSize.z = 1;
+
+
+			mapDet2Slice_kernel << <blockSize, threadsPerBlock >> > (d_pDetmX, d_pDetmY, tubeX, rtubeY, rtubeZ, d_pDetX, d_pRdetY, d_pRdetZ, d_pObjZ, nDetXMap, nDetYMap, nz);
+			//cudaDeviceSynchronize();
+
+			//printf("Map detector onto XY plane -- ThreadsX:%d ThreadsY:%d BlocksX:%d BlocksY:%d\n", threadsPerBlock.x, threadsPerBlock.y, blockSizeX, blockSizeY);
+
+			/*
+			S.2. Interpolation - Liu et al (2017)
+			*/
+
+			blockSize.x = (nPixYMap / threadsPerBlock.x) + 1;
+			blockSize.y = (nPixXMap / threadsPerBlock.y) + 1;
+			
+			bilinear_interpolation_kernel_GPU << <blockSize, threadsPerBlock >> > (d_sliceI, d_pProj, d_pObjX, d_pObjY, d_pDetmX_tmp, d_pDetmY, nPixXMap, nPixYMap, nDetXMap, nDetYMap, nDetX, nDetY, p);
+
+
+			/*
+			S.3. Differentiation - Eq. 24 - Liu et al (2017)
+			*/
+
+			blockSize.x = (nPixY / threadsPerBlock.x) + 1;
+			blockSize.y = (nPixX / threadsPerBlock.y) + 1;
+
+			differentiation_kernel << <blockSize, threadsPerBlock >> > (d_pVolume, d_sliceI, tubeX, rtubeY, rtubeZ, d_pObjX, d_pObjY, d_pObjZ, nPixX, nPixY, nPixXMap, nPixYMap, du, dv, dx, dy, dz, nz);
+
+		} // Loop end slices
+
+	} // Loop end Projections
+
+
+	// Normalize volume dividing by the number of projs
+	threadsPerBlock.x = 10;
+	threadsPerBlock.y = 10;
+	threadsPerBlock.z = 10;
+
+	blockSize.x = (nPixY / threadsPerBlock.x) + 1;
+	blockSize.y = (nPixX / threadsPerBlock.y) + 1;
+	blockSize.z = (nSlices / threadsPerBlock.z) + 1;
+
+	division_kernel << <blockSize, threadsPerBlock >> > (d_pVolume, nPixX, nPixY, nSlices, nProj2Run);
+
+	// d_pVolume
+	cudaMemcpy((void *)h_pVolume, (void *)d_pVolume, nSlices* nPixX * nPixY * sizeof(double), cudaMemcpyDeviceToHost);
+	cudaCheckErrors("cudaMemcpy Final");
+	
+
+	cudaFree(d_pProj);
+	cudaFree(d_sliceI);
+	cudaFree(d_pVolume);
+	cudaFree(d_pTubeAngle);
+	cudaFree(d_pDetAngle);
+	cudaFree(d_pDetX);
+	cudaFree(d_pDetY);
+	cudaFree(d_pDetZ);
+	cudaFree(d_pObjX);
+	cudaFree(d_pObjY);
+	cudaFree(d_pObjZ);
+	cudaFree(d_pDetmY);
+	cudaFree(d_pDetmX);
+	cudaFree(d_pRdetY);
+	cudaFree(d_pRdetZ);
+	
+	cudaCheckErrors("cudaFree Final");
+	
+	sdkStopTimer(&timer);
+	printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+	sdkDeleteTimer(&timer);
+
+	cudaDeviceReset();
+
+	return;
+
+}
\ No newline at end of file
diff --git a/pydbt/sources/projectionDD/projectionDD.cpp b/pydbt/sources/projectionDD/projectionDD.cpp
index e5d6c34..477aebc 100644
--- a/pydbt/sources/projectionDD/projectionDD.cpp
+++ b/pydbt/sources/projectionDD/projectionDD.cpp
@@ -5,7 +5,7 @@
 % =========================================================================
 %{
 % -------------------------------------------------------------------------
-%                 projectionDD_lib(pVolume, pProj, ppGeo
+%                 projectionDD_lib(pVolume, pProj, pGeo, idXProj)
 % -------------------------------------------------------------------------
 %     DESCRIPTION:
 %     This function calculates the volume projection based on the
@@ -16,8 +16,9 @@
 %
 %     INPUT:
 %
-%     - pVolume = pointer to 3D volume for projection
-%     - ppGeo = Parameter of all pGeometry
+%     - pVolume = 3D volume for projection
+%     - pGeo = Parameter of all geometry
+%	  - idXProj = projection number to be projected
 %
 %     OUTPUT:
 %
@@ -54,7 +55,8 @@
 
 extern "C" void projectionDD_lib(double* const pVolume, 
 								 double* const pProj, 
-								 float* const pGeo){
+								 float* const pGeo,
+								 const signed int idXProj){
 
 
 	const int unsigned nPixX = (const int)pGeo[0]; 
@@ -78,8 +80,8 @@ extern "C" void projectionDD_lib(double* const pVolume,
 	const double tubeAngle = (const double)pGeo[16];
 	const double detAngle = (const double)pGeo[17];
 
-	const int x_offset = (const int)pGeo[18];
-	const int y_offset = (const int)pGeo[19];
+	const double x_offset = (const double)pGeo[18];
+	const double y_offset = (const double)pGeo[19];
 
 	double* const pTubeAngle = (double*)malloc(nProj * sizeof(double));
 	double* const pDetAngle = (double*)malloc(nProj * sizeof(double));
@@ -151,8 +153,20 @@ extern "C" void projectionDD_lib(double* const pVolume,
 			for (int v = 0; v < nDetY; v++)
 				pProjt[(p * nDetY * nDetX) + (u * nDetY) + v] = 0.0;
 
+	// Test if we will loop over all projs or not
+	unsigned int projIni, projEnd;
+	if(idXProj == -1){
+		projIni = 0;
+		projEnd = nProj;
+	}
+	else{
+		projIni = (unsigned int) idXProj;
+		projEnd = (unsigned int) idXProj + 1;
+	}
+	
+	
 	// For each projection
-	for (int p = 0; p < nProj; p++) {
+	for (unsigned int p = projIni; p < projEnd; p++) {
 
 		// Get specif tube angle for the projection
 		double theta = pTubeAngle[p] * M_PI / 180.0;
diff --git a/pydbt/sources/projectionDDb/projectionDDb.cpp b/pydbt/sources/projectionDDb/projectionDDb.cpp
index d252791..abd75f3 100644
--- a/pydbt/sources/projectionDDb/projectionDDb.cpp
+++ b/pydbt/sources/projectionDDb/projectionDDb.cpp
@@ -5,7 +5,7 @@
 % =========================================================================
 %{
 % -------------------------------------------------------------------------
-%                 projectionDDb_lib(pVolume, pProj, ppGeo)
+%                 projectionDDb_lib(pVolume, pProj, pGeo, idXProj)
 % -------------------------------------------------------------------------
 %     DESCRIPTION:
 %     This function calculates the volume projection based on the
@@ -15,8 +15,9 @@
 %
 %     INPUT:
 %
-%     - pVolume = pointer to 3D volume for projection
-%     - ppGeo = Parameter of all pGeometry
+%     - pVolume = 3D volume for projection
+%     - pGeo = Parameter of all geometry
+%	  - idXProj = projection number to be projected
 %
 %     OUTPUT:
 %
@@ -55,7 +56,8 @@
 
 extern "C" void projectionDDb_lib(double* const pVolume, 
 								  double* const pProj, 
-								  float* const pGeo){
+								  float* const pGeo,
+								  const signed int idXProj){
 
 
 	const int unsigned nPixX = (const int)pGeo[0]; 
@@ -82,15 +84,15 @@ extern "C" void projectionDDb_lib(double* const pVolume,
 	double* const pTubeAngle = (double*)malloc(nProj * sizeof(double));
 	double* const pDetAngle = (double*)malloc(nProj * sizeof(double));
 
-	const int x_offset = (const int)pGeo[18];
-	const int y_offset = (const int)pGeo[19];
+	const double x_offset = (const double)pGeo[18];
+	const double y_offset = (const double)pGeo[19];
 
 	linspace(-tubeAngle/2, tubeAngle/2, nProj, pTubeAngle);
 	linspace(-detAngle/2, detAngle/2, nProj, pDetAngle);
 
 	// printf("Nx:%d Ny:%d Nz:%d \nNu:%d Nv:%d \nDx:%.2f Dy:%.2f Dz:%.2f \nDu:%.2f Dv:%.2f \nDSD:%.2f DDR:%.2f \nTube angle:%.2f \nDet angle:%.2f", nPixX, nPixY, nSlices, nDetX, nDetY, dx, dy, dz, du, dv, DSD, DDR, tubeAngle, detAngle);
 
-	projectionDDb(pProj, pVolume, pTubeAngle, pDetAngle, x_offset, y_offset, nProj, nPixX, nPixY, nSlices, nDetX, nDetY, dx, dy, dz, du, dv, DSD, DDR, DAG);
+	projectionDDb(pProj, pVolume, pTubeAngle, pDetAngle, nProj, nPixX, nPixY, nSlices, nDetX, nDetY, idXProj, x_offset, y_offset, dx, dy, dz, du, dv, DSD, DDR, DAG);
 
 	free(pTubeAngle);
 	free(pDetAngle);
@@ -105,14 +107,15 @@ void projectionDDb(double* const pProj,
 	double* const pVolume,
 	double* const pTubeAngle,
 	double* const pDetAngle,
-	const  int x_offset,
-	const  int y_offset,
-	const  int nProj,
-	const  int nPixX,
-	const  int nPixY,
-	const  int nSlices,
-	const  int nDetX,
-	const  int nDetY,
+	const unsigned int nProj,
+	const unsigned int nPixX,
+	const unsigned int nPixY,
+	const unsigned int nSlices,
+	const unsigned int nDetX,
+	const unsigned int nDetY,
+	const signed int idXProj,
+	const double x_offset,
+	const double y_offset,
 	const double dx,
 	const double dy,
 	const double dz,
@@ -247,8 +250,20 @@ void projectionDDb(double* const pProj,
 				pVolumet[(nz*nPixYMap *nPixXMap) + (x*nPixYMap) + y] += pVolumet[(nz*nPixYMap *nPixXMap) + ((x - 1)*nPixYMap) + y];	
 
 
+	// Test if we will loop over all projs or not
+	unsigned int projIni, projEnd;
+	if(idXProj == -1){
+		projIni = 0;
+		projEnd = nProj;
+	}
+	else{
+		projIni = (unsigned int) idXProj;
+		projEnd = (unsigned int) idXProj + 1;
+	}
+	
+	
 	// For each projection
-	for (int p = 0; p < nProj; p++) {
+	for (unsigned int p = projIni; p < projEnd; p++) {
 		
 		// Get specif tube angle for the projection
 		double theta = pTubeAngle[p] * M_PI / 180.0;
diff --git a/pydbt/sources/projectionDDb/projectionDDb.hpp b/pydbt/sources/projectionDDb/projectionDDb.hpp
index 07544a4..aad19c5 100644
--- a/pydbt/sources/projectionDDb/projectionDDb.hpp
+++ b/pydbt/sources/projectionDDb/projectionDDb.hpp
@@ -38,18 +38,19 @@
 // Global variable
 int nThreads;
 
-//extern "C" void projectionDDb_lib(double* const pVolume,double* const pProj,double* const geo);
-
 void projectionDDb(double* const pProj,
 	double* const pVolume,
 	double* const pTubeAngle,
 	double* const pDetAngle,
-	const  int nProj,
-	const  int nPixX,
-	const  int nPixY,
-	const  int nSlices,
-	const  int nDetX,
-	const  int nDetY,
+	const unsigned int nProj,
+	const unsigned int nPixX,
+	const unsigned int nPixY,
+	const unsigned int nSlices,
+	const unsigned int nDetX,
+	const unsigned int nDetY,
+	const signed int idXProj,
+	const double x_offset,
+	const double y_offset,
 	const double dx,
 	const double dy,
 	const double dz,
diff --git a/pydbt/sources/projectionDDb_cuda/kernel.cu b/pydbt/sources/projectionDDb_cuda/kernel.cu
new file mode 100644
index 0000000..3056286
--- /dev/null
+++ b/pydbt/sources/projectionDDb_cuda/kernel.cu
@@ -0,0 +1,723 @@
+/*
+%% Author: Rodrigo de Barros Vimieiro
+% Date: Feb, 2022
+% rodrigo.vimieiro@gmail.com
+% =========================================================================
+%{
+% -------------------------------------------------------------------------
+%                 projectionDDb_cuda_lib(pVolume, pProj, ppGeo)
+% -------------------------------------------------------------------------
+%     DESCRIPTION:
+%     This function calculates the volume projection based on the
+%     Branchless Distance-Driven principle.
+%     The geometry is for DBT with half cone-beam. All parameters are set
+%     in "ParameterSettings" code.
+%
+%     INPUT:
+%
+%     - data3d = 3D volume for projection
+%     - param = Parameter of all geometry
+%	  - nProj = projection number to be projected
+%
+%     OUTPUT:
+%
+%     - proj = projections for each angle.
+%
+%     Reference:
+%     - Branchless Distance Driven Projection and Backprojection,
+%     Samit Basu and Bruno De Man (2006)
+%     - GPU Acceleration of Branchless Distance Driven Projection and
+%     Backprojection, Liu et al (2016)
+%     - GPU-Based Branchless Distance-Driven Projection and Backprojection,
+%     Liu et al (2017)
+%     - A GPU Implementation of Distance-Driven Computed Tomography,
+%     Ryan D. Wagner (2017)
+%     ---------------------------------------------------------------------
+%     Copyright (C) <2019>  <Rodrigo de Barros Vimieiro>
+%
+%     This program is free software: you can redistribute it and/or modify
+%     it under the terms of the GNU General Public License as published by
+%     the Free Software Foundation, either version 3 of the License, or
+%     (at your option) any later version.
+%
+%     This program is distributed in the hope that it will be useful,
+%     but WITHOUT ANY WARRANTY; without even the implied warranty of
+%     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+%     GNU General Public License for more details.
+%
+%     You should have received a copy of the GNU General Public License
+%     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+%}
+% =========================================================================
+%% 3-D Projection Branchless Distance Driven Code
+*/
+
+#include "projectionDDb_cuda.hpp"
+
+/****************************************************************************
+*							CUDA Kernels									*
+****************************************************************************/
+__global__ void pad_volume_kernel(double* d_img, const int nPixXMap, const int nPixYMap, const int nElem, unsigned int nz) {
+
+	const int threadGId = blockIdx.x * blockDim.x + threadIdx.x;
+
+	// Make sure we don't try and access memory outside
+	// by having any threads mapped there return early
+	if (threadGId >= nElem)
+		return;
+	
+	d_img[(nz*nPixYMap *nPixXMap) + (threadGId*nPixYMap)] = 0;
+
+	return;
+}
+
+__global__ void map_boudaries_kernel(double* d_pBound, const int nElem, const double valueLeftBound, const double sizeElem, const double offset) {
+
+	const int threadGId = blockIdx.x * blockDim.x + threadIdx.x;
+
+	// Make sure we don't try and access memory outside
+	// by having any threads mapped there return early
+	if (threadGId >= nElem)
+		return;
+
+	d_pBound[threadGId] = (threadGId - valueLeftBound) * sizeElem + offset;
+
+	return;
+}
+
+__global__ void rot_detector_kernel(double* d_pRdetY, double* d_pRdetZ, double* d_pYcoord, double* d_pZcoord, const double yOffset, const double zOffset, 
+									const double phi, const int nElem) {
+
+	const int threadGId = blockIdx.x * blockDim.x + threadIdx.x;
+
+	// Make sure we don't try and access memory outside
+	// by having any threads mapped there return early
+	if (threadGId >= nElem)
+		return;
+
+	// cos and sin are in measured in radians.
+
+	d_pRdetY[threadGId] = ((d_pYcoord[threadGId] - yOffset)* cos(phi) - (d_pZcoord[threadGId] - zOffset)* sin(phi)) + yOffset;
+	d_pRdetZ[threadGId] = ((d_pYcoord[threadGId] - yOffset)* sin(phi) + (d_pZcoord[threadGId] - zOffset)* cos(phi)) + zOffset;
+
+	return;
+
+}
+
+__global__ void mapDet2Slice_kernel(double* const pXmapp, double* const pYmapp, double tubeX, double tubeY, double tubeZ, double* const pXcoord, 
+									double* const pYcoord, double* const pZcoord, double* const pZSlicecoord, const int nDetXMap, const int nDetYMap, const unsigned int nz) {
+
+	/*
+
+	Note: Matlab has linear indexing as a column of elements, i.e, the elements are actually stored in memory as queued columns. 
+	So threads in X are launched in the column direction (DBT Y coord), and threads in Y are launched in the row direction (DBT X coord).
+
+	*/
+
+	const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
+									     blockIdx.y * blockDim.y + threadIdx.y);
+
+	const int thread_1D_pos = thread_2D_pos.y * nDetYMap + thread_2D_pos.x;
+
+	// Make sure we don't try and access memory outside the detector
+	// by having any threads mapped there return early
+	if (thread_2D_pos.x >= nDetYMap || thread_2D_pos.y >= nDetXMap)
+		return;
+
+	pXmapp[thread_1D_pos] = ((pXcoord[thread_2D_pos.y] - tubeX)*(pZSlicecoord[nz] - pZcoord[thread_2D_pos.x]) - (pXcoord[thread_2D_pos.y]*tubeZ) + (pXcoord[thread_2D_pos.y]*pZcoord[thread_2D_pos.x]))/(-tubeZ + pZcoord[thread_2D_pos.x]);
+
+	if (thread_2D_pos.y == 0)
+		pYmapp[thread_2D_pos.x] = ((pYcoord[thread_2D_pos.x] - tubeY)*(pZSlicecoord[nz] - pZcoord[thread_2D_pos.x]) - (pYcoord[thread_2D_pos.x]*tubeZ) + (pYcoord[thread_2D_pos.x]*pZcoord[thread_2D_pos.x]))/(-tubeZ + pZcoord[thread_2D_pos.x]);
+
+	return;
+}
+
+__global__ void img_integration_kernel(double* d_img, const int nPixX, const int nPixY, bool direction, unsigned int offsetX, unsigned int offsetY, unsigned int nSlices) {
+
+	/*
+
+	Integration of 2D slices over the whole volume
+	
+	(S.1.Integration. - Liu et al(2017))
+
+	** Perfom an inclusive scan **
+
+	*/
+
+	const int3 memory_2D_pos = make_int3(blockIdx.x * blockDim.x + threadIdx.x + offsetX,
+										 blockIdx.y * blockDim.y + threadIdx.y + offsetY,
+										 blockIdx.z * blockDim.z + threadIdx.z);
+
+	const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
+										 blockIdx.y * blockDim.y + threadIdx.y);
+
+
+	// Make sure we don't try and access memory outside the detector
+	// by having any threads mapped there return early
+	if (memory_2D_pos.x >= nPixY || memory_2D_pos.y >= nPixX || memory_2D_pos.z >= nSlices)
+		return;
+
+
+	if (direction == integrateXcoord){
+
+		for (int s = 1; s <= blockDim.y; s *= 2) {
+
+			int spot = thread_2D_pos.y - s;
+
+			double val = 0;
+
+			if (spot >= 0) {
+				val = d_img[(memory_2D_pos.z*nPixY*nPixX) + (offsetY + spot) * nPixY + memory_2D_pos.x];
+			}
+			__syncthreads();
+
+			if (spot >= 0) {
+				d_img[(memory_2D_pos.z*nPixY*nPixX) + (memory_2D_pos.y * nPixY) + memory_2D_pos.x] += val;
+			}
+			__syncthreads();
+		}
+	}
+	else
+	{
+
+		for (int s = 1; s <= blockDim.x; s *= 2) {
+
+			int spot = thread_2D_pos.x - s;
+
+			double val = 0;
+
+			if (spot >= 0) {
+				val = d_img[(memory_2D_pos.z*nPixY*nPixX) + memory_2D_pos.y * nPixY + spot + offsetX];
+			}
+			__syncthreads();
+
+			if (spot >= 0) {
+				d_img[(memory_2D_pos.z*nPixY*nPixX) + (memory_2D_pos.y * nPixY) + memory_2D_pos.x] += val;
+			}
+			__syncthreads();
+		}
+
+	}
+	return;
+}
+
+
+__global__ void bilinear_interpolation_kernel_GPU(double* d_projI, double* d_pVolume, double* d_pDetmX, double* d_pDetmY, const int nDetXMap, const int nDetYMap,
+												  const int nPixXMap, const int nPixYMap, const int nPixX, const int nPixY, const double pixelSize, const unsigned int nz) {
+
+	const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
+										 blockIdx.y * blockDim.y + threadIdx.y);
+
+	// Make sure we don't try and access memory outside the detector
+	// by having any threads mapped there return early
+	if (thread_2D_pos.x >= nDetYMap || thread_2D_pos.y >= nDetXMap)
+		return;
+
+	/*
+
+	S.2. Interpolation - Liu et al (2017)
+
+	Reference:
+	- https://en.wikipedia.org/wiki/Bilinear_interpolation
+	- https://stackoverflow.com/questions/21128731/bilinear-interpolation-in-c-c-and-cuda
+
+	Note: *** We are using the Unit Square equation ***
+
+	alpha = X - X1
+	1-alpha = X2 - X
+
+	beta = Y - Y1
+	1-beta = Y2 - Y
+
+	----> Xcoord (thread_2D_pos.y)
+	|							0___________
+	v							|_d00_|_d10_|
+	Ycoord (thread_2D_pos.x)	|_d01_|_d11_|
+
+	*/
+
+	// Adjust the mapped coordinates to cross the range of (0-nPixX).*dx 
+	//  Divide by pixelSize to get a unitary pixel size
+	const double xNormData = nPixX - d_pDetmX[thread_2D_pos.y * nDetYMap + thread_2D_pos.x] / pixelSize;
+	const signed int    xData = floor(xNormData);
+	const double  alpha = xNormData - xData;
+
+	// Adjust the mapped coordinates to cross the range of (0-nPixY).*dy  
+	//  Divide by pixelSize to get a unitary pixel size
+	const double yNormData = (nPixY / 2.0) + (d_pDetmY[thread_2D_pos.x] / pixelSize);
+	const signed int    yData = floor(yNormData);
+	const double  beta = yNormData - yData;
+
+	double d00, d01, d10, d11;
+	if (((xNormData) >= 0) && ((xNormData) <= nPixX) && ((yNormData) >= 0) && ((yNormData) <= nPixY))	d00 = d_pVolume[(nz*nPixYMap*nPixXMap) + (xData*nPixYMap + yData)];						else    d00 = 0.0;
+	if (((xData + 1) > 0) && ((xData + 1) <= nPixX) && ((yNormData) >= 0) && ((yNormData) <= nPixY))	d10 = d_pVolume[(nz*nPixYMap*nPixXMap) + ((xData + 1)*nPixYMap + yData)]; 				else    d10 = 0.0;
+	if (((xNormData) >= 0) && ((xNormData) <= nPixX) && ((yData + 1) > 0) && ((yData + 1) <= nPixY))	d01 = d_pVolume[(nz*nPixYMap*nPixXMap) + (xData*nPixYMap + yData + 1)]; 				else    d01 = 0.0;
+	if (((xData + 1) > 0) && ((xData + 1) <= nPixX) && ((yData + 1) > 0) && ((yData + 1) <= nPixY))		d11 = d_pVolume[(nz*nPixYMap*nPixXMap) + ((xData + 1)*nPixYMap + yData + 1)];			else    d11 = 0.0;
+
+	double result_temp1 = alpha * d10 + (-d00 * alpha + d00);
+	double result_temp2 = alpha * d11 + (-d01 * alpha + d01);
+
+	d_projI[thread_2D_pos.y * nDetYMap + thread_2D_pos.x] = beta * result_temp2 + (-result_temp1 * beta + result_temp1);
+
+}
+
+__global__ void differentiation_kernel(double* d_pProj, double* d_projI, double* const d_pDetmX, double* const d_pDetmY, double tubeX, double rtubeY, double rtubeZ, 
+									   double* const d_pDetX, double* const d_pRdetY, double* const d_pRdetZ, const int nDetX, const int nDetY, const int nDetXMap, 
+									   const int nDetYMap, const double du, const double dv, const double dx, const double dy,  const double dz, const unsigned int p) {
+
+	const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
+										 blockIdx.y * blockDim.y + threadIdx.y);
+
+	const int thread_1D_pos = (nDetX*nDetY*p) + (thread_2D_pos.y * nDetY) + thread_2D_pos.x;
+
+	// Make sure we don't try and access memory outside the detector
+	// by having any threads mapped there return early
+	if (thread_2D_pos.x >= nDetY || thread_2D_pos.y >= nDetX)
+		return;
+
+	/*
+
+	S.3. Differentiation - Eq. 24 - Liu et al (2017)
+	
+	Detector integral projection
+	 ___________
+	|_A_|_B_|___|
+	|_C_|_D_|___|
+	|___|___|___| 
+
+
+	(thread_2D_pos.x,thread_2D_pos.y)
+	________________
+	|_A_|__B__|_____|	
+	|_C_|(0,0)|(0,1)|
+	|___|(1,0)|(1,1)|
+
+	Threads are lauched from D up to nDetX (thread_2D_pos.y) and nDetY (thread_2D_pos.x)
+	i.e., they are running on the detector image. Thread (0,0) is on D.
+
+	Coordinates on intergal projection:
+
+	A = thread_2D_pos.y * nDetYMap + thread_2D_pos.x
+	B = ((thread_2D_pos.y+1) * nDetYMap) + thread_2D_pos.x
+	C = thread_2D_pos.y * nDetYMap + thread_2D_pos.x + 1
+	D = ((thread_2D_pos.y+1) * nDetYMap) + thread_2D_pos.x + 1
+	
+	*/
+
+	unsigned int coordA = thread_2D_pos.y * nDetYMap + thread_2D_pos.x;
+	unsigned int coordB = ((thread_2D_pos.y + 1) * nDetYMap) + thread_2D_pos.x;
+	unsigned int coordC = coordA + 1;
+	unsigned int coordD = coordB + 1;
+
+
+	// Detector X coord overlap calculation
+	double detMapSizeX = d_pDetmX[coordA] - d_pDetmX[coordB];
+
+	// Detector Y coord overlap calculation
+	double detMapSizeY = d_pDetmY[thread_2D_pos.x+1] - d_pDetmY[thread_2D_pos.x];
+
+	// x - ray angle in X coord
+	double gamma = atan((d_pDetX[thread_2D_pos.y] + (du / 2.0) - tubeX) / rtubeZ);
+
+	// x - ray angle in Y coord
+	double alpha = atan((d_pRdetY[thread_2D_pos.x] + (dv / 2.0) - rtubeY) / rtubeZ);
+
+
+	double dA, dB, dC, dD;
+
+	dA = d_projI[coordA];
+	dB = d_projI[coordB];
+	dC = d_projI[coordC];
+	dD = d_projI[coordD];
+
+	// Treat border of interpolated integral detector
+	if (dC == 0 && dD == 0) {
+		dC = dA;
+		dD = dB;
+	}
+
+
+	// S.3.Differentiation - Eq. 24 - Liu et al(2017)
+	d_pProj[thread_1D_pos] += ((dD - dC - dB + dA)*(dx*dy*dz / (cos(alpha)*cos(gamma)*detMapSizeX*detMapSizeY)));
+
+	return;
+}
+
+
+
+/****************************************************************************
+* function: projectionDDb() - CUDA projection Branchless Distance Driven.	*
+****************************************************************************/
+
+void projectionDDb(double* const h_pProj,
+	double* const h_pVolume,
+	double* const h_pTubeAngle,
+	double* const h_pDetAngle,
+	const unsigned int nProj,
+	const unsigned int nPixX,
+	const unsigned int nPixY,
+	const unsigned int nSlices,
+	const unsigned int nDetX,
+	const unsigned int nDetY,
+	const signed int idXProj,
+	const double x_offset,
+	const double y_offset,
+	const double dx,
+	const double dy,
+	const double dz,
+	const double du,
+	const double dv,
+	const double DSD,
+	const double DDR,
+	const double DAG)
+{
+
+	// Unique GPU
+	int devID = 0;
+
+	cudaDeviceProp deviceProp;
+	cudaGetDeviceProperties(&deviceProp, devID);
+	cudaCheckErrors("cudaGetDeviceProperties");
+
+	const unsigned int maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
+
+
+	printf("GPU Device %d: \"%s\" with compute capability %d.%d has %d Multi-Processors and %zu bytes of global memory\n\n", devID,
+		deviceProp.name, deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount, deviceProp.totalGlobalMem);
+
+
+	// Create timmer variables and start it
+	StopWatchInterface *timer = NULL;
+	sdkCreateTimer(&timer);
+	sdkStartTimer(&timer);
+
+	//cudaStream_t stream1;
+	//cudaStreamCreate(&stream1);
+
+	dim3 threadsPerBlock(1, 1, 1);
+	dim3 blockSize(1, 1, 1);
+
+
+	// Number of mapped detector and pixels
+	const int nDetXMap = nDetX + 1;
+	const int nDetYMap = nDetY + 1;
+	const int nPixXMap = nPixX + 1;
+	const int nPixYMap = nPixY + 1;
+
+	
+	// Convention: h_ variables live on host
+	// Convention: d_ variables live on device (GPU global mem)
+	double* d_pProj;
+	double* d_projI;
+	double* d_pVolume;
+	double* d_pTubeAngle;
+	double* d_pDetAngle;
+
+
+	// Allocate global memory on the device, place result in "d_----"
+	cudaMalloc((void **)&d_pProj, nDetX*nDetY*nProj * sizeof(double));
+	cudaMalloc((void **)&d_projI, nDetXMap*nDetYMap * sizeof(double));
+	cudaMalloc((void **)&d_pVolume, nPixXMap*nPixYMap*nSlices * sizeof(double));
+	cudaMalloc((void **)&d_pTubeAngle, nProj * sizeof(double));
+	cudaMalloc((void **)&d_pDetAngle, nProj * sizeof(double));
+
+	cudaCheckErrors("cudaMalloc Initial");
+
+	// Copy data from host memory "h_----" to device memory "d_----"
+	cudaMemcpy((void *)d_pTubeAngle, (void *)h_pTubeAngle, nProj * sizeof(double), cudaMemcpyHostToDevice);
+	cudaMemcpy((void *)d_pDetAngle, (void *)h_pDetAngle, nProj * sizeof(double), cudaMemcpyHostToDevice);
+
+	cudaCheckErrors("cudaMemcpy Angles");
+
+
+	/*
+		Copy volume data from host memory "h_----" to device memory "d_----" padding with zeros for image integation
+	*/
+
+
+	// Initialize first column and row with zeros
+	double* h_pVolume_tmp;
+	double* d_pVolume_tmp;
+
+	threadsPerBlock.x = maxThreadsPerBlock;
+	blockSize.x = (nPixXMap / maxThreadsPerBlock) + 1;
+
+	for (int nz = 0; nz < nSlices; nz++) {
+
+		// Pad on X coord direction
+		pad_volume_kernel << <blockSize, threadsPerBlock >> > (d_pVolume, nPixXMap, nPixYMap, nPixXMap, nz);
+
+		// Pad on Y coord direction
+		d_pVolume_tmp = d_pVolume + (nPixXMap*nPixYMap*nz) + 1;
+		cudaMemset(d_pVolume_tmp, 0, nPixY * sizeof(double));
+		cudaCheckErrors("cudaMemset Padding Volume");
+	}
+
+
+	// Copy volume data from host memory
+	for (unsigned int nz = 0; nz < nSlices; nz++)
+		for (unsigned int c = 0; c < nPixX; c++) {
+
+			h_pVolume_tmp = h_pVolume + (c *nPixY) + (nPixX*nPixY*nz);
+			d_pVolume_tmp = d_pVolume + (((c+1) *nPixYMap) + 1) + (nPixXMap*nPixYMap*nz);
+
+			cudaMemcpy((void *)d_pVolume_tmp, (void *)h_pVolume_tmp, nPixY * sizeof(double), cudaMemcpyHostToDevice);
+			cudaCheckErrors("cudaMemset Volume");
+		}
+
+	
+
+	// Pointer for projections coordinates
+	double* d_pDetX;
+	double* d_pDetY;
+	double* d_pDetZ;
+	double* d_pObjX;
+	double* d_pObjY;
+	double* d_pObjZ;
+
+	// Allocate global memory on the device for projections coordinates
+	cudaMalloc((void **)&d_pDetX, nDetXMap * sizeof(double));
+	cudaMalloc((void **)&d_pDetY, nDetYMap * sizeof(double));
+	cudaMalloc((void **)&d_pDetZ, nDetYMap * sizeof(double));
+	cudaMalloc((void **)&d_pObjX, nPixXMap * sizeof(double));
+	cudaMalloc((void **)&d_pObjY, nPixYMap * sizeof(double));
+	cudaMalloc((void **)&d_pObjZ, nSlices * sizeof(double));
+
+	cudaCheckErrors("cudaMalloc Coordinates");
+
+
+
+	// Pointer for mapped coordinates
+	double* d_pDetmY;
+	double* d_pDetmX;
+
+
+	// Allocate global memory on the device for mapped coordinates
+	cudaMalloc((void **)&d_pDetmY, nDetYMap * sizeof(double));
+	cudaMalloc((void **)&d_pDetmX, nDetYMap * nDetXMap * sizeof(double));
+
+	cudaCheckErrors("cudaMalloc Map-Coordinates");
+
+
+	// Pointer for rotated detector coords
+	double* d_pRdetY;
+	double* d_pRdetZ;
+
+	// Allocate global memory on the device for for rotated detector coords
+	cudaMalloc((void **)&d_pRdetY, nDetYMap * sizeof(double));
+	cudaMalloc((void **)&d_pRdetZ, nDetYMap * sizeof(double));
+
+	cudaCheckErrors("cudaMalloc Rot-Coordinates");
+
+	// Generate detector and object boudaries
+
+	threadsPerBlock.x = maxThreadsPerBlock;
+
+	blockSize.x = (nDetX / maxThreadsPerBlock) + 1;
+
+	map_boudaries_kernel << <blockSize, threadsPerBlock >> > (d_pDetX, nDetXMap, (double)nDetX, -du, 0.0);
+
+	blockSize.x = (nDetY / maxThreadsPerBlock) + 1;
+
+	map_boudaries_kernel << <blockSize, threadsPerBlock >> > (d_pDetY, nDetYMap, nDetY / 2.0, dv, 0.0);
+
+	blockSize.x = (nPixX / maxThreadsPerBlock) + 1;
+
+	map_boudaries_kernel << <blockSize, threadsPerBlock >> > (d_pObjX, nPixXMap, (double)nPixX, -dx, x_offset);
+
+	blockSize.x = (nPixY / maxThreadsPerBlock) + 1;
+
+	map_boudaries_kernel << <blockSize, threadsPerBlock >> > (d_pObjY, nPixYMap, nPixY / 2.0, dy, y_offset);
+
+	blockSize.x = (nSlices / maxThreadsPerBlock) + 1;
+
+	map_boudaries_kernel << <blockSize, threadsPerBlock >> > (d_pObjZ, nSlices, 0.0, dz, DAG + (dz / 2.0));
+
+
+	//printf("Map Boundaries -- Threads:%d Blocks:%d \n", threads, blocks);
+
+
+	// Initiate variables value with 0
+	cudaMemset(d_pDetZ, 0, nDetYMap * sizeof(double));
+	cudaMemset(d_pProj, 0, nDetX * nDetY * nProj * sizeof(double));
+
+	cudaCheckErrors("cudaMemset Zeros");
+
+	//cudaDeviceSynchronize();
+
+	// X - ray tube initial position
+	double tubeX = 0;
+	double tubeY = 0;
+	double tubeZ = DSD;
+
+	// Iso - center position
+	double isoY = 0;
+	double isoZ = DDR;
+
+
+	// Integration of 2D slices over the whole volume
+	// (S.1.Integration. - Liu et al(2017))
+		
+	// Naive integration o the X coord
+	threadsPerBlock.x = 10;
+	threadsPerBlock.y = 10;
+	threadsPerBlock.z = 10;
+
+	blockSize.x = (unsigned int) ceil(((double)nPixYMap / (threadsPerBlock.x-1)));
+	blockSize.y = 1;
+	blockSize.z = (unsigned int) ceil((double)nSlices / threadsPerBlock.z);
+
+	//printf("Divisao: %f \n", (double)nPixY / (threadsPerBlock.x - 1));
+	//printf("Divisao ceil: %d \n", (unsigned int)ceil((double)nPixY / (threadsPerBlock.x - 1)));
+
+	for (int k = 0; k < ceil((double)nPixXMap / (threadsPerBlock.x-1)); k++) {
+
+		img_integration_kernel << <blockSize, threadsPerBlock >> > (d_pVolume, nPixXMap, nPixYMap, integrateXcoord, 0, k * 9, nSlices);
+
+		//printf("integration kernel -- threadsX:%d blocksX:%d threadsY:%d blocksY:%d \n", threadsPerBlock.x, blockSize.x, threadsPerBlock.y, blockSize.y);
+
+	}
+
+	// Naive integration o the Y coord
+	threadsPerBlock.x = 10;
+	threadsPerBlock.y = 10;
+	threadsPerBlock.z = 10;
+
+	blockSize.x = 1;
+	blockSize.y = (unsigned int) ceil((double)nPixXMap / (threadsPerBlock.y-1));
+	blockSize.z = (unsigned int) ceil((double)nSlices / threadsPerBlock.z);
+
+
+	for (int k = 0; k < ceil((double)nPixYMap / (threadsPerBlock.y-1)); k++) {
+
+		img_integration_kernel << <blockSize, threadsPerBlock >> > (d_pVolume, nPixXMap, nPixYMap, integrateYcoord, k * 9, 0, nSlices);
+
+		//printf("integration kernel -- threadsX:%d blocksX:%d threadsY:%d blocksY:%d \n", threadsPerBlock.x, blockSize.x, threadsPerBlock.y, blockSize.y);
+
+	}
+
+	// Test if we will loop over all projs or not
+	unsigned int projIni, projEnd;
+	if(idXProj == -1){
+		projIni = 0;
+		projEnd = nProj;
+	}
+	else{
+		projIni = (unsigned int) idXProj;
+		projEnd = (unsigned int) idXProj + 1;
+	}
+
+	// For each projection
+	for (unsigned int p = projIni; p < projEnd; p++) {
+		
+		// Get specif tube angle for the projection
+		double theta = h_pTubeAngle[p] * M_PI / 180.0;
+
+		// Get specif detector angle for the projection
+		double phi = h_pDetAngle[p] * M_PI / 180.0;
+
+		//printf("Tube angle:%f Det angle:%f\n", theta, phi);
+
+		// Tube rotation
+		double rtubeY = ((tubeY - isoY)*cos(theta) - (tubeZ - isoZ)*sin(theta)) + isoY;
+		double rtubeZ = ((tubeY - isoY)*sin(theta) + (tubeZ - isoZ)*cos(theta)) + isoZ;
+
+		//printf("R tube Y:%f R tube Z:%f\n", rtubeY, rtubeZ);
+
+		// Detector rotation
+		threadsPerBlock.x = maxThreadsPerBlock;
+		threadsPerBlock.y = 1;
+		threadsPerBlock.z = 1;
+
+		blockSize.x = (nDetYMap / maxThreadsPerBlock) + 1;
+		blockSize.y = 1;
+		blockSize.z = 1;
+
+		rot_detector_kernel << <blockSize, threadsPerBlock >> > (d_pRdetY, d_pRdetZ, d_pDetY, d_pDetZ, isoY, isoZ, phi, nDetYMap);
+
+		//cudaDeviceSynchronize();
+
+		//printf("Detector rotation -- Threads:%d Blocks:%d \n", threads, blocks);
+
+
+		// For each slice
+		for (unsigned int nz = 0; nz < nSlices; nz++) {
+
+			/*
+
+			Map detector onto XY plane(Inside proj loop in case detector rotates)
+
+			*** Note: Matlab has linear indexing as a column of elements, i.e, the elements are actually stored in memory as queued columns.
+			So threads in X are launched in the column direction (DBT Y coord), and threads in Y are launched in the row direction (DBT X coord).
+
+			*/
+
+			threadsPerBlock.x = 32;
+			threadsPerBlock.y = 32;
+			threadsPerBlock.z = 1;
+
+			blockSize.x = (nDetYMap / threadsPerBlock.x) + 1;
+			blockSize.y = (nDetXMap / threadsPerBlock.y) + 1;
+			blockSize.z = 1;
+
+
+			mapDet2Slice_kernel << <blockSize, threadsPerBlock >> > (d_pDetmX, d_pDetmY, tubeX, rtubeY, rtubeZ, d_pDetX, d_pRdetY, d_pRdetZ, d_pObjZ, nDetXMap, nDetYMap, nz);
+			//cudaDeviceSynchronize();
+
+			//printf("Map detector onto XY plane -- ThreadsX:%d ThreadsY:%d BlocksX:%d BlocksY:%d\n", threadsPerBlock.x, threadsPerBlock.y, blockSizeX, blockSizeY);
+
+			/*
+				S.2. Interpolation - Liu et al (2017)
+			*/
+
+			blockSize.x = (nDetYMap / threadsPerBlock.x) + 1;
+			blockSize.y = (nDetXMap / threadsPerBlock.y) + 1;
+
+			bilinear_interpolation_kernel_GPU << <blockSize, threadsPerBlock >> > (d_projI, d_pVolume, d_pDetmX, d_pDetmY, nDetXMap, nDetYMap, nPixXMap, nPixYMap, nPixX, nPixY, dx, nz);
+
+
+			/*
+				S.3. Differentiation - Eq. 24 - Liu et al (2017)
+			*/
+
+			blockSize.x = (nDetY / threadsPerBlock.x) + 1;
+			blockSize.y = (nDetX / threadsPerBlock.y) + 1;
+
+			differentiation_kernel << <blockSize, threadsPerBlock >> > (d_pProj, d_projI, d_pDetmX, d_pDetmY, tubeX, rtubeY, rtubeZ, d_pDetX, d_pRdetY, d_pRdetZ, nDetX, nDetY, nDetXMap, nDetYMap, du, dv, dx, dy, dz, p);
+
+		} // Loop end slices
+
+	} // Loop end Projections
+
+	// d_pProj
+	cudaMemcpy((void *)h_pProj, (void *)d_pProj, nProj*	nDetX * nDetY * sizeof(double), cudaMemcpyDeviceToHost);
+	cudaCheckErrors("cudaMemcpy Final");
+
+
+	cudaFree(d_pProj);
+	cudaFree(d_projI);
+	cudaFree(d_pVolume);
+	cudaFree(d_pTubeAngle);
+	cudaFree(d_pDetAngle);
+	cudaFree(d_pDetX);
+	cudaFree(d_pDetY);
+	cudaFree(d_pDetZ);
+	cudaFree(d_pObjX);
+	cudaFree(d_pObjY);
+	cudaFree(d_pObjZ);
+	cudaFree(d_pDetmY);
+	cudaFree(d_pDetmX);
+	cudaFree(d_pRdetY);
+	cudaFree(d_pRdetZ);
+
+	cudaCheckErrors("cudaFree Final");
+
+	sdkStopTimer(&timer);
+	printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+	sdkDeleteTimer(&timer);
+
+	cudaDeviceReset();
+
+	return;
+
+}
\ No newline at end of file
diff --git a/pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.cpp b/pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.cpp
new file mode 100644
index 0000000..1266531
--- /dev/null
+++ b/pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.cpp
@@ -0,0 +1,131 @@
+/*
+%% Author: Rodrigo de Barros Vimieiro
+% Date: Feb, 2022
+% rodrigo.vimieiro@gmail.com
+% =========================================================================
+%{
+% -------------------------------------------------------------------------
+%                 projectionDDb_cuda_lib(pVolume, pProj, pGeo, idXProj)
+% -------------------------------------------------------------------------
+%     DESCRIPTION:
+%     This function calculates the volume projection based on the
+%     Branchless Distance-Driven principle.
+%     The geometry is for DBT with half cone-beam. All parameters are set
+%     in "ParameterSettings" code.
+%
+%     INPUT:
+%
+%     - pVolume = 3D volume for projection
+%     - pGeo = Parameter of all geometry
+%	  - idXProj = projection number to be projected
+%
+%     OUTPUT:
+%
+%     - pProj = projections for each angle.
+%
+%     Reference:
+%     - Branchless Distance Driven Projection and Backprojection,
+%     Samit Basu and Bruno De Man (2006)
+%     - GPU Acceleration of Branchless Distance Driven Projection and
+%     Backprojection, Liu et al (2016)
+%     - GPU-Based Branchless Distance-Driven Projection and Backprojection,
+%     Liu et al (2017)
+%     - A GPU Implementation of Distance-Driven Computed Tomography,
+%     Ryan D. Wagner (2017)
+%     ---------------------------------------------------------------------
+%     Copyright (C) <2022>  <Rodrigo de Barros Vimieiro>
+%
+%     This program is free software: you can redistribute it and/or modify
+%     it under the terms of the GNU General Public License as published by
+%     the Free Software Foundation, either version 3 of the License, or
+%     (at your option) any later version.
+%
+%     This program is distributed in the hope that it will be useful,
+%     but WITHOUT ANY WARRANTY; without even the implied warranty of
+%     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+%     GNU General Public License for more details.
+%
+%     You should have received a copy of the GNU General Public License
+%     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+%}
+% =========================================================================
+%% 3-D Projection Branchless Distance Driven Code (CUDA)
+*/
+
+#include "projectionDDb_cuda.hpp"
+
+
+extern "C" void projectionDDb_cuda_lib(double* const pVolume, 
+									   double* const pProj, 
+									   float* const pGeo,
+									   const signed int idXProj){
+
+
+	const int unsigned nPixX = (const int)pGeo[0]; 
+	const int unsigned nPixY = (const int)pGeo[1]; 
+	const int unsigned nSlices = (const int)pGeo[2]; 
+	const int unsigned nDetX = (const int)pGeo[3]; 
+	const int unsigned nDetY = (const int)pGeo[4]; 
+
+	const double dx = (const double)pGeo[5]; 
+	const double dy = (const double)pGeo[6]; 
+	const double dz = (const double)pGeo[7]; 
+	const double du = (const double)pGeo[8]; 
+	const double dv = (const double)pGeo[9]; 
+
+	const double DSD = (const double)pGeo[10]; 
+	const double DDR = (const double)pGeo[12]; 
+	const double DAG = (const double)pGeo[14]; 
+
+	const int unsigned nProj = (const int)pGeo[15];
+
+	const double tubeAngle = (const double)pGeo[16];
+	const double detAngle = (const double)pGeo[17];
+
+	double* const pTubeAngle = (double*)malloc(nProj * sizeof(double));
+	double* const pDetAngle = (double*)malloc(nProj * sizeof(double));
+
+	const double x_offset = (const double)pGeo[18];
+	const double y_offset = (const double)pGeo[19];
+
+	linspace(-tubeAngle/2, tubeAngle/2, nProj, pTubeAngle);
+	linspace(-detAngle/2, detAngle/2, nProj, pDetAngle);
+
+	// printf("Nx:%d Ny:%d Nz:%d \nNu:%d Nv:%d \nDx:%.2f Dy:%.2f Dz:%.2f \nDu:%.2f Dv:%.2f \nDSD:%.2f DDR:%.2f \nTube angle:%.2f \nDet angle:%.2f", nPixX, nPixY, nSlices, nDetX, nDetY, dx, dy, dz, du, dv, DSD, DDR, tubeAngle, detAngle);
+
+	projectionDDb(pProj, pVolume, pTubeAngle, pDetAngle, nProj, nPixX, nPixY, nSlices, nDetX, nDetY, idXProj, x_offset, y_offset, dx, dy, dz, du, dv, DSD, DDR, DAG);
+
+	free(pTubeAngle);
+	free(pDetAngle);
+
+	return;
+
+}
+
+// Linear spaced vector
+// Ref: https://stackoverflow.com/a/27030598/8682939
+void linspace(double start, 
+			  double end, 
+			  int num,
+			  double* pLinspaced){
+
+	double delta;
+
+	if(num == 0)
+		delta = 0;
+	else{
+		if(num == 1)
+			delta = 1;
+		else
+			delta = (end - start) / (num - 1);
+	}
+
+	if((abs(start) < 0.00001) && (abs(end) < 0.00001))
+		delta = 0;
+
+	for (int k = 0; k < num; k++){
+			pLinspaced[k] = start + k * delta;
+	}
+
+  return;
+}
\ No newline at end of file
diff --git a/pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.hpp b/pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.hpp
new file mode 100644
index 0000000..23d041e
--- /dev/null
+++ b/pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.hpp
@@ -0,0 +1,86 @@
+/*
+%% Author: Rodrigo de Barros Vimieiro
+% Date: Feb, 2022
+% rodrigo.vimieiro@gmail.com
+% =========================================================================
+%{
+% -------------------------------------------------------------------------
+%
+% -------------------------------------------------------------------------
+%     DESCRIPTION:
+%     This is the header function
+%     ---------------------------------------------------------------------
+%     Copyright (C) <2022>  <Rodrigo de Barros Vimieiro>
+%
+%     This program is free software: you can redistribute it and/or modify
+%     it under the terms of the GNU General Public License as published by
+%     the Free Software Foundation, either version 3 of the License, or
+%     (at your option) any later version.
+%
+%     This program is distributed in the hope that it will be useful,
+%     but WITHOUT ANY WARRANTY; without even the implied warranty of
+%     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+%     GNU General Public License for more details.
+%
+%     You should have received a copy of the GNU General Public License
+%     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+%}
+% =========================================================================
+%% 3-D Projection Branchless Distance Driven header
+*/
+
+#include <stdio.h>
+#define _USE_MATH_DEFINES
+#include <math.h>
+
+// Includes CUDA
+#include <cuda_runtime.h>
+
+// Utilities and timing functions
+#include <helper_functions.h>    // includes cuda.h and cuda_runtime_api.h
+
+// CUDA helper functions
+#include <helper_cuda.h>         // helper functions for CUDA error check
+
+#define integrateXcoord 1
+#define integrateYcoord 0
+
+
+/*
+Reference: TIGRE - https://github.com/CERN/TIGRE
+*/
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                printf("%s \n",msg);\
+        } \
+} while (0)
+
+void linspace(double start, 
+              double end, 
+              int num,
+              double* pLinspaced);
+              
+void projectionDDb(double* const h_pProj,
+                double* const h_pVolume,
+                double* const h_pTubeAngle,
+                double* const h_pDetAngle,
+                const unsigned int nProj,
+                const unsigned int nPixX,
+                const unsigned int nPixY,
+                const unsigned int nSlices,
+                const unsigned int nDetX,
+                const unsigned int nDetY,
+                const signed int idXProj,
+                const double x_offset,
+                const double y_offset,
+                const double dx,
+                const double dy,
+                const double dz,
+                const double du,
+                const double dv,
+                const double DSD,
+                const double DDR,
+                const double DAG);
+
diff --git a/setup.py b/setup.py
index 8cf42e3..d2dc7fe 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,15 @@
 
 import setuptools
 import os
+
 from sys import platform
+from pydbt.functions.utilities2setuptools import locate_cuda, get_custom_build_ext
+
+# Find CUDA (NVCC)
+cuda_env = locate_cuda()
+
+# Create custom build ext
+custom_build_ext = get_custom_build_ext(cuda_env)
 
 
 if platform == "darwin":
@@ -19,34 +27,61 @@
     raise ValueError('Windows is not supported yet.')
 
 
+
 projectionDD = setuptools.Extension('projectionDD',
                                      sources = ['pydbt/sources/projectionDD/projectionDD.cpp'],
-                                     language='c++')
+                                     language='c++',
+                                     extra_compile_args = {"gcc"  : [],
+                                                           "nvcc" : []})
 
 backprojectionDD = setuptools.Extension('backprojectionDD',
                                      sources = ['pydbt/sources/backprojectionDD/backprojectionDD.cpp'],
-                                     language='c++')                                     
+                                     language='c++',
+                                     extra_compile_args = {"gcc"  : [],
+                                                           "nvcc" : []})                                     
 
 
 projectionDDb = setuptools.Extension('projectionDDb',
                                       sources = ['pydbt/sources/projectionDDb/projectionDDb.cpp'],
                                       language='c++',
-                                      extra_compile_args = ['-fopenmp'],
+                                      extra_compile_args = {"gcc"  : ['-fopenmp'],
+                                                            "nvcc" : []},
                                       extra_link_args=['-lgomp'])
 
 backprojectionDDb = setuptools.Extension('backprojectionDDb',
                                       sources = ['pydbt/sources/backprojectionDDb/backprojectionDDb.cpp'],
                                       language='c++',
-                                      extra_compile_args = ['-fopenmp'],
+                                      extra_compile_args = {"gcc"  : ['-fopenmp'],
+                                                            "nvcc" : []},
                                       extra_link_args=['-lgomp'])
 
+projectionDDb_cuda = setuptools.Extension('projectionDDb_cuda',
+                                      sources = ['pydbt/sources/projectionDDb_cuda/projectionDDb_cuda.cpp', 'pydbt/sources/projectionDDb_cuda/kernel.cu'],
+                                      library_dirs=[cuda_env['lib64']],
+                                      libraries=['cudart'],
+                                      language='c++',
+                                      runtime_library_dirs=[cuda_env['lib64']],
+                                      extra_compile_args = {"gcc"  : [],
+                                                            "nvcc" : ['-arch=sm_75', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
+                                      include_dirs = [cuda_env['include'], 'cuda-samples/Common/'])
+
+backprojectionDDb_cuda = setuptools.Extension('backprojectionDDb_cuda',
+                                      sources = ['pydbt/sources/backprojectionDDb_cuda/backprojectionDDb_cuda.cpp', 'pydbt/sources/backprojectionDDb_cuda/kernel.cu'],
+                                      library_dirs=[cuda_env['lib64']],
+                                      libraries=['cudart'],
+                                      language='c++',
+                                      runtime_library_dirs=[cuda_env['lib64']],
+                                      extra_compile_args = {"gcc"  : [],
+                                                            "nvcc" : ['-arch=sm_75', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
+                                      include_dirs = [cuda_env['include'], 'cuda-samples/Common/'])
+
 
 with open("README.md", "r") as fh:
     long_description = fh.read()
 
 setuptools.setup(
     name="pyDBT",
-    version="0.0.2",
+    version="0.0.3",
     author="Rodrigo Vimieiro",
     description="This package is a python extension of the DBT toolbox from LAVI-USP",
     long_description=long_description,
@@ -60,5 +95,8 @@
     ],
     python_requires='>=3.6',
     install_requires=["numpy", "matplotlib", "pydicom"],
-    ext_modules = [projectionDD, backprojectionDD, projectionDDb, backprojectionDDb]
+    ext_modules = [projectionDD, backprojectionDD, projectionDDb, backprojectionDDb, projectionDDb_cuda, backprojectionDDb_cuda],
+    
+    # Inject our custom trigger
+    cmdclass={'build_ext': custom_build_ext},
 )
\ No newline at end of file