From eed838624e7815fe34c1aa13e769e75cb5988745 Mon Sep 17 00:00:00 2001
From: nilspin <gooney28@gmail.com>
Date: Fri, 29 Mar 2019 17:35:05 +0530
Subject: [PATCH] Remove thrust everywhere. Only use it inside .cu files. Still
 no build.

---
 CameraTracking.cpp     | 32 +++++++++++----------
 CameraTracking.h       | 19 ++++++------
 CameraTrackingUtils.cu | 21 +++++++-------
 Solver.cpp             | 65 ++++++++++++++++++++++++++++--------------
 Solver.cu              | 17 ++++++-----
 Solver.h               | 26 ++++++++++-------
 6 files changed, 106 insertions(+), 74 deletions(-)
diff --git a/CameraTracking.cpp b/CameraTracking.cpp
index 59add19..28d310e 100644
--- a/CameraTracking.cpp
+++ b/CameraTracking.cpp
@@ -7,15 +7,12 @@
 #include <iostream>
 #include <fstream>
 #include <vector>
-#include <thrust/fill.h>
+//#include <thrust/fill.h>
 #include "CameraTracking.h"
 #include "termcolor.hpp"
 #include "DebugHelper.hpp"
 
-extern "C" float computeCorrespondences(const float4* d_input, const float4* d_target,
-    const float4* d_targetNormals, thrust::device_vector<float4>& corres,
-    thrust::device_vector<float4>& corresNormals,thrust::device_vector<float>& residual,
-    const float4x4 deltaTransform, const int width, const int height);
+extern "C" float computeCorrespondences(const float4* d_input, const float4* d_target, const float4* d_targetNormals, float4* corres, float4* corresNormals, float* residual, const float4x4 deltaTransform, const int width, const int height);
 
 extern "C" bool SetCameraIntrinsic(const float* intrinsic, const float* invIntrinsic);
 //Takes device pointers, calculates correct position and normals
@@ -44,9 +41,10 @@ void CameraTracking::Align(float4* d_input, float4* d_inputNormals, float4* d_ta
 	  float4x4 deltaT = float4x4(deltaTransform.data());
 
     //Clear previous data
-    thrust::fill(d_correspondences.begin(), d_correspondences.end(), make_float4(0));
-    thrust::fill(d_correspondenceNormals.begin(), d_correspondenceNormals.end(), make_float4(0));
-    thrust::fill(d_residuals.begin(), d_residuals.end(), (float)0.0f);
+    //TODO All move all this to .cu file
+    //thrust::fill(d_correspondences.begin(), d_correspondences.end(), make_float4(0));
+    //thrust::fill(d_correspondenceNormals.begin(), d_correspondenceNormals.end(), make_float4(0));
+    //thrust::fill(d_residuals.begin(), d_residuals.end(), (float)0.0f);
 
 
 	  //We now have all data we need. find correspondence.
@@ -114,11 +112,14 @@ Eigen::Matrix4f CameraTracking::rigidAlignment(const float4* d_input, const floa
 
 CameraTracking::CameraTracking(int w, int h):width(w),height(h)
 {
-  //const int ARRAY_SIZE = width*height*sizeof(CoordPair);
-  //checkCudaErrors(cudaMalloc((void**)&d_correspondence, ARRAY_SIZE));
-  //checkCudaErrors(cudaMemset(d_correspondence, 0, ARRAY_SIZE));
-  //checkCudaErrors(cudaMalloc((void**)&d_correspondenceNormals, ARRAY_SIZE));
-  //checkCudaErrors(cudaMemset(d_correspondenceNormals, 0, ARRAY_SIZE));
+  const int F4_ARRAY_SIZE = width*height*sizeof(float4);
+  const int ARRAY_SIZE = width*height*sizeof(float);
+  checkCudaErrors(cudaMalloc((void**)&d_correspondences, F4_ARRAY_SIZE));
+  checkCudaErrors(cudaMemset(d_correspondences, 0, F4_ARRAY_SIZE));
+  checkCudaErrors(cudaMalloc((void**)&d_correspondenceNormals, F4_ARRAY_SIZE));
+  checkCudaErrors(cudaMemset(d_correspondenceNormals, 0, F4_ARRAY_SIZE));
+  checkCudaErrors(cudaMalloc((void**)&d_residuals, ARRAY_SIZE));
+  checkCudaErrors(cudaMemset(d_residuals, 0, ARRAY_SIZE));
   float arr[16] = { 1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1 };
   deltaTransform = Matrix4x4f(arr);
   const float intrinsics[] = {525.0, 0, 319.5, 0, 525.0, 239.5, 0, 0, 1}; //TODO: read from file
@@ -132,8 +133,9 @@ CameraTracking::CameraTracking(int w, int h):width(w),height(h)
 
 CameraTracking::~CameraTracking()
 {
-  //checkCudaErrors(cudaFree(d_correspondence));
-  //checkCudaErrors(cudaFree(d_correspondenceNormals));
+  checkCudaErrors(cudaFree(d_correspondences));
+  checkCudaErrors(cudaFree(d_correspondenceNormals));
+  checkCudaErrors(cudaFree(d_residuals));
 }
 
 
diff --git a/CameraTracking.h b/CameraTracking.h
index fbc0146..2c3a220 100644
--- a/CameraTracking.h
+++ b/CameraTracking.h
@@ -10,8 +10,8 @@
 #endif
 
 #include <cuda_runtime_api.h>
-#include <thrust/device_vector.h>
-#include <thrust/device_ptr.h>
+//#include <thrust/device_vector.h>
+//#include <thrust/device_ptr.h>
 
 //This is a simple vector library. Use this with CUDA instead of GLM.
 #include "cuda_helper/cuda_SimpleMatrixUtil.h"
@@ -28,8 +28,8 @@
 //  int dummy = -2; //padding
 //};
 
-using thrust::device_vector;
-using thrust::device_ptr;
+//using thrust::device_vector;
+//using thrust::device_ptr;
 
 class CameraTracking  {
 
@@ -38,12 +38,13 @@ class CameraTracking  {
   //LinearSystem linearSystem;
   Solver solver;
   int maxIters = 20;
-  //float4* d_correspondenceNormals;
-  //float4* d_correspondence;
+  float4* d_correspondenceNormals;
+  float4* d_correspondences;
+  float* d_residuals;
   //thrust::device_vector<CorrPair> d_coordPair;
-  device_vector<float4> d_correspondences;
-  device_vector<float4> d_correspondenceNormals;
-  device_vector<float> d_residuals;
+  //device_vector<float4> d_correspondences;
+  //device_vector<float4> d_correspondenceNormals;
+  //device_vector<float> d_residuals;
   Matrix4x4f deltaTransform;
   float globalCorrespondenceError = 0.0f;
   Matrix4x4f delinearizeTransformation(const Vector6f & sol);
diff --git a/CameraTrackingUtils.cu b/CameraTrackingUtils.cu
index be2926d..1407a73 100644
--- a/CameraTrackingUtils.cu
+++ b/CameraTrackingUtils.cu
@@ -9,8 +9,8 @@
 #include <cuda_runtime_api.h>
 #include "cuda_helper/helper_cuda.h"
 #include "cuda_helper/helper_math.h"
-#include <thrust/device_vector.h>
-#include <thrust/device_ptr.h>
+//#include <thrust/device_vector.h>
+//#include <thrust/device_ptr.h>
 #include <thrust/fill.h>
 #include "common.h"
 
@@ -34,8 +34,8 @@
 dim3 blocks = dim3(20, 15, 1);
 dim3 threads = dim3(32, 32, 1);
 
-using thrust::device_vector;
-using thrust::device_ptr;
+//using thrust::device_vector;
+//using thrust::device_ptr;
 
 __device__ __constant__ float3x3 K;  //Camera intrinsic matrix
 __device__ __constant__ float3x3 K_inv;
@@ -175,19 +175,20 @@ void FindCorrespondences(const float4* input,	const float4* target,
 }
 
 extern "C" float computeCorrespondences(const float4* d_input, const float4* d_target,
-    const float4* d_targetNormals, device_vector<float4>& corres,
-    device_vector<float4>& corresNormals, device_vector<float>& residuals,
+    const float4* d_targetNormals, float4* corres,
+    float4* corresNormals, float* residuals,
     const float4x4 deltaTransform, const int width, const int height)
 {
 	//First clear the previous correspondence calculation
   checkCudaErrors(cudaMemcpyToSymbol(globalError, &idealError, sizeof(float)));
 
-  float4* d_correspondences = thrust::raw_pointer_cast(&corres[0]);
-  float4* d_corresNormals = thrust::raw_pointer_cast(&corresNormals[0]);
-  float* d_residuals = thrust::raw_pointer_cast(&residuals[0]);
+  //TODO All move all this to .cu file
+  thrust::fill(corres, corres + (width*height), float4{0});
+  thrust::fill(corresNormals, corresNormals+ (width*height), float4{0});
+  thrust::fill(residuals, residuals+ (width*height), (float)0.0f);
 
 	FindCorrespondences <<<blocks, threads>>>(d_input, d_target, d_targetNormals,
-      d_correspondences, d_corresNormals, d_residuals,	deltaTransform, distThres, normalThres, width, height);
+      corres, corresNormals, residuals,	deltaTransform, distThres, normalThres, width, height);
 
   float globalErrorReadback = 0.0;
   checkCudaErrors(cudaMemcpyFromSymbol(&globalErrorReadback, globalError, sizeof(float)));
diff --git a/Solver.cpp b/Solver.cpp
index 01185b0..322cd32 100755
--- a/Solver.cpp
+++ b/Solver.cpp
@@ -1,14 +1,14 @@
 #include "Solver.h"
 #include "termcolor.hpp"
 #include <string>
-#include <thrust/fill.h>
-#include <thrust/copy.h>
+//#include <thrust/fill.h>
+//#include <thrust/copy.h>
 
 float alpha = 1.0;
 float beta = 0.0f;
 
 extern "C"
-void CalculateJacobiansAndResiduals(const float4* d_input, const device_vector<float4>& corres, const device_vector<float4>& d_corresNormals, device_vector<float>& d_Jac);
+void CalculateJacobiansAndResiduals(const float4* d_input, const float4* corres, const float4* d_corresNormals, const float* d_Jac, const float* d_res);
 //inline
 //float calculate_B(const vec3& n, const vec3& d, const vec3& s)  {
 //  glm::vec3 p = vec3(d - s);
@@ -46,16 +46,16 @@ void Solver::PrintSystem()  {
 //  JacMat.row(index) << n.x, n.y, n.z, T.x, T.y, T.z ;
 //}
 
-void Solver::BuildLinearSystem(const float4* d_input, const device_vector<float4>& d_correspondences, const device_vector<float4>& d_correspondenceNormals, const device_vector<float>& d_residuals, int width, int height) {
+void Solver::BuildLinearSystem(const float4* d_input, const float4* d_correspondences, const float4* d_correspondenceNormals, const float* d_residuals, int width, int height) {
 
-  d_residual_ptr = thrust::raw_pointer_cast(&d_residuals[0]);
+  //d_residual = thrust::raw_pointer_cast(&d_residuals[0]);
   numCorrPairs = width*height;  //corrImageCoords.size();
   //d_Jac.resize(numCorrPairs*6); //Do we need to resize at runtime?
   //d_residual.resize(numCorrPairs);
-  thrust::fill(d_Jac.begin(), d_Jac.end(), 0);
+  //thrust::fill(d_Jac.begin(), d_Jac.end(), 0);
   //thrust::fill(d_residual.begin(), d_residual.end(), 0);
-  thrust::fill(d_JTJ.begin(), d_JTJ.end(), 0);
-  thrust::fill(d_JTr.begin(), d_JTr.end(), 0);
+  //thrust::fill(d_JTJ.begin(), d_JTJ.end(), 0);
+  //thrust::fill(d_JTr.begin(), d_JTr.end(), 0);
   //Jac = MatrixXf(numCorrPairs,6);
   //residual = VectorXf(numCorrPairs);
   //PrintMatrixDims(Jac, std::string("Jac"));
@@ -68,22 +68,26 @@ void Solver::BuildLinearSystem(const float4* d_input, const device_vector<float4
   //JTJ.setZero();
   //JTr.setZero();
   //residual.setZero();
-  uint idx = 0;
+  //uint idx = 0;
 
   //Invoke kernel here
   std::cout<<"Calculating Jacobians and residuals\n";
-  CalculateJacobiansAndResiduals(d_input, d_correspondences, d_correspondenceNormals, d_Jac);
+  CalculateJacobiansAndResiduals(d_input, d_correspondences, d_correspondenceNormals, d_Jac, d_residuals);
+  checkCudaErrors(cudaDeviceSynchronize());
   //Now Jac and res are populated. Invoke cublas functions to calculate JTJ and JTr
   //JTr
   std::cout<<"Calculating JTr\n";
   //TODO : Set this d_residual_ptr correctly
-  stat = cublasSgemv(handle, CUBLAS_OP_N, 6, numCols*numRows, &alpha, d_Jac_ptr/*d_a*/, 6, d_residual_ptr/*d_x*/, 1, &beta, d_JTr_ptr/*d_y*/, 1);
-  cudaMemcpy(JTr.data(), d_JTr_ptr, 6*sizeof(float), cudaMemcpyDeviceToHost); // copy back
-  std::cout<<JTr<<"\n";
+  stat = cublasSgemv(handle, CUBLAS_OP_N, 6, numCols*numRows, &alpha, d_Jac/*d_a*/, 6, d_residuals/*d_x*/, 1, &beta, d_JTr/*d_y*/, 1);
   //JTJ
   std::cout<<"Calculating JTJ\n";
-  stat = cublasSsyrk(handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, 6, numCols*numRows, &alpha, d_Jac_ptr/*d_a*/, 6, &beta, d_JTJ_ptr/*d_c*/, 6); //compute JJT in column-maj order
-  cudaMemcpy(JTJ.data(), d_JTJ_ptr, 6*6*sizeof(float), cudaMemcpyDeviceToHost); // copy back
+  stat = cublasSsyrk(handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, 6, numCols*numRows, &alpha, d_Jac/*d_a*/, 6, &beta, d_JTJ/*d_c*/, 6); //compute JJT in column-maj order
+
+  //copy back
+  cudaMemcpy(JTr.data(), d_JTr, JTr_SIZE, cudaMemcpyDeviceToHost);
+  std::cout<<JTr<<"\n";
+
+  cudaMemcpy(JTJ.data(), d_JTJ, JTJ_SIZE, cudaMemcpyDeviceToHost);
   std::cout<<JTJ<<"\n";
   checkCudaErrors(cudaDeviceSynchronize());
 
@@ -149,19 +153,32 @@ void Solver::SolveJacobianSystem(const Matrix6x6f& JTJ, const Vector6f& JTr)  {
 //}
 
 Solver::Solver() {
-  d_Jac.resize(6*numCols*numRows);
+  JAC_SIZE = 6*numCols*numRows*sizeof(float);
+  RES_SIZE = numCols*numRows*sizeof(float);
+  JTJ_SIZE = 6*6*sizeof(float);
+  JTr_SIZE = 6*sizeof(float);
+
+  checkCudaErrors(cudaMalloc((void**)&d_Jac, JAC_SIZE));
+  checkCudaErrors(cudaMemset(d_Jac, 0, JAC_SIZE));
+  //checkCudaErrors(cudaMalloc((void**)&d_residual, RES_SIZE));
+  //checkCudaErrors(cudaMemset(d_residual, 0, RES_SIZE));
+  checkCudaErrors(cudaMalloc((void**)&d_JTJ, JTJ_SIZE));
+  checkCudaErrors(cudaMemset(d_JTJ, 0, JTJ_SIZE));
+  checkCudaErrors(cudaMalloc((void**)&d_JTr, JTr_SIZE));
+  checkCudaErrors(cudaMemset(d_JTr, 0, JTr_SIZE));
+  //d_Jac.resize(6*numCols*numRows);
   //d_residual.resize(numCols*numRows);
-  d_JTr.resize(6);
-  d_JTJ.resize(6*6);
+  //d_JTr.resize(6);
+  //d_JTJ.resize(6*6);
 
-  thrust::fill(d_Jac.begin(), d_Jac.end(), (float)0.0f);
+  //thrust::fill(d_Jac.begin(), d_Jac.end(), (float)0.0f);
   //thrust::fill(d_residual.begin(), d_residual.end(), 0);
   stat = cublasCreate(&handle);
 
-  d_Jac_ptr = thrust::raw_pointer_cast(&d_Jac[0]);
+  //d_Jac_ptr = thrust::raw_pointer_cast(&d_Jac[0]);
   //d_residual_ptr = thrust::raw_pointer_cast(&d_residual[0]);
-  d_JTr_ptr = thrust::raw_pointer_cast(&d_JTr[0]);
-  d_JTJ_ptr = thrust::raw_pointer_cast(&d_JTJ[0]);
+  //d_JTr_ptr = thrust::raw_pointer_cast(&d_JTr[0]);
+  //d_JTJ_ptr = thrust::raw_pointer_cast(&d_JTJ[0]);
 
   JTJ.setZero();
   JTr.setZero();
@@ -170,5 +187,9 @@ Solver::Solver() {
 }
 
 Solver::~Solver() {
+  checkCudaErrors(cudaFree(d_Jac));
+  //checkCudaErrors(cudaFree(d_residual));
+  checkCudaErrors(cudaFree(d_JTJ));
+  checkCudaErrors(cudaFree(d_JTr));
   cublasDestroy(handle);
 }
diff --git a/Solver.cu b/Solver.cu
index a46e40d..3a50b95 100644
--- a/Solver.cu
+++ b/Solver.cu
@@ -2,6 +2,7 @@
 #include <cublas_v2.h>
 #include "Solver.h"
 #include "cuda_helper/helper_math.h"
+#include <thrust/fill.h>
 
 #define numCols 640
 #define numRows 480
@@ -9,8 +10,8 @@
 dim3 blocks = dim3(20, 15, 1);
 dim3 threads = dim3(32, 32, 1);
 
-using FloatVec = thrust::device_vector<float>;
-using Float4Vec = thrust::device_vector<float4>;
+//using FloatVec = thrust::device_vector<float>;
+//using Float4Vec = thrust::device_vector<float4>;
 
 //using CorrPairVec = thrust::device_vector<CorrPair>;
 
@@ -51,16 +52,18 @@ void CalculateJacAndResKernel(const float4* d_src, const float4* d_dest, const f
 //__device__ inline
 //void CalculateJTJ
 
-extern "C" void CalculateJacobiansAndResidual(const float4* d_src, Float4Vec targ, Float4Vec targNormals, FloatVec Jac) {
+extern "C" void CalculateJacobiansAndResidual(const float4* d_src, const float4* d_targ, const float4* d_targNormals,
+    float* d_Jac) {
 
   //First calculate Jacobian and Residual matrices
-  float4* d_targ = thrust::raw_pointer_cast(&targ[0]);
-  float4* d_targNormals = thrust::raw_pointer_cast(&targNormals[0]);
-  float* d_jacobianMatrix = thrust::raw_pointer_cast(&Jac[0]);
+  //float4* d_targ = thrust::raw_pointer_cast(&targ[0]);
+  //float4* d_targNormals = thrust::raw_pointer_cast(&targNormals[0]);
+  //float* d_jacobianMatrix = thrust::raw_pointer_cast(&Jac[0]);
   //float* d_resVector = thrust::raw_pointer_cast(&residual[0]);
   //float* d_jtj = thrust::raw_pointer_cast(&JTJ[0]);
   //float* d_jtr = thrust::raw_pointer_cast(&JTr[0]);
-  CalculateJacAndResKernel<<<blocks, threads>>>(d_src, d_targ, d_targNormals, d_jacobianMatrix);
+  thrust::fill(d_Jac, d_Jac+(numCols*numRows*6), 0);  //TODO - is this redundant?
+  CalculateJacAndResKernel<<<blocks, threads>>>(d_src, d_targ, d_targNormals, d_Jac);
 
   //Then calculate Matrix-vector JTr and Matrix-matrix JTJ products
 }
diff --git a/Solver.h b/Solver.h
index e0f39d3..4642054 100755
--- a/Solver.h
+++ b/Solver.h
@@ -7,17 +7,17 @@
 #include "cuda_helper/helper_cuda.h"
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
-#include <thrust/device_vector.h>
-#include <thrust/device_ptr.h>
+//#include <thrust/device_vector.h>
+//#include <thrust/device_ptr.h>
 
 using namespace Eigen;
-using thrust::device_vector;
+//using thrust::device_vector;
 
 class Solver {
   public:
     uint numIters = 10;
 
-    void BuildLinearSystem(const float4* , const device_vector<float4>& , const device_vector<float4>& , const device_vector<float>& , int , int );
+    void BuildLinearSystem(const float4* , const float4* , const float4* , const float* , int , int );
 
     void PrintSystem();
 
@@ -29,6 +29,11 @@ class Solver {
     Matrix4x4f getTransform() {return SE3Exp(estimate);};
     double getError() {return TotalError;};
   private:
+    int JAC_SIZE;
+    int RES_SIZE;
+    int JTJ_SIZE;
+    int JTr_SIZE;
+    const int num_vars_in_jac = 6;
     Vector6f update, estimate; //Will hold solution
     bool solution_exists = false;
     //Matrix4x4f deltaT;  //intermediate estimated transform
@@ -47,14 +52,13 @@ class Solver {
     cudaError_t cudaStat;
     cublasStatus_t  stat;
     cublasHandle_t handle;
-    thrust::device_vector<float> d_Jac;  //Computed on device
+    //thrust::device_vector<float> d_Jac;  //Computed on device
     //thrust::device_vector<float> d_residual; //Computed on device
-    thrust::device_vector<float> d_JTr;  //then multiplied on device
-    thrust::device_vector<float> d_JTJ;  //finally this is computed
-    float* d_Jac_ptr = nullptr;
-    float* d_residual_ptr = nullptr;
-    float* d_JTr_ptr = nullptr;
-    float* d_JTJ_ptr = nullptr;
+    //thrust::device_vector<float> d_JTr;  //then multiplied on device
+    //thrust::device_vector<float> d_JTJ;  //finally this is computed
+    float* d_Jac = nullptr;
+    float* d_JTr = nullptr;
+    float* d_JTJ = nullptr;
 
 
 };