Skip to content

Commit

Permalink
Remove thrust everywhere. Only use it inside .cu files. Still no build.
Browse files Browse the repository at this point in the history
  • Loading branch information
nilspin committed Mar 29, 2019
1 parent 5d52dc0 commit eed8386
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 74 deletions.
32 changes: 17 additions & 15 deletions CameraTracking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,12 @@
#include <iostream>
#include <fstream>
#include <vector>
#include <thrust/fill.h>
//#include <thrust/fill.h>
#include "CameraTracking.h"
#include "termcolor.hpp"
#include "DebugHelper.hpp"

extern "C" float computeCorrespondences(const float4* d_input, const float4* d_target,
const float4* d_targetNormals, thrust::device_vector<float4>& corres,
thrust::device_vector<float4>& corresNormals,thrust::device_vector<float>& residual,
const float4x4 deltaTransform, const int width, const int height);
extern "C" float computeCorrespondences(const float4* d_input, const float4* d_target, const float4* d_targetNormals, float4* corres, float4* corresNormals, float* residual, const float4x4 deltaTransform, const int width, const int height);

extern "C" bool SetCameraIntrinsic(const float* intrinsic, const float* invIntrinsic);
//Takes device pointers, calculates correct position and normals
Expand Down Expand Up @@ -44,9 +41,10 @@ void CameraTracking::Align(float4* d_input, float4* d_inputNormals, float4* d_ta
float4x4 deltaT = float4x4(deltaTransform.data());

//Clear previous data
thrust::fill(d_correspondences.begin(), d_correspondences.end(), make_float4(0));
thrust::fill(d_correspondenceNormals.begin(), d_correspondenceNormals.end(), make_float4(0));
thrust::fill(d_residuals.begin(), d_residuals.end(), (float)0.0f);
//TODO All move all this to .cu file
//thrust::fill(d_correspondences.begin(), d_correspondences.end(), make_float4(0));
//thrust::fill(d_correspondenceNormals.begin(), d_correspondenceNormals.end(), make_float4(0));
//thrust::fill(d_residuals.begin(), d_residuals.end(), (float)0.0f);


//We now have all data we need. find correspondence.
Expand Down Expand Up @@ -114,11 +112,14 @@ Eigen::Matrix4f CameraTracking::rigidAlignment(const float4* d_input, const floa

CameraTracking::CameraTracking(int w, int h):width(w),height(h)
{
//const int ARRAY_SIZE = width*height*sizeof(CoordPair);
//checkCudaErrors(cudaMalloc((void**)&d_correspondence, ARRAY_SIZE));
//checkCudaErrors(cudaMemset(d_correspondence, 0, ARRAY_SIZE));
//checkCudaErrors(cudaMalloc((void**)&d_correspondenceNormals, ARRAY_SIZE));
//checkCudaErrors(cudaMemset(d_correspondenceNormals, 0, ARRAY_SIZE));
const int F4_ARRAY_SIZE = width*height*sizeof(float4);
const int ARRAY_SIZE = width*height*sizeof(float);
checkCudaErrors(cudaMalloc((void**)&d_correspondences, F4_ARRAY_SIZE));
checkCudaErrors(cudaMemset(d_correspondences, 0, F4_ARRAY_SIZE));
checkCudaErrors(cudaMalloc((void**)&d_correspondenceNormals, F4_ARRAY_SIZE));
checkCudaErrors(cudaMemset(d_correspondenceNormals, 0, F4_ARRAY_SIZE));
checkCudaErrors(cudaMalloc((void**)&d_residuals, ARRAY_SIZE));
checkCudaErrors(cudaMemset(d_residuals, 0, ARRAY_SIZE));
float arr[16] = { 1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1 };
deltaTransform = Matrix4x4f(arr);
const float intrinsics[] = {525.0, 0, 319.5, 0, 525.0, 239.5, 0, 0, 1}; //TODO: read from file
Expand All @@ -132,8 +133,9 @@ CameraTracking::CameraTracking(int w, int h):width(w),height(h)

CameraTracking::~CameraTracking()
{
//checkCudaErrors(cudaFree(d_correspondence));
//checkCudaErrors(cudaFree(d_correspondenceNormals));
checkCudaErrors(cudaFree(d_correspondences));
checkCudaErrors(cudaFree(d_correspondenceNormals));
checkCudaErrors(cudaFree(d_residuals));
}


19 changes: 10 additions & 9 deletions CameraTracking.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
#endif

#include <cuda_runtime_api.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
//#include <thrust/device_vector.h>
//#include <thrust/device_ptr.h>

//This is a simple vector library. Use this with CUDA instead of GLM.
#include "cuda_helper/cuda_SimpleMatrixUtil.h"
Expand All @@ -28,8 +28,8 @@
// int dummy = -2; //padding
//};

using thrust::device_vector;
using thrust::device_ptr;
//using thrust::device_vector;
//using thrust::device_ptr;

class CameraTracking {

Expand All @@ -38,12 +38,13 @@ class CameraTracking {
//LinearSystem linearSystem;
Solver solver;
int maxIters = 20;
//float4* d_correspondenceNormals;
//float4* d_correspondence;
float4* d_correspondenceNormals;
float4* d_correspondences;
float* d_residuals;
//thrust::device_vector<CorrPair> d_coordPair;
device_vector<float4> d_correspondences;
device_vector<float4> d_correspondenceNormals;
device_vector<float> d_residuals;
//device_vector<float4> d_correspondences;
//device_vector<float4> d_correspondenceNormals;
//device_vector<float> d_residuals;
Matrix4x4f deltaTransform;
float globalCorrespondenceError = 0.0f;
Matrix4x4f delinearizeTransformation(const Vector6f & sol);
Expand Down
21 changes: 11 additions & 10 deletions CameraTrackingUtils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
#include <cuda_runtime_api.h>
#include "cuda_helper/helper_cuda.h"
#include "cuda_helper/helper_math.h"
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
//#include <thrust/device_vector.h>
//#include <thrust/device_ptr.h>
#include <thrust/fill.h>
#include "common.h"

Expand All @@ -34,8 +34,8 @@
dim3 blocks = dim3(20, 15, 1);
dim3 threads = dim3(32, 32, 1);

using thrust::device_vector;
using thrust::device_ptr;
//using thrust::device_vector;
//using thrust::device_ptr;

__device__ __constant__ float3x3 K; //Camera intrinsic matrix
__device__ __constant__ float3x3 K_inv;
Expand Down Expand Up @@ -175,19 +175,20 @@ void FindCorrespondences(const float4* input, const float4* target,
}

extern "C" float computeCorrespondences(const float4* d_input, const float4* d_target,
const float4* d_targetNormals, device_vector<float4>& corres,
device_vector<float4>& corresNormals, device_vector<float>& residuals,
const float4* d_targetNormals, float4* corres,
float4* corresNormals, float* residuals,
const float4x4 deltaTransform, const int width, const int height)
{
//First clear the previous correspondence calculation
checkCudaErrors(cudaMemcpyToSymbol(globalError, &idealError, sizeof(float)));

float4* d_correspondences = thrust::raw_pointer_cast(&corres[0]);
float4* d_corresNormals = thrust::raw_pointer_cast(&corresNormals[0]);
float* d_residuals = thrust::raw_pointer_cast(&residuals[0]);
//TODO All move all this to .cu file
thrust::fill(corres, corres + (width*height), float4{0});
thrust::fill(corresNormals, corresNormals+ (width*height), float4{0});
thrust::fill(residuals, residuals+ (width*height), (float)0.0f);

FindCorrespondences <<<blocks, threads>>>(d_input, d_target, d_targetNormals,
d_correspondences, d_corresNormals, d_residuals, deltaTransform, distThres, normalThres, width, height);
corres, corresNormals, residuals, deltaTransform, distThres, normalThres, width, height);

float globalErrorReadback = 0.0;
checkCudaErrors(cudaMemcpyFromSymbol(&globalErrorReadback, globalError, sizeof(float)));
Expand Down
65 changes: 43 additions & 22 deletions Solver.cpp
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#include "Solver.h"
#include "termcolor.hpp"
#include <string>
#include <thrust/fill.h>
#include <thrust/copy.h>
//#include <thrust/fill.h>
//#include <thrust/copy.h>

float alpha = 1.0;
float beta = 0.0f;

extern "C"
void CalculateJacobiansAndResiduals(const float4* d_input, const device_vector<float4>& corres, const device_vector<float4>& d_corresNormals, device_vector<float>& d_Jac);
void CalculateJacobiansAndResiduals(const float4* d_input, const float4* corres, const float4* d_corresNormals, const float* d_Jac, const float* d_res);
//inline
//float calculate_B(const vec3& n, const vec3& d, const vec3& s) {
// glm::vec3 p = vec3(d - s);
Expand Down Expand Up @@ -46,16 +46,16 @@ void Solver::PrintSystem() {
// JacMat.row(index) << n.x, n.y, n.z, T.x, T.y, T.z ;
//}

void Solver::BuildLinearSystem(const float4* d_input, const device_vector<float4>& d_correspondences, const device_vector<float4>& d_correspondenceNormals, const device_vector<float>& d_residuals, int width, int height) {
void Solver::BuildLinearSystem(const float4* d_input, const float4* d_correspondences, const float4* d_correspondenceNormals, const float* d_residuals, int width, int height) {

d_residual_ptr = thrust::raw_pointer_cast(&d_residuals[0]);
//d_residual = thrust::raw_pointer_cast(&d_residuals[0]);
numCorrPairs = width*height; //corrImageCoords.size();
//d_Jac.resize(numCorrPairs*6); //Do we need to resize at runtime?
//d_residual.resize(numCorrPairs);
thrust::fill(d_Jac.begin(), d_Jac.end(), 0);
//thrust::fill(d_Jac.begin(), d_Jac.end(), 0);
//thrust::fill(d_residual.begin(), d_residual.end(), 0);
thrust::fill(d_JTJ.begin(), d_JTJ.end(), 0);
thrust::fill(d_JTr.begin(), d_JTr.end(), 0);
//thrust::fill(d_JTJ.begin(), d_JTJ.end(), 0);
//thrust::fill(d_JTr.begin(), d_JTr.end(), 0);
//Jac = MatrixXf(numCorrPairs,6);
//residual = VectorXf(numCorrPairs);
//PrintMatrixDims(Jac, std::string("Jac"));
Expand All @@ -68,22 +68,26 @@ void Solver::BuildLinearSystem(const float4* d_input, const device_vector<float4
//JTJ.setZero();
//JTr.setZero();
//residual.setZero();
uint idx = 0;
//uint idx = 0;

//Invoke kernel here
std::cout<<"Calculating Jacobians and residuals\n";
CalculateJacobiansAndResiduals(d_input, d_correspondences, d_correspondenceNormals, d_Jac);
CalculateJacobiansAndResiduals(d_input, d_correspondences, d_correspondenceNormals, d_Jac, d_residuals);
checkCudaErrors(cudaDeviceSynchronize());
//Now Jac and res are populated. Invoke cublas functions to calculate JTJ and JTr
//JTr
std::cout<<"Calculating JTr\n";
//TODO : Set this d_residual_ptr correctly
stat = cublasSgemv(handle, CUBLAS_OP_N, 6, numCols*numRows, &alpha, d_Jac_ptr/*d_a*/, 6, d_residual_ptr/*d_x*/, 1, &beta, d_JTr_ptr/*d_y*/, 1);
cudaMemcpy(JTr.data(), d_JTr_ptr, 6*sizeof(float), cudaMemcpyDeviceToHost); // copy back
std::cout<<JTr<<"\n";
stat = cublasSgemv(handle, CUBLAS_OP_N, 6, numCols*numRows, &alpha, d_Jac/*d_a*/, 6, d_residuals/*d_x*/, 1, &beta, d_JTr/*d_y*/, 1);
//JTJ
std::cout<<"Calculating JTJ\n";
stat = cublasSsyrk(handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, 6, numCols*numRows, &alpha, d_Jac_ptr/*d_a*/, 6, &beta, d_JTJ_ptr/*d_c*/, 6); //compute JJT in column-maj order
cudaMemcpy(JTJ.data(), d_JTJ_ptr, 6*6*sizeof(float), cudaMemcpyDeviceToHost); // copy back
stat = cublasSsyrk(handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, 6, numCols*numRows, &alpha, d_Jac/*d_a*/, 6, &beta, d_JTJ/*d_c*/, 6); //compute JJT in column-maj order

//copy back
cudaMemcpy(JTr.data(), d_JTr, JTr_SIZE, cudaMemcpyDeviceToHost);
std::cout<<JTr<<"\n";

cudaMemcpy(JTJ.data(), d_JTJ, JTJ_SIZE, cudaMemcpyDeviceToHost);
std::cout<<JTJ<<"\n";
checkCudaErrors(cudaDeviceSynchronize());

Expand Down Expand Up @@ -149,19 +153,32 @@ void Solver::SolveJacobianSystem(const Matrix6x6f& JTJ, const Vector6f& JTr) {
//}

Solver::Solver() {
d_Jac.resize(6*numCols*numRows);
JAC_SIZE = 6*numCols*numRows*sizeof(float);
RES_SIZE = numCols*numRows*sizeof(float);
JTJ_SIZE = 6*6*sizeof(float);
JTr_SIZE = 6*sizeof(float);

checkCudaErrors(cudaMalloc((void**)&d_Jac, JAC_SIZE));
checkCudaErrors(cudaMemset(d_Jac, 0, JAC_SIZE));
//checkCudaErrors(cudaMalloc((void**)&d_residual, RES_SIZE));
//checkCudaErrors(cudaMemset(d_residual, 0, RES_SIZE));
checkCudaErrors(cudaMalloc((void**)&d_JTJ, JTJ_SIZE));
checkCudaErrors(cudaMemset(d_JTJ, 0, JTJ_SIZE));
checkCudaErrors(cudaMalloc((void**)&d_JTr, JTr_SIZE));
checkCudaErrors(cudaMemset(d_JTr, 0, JTr_SIZE));
//d_Jac.resize(6*numCols*numRows);
//d_residual.resize(numCols*numRows);
d_JTr.resize(6);
d_JTJ.resize(6*6);
//d_JTr.resize(6);
//d_JTJ.resize(6*6);

thrust::fill(d_Jac.begin(), d_Jac.end(), (float)0.0f);
//thrust::fill(d_Jac.begin(), d_Jac.end(), (float)0.0f);
//thrust::fill(d_residual.begin(), d_residual.end(), 0);
stat = cublasCreate(&handle);

d_Jac_ptr = thrust::raw_pointer_cast(&d_Jac[0]);
//d_Jac_ptr = thrust::raw_pointer_cast(&d_Jac[0]);
//d_residual_ptr = thrust::raw_pointer_cast(&d_residual[0]);
d_JTr_ptr = thrust::raw_pointer_cast(&d_JTr[0]);
d_JTJ_ptr = thrust::raw_pointer_cast(&d_JTJ[0]);
//d_JTr_ptr = thrust::raw_pointer_cast(&d_JTr[0]);
//d_JTJ_ptr = thrust::raw_pointer_cast(&d_JTJ[0]);

JTJ.setZero();
JTr.setZero();
Expand All @@ -170,5 +187,9 @@ Solver::Solver() {
}

Solver::~Solver() {
checkCudaErrors(cudaFree(d_Jac));
//checkCudaErrors(cudaFree(d_residual));
checkCudaErrors(cudaFree(d_JTJ));
checkCudaErrors(cudaFree(d_JTr));
cublasDestroy(handle);
}
17 changes: 10 additions & 7 deletions Solver.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
#include <cublas_v2.h>
#include "Solver.h"
#include "cuda_helper/helper_math.h"
#include <thrust/fill.h>

#define numCols 640
#define numRows 480
//Since numCols = 640 and numRows = 480, we set blockDim according to 32x32 tile
dim3 blocks = dim3(20, 15, 1);
dim3 threads = dim3(32, 32, 1);

using FloatVec = thrust::device_vector<float>;
using Float4Vec = thrust::device_vector<float4>;
//using FloatVec = thrust::device_vector<float>;
//using Float4Vec = thrust::device_vector<float4>;

//using CorrPairVec = thrust::device_vector<CorrPair>;

Expand Down Expand Up @@ -51,16 +52,18 @@ void CalculateJacAndResKernel(const float4* d_src, const float4* d_dest, const f
//__device__ inline
//void CalculateJTJ

extern "C" void CalculateJacobiansAndResidual(const float4* d_src, Float4Vec targ, Float4Vec targNormals, FloatVec Jac) {
extern "C" void CalculateJacobiansAndResidual(const float4* d_src, const float4* d_targ, const float4* d_targNormals,
float* d_Jac) {

//First calculate Jacobian and Residual matrices
float4* d_targ = thrust::raw_pointer_cast(&targ[0]);
float4* d_targNormals = thrust::raw_pointer_cast(&targNormals[0]);
float* d_jacobianMatrix = thrust::raw_pointer_cast(&Jac[0]);
//float4* d_targ = thrust::raw_pointer_cast(&targ[0]);
//float4* d_targNormals = thrust::raw_pointer_cast(&targNormals[0]);
//float* d_jacobianMatrix = thrust::raw_pointer_cast(&Jac[0]);
//float* d_resVector = thrust::raw_pointer_cast(&residual[0]);
//float* d_jtj = thrust::raw_pointer_cast(&JTJ[0]);
//float* d_jtr = thrust::raw_pointer_cast(&JTr[0]);
CalculateJacAndResKernel<<<blocks, threads>>>(d_src, d_targ, d_targNormals, d_jacobianMatrix);
thrust::fill(d_Jac, d_Jac+(numCols*numRows*6), 0); //TODO - is this redundant?
CalculateJacAndResKernel<<<blocks, threads>>>(d_src, d_targ, d_targNormals, d_Jac);

//Then calculate Matrix-vector JTr and Matrix-matrix JTJ products
}
Expand Down
26 changes: 15 additions & 11 deletions Solver.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@
#include "cuda_helper/helper_cuda.h"
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
//#include <thrust/device_vector.h>
//#include <thrust/device_ptr.h>

using namespace Eigen;
using thrust::device_vector;
//using thrust::device_vector;

class Solver {
public:
uint numIters = 10;

void BuildLinearSystem(const float4* , const device_vector<float4>& , const device_vector<float4>& , const device_vector<float>& , int , int );
void BuildLinearSystem(const float4* , const float4* , const float4* , const float* , int , int );

void PrintSystem();

Expand All @@ -29,6 +29,11 @@ class Solver {
Matrix4x4f getTransform() {return SE3Exp(estimate);};
double getError() {return TotalError;};
private:
int JAC_SIZE;
int RES_SIZE;
int JTJ_SIZE;
int JTr_SIZE;
const int num_vars_in_jac = 6;
Vector6f update, estimate; //Will hold solution
bool solution_exists = false;
//Matrix4x4f deltaT; //intermediate estimated transform
Expand All @@ -47,14 +52,13 @@ class Solver {
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
thrust::device_vector<float> d_Jac; //Computed on device
//thrust::device_vector<float> d_Jac; //Computed on device
//thrust::device_vector<float> d_residual; //Computed on device
thrust::device_vector<float> d_JTr; //then multiplied on device
thrust::device_vector<float> d_JTJ; //finally this is computed
float* d_Jac_ptr = nullptr;
float* d_residual_ptr = nullptr;
float* d_JTr_ptr = nullptr;
float* d_JTJ_ptr = nullptr;
//thrust::device_vector<float> d_JTr; //then multiplied on device
//thrust::device_vector<float> d_JTJ; //finally this is computed
float* d_Jac = nullptr;
float* d_JTr = nullptr;
float* d_JTJ = nullptr;


};
Expand Down

0 comments on commit eed8386

Please sign in to comment.