Skip to content

Commit

Permalink
Refreshing code with changes from recent development.
Browse files Browse the repository at this point in the history
  • Loading branch information
romerojosh committed Oct 1, 2020
1 parent 0ecab0c commit b794717
Show file tree
Hide file tree
Showing 8 changed files with 185 additions and 76 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ endif()
#decide whether to use CUDA or not
#find_package(CUDAToolkit REQUIRED)
if(NOT CUDA_COMPUTE_CAPABILITY)
set(CUDA_COMPUTE_CAPABILITY 70)
set(CUDA_COMPUTE_CAPABILITY 70 80)
endif()

#Find OpenMP
Expand Down
26 changes: 22 additions & 4 deletions cuslines/cuslines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@ py::capsule cleanup(T* ptr) {

class GPUTracker {
public:
GPUTracker(np_array_cast dataf,
GPUTracker(double max_angle,
double min_signal,
double tc_threshold,
double step_size,
np_array_cast dataf,
np_array H,
np_array R,
np_array delta_b,
Expand Down Expand Up @@ -117,6 +121,11 @@ class GPUTracker {
std::cerr << "Creating GPUTracker with " << ngpus << " GPUs..." << std::endl;
ngpus_ = ngpus;

max_angle_ = max_angle;
min_signal_ = min_signal;
tc_threshold_ = tc_threshold;
step_size_ = step_size;

// Allocate/copy constant problem data on GPUs
dataf_d.resize(ngpus_, nullptr);
H_d.resize(ngpus_, nullptr);
Expand All @@ -132,7 +141,8 @@ class GPUTracker {
//#pragma omp parallel for
for (int n = 0; n < ngpus_; ++n) {
CHECK_CUDA(cudaSetDevice(n));
CHECK_CUDA(cudaMalloc(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size));
CHECK_CUDA(cudaMallocManaged(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size));
CHECK_CUDA(cudaMemAdvise(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n));
CHECK_CUDA(cudaMalloc(&H_d[n], sizeof(*H_d[n]) * H_info.size));
CHECK_CUDA(cudaMalloc(&R_d[n], sizeof(*R_d[n]) * R_info.size));
CHECK_CUDA(cudaMalloc(&delta_b_d[n], sizeof(*delta_b_d[n]) * delta_b_info.size));
Expand Down Expand Up @@ -210,7 +220,8 @@ class GPUTracker {
std::vector<int> nSlines(ngpus_);

// Call GPU routine
generate_streamlines_cuda_mgpu(nseeds, seeds_d,
generate_streamlines_cuda_mgpu(max_angle_, min_signal_, tc_threshold_, step_size_,
nseeds, seeds_d,
dimx_, dimy_, dimz_, dimt_,
dataf_d, H_d, R_d, delta_nr_, delta_b_d, delta_q_d, b0s_mask_d, metric_map_d, samplm_nr_, sampling_matrix_d,
sphere_vertices_d, sphere_edges_d, nedges_,
Expand Down Expand Up @@ -270,6 +281,11 @@ class GPUTracker {
int nedges_;
int delta_nr_, samplm_nr_;

double max_angle_;
double tc_threshold_;
double min_signal_;
double step_size_;

std::vector<int> nSlines_old_;
std::vector<REAL*> slines_;
std::vector<int*> slinesLen_;
Expand All @@ -292,12 +308,14 @@ class GPUTracker {

PYBIND11_MODULE(cuslines, m) {
py::class_<GPUTracker>(m, "GPUTracker")
.def(py::init<np_array_cast, np_array,
.def(py::init<double, double, double, double,
np_array_cast, np_array,
np_array, np_array,
np_array, np_array_int,
np_array, np_array,
np_array, np_array_int,
int, int, int>(),
py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(),
py::arg().noconvert(), py::arg().noconvert(),
py::arg().noconvert(), py::arg().noconvert(),
py::arg().noconvert(), py::arg().noconvert(),
Expand Down
79 changes: 57 additions & 22 deletions cuslines/generate_streamlines_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ __device__ int trilinear_interp_d(const int dimx,
return -1;
}

int coo[3][2];
long long coo[3][2];
REAL wgh[3][2]; // could use just one...

const REAL_T ONE = static_cast<REAL_T>(1.0);
Expand Down Expand Up @@ -647,7 +647,8 @@ template<int BDIM_X,
int BDIM_Y,
typename REAL_T,
typename REAL3_T>
__device__ int closest_peak_d(const REAL3_T direction, //dir
__device__ int closest_peak_d(const REAL_T max_angle,
const REAL3_T direction, //dir
const int npeaks,
const REAL3_T *__restrict__ peaks,
REAL3_T *__restrict__ peak) {// dirs,
Expand All @@ -657,7 +658,8 @@ __device__ int closest_peak_d(const REAL3_T direction, //dir
const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));

const REAL_T cos_similarity = COS(MAX_ANGLE_P);
//const REAL_T cos_similarity = COS(MAX_ANGLE_P);
const REAL_T cos_similarity = COS(max_angle);
#if 0
if (!threadIdx.y && !tidx) {
printf("direction: (%f, %f, %f)\n",
Expand Down Expand Up @@ -804,7 +806,9 @@ template<int BDIM_X,
typename REAL_T,
typename REAL3_T>
__device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
REAL3_T dir,
const REAL_T max_angle,
const REAL_T min_signal,
REAL3_T dir,
const int dimx,
const int dimy,
const int dimz,
Expand Down Expand Up @@ -919,7 +923,8 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
//__syncwarp();

for(int j = tidx; j < dimt; j += BDIM_X) {
__vox_data_sh[j] = MAX(MIN_SIGNAL_P, __vox_data_sh[j]);
//__vox_data_sh[j] = MAX(MIN_SIGNAL_P, __vox_data_sh[j]);
__vox_data_sh[j] = MAX(min_signal, __vox_data_sh[j]);
}
__syncwarp(WMASK);

Expand Down Expand Up @@ -1021,7 +1026,7 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
}
*/
REAL3_T peak;
const int foundPeak = closest_peak_d<BDIM_X, BDIM_Y, REAL_T, REAL3_T>(dir, ndir, dirs, &peak);
const int foundPeak = closest_peak_d<BDIM_X, BDIM_Y, REAL_T, REAL3_T>(max_angle, dir, ndir, dirs, &peak);
__syncwarp(WMASK);
if (foundPeak) {
if (tidx == 0) {
Expand All @@ -1041,7 +1046,8 @@ template<int BDIM_X,
int BDIM_Y,
typename REAL_T,
typename REAL3_T>
__device__ int check_point_d(const REAL3_T point,
__device__ int check_point_d(const REAL_T tc_threshold,
const REAL3_T point,
const int dimx,
const int dimy,
const int dimz,
Expand All @@ -1064,14 +1070,19 @@ __device__ int check_point_d(const REAL3_T point,
if (rv != 0) {
return OUTSIDEIMAGE;
}
return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT;
//return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT;
return (__shInterpOut[tidy] > tc_threshold) ? TRACKPOINT : ENDPOINT;
}

template<int BDIM_X,
int BDIM_Y,
typename REAL_T,
typename REAL3_T>
__device__ int tracker_d(curandStatePhilox4_32_10_t *st,
const REAL_T max_angle,
const REAL_T min_signal,
const REAL_T tc_threshold,
const REAL_T step_size,
REAL3_T seed,
REAL3_T first_step,
REAL3_T voxel_size,
Expand All @@ -1088,7 +1099,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
// max_angle, pmf_threshold from global defines
// b0s_mask already passed
// min_signal from global defines
// tc_threashold from global defines
// tc_threshold from global defines
// pmf_threashold from global defines
const REAL_T *__restrict__ metric_map,
const int delta_nr,
Expand Down Expand Up @@ -1131,6 +1142,8 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
int ndir = get_direction_d<BDIM_X,
BDIM_Y,
5>(st,
max_angle,
min_signal,
direction,
dimx, dimy, dimz, dimt, dataf,
b0s_mask /* !dwi_mask */,
Expand Down Expand Up @@ -1161,9 +1174,12 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
}
//return;
#endif
point.x += (direction.x / voxel_size.x) * STEP_SIZE_P;
point.y += (direction.y / voxel_size.y) * STEP_SIZE_P;
point.z += (direction.z / voxel_size.z) * STEP_SIZE_P;
//point.x += (direction.x / voxel_size.x) * STEP_SIZE_P;
//point.y += (direction.y / voxel_size.y) * STEP_SIZE_P;
//point.z += (direction.z / voxel_size.z) * STEP_SIZE_P;
point.x += (direction.x / voxel_size.x) * step_size;
point.y += (direction.y / voxel_size.y) * step_size;
point.z += (direction.z / voxel_size.z) * step_size;

if (tidx == 0) {
streamline[i] = point;
Expand All @@ -1175,7 +1191,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
}
__syncwarp(WMASK);

tissue_class = check_point_d<BDIM_X, BDIM_Y>(point, dimx, dimy, dimz, metric_map);
tissue_class = check_point_d<BDIM_X, BDIM_Y>(tc_threshold, point, dimx, dimy, dimz, metric_map);

if (tissue_class == ENDPOINT ||
tissue_class == INVALIDPOINT ||
Expand All @@ -1192,7 +1208,9 @@ template<int BDIM_X,
int BDIM_Y,
typename REAL_T,
typename REAL3_T>
__global__ void getNumStreamlines_k(const long long rndSeed,
__global__ void getNumStreamlines_k(const REAL_T max_angle,
const REAL_T min_signal,
const long long rndSeed,
const int rndOffset,
const int nseed,
const REAL3_T *__restrict__ seeds,
Expand Down Expand Up @@ -1244,6 +1262,8 @@ __global__ void getNumStreamlines_k(const long long rndSeed,
int ndir = get_direction_d<BDIM_X,
BDIM_Y,
1>(&st,
max_angle,
min_signal,
MAKE_REAL3(0,0,0),
dimx, dimy, dimz, dimt, dataf,
b0s_mask /* !dwi_mask */,
Expand Down Expand Up @@ -1280,7 +1300,11 @@ template<int BDIM_X,
int BDIM_Y,
typename REAL_T,
typename REAL3_T>
__global__ void genStreamlinesMerge_k(const long long rndSeed,
__global__ void genStreamlinesMerge_k(const REAL_T max_angle,
const REAL_T min_signal,
const REAL_T tc_threshold,
const REAL_T step_size,
const long long rndSeed,
const int rndOffset,
const int nseed,
const REAL3_T *__restrict__ seeds,
Expand Down Expand Up @@ -1358,6 +1382,10 @@ __global__ void genStreamlinesMerge_k(const long long rndSeed,
int stepsB;
const int tissue_classB = tracker_d<BDIM_X,
BDIM_Y>(&st,
max_angle,
min_signal,
tc_threshold,
step_size,
seed,
MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
MAKE_REAL3(1, 1, 1),
Expand Down Expand Up @@ -1391,6 +1419,10 @@ __global__ void genStreamlinesMerge_k(const long long rndSeed,
int stepsF;
const int tissue_classF = tracker_d<BDIM_X,
BDIM_Y>(&st,
max_angle,
min_signal,
tc_threshold,
step_size,
seed,
first_step,
MAKE_REAL3(1, 1, 1),
Expand Down Expand Up @@ -1433,7 +1465,8 @@ __global__ void genStreamlinesMerge_k(const long long rndSeed,
return;
}

void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &seeds_d,
void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
const int nseeds, const std::vector<REAL*> &seeds_d,
const int dimx, const int dimy, const int dimz, const int dimt,
const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
const int delta_nr,
Expand Down Expand Up @@ -1464,10 +1497,6 @@ void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &
CHECK_CUDA(cudaMalloc(&shDirTemp1_d[n], sizeof(*shDirTemp1_d[n])*samplm_nr*grid.x*block.y));
}


// int delta_nr = 28; // TO BE MADE PARAMETERS!
// int samplm_nr = 181;

int n32dimt = ((dimt+31)/32)*32;

size_t shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_d
Expand All @@ -1486,7 +1515,9 @@ void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &
// Precompute number of streamlines before allocating memory
getNumStreamlines_k<THR_X_SL,
THR_X_BL/THR_X_SL>
<<<grid, block, shSizeGNS>>>(rng_seed,
<<<grid, block, shSizeGNS>>>(max_angle,
min_signal,
rng_seed,
rng_offset + n*nseeds_per_gpu,
nseeds_gpu,
reinterpret_cast<const REAL3 *>(seeds_d[n]),
Expand Down Expand Up @@ -1591,7 +1622,11 @@ void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &
//fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y);
genStreamlinesMerge_k<THR_X_SL,
THR_X_BL/THR_X_SL>
<<<grid, block, shSizeGNS, streams[n]>>>(rng_seed,
<<<grid, block, shSizeGNS, streams[n]>>>(max_angle,
min_signal,
tc_threshold,
step_size,
rng_seed,
rng_offset + n*nseeds_per_gpu,
nseeds_gpu,
reinterpret_cast<const REAL3 *>(seeds_d[n]),
Expand Down
3 changes: 2 additions & 1 deletion cuslines/generate_streamlines_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@

#include "globals.h"

void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &seeds_d,
void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
const int nseeds, const std::vector<REAL*> &seeds_d,
const int dimx, const int dimy, const int dimz, const int dimt,
const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
const int delta_nr,
Expand Down
9 changes: 5 additions & 4 deletions cuslines/globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,11 @@

#define MAX_SLINE_LEN (501)
#define PMF_THRESHOLD_P ((REAL)0.1)
#define TC_THRESHOLD_P ((REAL)0.1)
#define STEP_SIZE_P ((REAL)0.5)
#define MAX_ANGLE_P ((REAL)1.0471975511965976) // 60 deg in radians
#define MIN_SIGNAL_P ((REAL)1.0)

//#define TC_THRESHOLD_P ((REAL)0.1)
//#define STEP_SIZE_P ((REAL)0.5) // only for TRK generation
//#define MAX_ANGLE_P ((REAL)1.0471975511965976) // 60 deg in radians
//#define MIN_SIGNAL_P ((REAL)1.0)

#define MAX_SLINES_PER_SEED (10)

Expand Down
4 changes: 2 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive
RUN apt update && \
apt install --assume-yes apt-transport-https \
ca-certificates gnupg \
software-properties-common gcc git wget
software-properties-common gcc git wget numactl
RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null \
| gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
RUN apt-add-repository "deb https://apt.kitware.com/ubuntu/ focal main"
Expand All @@ -23,7 +23,7 @@ ENV PATH /opt/anaconda/bin:${PATH}
ENV LD_LIBRARY_PATH /opt/anaconda/lib:${LD_LIBRARY_PATH}

# python prereqs
RUN pip install numpy scipy cython nibabel dipy
RUN pip install numpy scipy cython nibabel dipy tqdm

# copy stuff
COPY CMakeLists.txt /opt/GPUStreamlines/CMakeLists.txt
Expand Down
Loading

0 comments on commit b794717

Please sign in to comment.