-
Notifications
You must be signed in to change notification settings - Fork 0
/
mollertrumbore.cu
245 lines (221 loc) · 9.74 KB
/
mollertrumbore.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
/*
* Fast detection test if a ray intersects with a set of triangles using CUDA accelerated kernels based on the work of Möller and Trumbore (1997).
*
* Arguments: 'vertices' [(3 * 3 * numTriangles),type float32]
These are interleaved, example:
vertices=[T0_V0_X, T0_V0_Y, T0_V0_Z;
T0_V1_X, T0_V1_Y, T0_V1_Z;
T0_V2_X, T0_V2_Y, T0_V2_Z;
T1_V0_X, T1_V0_Y, T1_V0_Z;
T1_V1_X, T1_V1_Y, T1_V1_Z;
T1_V2_X, T1_V2_Y, T1_V2_Z;
...]
with Ti: triangles, Vj:vertices 1 to 3 of triangle and X/Y/Z the 3 coordinates
'raysFrom' [(3 * numRays),type float32]
These are the starting points of each ray in X Y Z
'directions' [(3 * numRays),type float32]
These are the directions vectors of each ray in X Y Z
* Returns: 'results' [numTriangles * numRays, type int32]
These are the binary test results to see if there is an intersection.
1 = intersection found, 0 = no intersection
* Compile with 'mexcuda -v mollertrumbore.cu' (-v for extra details for debugging)
* Requires CUDA toolkit and a C compiler.
* Make sure to correctly set c++ compiler with 'mex -setup c++' and clicking on the link of the version you want if asked.
* And make sure to set the CUDA enviroment variable correctly with
* 'setenv('MW_NVCC_PATH','/usr/local/cuda-X/bin')' on Linux
* 'setenv('MW_NVCC_PATH','C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX\bin')' on Windows
*
* Original CUDA implementation of Möller and Trumbore is by Raymond Leung (2022),
* 'GPU implementation of a ray-surface intersection algorithm in CUDA',
* arXiv e-print 2209.02878, 2022.
* Source code available at: https://github.com/raymondleung8/gpu-ray-surface-intersection-in-cuda
* and is made under a BSD 3 license.
*
* MATLAB implementation by Wouter Jansen, Cosys-Lab, University of Antwerp
*/
#define EPSILON 0.0000000001
#include <string>
__device__ void subtract(const float *a, const float *b, float *out)
{
out[0] = a[0] - b[0];
out[1] = a[1] - b[1];
out[2] = a[2] - b[2];
}
__device__ void dot(const float *a, const float *b, float &out)
{
out = a[0]*b[0] + a[1]*b[1] + a[2]*b[2];
}
__device__ void cross(const float *a, const float *b, float *out)
{
out[0] = a[1]*b[2] - a[2]*b[1];
out[1] = a[2]*b[0] - a[0]*b[2];
out[2] = a[0]*b[1] - a[1]*b[0];
}
__device__ float tolerance(const float *d, const float *eAB, const float *eAC)
{
float scaling = sqrtf((d[0]*d[0] + d[1]*d[1] + d[2]*d[2]) *
(eAB[0]*eAB[0] + eAB[1]*eAB[1] + eAB[2]*eAB[2]) *
(eAC[0]*eAC[0] + eAC[1]*eAC[1] + eAC[2]*eAC[2]));
return (scaling > 1)? scaling * EPSILON : EPSILON;
}
// Implement the Moller-Trumbore ray-triangle intersection algorithm
// - Ray model: R(t) = Q0 + t *(dir), where Q0 denote segment start point
// - and dir the direction of the segment
// - Point on triangle: T(u,v) = (1-u-v)*V0 + u*V1 + v*V2
//
__device__ int intersectMoller(
const float *v0, const float *v1, const float *v2,
const float *edge1, const float *edge2,
const float *q0, const float *dir)
{
float avec[3], bvec[3], tvec[3], t, u, v, det, inv_det;
cross(dir, edge2, avec);
dot(avec, edge1, det);
float epsilon = tolerance(dir, edge1, edge2);
if (det > epsilon) {
subtract(q0, v0, tvec);
dot(avec, tvec, u);
if (u < 0 || u > det){
return 0;
}
cross(tvec, edge1, bvec);
dot(bvec, dir, v);
if (v < 0 || u + v > det){
return 0;
}
}
else if (det < -epsilon) {
subtract(q0, v0, tvec);
dot(avec, tvec, u);
if (u > 0 || u < det){
return 0;
}
cross(tvec, edge1, bvec);
dot(bvec, dir, v);
if (v > 0 || u + v < det){
return 0;
}
}
else{
return 0;
}
inv_det = 1.0 / det;
dot(bvec, edge2, t);
t *= inv_det;
if (t < 0) {
return 0;
}
else {
return 1;
}
}
__device__ void checkRayTriangleIntersection(const float* __restrict__ vertices,
const float* __restrict__ raysFrom,
const float* __restrict__ directions,
int* __restrict__ results,
int triangleIdx, int rayIdx,
int numTriangles, int numRays)
{
float triangleVerts[9], edge1[3], edge2[3];
const float *v0 = &triangleVerts[0],
*v1 = &triangleVerts[3],
*v2 = &triangleVerts[6];
for(int j = 0; j < 3; j++) { // loop by triangle v0, v1, v2
for (int k = 0; k < 3; k++) { // loop by x, y and z
triangleVerts[3*j + k] = vertices[9*(triangleIdx) + (3*j) + k];
}
}
subtract(v1, v0, edge1);
subtract(v2, v0, edge2);
// Apply Moller-Trumbore ray-triangle intersection test
const float *start = &raysFrom[3*rayIdx], *dir = &directions[3*rayIdx];
if (intersectMoller(v0, v1, v2, edge1, edge2, start, dir)) {
// printf("HIT! triangleIdx: %i rayIdx: %i\n", triangleIdx, rayIdx);
int tempRayIdx = rayIdx / 32;
int bitOffset = rayIdx % 32;
atomicOr(&results[tempRayIdx * numTriangles + triangleIdx], 1 << bitOffset);
}
}
__global__ void checkRayTriangleIntersectionKernel(const float* vertices,
const float* raysFrom,
const float* directions,
int* results,
int numTriangles, int numRays)
{
int triangleIdx = blockIdx.x * blockDim.x + threadIdx.x;
int rayIdx = blockIdx.y * blockDim.y + threadIdx.y;
if(triangleIdx < numTriangles && rayIdx < numRays){
checkRayTriangleIntersection(vertices, raysFrom, directions, results, triangleIdx, rayIdx, numTriangles, numRays);
}
}
#include "mex.h"
#include "gpu/mxGPUArray.h"
/*
* Host code for CPU
*/
void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] )
{
/* Declare all variables.*/
mxGPUArray const *deviceVertices;
mxGPUArray const *deviceRaysFrom;
mxGPUArray const *deviceDirections;
mxGPUArray *deviceResults;
float const *d_vertices;
float const *d_raysFrom;
float const *d_directions;
int *d_results;
int numTriangles;
int numRays;
/* Initialize the MathWorks GPU API. */
mxInitGPU();
/* Throw an error if the input are not a CPU arrays. */
if ((mxIsGPUArray(prhs[0])) || (mxIsGPUArray(prhs[1])) || (mxIsGPUArray(prhs[2]))) {
mexErrMsgIdAndTxt("parallel:gpu:mexGPUExample:InvalidInput", "The input matrices have to be normal CPU arrays, not GPUArrays.\n");
}
/* Throw an error if the input are not the correct datatype. */
if ( mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {
mexErrMsgIdAndTxt("parallel:gpu:mexGPUExample:InvalidInput", "The vertices data matrix has to be of datatype 'single'.\n");
}
if ( mxGetClassID(prhs[1]) != mxSINGLE_CLASS) {
mexErrMsgIdAndTxt("parallel:gpu:mexGPUExample:InvalidInput", "The ray-from data matrix has to be of datatype 'single'.\n");
}
if ( mxGetClassID(prhs[2]) != mxSINGLE_CLASS) {
mexErrMsgIdAndTxt("parallel:gpu:mexGPUExample:InvalidInput", "The ray-to data matrix has to be of datatype 'single'.\n");
}
deviceVertices = mxGPUCreateFromMxArray(prhs[0]);
deviceRaysFrom = mxGPUCreateFromMxArray(prhs[1]);
deviceDirections = mxGPUCreateFromMxArray(prhs[2]);
numTriangles = (int)((float)mxGPUGetDimensions(deviceVertices)[1]) / 3;
numRays = mxGPUGetDimensions(deviceRaysFrom)[1];
/* Extract a pointer to the input data on the device. */
d_vertices = (float const *)(mxGPUGetDataReadOnly(deviceVertices));
d_raysFrom = (float const *)(mxGPUGetDataReadOnly(deviceRaysFrom));
d_directions = (float const *)(mxGPUGetDataReadOnly(deviceDirections));
// printf("numTriangles:%i numRays:%i \n", numTriangles, numRays);
/* Create a GPUArray to hold the result and get its underlying pointer. */
int numRaysLogicalStorage = ceil((float)numRays / (float)32);
mwSize dims[2] = {numTriangles, numRaysLogicalStorage};
// printf("Storing results in array of %ix%i\n", numTriangles, numRaysLogicalStorage);
deviceResults = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(deviceVertices),
dims,
mxINT32_CLASS,
mxREAL,
MX_GPU_INITIALIZE_VALUES);
d_results = (int *)(mxGPUGetData(deviceResults));
/* Execute the kernel. */
dim3 threadsPerBlock(256, 4, 1);
int gridX = (int)ceil(numTriangles / (threadsPerBlock.x*1.0));
int gridY = (int)ceil(numRays / (threadsPerBlock.y*1.0));
dim3 numBlocks(gridX, gridY, 1);
checkRayTriangleIntersectionKernel<<<numBlocks, threadsPerBlock>>>(d_vertices, d_raysFrom, d_directions, d_results, numTriangles, numRays);
/* Wrap the result up as a MATLAB gpuArray for return. */
plhs[0] = mxGPUCreateMxArrayOnCPU(deviceResults);
/*
* The mxGPUArray pointers are host-side structures that refer to device
* data. These must be destroyed before leaving the mex function.
*/
mxGPUDestroyGPUArray(deviceVertices);
mxGPUDestroyGPUArray(deviceRaysFrom);
mxGPUDestroyGPUArray(deviceDirections);
mxGPUDestroyGPUArray(deviceResults);
}