From e7c97f772c1c89466bdfacbb9392b7fed8f22bb4 Mon Sep 17 00:00:00 2001 From: Caenorst Date: Tue, 7 Nov 2023 14:43:17 -0500 Subject: [PATCH] support torch 2.1.0 and cuda 12.1 (#765) Signed-off-by: Clement Fuji Tsang --- ci/gitlab_jenkins_templates/core_ci.jenkins | 18 +- .../ubuntu_build_CI.jenkins | 6 +- .../windows_build_CI.jenkins | 2 +- .../unbatched_mcube/unbatched_mcube_cuda.cu | 215 +++++++++--------- kaolin/csrc/ops/spc/convolution_cuda.cu | 4 - kaolin/csrc/spc_utils.cuh | 4 - setup.py | 4 +- tests/python/kaolin/io/test_materials.py | 2 +- .../kaolin/ops/mesh/test_trianglemesh.py | 52 ++--- version.txt | 2 +- 10 files changed, 162 insertions(+), 147 deletions(-) diff --git a/ci/gitlab_jenkins_templates/core_ci.jenkins b/ci/gitlab_jenkins_templates/core_ci.jenkins index 81f546450..e4ea95856 100644 --- a/ci/gitlab_jenkins_templates/core_ci.jenkins +++ b/ci/gitlab_jenkins_templates/core_ci.jenkins @@ -9,14 +9,24 @@ gitlabCommitStatus("launch all builds") { // Configs for build from pytorch docker images // (See: https://hub.docker.com/r/pytorch/pytorch/tags) def ubuntu_from_pytorch_configs = [ + [ + // python: 3.7 + 'cudaVer': '11.3', 'cudnnVer': '8', 'torchVer': '1.12.1', + 'archsToTest': 'MULTI' + ], [ // python: 3.7 'cudaVer': '11.6', 'cudnnVer': '8', 'torchVer': '1.13.1', 'archsToTest': 'MULTI' ], + [ + // python: 3.7 + 'cudaVer': '12.1', 'cudnnVer': '8', 'torchVer': '2.1.0', + 'archsToTest': 'MULTI' + ], [ // python: 3.10 - 'cudaVer': '11.7', 'cudnnVer': '8', 'torchVer': '2.0.1', + 'cudaVer': '11.8', 'cudnnVer': '8', 'torchVer': '2.1.0', 'archsToTest': 'MULTI' ] ] @@ -25,7 +35,7 @@ def ubuntu_from_pytorch_configs = [ // (See: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) def ubuntu_from_nvcr_configs = [ [ - 'baseImageTag': '22.10-py3', + 'baseImageTag': '23.10-py3', 'archsToTest': 'MULTI' ], ] @@ -40,8 +50,8 @@ def ubuntu_from_cuda_configs = [ 'archsToTest': 'MULTI' ], [ - 'cudaVer': '11.8.0', 'cudnnVer': '8', - 'pythonVer': '3.10', 'torchVer': '2.0.1', + 'cudaVer': '12.1.0', 'cudnnVer': '8', + 'pythonVer': '3.10', 'torchVer': '2.1.0', 'archsToTest': 'MULTI' ], ] diff --git a/ci/gitlab_jenkins_templates/ubuntu_build_CI.jenkins b/ci/gitlab_jenkins_templates/ubuntu_build_CI.jenkins index 9770c82d1..bff79a338 100644 --- a/ci/gitlab_jenkins_templates/ubuntu_build_CI.jenkins +++ b/ci/gitlab_jenkins_templates/ubuntu_build_CI.jenkins @@ -16,7 +16,7 @@ kind: Pod spec: containers: - name: docker - image: docker:19.03.1 + image: docker:20.10.23 command: - sleep args: @@ -25,7 +25,7 @@ spec: - name: DOCKER_HOST value: tcp://localhost:2375 - name: docker-daemon - image: docker:19.03.1-dind + image: docker:20.10.23-dind securityContext: privileged: true env: @@ -41,6 +41,8 @@ spec: ''') { node(POD_LABEL) { container("docker") { + // This is to let the time for the docker-daemon to get initialized. + sleep 10 try { stage("Checkout") { checkout([ diff --git a/ci/gitlab_jenkins_templates/windows_build_CI.jenkins b/ci/gitlab_jenkins_templates/windows_build_CI.jenkins index e399fd7b3..fd7908a35 100644 --- a/ci/gitlab_jenkins_templates/windows_build_CI.jenkins +++ b/ci/gitlab_jenkins_templates/windows_build_CI.jenkins @@ -36,7 +36,7 @@ spec: claimName: 'kaolin-pvc' containers: - name: jnlp - image: jenkins/jnlp-agent:latest-windows + image: urm.nvidia.com/sw-ipp-blossom-sre-docker-local/jnlp-agent:jdk11-windows env: - name: JENKINS_AGENT_WORKDIR value: C:/Jenkins/agent diff --git a/kaolin/csrc/ops/conversions/unbatched_mcube/unbatched_mcube_cuda.cu b/kaolin/csrc/ops/conversions/unbatched_mcube/unbatched_mcube_cuda.cu index 0dcf8cc21..2b8dd09b6 100644 --- a/kaolin/csrc/ops/conversions/unbatched_mcube/unbatched_mcube_cuda.cu +++ b/kaolin/csrc/ops/conversions/unbatched_mcube/unbatched_mcube_cuda.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2019-2023 NVIDIA CORPORATION. All rights reserved. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,51 +11,61 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#define CUB_NS_PREFIX namespace kaolin { -#define CUB_NS_POSTFIX } -#define CUB_NS_QUALIFIER ::kaolin::cub - -#include -#include #include "tables.h" #include "helper_math.h" #include +#include +#include #include namespace kaolin { // textures containing look-up tables -texture triTex; -texture numUniqueVertsTex; -texture numTrianglesTex; -texture numPartialVertsTex; -texture vertsOrderTex; - -void allocateTextures(at::Tensor d_triTable, at::Tensor d_numUniqueVertsTable, - at::Tensor d_numTrianglesTable, at::Tensor d_numPartialVertsTable, - at::Tensor d_vertsOrderTable) -{ - // TODO: rename allocateTextures - // TODO: check if texture is already binded. - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned); - - cudaMemcpy((void *) d_triTable.data_ptr(), (void *)triTable, 256*16*sizeof(int), cudaMemcpyHostToDevice); - cudaBindTexture(0, triTex, d_triTable.data_ptr(), channelDesc); - - cudaMemcpy((void *) d_numUniqueVertsTable.data_ptr(), (void *)numUniqueVertsTable, 256*sizeof(int), cudaMemcpyHostToDevice); - cudaBindTexture(0, numUniqueVertsTex, d_numUniqueVertsTable.data_ptr(), channelDesc); - - cudaMemcpy((void *) d_numTrianglesTable.data_ptr(), (void *)numTrianglesTable, 256*sizeof(int), cudaMemcpyHostToDevice); - cudaBindTexture(0, numTrianglesTex, d_numTrianglesTable.data_ptr(), channelDesc); - - cudaMemcpy((void *) d_numPartialVertsTable.data_ptr(), (void *)numPartialVertsTable, 256*sizeof(int), cudaMemcpyHostToDevice); - cudaBindTexture(0, numPartialVertsTex, d_numPartialVertsTable.data_ptr(), channelDesc); +cudaTextureObject_t triTex; +cudaTextureObject_t numUniqueVertsTex; +cudaTextureObject_t numTrianglesTex; +cudaTextureObject_t numPartialVertsTex; +cudaTextureObject_t vertsOrderTex; + +void initTextures(cudaStream_t stream) { + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc( + 32, 0, 0, 0, cudaChannelFormatKindUnsigned); + auto _init_tex = [&](cudaTextureObject_t& tex, void* hostPtr, int input_size){ + unsigned int *devPtr = (unsigned int*) c10::cuda::CUDACachingAllocator::raw_alloc_with_stream( + input_size * sizeof(unsigned int), stream); + AT_CUDA_CHECK(cudaMemcpyAsync((void*) devPtr, hostPtr, + input_size * sizeof(unsigned int), + cudaMemcpyHostToDevice, stream)); + + cudaResourceDesc resDesc = {}; + resDesc.resType = cudaResourceTypeLinear; + resDesc.res.linear.devPtr = devPtr; + resDesc.res.linear.sizeInBytes = input_size * sizeof(unsigned int); + resDesc.res.linear.desc = channelDesc; + + cudaTextureDesc texDesc = {}; + texDesc.normalizedCoords = false; + texDesc.filterMode = cudaFilterModePoint; + texDesc.addressMode[0] = cudaAddressModeClamp; + texDesc.readMode = cudaReadModeElementType; + AT_CUDA_CHECK(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL)); + }; + _init_tex(triTex, (void*) triTable, 256 * 16); + _init_tex(numUniqueVertsTex, (void*) numUniqueVertsTable, 256); + _init_tex(numTrianglesTex, (void*) numTrianglesTable, 256); + _init_tex(numPartialVertsTex, (void*) numPartialVertsTable, 256); + _init_tex(vertsOrderTex, (void*) vertsOrderTable, 256 * 3); +} - cudaMemcpy((void *) d_vertsOrderTable.data_ptr(), (void *)vertsOrderTable, 256*3*sizeof(int), cudaMemcpyHostToDevice); - cudaBindTexture(0, vertsOrderTex, d_vertsOrderTable.data_ptr(), channelDesc); +void freeTextures() { + AT_CUDA_CHECK(cudaDestroyTextureObject(triTex)); + AT_CUDA_CHECK(cudaDestroyTextureObject(numUniqueVertsTex)); + AT_CUDA_CHECK(cudaDestroyTextureObject(numTrianglesTex)); + AT_CUDA_CHECK(cudaDestroyTextureObject(numPartialVertsTex)); + AT_CUDA_CHECK(cudaDestroyTextureObject(vertsOrderTex)); } // sample volume data set at a point @@ -85,7 +95,11 @@ __global__ void classifyVoxel(int *voxelOccupied, int *voxelTriangles, int *voxelPartialVerts, int *voxelVertsOrder, float* volume, int3 gridSize, int numVoxels, - float3 voxelSize, float isoValue) + float3 voxelSize, float isoValue, + cudaTextureObject_t numUniqueVertsTex, + cudaTextureObject_t numPartialVertsTex, + cudaTextureObject_t numTrianglesTex, + cudaTextureObject_t vertsOrderTex) { int blockId = __mul24(blockIdx.y, gridDim.x) + blockIdx.x; int i = __mul24(blockId, blockDim.x) + threadIdx.x; @@ -115,13 +129,13 @@ classifyVoxel(int *voxelOccupied, int *voxelTriangles, int *voxelPartialVerts, cubeindex += int(field[7] < isoValue)*128; // read number of vertices from texture for half cube - int numVerts = tex1Dfetch(numUniqueVertsTex, cubeindex); - int numPartialVerts = tex1Dfetch(numPartialVertsTex, cubeindex); - int numTriangles = tex1Dfetch(numTrianglesTex, cubeindex); + int numVerts = tex1Dfetch(numUniqueVertsTex, cubeindex); + int numPartialVerts = tex1Dfetch(numPartialVertsTex, cubeindex); + int numTriangles = tex1Dfetch(numTrianglesTex, cubeindex); - int vertsOrder1 = tex1Dfetch(vertsOrderTex, cubeindex*3); - int vertsOrder2 = tex1Dfetch(vertsOrderTex, cubeindex*3 + 1); - int vertsOrder3 = tex1Dfetch(vertsOrderTex, cubeindex*3 + 2); + int vertsOrder1 = tex1Dfetch(vertsOrderTex, cubeindex*3); + int vertsOrder2 = tex1Dfetch(vertsOrderTex, cubeindex*3 + 1); + int vertsOrder3 = tex1Dfetch(vertsOrderTex, cubeindex*3 + 2); if (i < numVoxels) { @@ -159,10 +173,13 @@ void launch_classifyVoxel(at::Tensor voxelOccupied, at::Tensor voxelTriangles, a } // calculate number of vertices need per voxel classifyVoxel<<>>(voxelOccupied.data_ptr(), - voxelTriangles.data_ptr(), voxelPartialVerts.data_ptr(), + voxelTriangles.data_ptr(), + voxelPartialVerts.data_ptr(), voxelVertsOrder.data_ptr(), - voxelgrid.data_ptr(), gridSize, - numVoxels, voxelSize, isoValue); + voxelgrid.data_ptr(), + gridSize, numVoxels, voxelSize, isoValue, + numUniqueVertsTex, numPartialVertsTex, + numTrianglesTex, vertsOrderTex); } // compact voxel array @@ -361,7 +378,8 @@ generateTriangles2(float *pos, int *faces, int *compactedVoxelArray, int *numTrianglesScanned, int *numPartialVertsScanned, int *numPartialVerts, int *voxelVertsOrder, float* volume, int3 gridSize, - float3 voxelSize, float isoValue, int activeVoxels, int maxVerts) + float3 voxelSize, float isoValue, int activeVoxels, int maxVerts, + cudaTextureObject_t triTex, cudaTextureObject_t vertsOrderTex) { int blockId = __mul24(blockIdx.y, gridDim.x) + blockIdx.x; int grid_index = __mul24(blockId, blockDim.x) + threadIdx.x; @@ -419,17 +437,17 @@ generateTriangles2(float *pos, int *faces, int *compactedVoxelArray, __shared__ float3 vertlist[12*NTHREADS]; vertlist[threadIdx.x] = vertexInterp(isoValue, v[0], v[1], field[0], field[1]); - vertlist[NTHREADS+threadIdx.x] = vertexInterp(isoValue, v[1], v[2], field[1], field[2]); - vertlist[(NTHREADS*2)+threadIdx.x] = vertexInterp(isoValue, v[2], v[3], field[2], field[3]); - vertlist[(NTHREADS*3)+threadIdx.x] = vertexInterp(isoValue, v[3], v[0], field[3], field[0]); - vertlist[(NTHREADS*4)+threadIdx.x] = vertexInterp(isoValue, v[4], v[5], field[4], field[5]); - vertlist[(NTHREADS*5)+threadIdx.x] = vertexInterp(isoValue, v[5], v[6], field[5], field[6]); - vertlist[(NTHREADS*6)+threadIdx.x] = vertexInterp(isoValue, v[6], v[7], field[6], field[7]); - vertlist[(NTHREADS*7)+threadIdx.x] = vertexInterp(isoValue, v[7], v[4], field[7], field[4]); - vertlist[(NTHREADS*8)+threadIdx.x] = vertexInterp(isoValue, v[0], v[4], field[0], field[4]); - vertlist[(NTHREADS*9)+threadIdx.x] = vertexInterp(isoValue, v[1], v[5], field[1], field[5]); - vertlist[(NTHREADS*10)+threadIdx.x] = vertexInterp(isoValue, v[2], v[6], field[2], field[6]); - vertlist[(NTHREADS*11)+threadIdx.x] = vertexInterp(isoValue, v[3], v[7], field[3], field[7]); + vertlist[NTHREADS + threadIdx.x] = vertexInterp(isoValue, v[1], v[2], field[1], field[2]); + vertlist[(NTHREADS * 2) + threadIdx.x] = vertexInterp(isoValue, v[2], v[3], field[2], field[3]); + vertlist[(NTHREADS * 3) + threadIdx.x] = vertexInterp(isoValue, v[3], v[0], field[3], field[0]); + vertlist[(NTHREADS * 4) + threadIdx.x] = vertexInterp(isoValue, v[4], v[5], field[4], field[5]); + vertlist[(NTHREADS * 5) + threadIdx.x] = vertexInterp(isoValue, v[5], v[6], field[5], field[6]); + vertlist[(NTHREADS * 6) + threadIdx.x] = vertexInterp(isoValue, v[6], v[7], field[6], field[7]); + vertlist[(NTHREADS * 7) + threadIdx.x] = vertexInterp(isoValue, v[7], v[4], field[7], field[4]); + vertlist[(NTHREADS * 8) + threadIdx.x] = vertexInterp(isoValue, v[0], v[4], field[0], field[4]); + vertlist[(NTHREADS * 9) + threadIdx.x] = vertexInterp(isoValue, v[1], v[5], field[1], field[5]); + vertlist[(NTHREADS * 10) + threadIdx.x] = vertexInterp(isoValue, v[2], v[6], field[2], field[6]); + vertlist[(NTHREADS * 11) + threadIdx.x] = vertexInterp(isoValue, v[3], v[7], field[3], field[7]); __syncthreads(); @@ -438,7 +456,7 @@ generateTriangles2(float *pos, int *faces, int *compactedVoxelArray, float3 *v[1]; - uint edge = tex1Dfetch(vertsOrderTex, (cubeindex*3) + i); + uint edge = tex1Dfetch(vertsOrderTex, (cubeindex*3) + i); if (edge == 255) { break; @@ -461,14 +479,14 @@ generateTriangles2(float *pos, int *faces, int *compactedVoxelArray, // Add triangles for (int j=0; j<16; j+=3) { - uint face_idx1 = tex1Dfetch(triTex, cubeindex*16 + j); + uint face_idx1 = tex1Dfetch(triTex, cubeindex*16 + j); if (face_idx1 == 255) { break; } - uint face_idx2 = tex1Dfetch(triTex, cubeindex*16 + j + 1); - uint face_idx3 = tex1Dfetch(triTex, cubeindex*16 + j + 2); + uint face_idx2 = tex1Dfetch(triTex, cubeindex*16 + j + 1); + uint face_idx3 = tex1Dfetch(triTex, cubeindex*16 + j + 2); int num_prev_verts; int num_prev_triangles; @@ -511,17 +529,20 @@ void launch_generateTriangles2(at::Tensor pos, at::Tensor faces, at::Tensor comp dim3 grid2((int) ceil(activeVoxels/ (float) NTHREADS), 1, 1); while (grid2.x > 65535) { - grid2.x/=2; - grid2.y*=2; + grid2.x /= 2; + grid2.y *= 2; } - generateTriangles2<<>>(pos.data_ptr(), faces.data_ptr(), - compactedVoxelArray.data_ptr(), numTrianglesScanned.data_ptr(), - numPartialVertsScanned.data_ptr(), numPartialVerts.data_ptr(), + generateTriangles2<<>>(pos.data_ptr(), + faces.data_ptr(), + compactedVoxelArray.data_ptr(), + numTrianglesScanned.data_ptr(), + numPartialVertsScanned.data_ptr(), + numPartialVerts.data_ptr(), voxelVertsOrder.data_ptr(), - voxelgrid.data_ptr(), gridSize, - voxelSize, isoValue, activeVoxels, - maxVerts); + voxelgrid.data_ptr(), + gridSize, voxelSize, isoValue, activeVoxels, maxVerts, + triTex, vertsOrderTex); } void CubScanWrapper(at::Tensor output, at::Tensor input, int numElements) { @@ -540,23 +561,19 @@ void CubScanWrapper(at::Tensor output, at::Tensor input, int numElements) { // Run exclusive prefix sum cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, numElements); } + // at::Tensor used to store tables -at::Tensor d_triTable; -at::Tensor d_numUniqueVertsTable; -at::Tensor d_numTrianglesTable; -at::Tensor d_numPartialVertsTable; -at::Tensor d_vertsOrderTable; - -void -computeIsosurface(int3 gridSize, int3 gridSizeLog2, float isoValue, - int *activeVoxels, int *totalVerts, int *totalTriangles, int *totalPartialVerts, - int numVoxels, float3 voxelSize, int maxVerts, int maxFaces, - at::Tensor voxelgrid, at::Tensor d_pos, at::Tensor d_faces, - at::Tensor d_voxelPartialVerts, - at::Tensor d_voxelTriangles, - at::Tensor d_voxelOccupied, - at::Tensor d_compVoxelArray, - at::Tensor d_voxelVertsOrder) { +void computeIsosurface(int3 gridSize, int3 gridSizeLog2, float isoValue, + int *activeVoxels, int *totalVerts, int *totalTriangles, + int *totalPartialVerts, + int numVoxels, float3 voxelSize, int maxVerts, int maxFaces, + at::Tensor voxelgrid, at::Tensor d_pos, at::Tensor d_faces, + at::Tensor d_voxelPartialVerts, + at::Tensor d_voxelTriangles, + at::Tensor d_voxelOccupied, + at::Tensor d_compVoxelArray, + at::Tensor d_voxelVertsOrder +) { // calculate number of vertices and triangles need per voxel launch_classifyVoxel(d_voxelOccupied, d_voxelTriangles, d_voxelPartialVerts, @@ -582,8 +599,7 @@ computeIsosurface(int3 gridSize, int3 gridSizeLog2, float isoValue, *activeVoxels = lastElement + lastScanElement; } - if (activeVoxels==0) - { + if (activeVoxels==0) { // return if there are no full voxels *totalVerts = 0; return; @@ -636,7 +652,12 @@ computeIsosurface(int3 gridSize, int3 gridSizeLog2, float isoValue, maxVerts); } -std::vector unbatched_mcube_forward_cuda_kernel_launcher(const at::Tensor voxelgrid, float iso_value) { +std::vector unbatched_mcube_forward_cuda_kernel_launcher( + const at::Tensor voxelgrid, float iso_value) { + const at::cuda::OptionalCUDAGuard device_guard(at::device_of(voxelgrid)); + auto stream = at::cuda::getCurrentCUDAStream(); + + int3 gridSizeLog2; int3 gridSize; // height, width, depth of voxelgrid @@ -682,24 +703,14 @@ std::vector unbatched_mcube_forward_cuda_kernel_launcher(const at::T at::Tensor d_voxelVertsOrder = at::zeros({numVoxels, 3}, int_options); // tensor to store the order of added verts for each voxel // initialize static pointers - if (!d_triTable.defined()) { - d_triTable = at::zeros({256, 16}, int_options); - d_numUniqueVertsTable = at::zeros({256}, int_options); - d_numTrianglesTable = at::zeros({256}, int_options); - d_numPartialVertsTable = at::zeros({256}, int_options); - d_vertsOrderTable = at::zeros({256, 3}, int_options); - - // allocate table textures after we initialize everything. - allocateTextures(d_triTable, d_numUniqueVertsTable, d_numTrianglesTable, d_numPartialVertsTable, d_vertsOrderTable); - } + // TODO(cfujitsang): keep the memoization + initTextures(stream); computeIsosurface(gridSize, gridSizeLog2, isoValue, &activeVoxels, &totalVerts, &totalTriangles, &totalPartialVerts, numVoxels, voxelSize, maxVerts, maxFaces, - voxelgrid, d_pos, d_faces, - d_voxelPartialVerts, - d_voxelTriangles, - d_voxelOccupied, d_compVoxelArray, d_voxelVertsOrder); + voxelgrid, d_pos, d_faces, d_voxelPartialVerts, + d_voxelTriangles, d_voxelOccupied, d_compVoxelArray, d_voxelVertsOrder); std::vector result; @@ -712,7 +723,7 @@ std::vector unbatched_mcube_forward_cuda_kernel_launcher(const at::T if (err != cudaSuccess) { printf("CUDA Error: %s\n", cudaGetErrorString(err)); } - + freeTextures(); return result; } diff --git a/kaolin/csrc/ops/spc/convolution_cuda.cu b/kaolin/csrc/ops/spc/convolution_cuda.cu index fce46ecd2..e65d48fe4 100644 --- a/kaolin/csrc/ops/spc/convolution_cuda.cu +++ b/kaolin/csrc/ops/spc/convolution_cuda.cu @@ -17,10 +17,6 @@ #include "../../utils.h" #include "convolution.cuh" -#define CUB_NS_PREFIX namespace kaolin { -#define CUB_NS_POSTFIX } -#define CUB_NS_QUALIFIER ::kaolin::cub - #include #include diff --git a/kaolin/csrc/spc_utils.cuh b/kaolin/csrc/spc_utils.cuh index 359a6295b..4b5b2dfe7 100644 --- a/kaolin/csrc/spc_utils.cuh +++ b/kaolin/csrc/spc_utils.cuh @@ -16,10 +16,6 @@ #ifndef KAOLIN_SPC_UTILS_CUH_ #define KAOLIN_SPC_UTILS_CUH_ -#define CUB_NS_PREFIX namespace kaolin { -#define CUB_NS_POSTFIX } -#define CUB_NS_QUALIFIER ::kaolin::cub - #include #include "spc_math.h" diff --git a/setup.py b/setup.py index ef6292e99..4a8c324a0 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ import warnings TORCH_MIN_VER = '1.6.0' -TORCH_MAX_VER = '2.0.1' +TORCH_MAX_VER = '2.1.0' CYTHON_MIN_VER = '0.29.20' IGNORE_TORCH_VER = os.getenv('IGNORE_TORCH_VER') is not None @@ -134,6 +134,8 @@ def get_cuda_bare_metal_version(cuda_dir): os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0;8.6" else: os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0;8.6;9.0" + elif int(bare_metal_major) == 12: + os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0;8.6;9.0" else: os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5" print(f'TORCH_CUDA_ARCH_LIST: {os.environ["TORCH_CUDA_ARCH_LIST"]}') diff --git a/tests/python/kaolin/io/test_materials.py b/tests/python/kaolin/io/test_materials.py index 23c49a621..dccd85b41 100644 --- a/tests/python/kaolin/io/test_materials.py +++ b/tests/python/kaolin/io/test_materials.py @@ -277,7 +277,7 @@ def test_cuda(self, material_values, material_textures, device, non_blocking): assert cuda_val is None else: assert torch.equal(cuda_val, val.cuda()) - assert val.is_cpu + assert not val.is_cuda for param_name in _misc_attributes: assert getattr(mat, param_name) == getattr(cuda_mat, param_name) diff --git a/tests/python/kaolin/ops/mesh/test_trianglemesh.py b/tests/python/kaolin/ops/mesh/test_trianglemesh.py index e8ea4d083..d29e422cb 100644 --- a/tests/python/kaolin/ops/mesh/test_trianglemesh.py +++ b/tests/python/kaolin/ops/mesh/test_trianglemesh.py @@ -58,7 +58,7 @@ def test_packed_face_areas(self, device, dtype): output = kaolin.ops.mesh.packed_face_areas(vertices, first_idx_vertices, faces, num_faces_per_mesh) expected_output = torch.tensor([0.5, 1., math.sqrt(2.)], device=device, dtype=dtype) - assert torch.allclose(output, expected_output) + check_allclose(output, expected_output) @pytest.mark.parametrize("device,dtype", FLOAT_TYPES) class TestSamplePoints: @@ -130,10 +130,10 @@ def test_sample_points(self, vertices, faces, face_features, v0_p = points - face_vertices_choices[:, :, 0] # batch_size x num_points x 3 len_v0_p = torch.sqrt(torch.sum(v0_p ** 2, dim=-1)) - cos_a = torch.matmul(v0_p.reshape(-1, 1, 3), - face_normals.reshape(-1, 3, 1)).reshape( - batch_size, num_samples) / len_v0_p - point_to_face_dist = len_v0_p * cos_a + point_to_face_dist = torch.matmul( + v0_p.reshape(-1, 1, 3), + face_normals.reshape(-1, 3, 1) + ).reshape(batch_size, num_samples) if dtype == torch.half: atol = 1e-2 @@ -143,10 +143,9 @@ def test_sample_points(self, vertices, faces, face_features, rtol = 1e-5 # check that the point is close to the plan - assert torch.allclose(point_to_face_dist, - torch.zeros((batch_size, num_samples), - device=device, dtype=dtype), - atol=atol, rtol=rtol) + check_allclose(point_to_face_dist, + torch.zeros((batch_size, num_samples), device=device, dtype=dtype), + atol=atol, rtol=rtol) # check that the point lie in the triangle edges0 = face_vertices_choices[:, :, 1] - face_vertices_choices[:, :, 0] @@ -201,15 +200,15 @@ def test_sample_points(self, vertices, faces, face_features, gt_points = torch.sum( face_vertices_choices * weights.unsqueeze(-1), dim=-2) - assert torch.allclose(points, gt_points, atol=atol, rtol=rtol) + check_allclose(points, gt_points, atol=atol, rtol=rtol) _face_choices = face_choices[..., None, None].repeat(1, 1, 3, feat_dim) face_features_choices = torch.gather(face_features, 1, _face_choices) gt_interpolated_features = torch.sum( face_features_choices * weights.unsqueeze(-1), dim=-2) - assert torch.allclose(interpolated_features, gt_interpolated_features, - atol=atol, rtol=rtol) + check_allclose(interpolated_features, gt_interpolated_features, + atol=atol, rtol=rtol) def test_sample_points_with_areas(self, vertices, faces, dtype, device): num_samples = 1000 @@ -218,7 +217,7 @@ def test_sample_points_with_areas(self, vertices, faces, dtype, device): kaolin.ops.mesh.sample_points)(vertices, faces, num_samples, face_areas) points2, face_choices2 = with_seed(1234)( kaolin.ops.mesh.sample_points)(vertices, faces, num_samples) - assert torch.allclose(points1, points2) + check_allclose(points1, points2) assert torch.equal(face_choices1, face_choices2) def test_sample_points_with_areas_with_features(self, vertices, faces, @@ -231,9 +230,9 @@ def test_sample_points_with_areas_with_features(self, vertices, faces, points2, face_choices2, interpolated_features2 = with_seed(1234)( kaolin.ops.mesh.sample_points)(vertices, faces, num_samples, face_features=face_features) - assert torch.allclose(points1, points2) + check_allclose(points1, points2) assert torch.equal(face_choices1, face_choices2) - assert torch.allclose(interpolated_features1, interpolated_features2) + check_allclose(interpolated_features1, interpolated_features2) def test_diff_sample_points(self, vertices, faces, device, dtype): num_samples = 1000 @@ -308,10 +307,10 @@ def test_packed_sample_points(self, packed_vertices_info, packed_faces_info, face_normals = kaolin.ops.mesh.face_normals(face_vertices_choices, unit=True) v0_p = points - face_vertices_choices[:, :, 0] # batch_size x num_points x 3 len_v0_p = torch.sqrt(torch.sum(v0_p ** 2, dim=-1)) - cos_a = torch.matmul(v0_p.reshape(-1, 1, 3), - face_normals.reshape(-1, 3, 1)).reshape( - batch_size, num_samples) / len_v0_p - point_to_face_dist = len_v0_p * cos_a + point_to_face_dist = torch.matmul( + v0_p.reshape(-1, 1, 3), + face_normals.reshape(-1, 3, 1) + ).reshape(batch_size, num_samples) if dtype == torch.half: atol = 1e-2 @@ -321,10 +320,9 @@ def test_packed_sample_points(self, packed_vertices_info, packed_faces_info, rtol = 1e-5 # check that the point is close to the plan - assert torch.allclose(point_to_face_dist, - torch.zeros((batch_size, num_samples), - device=device, dtype=dtype), - atol=atol, rtol=rtol) + check_allclose(point_to_face_dist, + torch.zeros((batch_size, num_samples), device=device, dtype=dtype), + atol=atol, rtol=rtol) # check that the point lie in the triangle edges0 = face_vertices_choices[:, :, 1] - face_vertices_choices[:, :, 0] @@ -363,7 +361,7 @@ def test_packed_sample_points_with_areas(self, packed_vertices_info, packed_face points2, face_choices2 = with_seed(1234)(kaolin.ops.mesh.packed_sample_points)( vertices, first_idx_vertices, faces, num_faces_per_mesh, num_samples) - assert torch.allclose(points1, points2) + check_allclose(points1, points2) assert torch.equal(face_choices1, face_choices2) def test_diff_packed_sample_points(self, packed_vertices_info, packed_faces_info, @@ -811,21 +809,21 @@ def expected_faces_icosahedron_1_iter(self, device): def test_subdivide_trianglemesh_1_iter_default_alpha(self, vertices_icosahedron, faces_icosahedron, expected_vertices_default_alpha, expected_faces_icosahedron_1_iter): new_vertices, new_faces = kaolin.ops.mesh.subdivide_trianglemesh( vertices_icosahedron, faces_icosahedron, 1) - assert torch.allclose(new_vertices, expected_vertices_default_alpha, atol=1e-04) + check_allclose(new_vertices, expected_vertices_default_alpha, atol=1e-04) assert torch.equal(new_faces, expected_faces_icosahedron_1_iter) def test_subdivide_trianglemesh_1_iter_zero_alpha(self, vertices_icosahedron, faces_icosahedron, expected_vertices_zero_alpha, expected_faces_icosahedron_1_iter): alpha = torch.zeros_like(vertices_icosahedron[..., 0]) new_vertices, new_faces = kaolin.ops.mesh.subdivide_trianglemesh( vertices_icosahedron, faces_icosahedron, 1, alpha) - assert torch.allclose(new_vertices, expected_vertices_zero_alpha, atol=1e-04) + check_allclose(new_vertices, expected_vertices_zero_alpha, atol=1e-04) assert torch.equal(new_faces, expected_faces_icosahedron_1_iter) def test_subdivide_trianglemesh_5_iter(self, vertices_icosahedron, faces_icosahedron): new_vertices, new_faces = kaolin.ops.mesh.subdivide_trianglemesh( vertices_icosahedron, faces_icosahedron, 5) # check total area of all faces - assert torch.allclose( + check_allclose( kaolin.ops.mesh.face_areas(new_vertices, new_faces).sum(), torch.tensor([6.2005], dtype=new_vertices.dtype, device=new_faces.device), atol=1e-4) diff --git a/version.txt b/version.txt index a803cc227..b4f7ccce2 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.14.0 +0.15.0a0