From 19c793754073a0764adf740abc75f84b777c5d30 Mon Sep 17 00:00:00 2001 From: ahmed Date: Tue, 4 Jan 2022 22:24:21 -0500 Subject: [PATCH] RXMesh v0.2.0 (#3) **Highlights:** - Eliminate global index space and identify mesh elements using the combination of their patch index and their local index within the patch - All query operation no longer needs to map their output to global index space - Allocating `Attributes` per-patch basis instead of a single large array - Introduce `Vertex/Edge/FaceHandle` to improve type safety which is used to identify different mesh element and index the mesh `Attributes` - Introduce `LocalVertex/Edge/FaceT` to improve type safety for internal implementation of local index space - `Attributes` are now managed by `RXMeshStatic` using `add_vertex/edge/face_attribute()` API - Introduce `for_each_vertex/edge/face()` API in `RXMeshStatic` for simple operations on the mesh that do not require query operations (i.e., map operations) for both CUDA and OpenMP backend - Introduce `ReduceHandle` to do reduction operations on `Attribute` and so reduction operation temp memory is no longer handle by `Attribute` itself - Improve the documentation for most of the user-facing APIs - Removing code related to shuffling and sorting the input mesh (it was only relevant to reproduce SIGGRAPH paper results) - Accurately report the register usage and static shared memory used by the kernel by using the (pointer to) kernel itself instead of the prototype function - Improve initializing the data structure by removing duplicate supporting structure that were created by both `RXMesh` and `Patcher` and by using OpenMP when possible in `RXMesh` **Known Issues:** - Some queries (VE, VF, and higher queries) now require more registers per thread and thus they might fail on some (less powerful) GPUs. Co-authored-by: Ahmed Mahmoud --- .clang-format | 4 +- .github/workflows/Ubuntu.yml | 2 +- .github/workflows/Windows.yml | 2 +- .gitignore | 1 + CMakeLists.txt | 12 +- LICENSE | 2 +- apps/Filtering/CMakeLists.txt | 2 + apps/Filtering/benchmark.sh | 5 +- apps/Filtering/filtering.cu | 67 +- apps/Filtering/filtering_openmesh.h | 79 +- apps/Filtering/filtering_rxmesh.cuh | 133 +- apps/Filtering/filtering_rxmesh_kernel.cuh | 226 +-- apps/Filtering/filtering_util.h | 32 +- apps/Geodesic/CMakeLists.txt | 2 + apps/Geodesic/benchmark.sh | 5 +- apps/Geodesic/geodesic.cu | 99 +- apps/Geodesic/geodesic_kernel.cuh | 58 +- apps/Geodesic/geodesic_ptp_openmesh.h | 148 +- apps/Geodesic/geodesic_ptp_rxmesh.h | 138 +- apps/MCF/CMakeLists.txt | 2 + apps/MCF/benchmark.sh | 5 +- apps/MCF/mcf.cu | 70 +- apps/MCF/mcf_openmesh.h | 300 ++- apps/MCF/mcf_rxmesh.h | 259 +-- apps/MCF/mcf_rxmesh_kernel.cuh | 110 +- apps/MCF/mcf_util.h | 20 +- apps/VertexNormal/CMakeLists.txt | 2 + apps/VertexNormal/benchmark.sh | 5 +- apps/VertexNormal/vertex_normal.cu | 113 +- apps/VertexNormal/vertex_normal_hardwired.cuh | 73 +- apps/VertexNormal/vertex_normal_kernel.cuh | 18 +- apps/VertexNormal/vertex_normal_ref.h | 78 +- apps/common/openmesh_report.h | 10 +- apps/common/openmesh_trimesh.h | 2 +- cmake/AutoDetectCudaArch.cmake | 24 +- include/rxmesh/attribute.h | 884 +++++++++ include/rxmesh/context.h | 111 ++ include/rxmesh/handle.h | 284 +++ include/rxmesh/iterator.cuh | 142 ++ include/rxmesh/kernels/attribute.cuh | 93 + include/rxmesh/kernels/collective.cuh | 6 +- include/rxmesh/kernels/debug.cuh | 6 +- include/rxmesh/kernels/for_each.cuh | 54 + include/rxmesh/kernels/get_arch.cuh | 4 +- include/rxmesh/kernels/loader.cuh | 285 +++ include/rxmesh/kernels/prototype.cuh | 63 - include/rxmesh/kernels/query_dispatcher.cuh | 415 +++++ include/rxmesh/kernels/rxmesh_attribute.cuh | 87 - include/rxmesh/kernels/rxmesh_iterator.cuh | 128 -- include/rxmesh/kernels/rxmesh_loader.cuh | 171 -- include/rxmesh/kernels/rxmesh_queries.cuh | 255 ++- .../kernels/rxmesh_query_dispatcher.cuh | 404 ---- include/rxmesh/kernels/util.cuh | 50 +- include/rxmesh/launch_box.h | 14 +- include/rxmesh/local.h | 77 + include/rxmesh/patch_info.h | 43 + include/rxmesh/patcher/patcher.cu | 1175 ++++-------- include/rxmesh/patcher/patcher.h | 194 +- include/rxmesh/patcher/patcher_kernel.cuh | 59 +- include/rxmesh/reduce_handle.h | 140 ++ include/rxmesh/rxmesh.cpp | 1649 ++++++----------- include/rxmesh/rxmesh.h | 455 ++--- include/rxmesh/rxmesh_attribute.h | 866 --------- include/rxmesh/rxmesh_context.h | 284 --- include/rxmesh/rxmesh_static.h | 808 ++++++-- include/rxmesh/rxmesh_util.h | 29 - include/rxmesh/types.h | 125 ++ include/rxmesh/util/cuda_query.h | 59 +- include/rxmesh/util/export_tools.h | 4 +- include/rxmesh/util/import_obj.h | 189 +- include/rxmesh/util/log.h | 34 +- include/rxmesh/util/macros.h | 32 +- include/rxmesh/util/math.h | 201 -- include/rxmesh/util/meta.h | 51 + include/rxmesh/util/report.h | 96 +- include/rxmesh/util/timer.h | 4 +- include/rxmesh/util/util.h | 149 +- include/rxmesh/util/vector.h | 52 +- tests/RXMesh_test/CMakeLists.txt | 5 +- tests/RXMesh_test/benchmark.sh | 11 +- tests/RXMesh_test/higher_query.cuh | 87 +- tests/RXMesh_test/query.cuh | 55 +- tests/RXMesh_test/rxmesh_test.h | 712 ++++--- tests/RXMesh_test/rxmesh_test_main.cu | 31 +- tests/RXMesh_test/test_attribute.cu | 458 ----- tests/RXMesh_test/test_attribute.cuh | 170 ++ tests/RXMesh_test/test_for_each.h | 37 + tests/RXMesh_test/test_higher_queries.h | 60 +- tests/RXMesh_test/test_iterator.cu | 126 +- tests/RXMesh_test/test_queries.h | 496 +++-- tests/RXMesh_test/test_util.cu | 358 ++-- tests/RXMesh_test/test_vector.cu | 4 +- 92 files changed, 7197 insertions(+), 7752 deletions(-) create mode 100644 include/rxmesh/attribute.h create mode 100644 include/rxmesh/context.h create mode 100644 include/rxmesh/handle.h create mode 100644 include/rxmesh/iterator.cuh create mode 100644 include/rxmesh/kernels/attribute.cuh create mode 100644 include/rxmesh/kernels/for_each.cuh create mode 100644 include/rxmesh/kernels/loader.cuh delete mode 100644 include/rxmesh/kernels/prototype.cuh create mode 100644 include/rxmesh/kernels/query_dispatcher.cuh delete mode 100644 include/rxmesh/kernels/rxmesh_attribute.cuh delete mode 100644 include/rxmesh/kernels/rxmesh_iterator.cuh delete mode 100644 include/rxmesh/kernels/rxmesh_loader.cuh delete mode 100644 include/rxmesh/kernels/rxmesh_query_dispatcher.cuh create mode 100644 include/rxmesh/local.h create mode 100644 include/rxmesh/patch_info.h create mode 100644 include/rxmesh/reduce_handle.h delete mode 100644 include/rxmesh/rxmesh_attribute.h delete mode 100644 include/rxmesh/rxmesh_context.h delete mode 100644 include/rxmesh/rxmesh_util.h create mode 100644 include/rxmesh/types.h delete mode 100644 include/rxmesh/util/math.h create mode 100644 include/rxmesh/util/meta.h mode change 100644 => 100755 tests/RXMesh_test/benchmark.sh delete mode 100644 tests/RXMesh_test/test_attribute.cu create mode 100644 tests/RXMesh_test/test_attribute.cuh create mode 100644 tests/RXMesh_test/test_for_each.h diff --git a/.clang-format b/.clang-format index 45af25e2..dd42a951 100644 --- a/.clang-format +++ b/.clang-format @@ -4,14 +4,16 @@ BasedOnStyle: Chromium TabWidth: 4 UseTab: Never +AlignConsecutiveAssignments: true AllowShortFunctionsOnASingleLine: false AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakTemplateDeclarations: true AlignTrailingComments: true +BinPackArguments: false BinPackParameters: false BreakBeforeTernaryOperators: false -ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerAllOnOneLineOrOnePerLine: true Cpp11BracedListStyle: true IndentCaseLabels: true IndentWidth: 4 diff --git a/.github/workflows/Ubuntu.yml b/.github/workflows/Ubuntu.yml index dfa4823c..c618db96 100644 --- a/.github/workflows/Ubuntu.yml +++ b/.github/workflows/Ubuntu.yml @@ -1,5 +1,5 @@ name: Ubuntu -on: [push, pull_request] +on: [push, pull_request, workflow_dispatch] jobs: UbuntuRun: runs-on: ubuntu-latest diff --git a/.github/workflows/Windows.yml b/.github/workflows/Windows.yml index 4ea07595..575fbc68 100644 --- a/.github/workflows/Windows.yml +++ b/.github/workflows/Windows.yml @@ -1,5 +1,5 @@ name: Windows -on: [push, pull_request] +on: [push, pull_request, workflow_dispatch] jobs: WindowsRun: runs-on: windows-latest diff --git a/.gitignore b/.gitignore index cda600a3..6b9aad91 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ output/ +input/ build/ include/rxmesh/util/git_sha1.cpp .vscode/ \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index f41a40e9..ea45e707 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,11 +1,11 @@ -cmake_minimum_required(VERSION 3.15 FATAL_ERROR) +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) cmake_policy(SET CMP0104 OLD) endif() project(RXMesh - VERSION 0.1.0 + VERSION 0.2.0 LANGUAGES C CXX CUDA) set(CMAKE_CXX_STANDARD 17) @@ -86,15 +86,16 @@ target_sources(RXMesh_header_lib # CUDA and C++ compiler flags set(cxx_flags - $<$:-D_SCL_SECURE_NO_WARNINGS /openmp /std:c++17> #Add MSVC-specific compiler flags here - $<$:-Wall -m64 -fopenmp -O3 -std=c++17> #Add GCC/Clang-specific compiler flags here + $<$:-D_SCL_SECURE_NO_WARNINGS /openmp /std:c++17> #Add MSVC-specific compiler flags here + $<$:-Wall -m64 -fopenmp -O3 -std=c++17 -Wno-unused-function> #Add GCC/Clang-specific compiler flags here ) set(cuda_flags - -Xcompiler=$<$:-Wall -fopenmp -O3> + -Xcompiler=$<$:-Wall -fopenmp -O3 -Wno-unused-function> #Disables warning #177-D "function XXX was declared but never referenced" -Xcudafe "--display_error_number --diag_suppress=177" ${CUDA_ARCHS} + -rdc=true -lineinfo --expt-extended-lambda -use_fast_math @@ -111,7 +112,6 @@ target_compile_options(developer_flags INTERFACE $<$:${cuda_flags}> ) - target_link_libraries(RXMesh_header_lib INTERFACE $) #OpenMP diff --git a/LICENSE b/LICENSE index e1826355..ed791707 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD 2-Clause License -Copyright (c) 2021, owensgroup +Copyright (c) 2022, owensgroup All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/apps/Filtering/CMakeLists.txt b/apps/Filtering/CMakeLists.txt index 57b2309f..8090e5d3 100644 --- a/apps/Filtering/CMakeLists.txt +++ b/apps/Filtering/CMakeLists.txt @@ -27,6 +27,8 @@ endif() set_target_properties(Filtering PROPERTIES FOLDER "apps") +set_property(TARGET Filtering PROPERTY CUDA_SEPARABLE_COMPILATION ON) + source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "Filtering" FILES ${SOURCE_LIST}) target_link_libraries(Filtering diff --git a/apps/Filtering/benchmark.sh b/apps/Filtering/benchmark.sh index 2cb5fcb9..cb437324 100644 --- a/apps/Filtering/benchmark.sh +++ b/apps/Filtering/benchmark.sh @@ -1,5 +1,4 @@ #!/bin/bash -echo "This script re-generates RXMesh data in Figure 8(c) in the paper." echo "Please make sure to first compile the source code and then enter the input OBJ files directory." read -p "OBJ files directory (no trailing slash): " input_dir @@ -16,7 +15,7 @@ device_id=0 for file in $input_dir/*.obj; do if [ -f "$file" ]; then - echo $exe -p -input "$file" -num_filter_iter 5 -device_id $device_id - $exe -p -input "$file" -num_filter_iter 5 -device_id $device_id + echo $exe -input "$file" -num_filter_iter 5 -device_id $device_id + $exe -input "$file" -num_filter_iter 5 -device_id $device_id fi done \ No newline at end of file diff --git a/apps/Filtering/filtering.cu b/apps/Filtering/filtering.cu index 5e016041..3a85ee2a 100644 --- a/apps/Filtering/filtering.cu +++ b/apps/Filtering/filtering.cu @@ -13,15 +13,12 @@ struct arg { - std::string obj_file_name = STRINGIFY(INPUT_DIR) "sphere3.obj"; - std::string output_folder = STRINGIFY(OUTPUT_DIR); - uint32_t device_id = 0; + std::string obj_file_name = STRINGIFY(INPUT_DIR) "sphere3.obj"; + std::string output_folder = STRINGIFY(OUTPUT_DIR); + uint32_t device_id = 0; uint32_t num_filter_iter = 5; char** argv; int argc; - bool shuffle = false; - bool sort = false; - } Arg; #include "filtering_openmesh.h" @@ -29,18 +26,9 @@ struct arg TEST(App, Filtering) { - using namespace RXMESH; + using namespace rxmesh; using dataT = float; - - if (Arg.shuffle) { - ASSERT_FALSE(Arg.sort) << " cannot shuffle and sort at the same time!"; - } - if (Arg.sort) { - ASSERT_FALSE(Arg.shuffle) - << " cannot shuffle and sort at the same time!"; - } - // Select device cuda_query(Arg.device_id); @@ -50,42 +38,27 @@ TEST(App, Filtering) std::vector> Verts; ASSERT_TRUE(import_obj(Arg.obj_file_name, Verts, Faces)); - if (Arg.shuffle) { - shuffle_obj(Faces, Verts); - } - // Create RXMeshStatic instance. If Arg.sort is true, Faces and Verts will - // be sorted based on the patching happening inside RXMesh - RXMeshStatic rxmesh_static(Faces, Verts, Arg.sort, false); + TriMesh input_mesh; + ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, Arg.obj_file_name)); + ASSERT_EQ(input_mesh.n_vertices(), Verts.size()); - // Since OpenMesh only accepts input as obj files, if the input mesh is - // shuffled or sorted, we have to write it to a temp file so that OpenMesh - // can pick it up - TriMesh input_mesh; - if (Arg.sort || Arg.shuffle) { - export_obj(Faces, Verts, "temp.obj", false); - ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, "temp.obj")); - } else { - ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, Arg.obj_file_name)); - } + // OpenMesh Impl + std::vector> ground_truth(Verts); + size_t max_neighbour_size = 0; + filtering_openmesh( + omp_get_max_threads(), input_mesh, ground_truth, max_neighbour_size); - //*** OpenMesh Impl - RXMESH::RXMeshAttribute ground_truth; - size_t max_neighbour_size = 0; - filtering_openmesh(omp_get_max_threads(), input_mesh, ground_truth, max_neighbour_size); - - //*** RXMesh Impl - filtering_rxmesh(rxmesh_static, Verts, ground_truth, max_neighbour_size); + // RXMesh Impl + filtering_rxmesh(Faces, Verts, ground_truth, max_neighbour_size); - // Release allocation - ground_truth.release(); } int main(int argc, char** argv) { - using namespace RXMESH; + using namespace rxmesh; Log::init(); ::testing::InitGoogleTest(&argc, argv); @@ -101,9 +74,7 @@ int main(int argc, char** argv) " Default is {} \n" " Hint: Only accepts OBJ files\n" " -o: JSON file output folder. Default is {} \n" - " -num_filter_iter: Iteration count. Default is {} \n" - " -s: Shuffle input. Default is false.\n" - " -p: Sort input using patching output. Default is false.\n" + " -num_filter_iter: Iteration count. Default is {} \n" " -device_id: GPU device ID. Default is {}", Arg.obj_file_name, Arg.output_folder ,Arg.num_filter_iter ,Arg.device_id); // clang-format on @@ -123,12 +94,6 @@ int main(int argc, char** argv) Arg.output_folder = std::string(get_cmd_option(argv, argv + argc, "-o")); } - if (cmd_option_exists(argv, argc + argv, "-s")) { - Arg.shuffle = true; - } - if (cmd_option_exists(argv, argc + argv, "-p")) { - Arg.sort = true; - } if (cmd_option_exists(argv, argc + argv, "-device_id")) { Arg.device_id = atoi(get_cmd_option(argv, argv + argc, "-device_id")); diff --git a/apps/Filtering/filtering_openmesh.h b/apps/Filtering/filtering_openmesh.h index dae4481e..5540ace5 100644 --- a/apps/Filtering/filtering_openmesh.h +++ b/apps/Filtering/filtering_openmesh.h @@ -4,7 +4,6 @@ #include #include "../common/openmesh_report.h" #include "../common/openmesh_trimesh.h" -#include "rxmesh/rxmesh_attribute.h" /** *computeSigma_s() @@ -17,19 +16,19 @@ double computeSigma_s( { - float offset = 0; - float sum = 0; + float offset = 0; + float sum = 0; float sum_sqs = 0; - size_t count = vertex_neighbour.size(); + size_t count = vertex_neighbour.size(); for (size_t i = 0; i < count; ++i) { TriMesh::Point pj = mesh.point(vertex_neighbour[i]); - float t = (pj - pi) | ni; - t = sqrt(t * t); + float t = (pj - pi) | ni; + t = sqrt(t * t); sum += t; sum_sqs += t * t; } float c = static_cast(count); - offset = (sum_sqs / c) - ((sum * sum) / (c * c)); + offset = (sum_sqs / c) - ((sum * sum) / (c * c)); float sigma_s = (sqrt(offset) < 1.0e-12) ? (sqrt(offset) + 1.0e-12) : sqrt(offset); @@ -50,17 +49,18 @@ void getAdaptiveVertexNeighbor( mark[vh.idx()] = true; queue_vertex_handle.push(vh); float radius = 2.0 * sigma_c; - TriMesh::Point ci = mesh.point(vh); + TriMesh::Point ci = mesh.point(vh); while (!queue_vertex_handle.empty()) { TriMesh::VertexHandle vh = queue_vertex_handle.front(); vertex_neighbor.push_back(vh); queue_vertex_handle.pop(); for (TriMesh::VertexVertexIter vv_it = mesh.vv_iter(vh); - vv_it.is_valid(); ++vv_it) { + vv_it.is_valid(); + ++vv_it) { TriMesh::VertexHandle vh_neighbor = *vv_it; if (mark[vh_neighbor.idx()] == false) { - TriMesh::Point cj = mesh.point(vh_neighbor); + TriMesh::Point cj = mesh.point(vh_neighbor); float length = (cj - ci).length(); if (length <= radius) queue_vertex_handle.push(vh_neighbor); @@ -71,10 +71,10 @@ void getAdaptiveVertexNeighbor( } template -void filtering_openmesh(const int num_omp_threads, - TriMesh& input_mesh, - RXMESH::RXMeshAttribute& filtered_coord, - size_t& max_neighbour_size) +void filtering_openmesh(const int num_omp_threads, + TriMesh& input_mesh, + std::vector>& filtered_coord, + size_t& max_neighbour_size) { // Report OpenMeshReport report("Filtering_OpenMesh"); @@ -84,18 +84,8 @@ void filtering_openmesh(const int num_omp_threads, std::string method = "OpenMesh " + std::to_string(num_omp_threads) + " Core"; report.add_member("method", method); - std::string order = "default"; - if (Arg.shuffle) { - order = "shuffle"; - } else if (Arg.sort) { - order = "sorted"; - } - report.add_member("input_order", order); report.add_member("num_filter_iter", Arg.num_filter_iter); - // Allocate space for the filtered output coordinates - filtered_coord.init(input_mesh.n_vertices(), 3u, RXMESH::HOST); - filtered_coord.reset(0.0, RXMESH::HOST); // this where each thread will store its neighbour vertices // we allocate enough space such that each thread can store as much @@ -109,7 +99,7 @@ void filtering_openmesh(const int num_omp_threads, max_neighbour_size = 0; - RXMESH::CPUTimer timer; + rxmesh::CPUTimer timer; timer.start(); for (uint32_t itr = 0; itr < Arg.num_filter_iter; ++itr) { @@ -127,12 +117,13 @@ void filtering_openmesh(const int num_omp_threads, int tid = omp_get_thread_num(); // calculate sigma_c - TriMesh::Point pi = input_mesh.point(*v_it); - TriMesh::Normal ni = input_mesh.normal(*v_it); + TriMesh::Point pi = input_mesh.point(*v_it); + TriMesh::Normal ni = input_mesh.normal(*v_it); float sigma_c = 1e10; for (TriMesh::VertexVertexIter vv_it = input_mesh.vv_iter(*v_it); - vv_it.is_valid(); vv_it++) { - TriMesh::Point pj = input_mesh.point(*vv_it); + vv_it.is_valid(); + vv_it++) { + TriMesh::Point pj = input_mesh.point(*vv_it); float length = (pi - pj).length(); if (length < sigma_c) { sigma_c = length; @@ -141,8 +132,8 @@ void filtering_openmesh(const int num_omp_threads, // get the neighbor vertices vertex_neighbour[tid].clear(); - getAdaptiveVertexNeighbor(input_mesh, *v_it, sigma_c, - vertex_neighbour[tid]); + getAdaptiveVertexNeighbor( + input_mesh, *v_it, sigma_c, vertex_neighbour[tid]); max_neighbour_size = max(max_neighbour_size, vertex_neighbour[tid].size()); @@ -150,24 +141,24 @@ void filtering_openmesh(const int num_omp_threads, float sigma_s = computeSigma_s(vertex_neighbour[tid], input_mesh, pi, ni); - float sum = 0; + float sum = 0; float normalizer = 0; // calculate new vertex position for (int iv = 0; iv < (int)vertex_neighbour[tid].size(); iv++) { TriMesh::Point pj = input_mesh.point(vertex_neighbour[tid][iv]); - float t = (pi - pj).length(); - float h = (pj - pi) | ni; + float t = (pi - pj).length(); + float h = (pj - pi) | ni; float wc = std::exp(-0.5 * t * t / (sigma_c * sigma_c)); float ws = std::exp(-0.5 * h * h / (sigma_s * sigma_s)); sum += wc * ws * h; normalizer += wc * ws; } - auto updated_point = pi + ni * (sum / normalizer); - filtered_coord(vert, 0) = updated_point[0]; - filtered_coord(vert, 1) = updated_point[1]; - filtered_coord(vert, 2) = updated_point[2]; + auto updated_point = pi + ni * (sum / normalizer); + filtered_coord[vert][0] = updated_point[0]; + filtered_coord[vert][1] = updated_point[1]; + filtered_coord[vert][2] = updated_point[2]; } // update the mesh for the next iterations (needed to update the @@ -176,9 +167,9 @@ void filtering_openmesh(const int num_omp_threads, for (int vert = 0; vert < num_vertrices; vert++) { TriMesh::VertexIter v_it = input_mesh.vertices_begin() + vert; TriMesh::Point p; - p[0] = filtered_coord(vert, 0); - p[1] = filtered_coord(vert, 1); - p[2] = filtered_coord(vert, 2); + p[0] = filtered_coord[vert][0]; + p[1] = filtered_coord[vert][1]; + p[2] = filtered_coord[vert][2]; input_mesh.set_point(*v_it, p); } } @@ -202,13 +193,13 @@ void filtering_openmesh(const int num_omp_threads, // Finalize report report.add_member("total_time (ms)", timer.elapsed_millis()); - RXMESH::TestData td; - td.test_name = "MCF"; + rxmesh::TestData td; + td.test_name = "MCF"; td.num_threads = num_omp_threads; td.time_ms.push_back(timer.elapsed_millis()); td.passed.push_back(true); report.add_test(td); report.write( Arg.output_folder + "/openmesh", - "MCF_OpenMesh_" + RXMESH::extract_file_name(Arg.obj_file_name)); + "MCF_OpenMesh_" + rxmesh::extract_file_name(Arg.obj_file_name)); } \ No newline at end of file diff --git a/apps/Filtering/filtering_rxmesh.cuh b/apps/Filtering/filtering_rxmesh.cuh index b9301112..ff528279 100644 --- a/apps/Filtering/filtering_rxmesh.cuh +++ b/apps/Filtering/filtering_rxmesh.cuh @@ -3,20 +3,20 @@ #include #include "filtering_rxmesh_kernel.cuh" -#include "rxmesh/rxmesh_attribute.h" +#include "rxmesh/attribute.h" #include "rxmesh/util/report.h" #include "rxmesh/util/timer.h" /** * filtering_rxmesh() */ -template -void filtering_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, - std::vector>& Verts, - const RXMESH::RXMeshAttribute& ground_truth, - const size_t max_neighbour_size) +template +void filtering_rxmesh(std::vector>& Faces, + const std::vector>& Verts, + const std::vector>& ground_truth, + const size_t max_neighbour_size) { - using namespace RXMESH; + using namespace rxmesh; constexpr uint32_t maxVVSize = 20 * 4; @@ -25,132 +25,115 @@ void filtering_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, "greater than maxVVSize. Should increase maxVVSize to " << max_neighbour_size << " to avoid illegal memory access"; + RXMeshStatic rxmesh(Faces, false); + // Report Report report("Filtering_RXMesh"); report.command_line(Arg.argc, Arg.argv); report.device(); report.system(); - report.model_data(Arg.obj_file_name, rxmesh_static); + report.model_data(Arg.obj_file_name, rxmesh); report.add_member("method", std::string("RXMesh")); - std::string order = "default"; - if (Arg.shuffle) { - order = "shuffle"; - } else if (Arg.sort) { - order = "sorted"; - } - report.add_member("input_order", order); report.add_member("num_filter_iter", Arg.num_filter_iter); // input coords - RXMeshAttribute coords; - coords.set_name("coords"); - coords.init(rxmesh_static.get_num_vertices(), 3u, RXMESH::LOCATION_ALL); - for (uint32_t i = 0; i < Verts.size(); ++i) { - for (uint32_t j = 0; j < Verts[i].size(); ++j) { - coords(i, j) = Verts[i][j]; - } - } - coords.move(RXMESH::HOST, RXMESH::DEVICE); + auto coords = rxmesh.add_vertex_attribute(Verts, "coords"); // Vertex normals (only on device) - RXMeshAttribute vertex_normal; - vertex_normal.set_name("vertex_normal"); - vertex_normal.init(rxmesh_static.get_num_vertices(), 3u, RXMESH::DEVICE); - vertex_normal.reset(0.0, RXMESH::DEVICE); + auto vertex_normal = rxmesh.add_vertex_attribute("vn", 3, DEVICE); + vertex_normal->reset(0, DEVICE); // Filtered coordinates - RXMeshAttribute filtered_coord; - filtered_coord.set_name("filtered_coord"); - filtered_coord.init(rxmesh_static.get_num_vertices(), 3u, - RXMESH::LOCATION_ALL); - filtered_coord.reset(0.0, RXMESH::LOCATION_ALL); - filtered_coord.move(RXMESH::HOST, RXMESH::DEVICE); + auto filtered_coord = + rxmesh.add_vertex_attribute("filtered", 3, LOCATION_ALL); + filtered_coord->reset(0, LOCATION_ALL); // vertex normal launch box constexpr uint32_t vn_block_threads = 256; LaunchBox vn_launch_box; - rxmesh_static.prepare_launch_box(RXMESH::Op::FV, vn_launch_box); + rxmesh.prepare_launch_box(rxmesh::Op::FV, + vn_launch_box, + (void*)compute_vertex_normal); // filter launch box constexpr uint32_t filter_block_threads = 512; LaunchBox filter_launch_box; - rxmesh_static.prepare_launch_box(RXMESH::Op::VV, filter_launch_box, true); + rxmesh.prepare_launch_box( + rxmesh::Op::VV, + filter_launch_box, + (void*)bilateral_filtering); // double buffer - RXMeshAttribute* double_buffer[2] = {&coords, &filtered_coord}; + VertexAttribute* double_buffer[2] = {coords.get(), filtered_coord.get()}; - cudaStream_t stream; - CUDA_ERROR(cudaStreamCreate(&stream)); CUDA_ERROR(cudaProfilerStart()); GPUTimer timer; timer.start(); uint32_t d = 0; for (uint32_t itr = 0; itr < Arg.num_filter_iter; ++itr) { - vertex_normal.reset(0, RXMESH::DEVICE, stream); + vertex_normal->reset(0, rxmesh::DEVICE); // update vertex normal before filtering compute_vertex_normal - <<>>( - rxmesh_static.get_context(), *double_buffer[d], vertex_normal); + <<>>( + rxmesh.get_context(), *double_buffer[d], *vertex_normal); bilateral_filtering - <<>>( - rxmesh_static.get_context(), *double_buffer[d], - *double_buffer[!d], vertex_normal); + <<>>(rxmesh.get_context(), + *double_buffer[d], + *double_buffer[!d], + *vertex_normal); d = !d; - CUDA_ERROR(cudaStreamSynchronize(stream)); + CUDA_ERROR(cudaDeviceSynchronize()); } timer.stop(); - CUDA_ERROR(cudaDeviceSynchronize()); CUDA_ERROR(cudaGetLastError()); CUDA_ERROR(cudaProfilerStop()); - CUDA_ERROR(cudaStreamDestroy(stream)); RXMESH_TRACE("filtering_rxmesh() took {} (ms) (i.e., {} ms/iter) ", timer.elapsed_millis(), timer.elapsed_millis() / float(Arg.num_filter_iter)); // move output to host - coords.copy(*double_buffer[d], RXMESH::DEVICE, RXMESH::HOST); + coords->copy_from(*double_buffer[d], rxmesh::DEVICE, rxmesh::HOST); // output to obj - // rxmesh_static.exportOBJ( - // "output_rxmesh" + std::to_string(Arg.num_filter_iter) + ".obj", - // [&](uint32_t i, uint32_t j) { return coords(i, j); }); + // rxmesh.export_obj(STRINGIFY(OUTPUT_DIR) "output_rxmesh" + + // std::to_string(Arg.num_filter_iter) + ".obj", + // *coords); // Verify - bool passed = true; const T tol = 0.01; - for (uint32_t v = 0; v < coords.get_num_mesh_elements(); ++v) { - const Vector<3, T> gt(ground_truth(v, 0), ground_truth(v, 1), - ground_truth(v, 2)); - const Vector<3, T> co(coords(v, 0), coords(v, 1), coords(v, 2)); - - if (std::fabs(co[0] - gt[0]) > tol || std::fabs(co[1] - gt[1]) > tol || - std::fabs(co[2] - gt[2]) > tol) { - passed = false; - break; - } - } - - EXPECT_TRUE(passed); - - // Release allocation - filtered_coord.release(); - coords.release(); - vertex_normal.release(); + rxmesh.for_each_vertex(HOST, [&](const VertexHandle& vh) { + uint32_t v_id = rxmesh.map_to_global(vh); + const Vector<3, T> gt(ground_truth[v_id][0], + ground_truth[v_id][1], + ground_truth[v_id][2]); + const Vector<3, T> co( + (*coords)(vh, 0), (*coords)(vh, 1), (*coords)(vh, 2)); + + EXPECT_LT(std::fabs((*coords)(vh, 0) - ground_truth[v_id][0]), tol); + EXPECT_LT(std::fabs((*coords)(vh, 1) - ground_truth[v_id][1]), tol); + EXPECT_LT(std::fabs((*coords)(vh, 2) - ground_truth[v_id][2]), tol); + }); // Finalize report TestData td; - td.test_name = "Filtering"; - td.passed.push_back(passed); + td.test_name = "Filtering"; + td.num_threads = filter_launch_box.num_threads; + td.num_blocks = filter_launch_box.blocks; + td.dyn_smem = filter_launch_box.smem_bytes_dyn; + td.static_smem = filter_launch_box.smem_bytes_static; + td.num_reg = filter_launch_box.num_registers_per_thread; td.time_ms.push_back(timer.elapsed_millis()); report.add_test(td); report.write(Arg.output_folder + "/rxmesh", diff --git a/apps/Filtering/filtering_rxmesh_kernel.cuh b/apps/Filtering/filtering_rxmesh_kernel.cuh index 39ddd968..b8a53300 100644 --- a/apps/Filtering/filtering_rxmesh_kernel.cuh +++ b/apps/Filtering/filtering_rxmesh_kernel.cuh @@ -4,10 +4,9 @@ #include #include "filtering_util.h" -#include "rxmesh/kernels/rxmesh_query_dispatcher.cuh" -#include "rxmesh/rxmesh_attribute.h" -#include "rxmesh/rxmesh_context.h" -#include "rxmesh/util/math.h" +#include "rxmesh/attribute.h" +#include "rxmesh/context.h" +#include "rxmesh/kernels/query_dispatcher.cuh" #include "rxmesh/util/vector.h" constexpr float EPS = 10e-6; @@ -17,35 +16,30 @@ constexpr float EPS = 10e-6; * compute_vertex_normal() */ template -__launch_bounds__(blockThreads, 6) __global__ - static void compute_vertex_normal(const RXMESH::RXMeshContext context, - RXMESH::RXMeshAttribute coords, - RXMESH::RXMeshAttribute normals) +__global__ static void compute_vertex_normal(const rxmesh::Context context, + rxmesh::VertexAttribute coords, + rxmesh::VertexAttribute normals) { - using namespace RXMESH; - auto vn_lambda = [&](uint32_t face_id, RXMeshIterator& iter) { + using namespace rxmesh; + auto vn_lambda = [&](FaceHandle face_id, VertexIterator& fv) { // this face's three vertices - uint32_t v0(iter[0]), v1(iter[1]), v2(iter[2]); + VertexHandle v0(fv[0]), v1(fv[1]), v2(fv[2]); + + // get the face's three vertices coordinates + Vector<3, T> c0(coords(fv[0], 0), coords(fv[0], 1), coords(fv[0], 2)); + Vector<3, T> c1(coords(fv[1], 0), coords(fv[1], 1), coords(fv[1], 2)); + Vector<3, T> c2(coords(fv[2], 0), coords(fv[2], 1), coords(fv[2], 2)); // compute the face normal - const Vector<3, T> v0c(coords(v0, 0), coords(v0, 1), coords(v0, 2)); - const Vector<3, T> v1c(coords(v1, 0), coords(v1, 1), coords(v1, 2)); - const Vector<3, T> v2c(coords(v2, 0), coords(v2, 1), coords(v2, 2)); - Vector<3, T> n = cross(v1c - v0c, v2c - v0c); + Vector<3, T> n = cross(c1 - c0, c2 - c0); n.normalize(); // add the face's normal to its vertices - atomicAdd(&normals(v0, 0), n[0]); - atomicAdd(&normals(v0, 1), n[1]); - atomicAdd(&normals(v0, 2), n[2]); - - atomicAdd(&normals(v1, 0), n[0]); - atomicAdd(&normals(v1, 1), n[1]); - atomicAdd(&normals(v1, 2), n[2]); - - atomicAdd(&normals(v2, 0), n[0]); - atomicAdd(&normals(v2, 1), n[1]); - atomicAdd(&normals(v2, 2), n[2]); + for (uint32_t v = 0; v < 3; ++v) { // for every vertex in this face + for (uint32_t i = 0; i < 3; ++i) { // for the vertex 3 coordinates + atomicAdd(&normals(fv[v], i), n[i]); + } + } }; query_block_dispatcher(context, vn_lambda); @@ -57,26 +51,26 @@ __launch_bounds__(blockThreads, 6) __global__ */ template __device__ __inline__ void compute_new_coordinates( - const uint32_t v_id, - const uint32_t vv[], + const rxmesh::VertexHandle& v_id, + const rxmesh::VertexHandle vv[], const uint8_t num_vv, - RXMESH::Vector<3, T>& v, - const RXMESH::Vector<3, T>& n, + rxmesh::Vector<3, T>& v, + const rxmesh::Vector<3, T>& n, const T sigma_c_sq, - const RXMESH::RXMeshAttribute& input_coords, - RXMESH::RXMeshAttribute& filtered_coords) + const rxmesh::VertexAttribute& input_coords, + rxmesh::VertexAttribute& filtered_coords) { - T sigma_s_sq = - compute_sigma_s_sq(v_id, vv, num_vv, v_id, v, n, input_coords); + T sigma_s_sq = compute_sigma_s_sq(v_id, vv, num_vv, v, n, input_coords); - T sum = 0; + T sum = 0; T normalizer = 0; for (uint8_t i = 0; i < num_vv; ++i) { - RXMESH::Vector<3, T> q(input_coords(vv[i], 0), input_coords(vv[i], 1), + rxmesh::Vector<3, T> q(input_coords(vv[i], 0), + input_coords(vv[i], 1), input_coords(vv[i], 2)); q -= v; - T t = q.norm(); - T h = dot(q, n); + T t = q.norm(); + T h = dot(q, n); T wc = exp(-0.5 * t * t / sigma_c_sq); T ws = exp(-0.5 * h * h / sigma_s_sq); @@ -91,24 +85,27 @@ __device__ __inline__ void compute_new_coordinates( } /** - * bilateral_filtering() + * bilateral_filtering_low_level_API() + * TODO refactor this to use handles */ -template +/*template __launch_bounds__(blockThreads) __global__ static void bilateral_filtering_low_level_API( - const RXMESH::RXMeshContext context, - RXMESH::RXMeshAttribute input_coords, - RXMESH::RXMeshAttribute filtered_coords, - RXMESH::RXMeshAttribute vertex_normals) + const rxmesh::Context context, + rxmesh::Attribute input_coords, + rxmesh::Attribute filtered_coords, + rxmesh::Attribute vertex_normals) { - using namespace RXMESH; + constexpr uint32_t special = 0xFFFFFFFE; + + using namespace rxmesh; uint32_t vv[maxVVSize]; uint32_t vv_patch[maxVVSize]; uint16_t vv_local[maxVVSize]; - uint8_t num_vv = 0; + uint8_t num_vv = 0; T sigma_c_sq = 0; - T radius = 0; + T radius = 0; Vector<3, T> vertex, normal; uint32_t v_id = INVALID32; @@ -119,7 +116,7 @@ __launch_bounds__(blockThreads) __global__ if (threadIdx.x == 0) { s_current_num_patches = 0; - s_num_patches = 0; + s_num_patches = 0; } uint32_t patch_id = blockIdx.x; @@ -128,8 +125,8 @@ __launch_bounds__(blockThreads) __global__ // processed are within the same patch (patch_id). If a vertex within the // k-ring is not in the patch, it will be added to s_block_patches so the // whole block would process this patch later. - auto compute_vv_1st_level = [&](uint32_t p_id, RXMeshIterator& iter) { - v_id = p_id; + auto compute_vv_1st_level = [&](uint32_t p_id, Iterator& iter) { + v_id = p_id; vertex[0] = input_coords(v_id, 0); vertex[1] = input_coords(v_id, 1); vertex[2] = input_coords(v_id, 2); @@ -140,7 +137,7 @@ __launch_bounds__(blockThreads) __global__ normal.normalize(); - vv[0] = v_id; + vv[0] = v_id; vv_patch[0] = INVALID32; ++num_vv; @@ -148,7 +145,8 @@ __launch_bounds__(blockThreads) __global__ for (uint32_t v = 0; v < iter.size(); ++v) { const uint32_t vv_id = iter[v]; - const Vector<3, T> q(input_coords(vv_id, 0), input_coords(vv_id, 1), + const Vector<3, T> q(input_coords(vv_id, 0), + input_coords(vv_id, 1), input_coords(vv_id, 2)); T len = dist2(vertex, q); @@ -173,9 +171,9 @@ __launch_bounds__(blockThreads) __global__ if (dist <= radius) { uint8_t id = num_vv++; assert(id < maxVVSize); - vv[id] = vv_id; + vv[id] = vv_id; vv_local[id] = iter.neighbour_local_id(v); - vv_patch[id] = SPECIAL; + vv_patch[id] = special; } } @@ -183,7 +181,7 @@ __launch_bounds__(blockThreads) __global__ // process the 1-ring vertices that this in this patch and within // the radius uint8_t num_vv_start = 1; - uint8_t num_vv_end = num_vv; + uint8_t num_vv_end = num_vv; while (true) { @@ -194,17 +192,17 @@ __launch_bounds__(blockThreads) __global__ // results if (vv_local[v] < iter.m_num_src_in_patch) { - assert(vv_patch[v] == SPECIAL); + assert(vv_patch[v] == special); assert(context.get_vertex_patch()[vv[v]] == patch_id); // to indicate that it's processed vv_patch[v] = INVALID32; - RXMeshIterator vv_iter(iter); + Iterator vv_iter(iter); vv_iter.set(vv_local[v], 0); for (uint32_t i = 0; i < vv_iter.size(); ++i) { - uint32_t vvv_id = vv_iter[i]; + uint32_t vvv_id = vv_iter[i]; uint16_t vvv_local_id = vv_iter.neighbour_local_id(i); // make sure that it is not a duplicate @@ -219,9 +217,9 @@ __launch_bounds__(blockThreads) __global__ uint8_t id = num_vv++; assert(id < maxVVSize); - vv[id] = vvv_id; + vv[id] = vvv_id; vv_local[id] = vvv_local_id; - vv_patch[id] = SPECIAL; + vv_patch[id] = special; } } } @@ -253,7 +251,7 @@ __launch_bounds__(blockThreads) __global__ // otherwise, it means we have added new vertices that might // fall in this patch, so we better process them now. num_vv_start = num_vv_end; - num_vv_end = num_vv; + num_vv_end = num_vv; } }; @@ -282,13 +280,14 @@ __launch_bounds__(blockThreads) __global__ // uniquify uint32_t num_current_patches = s_num_patches - s_current_num_patches; uint32_t* new_end = - thrust::unique(thrust::device, s_block_patches, + thrust::unique(thrust::device, + s_block_patches, s_block_patches + num_current_patches); __syncthreads(); if (threadIdx.x == 0) { s_current_num_patches = new_end - s_block_patches; - s_num_patches = s_current_num_patches; + s_num_patches = s_current_num_patches; } __syncthreads(); @@ -301,9 +300,16 @@ __launch_bounds__(blockThreads) __global__ uint16_t *offset_all_patches, *output_all_patches; detail::template query_block_dispatcher( - context, patch_id, [](uint32_t) { return true; }, false, true, - num_src_in_patch, input_mapping, output_mapping, - offset_all_patches, output_all_patches); + context, + patch_id, + [](uint32_t) { return true; }, + false, + true, + num_src_in_patch, + input_mapping, + output_mapping, + offset_all_patches, + output_all_patches); // mean that this thread has be assigned a vertex in @@ -334,9 +340,12 @@ __launch_bounds__(blockThreads) __global__ // so that we don't process it again vv_patch[v] = INVALID32; - RXMeshIterator vv_iter( - vv_local_id, output_all_patches, offset_all_patches, - output_mapping, 0, num_src_in_patch); + Iterator vv_iter(vv_local_id, + output_all_patches, + offset_all_patches, + output_mapping, + 0, + num_src_in_patch); for (uint32_t i = 0; i < vv_iter.size(); ++i) { uint32_t vvv_id = vv_iter[i]; @@ -370,8 +379,8 @@ __launch_bounds__(blockThreads) __global__ // patch before so we reduce the // duplicates if (pp != patch_id) { - if (!linear_search(vv_patch, pp, - num_vv)) { + if (!linear_search( + vv_patch, pp, num_vv)) { uint32_t d = atomicAdd(&s_num_patches, 1u); assert(d < blockThreads); @@ -403,33 +412,35 @@ __launch_bounds__(blockThreads) __global__ if (v_id != INVALID32) { - compute_new_coordinates(v_id, vv, num_vv, vertex, normal, sigma_c_sq, - input_coords, filtered_coords); + compute_new_coordinates(v_id, + vv, + num_vv, + vertex, + normal, + sigma_c_sq, + input_coords, + filtered_coords); } -} +}*/ - -/** - * bilateral_filtering2() - */ template -__launch_bounds__(blockThreads) __global__ - static void bilateral_filtering(const RXMESH::RXMeshContext context, - RXMESH::RXMeshAttribute input_coords, - RXMESH::RXMeshAttribute filtered_coords, - RXMESH::RXMeshAttribute vertex_normals) +__global__ static void bilateral_filtering( + const rxmesh::Context context, + rxmesh::VertexAttribute input_coords, + rxmesh::VertexAttribute filtered_coords, + rxmesh::VertexAttribute vertex_normals) { - using namespace RXMESH; - uint32_t vv[maxVVSize]; + using namespace rxmesh; + VertexHandle vv[maxVVSize]; - uint8_t num_vv = 0; + uint32_t num_vv = 0; T sigma_c_sq = 0; - T radius = 0; + T radius = 0; Vector<3, T> vertex, normal; - uint32_t v_id = INVALID32; + VertexHandle v_id; - auto first_ring = [&](uint32_t p_id, RXMeshIterator& iter) { - v_id = p_id; + auto first_ring = [&](VertexHandle& p_id, VertexIterator& iter) { + v_id = p_id; vertex[0] = input_coords(v_id, 0); vertex[1] = input_coords(v_id, 1); vertex[2] = input_coords(v_id, 2); @@ -446,8 +457,9 @@ __launch_bounds__(blockThreads) __global__ sigma_c_sq = 1e10; for (uint32_t v = 0; v < iter.size(); ++v) { - const uint32_t vv_id = iter[v]; - const Vector<3, T> q(input_coords(vv_id, 0), input_coords(vv_id, 1), + const VertexHandle vv_id = iter[v]; + const Vector<3, T> q(input_coords(vv_id, 0), + input_coords(vv_id, 1), input_coords(vv_id, 2)); T len = dist2(vertex, q); @@ -460,7 +472,7 @@ __launch_bounds__(blockThreads) __global__ // add 1-ring if it is within the radius for (uint32_t v = 0; v < iter.size(); ++v) { - uint32_t vv_id = iter[v]; + const VertexHandle vv_id = iter[v]; const Vector<3, T> vvc(input_coords(vv_id, 0), input_coords(vv_id, 1), @@ -483,15 +495,15 @@ __launch_bounds__(blockThreads) __global__ uint32_t next_id = 1; while (true) { - uint32_t next_vertex = INVALID32; - if (v_id != INVALID32 && next_id < num_vv) { + VertexHandle next_vertex; + if (v_id.is_valid() && next_id < num_vv) { next_vertex = vv[next_id]; } - auto n_rings = [&](uint32_t id, RXMeshIterator& iter) { + auto n_rings = [&](const VertexHandle& id, const VertexIterator& iter) { assert(id == next_vertex); for (uint32_t i = 0; i < iter.size(); ++i) { - uint32_t vvv_id = iter[i]; + VertexHandle vvv_id = iter[i]; if (vvv_id != v_id) { // make sure that we don't store duplicate outputs @@ -503,7 +515,7 @@ __launch_bounds__(blockThreads) __global__ T dist = dist2(vvv, vertex); if (dist <= radius) { - uint8_t id = num_vv++; + uint32_t id = num_vv++; assert(id < maxVVSize); vv[id] = vvv_id; } @@ -513,18 +525,24 @@ __launch_bounds__(blockThreads) __global__ }; - query_block_dispatcher(context, next_vertex, - n_rings); + higher_query_block_dispatcher( + context, next_vertex, n_rings); - bool is_done = (next_id > num_vv - 1) || (v_id == INVALID32); + bool is_done = (next_id >= num_vv) || !v_id.is_valid(); if (__syncthreads_and(is_done)) { break; } next_id++; } - if (v_id != INVALID32) { - compute_new_coordinates(v_id, vv, num_vv, vertex, normal, sigma_c_sq, - input_coords, filtered_coords); + if (v_id.is_valid()) { + compute_new_coordinates(v_id, + vv, + num_vv, + vertex, + normal, + sigma_c_sq, + input_coords, + filtered_coords); } } \ No newline at end of file diff --git a/apps/Filtering/filtering_util.h b/apps/Filtering/filtering_util.h index 08092f9f..58915b5a 100644 --- a/apps/Filtering/filtering_util.h +++ b/apps/Filtering/filtering_util.h @@ -1,19 +1,19 @@ -#include "rxmesh/rxmesh_attribute.h" +#include "rxmesh/attribute.h" /** * compute_sigma_c() */ template __device__ __inline__ T compute_sigma_c_sq( - const uint32_t vv[], + const rxmesh::VertexHandle vv[], const uint8_t num_vv, - const RXMESH::Vector<3, T>& v, - const RXMESH::RXMeshAttribute& input_coords) + const rxmesh::Vector<3, T>& v, + const rxmesh::VertexAttribute& input_coords) { T sigma_c = 1e10; for (uint8_t i = 1; i < num_vv; ++i) { - const RXMESH::Vector<3, T> q(input_coords(vv[i], 0), + const rxmesh::Vector<3, T> q(input_coords(vv[i], 0), input_coords(vv[i], 1), input_coords(vv[i], 2)); @@ -30,31 +30,31 @@ __device__ __inline__ T compute_sigma_c_sq( */ template __device__ __inline__ T compute_sigma_s_sq( - const uint32_t v_id, - const uint32_t vv[], + const rxmesh::VertexHandle& v_id, + const rxmesh::VertexHandle vv[], const uint8_t num_vv, - uint32_t thread_vertex, - const RXMESH::Vector<3, T>& v, - const RXMESH::Vector<3, T>& n, - const RXMESH::RXMeshAttribute& input_coords) + const rxmesh::Vector<3, T>& v, + const rxmesh::Vector<3, T>& n, + const rxmesh::VertexAttribute& input_coords) { - T sum = 0; + T sum = 0; T sum_sqs = 0; for (uint32_t i = 0; i < num_vv; ++i) { - RXMESH::Vector<3, T> q(input_coords(vv[i], 0), input_coords(vv[i], 1), + rxmesh::Vector<3, T> q(input_coords(vv[i], 0), + input_coords(vv[i], 1), input_coords(vv[i], 2)); q -= v; T t = dot(q, n); - t = sqrt(t * t); + t = sqrt(t * t); sum += t; sum_sqs += t * t; } - T c = static_cast(num_vv); + T c = static_cast(num_vv); T sigma_s = (sum_sqs / c) - ((sum * sum) / (c * c)); - sigma_s = (sigma_s < 1.0e-20) ? (sigma_s + 1.0e-20) : sigma_s; + sigma_s = (sigma_s < 1.0e-20) ? (sigma_s + 1.0e-20) : sigma_s; return sigma_s; } diff --git a/apps/Geodesic/CMakeLists.txt b/apps/Geodesic/CMakeLists.txt index 98b3a353..d63fb18c 100644 --- a/apps/Geodesic/CMakeLists.txt +++ b/apps/Geodesic/CMakeLists.txt @@ -26,6 +26,8 @@ endif() set_target_properties( Geodesic PROPERTIES FOLDER "apps") +set_property(TARGET Geodesic PROPERTY CUDA_SEPARABLE_COMPILATION ON) + source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "Geodesic" FILES ${SOURCE_LIST}) target_link_libraries( Geodesic diff --git a/apps/Geodesic/benchmark.sh b/apps/Geodesic/benchmark.sh index b64cb46b..6f24ff96 100644 --- a/apps/Geodesic/benchmark.sh +++ b/apps/Geodesic/benchmark.sh @@ -1,5 +1,4 @@ #!/bin/bash -echo "This script re-generates RXMesh data in Figure 8(b) in the paper." echo "Please make sure to first compile the source code and then enter the input OBJ files directory." read -p "OBJ files directory (no trailing slash): " input_dir @@ -16,7 +15,7 @@ device_id=0 for file in $input_dir/*.obj; do if [ -f "$file" ]; then - echo $exe -p -input "$file" -device_id $device_id - $exe -p -input "$file" -device_id $device_id + echo $exe -input "$file" -device_id $device_id + $exe -input "$file" -device_id $device_id fi done \ No newline at end of file diff --git a/apps/Geodesic/geodesic.cu b/apps/Geodesic/geodesic.cu index 5fd078b2..c11aa7af 100644 --- a/apps/Geodesic/geodesic.cu +++ b/apps/Geodesic/geodesic.cu @@ -6,23 +6,21 @@ #include #include -#include "../common/openmesh_trimesh.h" #include "gtest/gtest.h" -#include "rxmesh/rxmesh_attribute.h" + +#include "../common/openmesh_trimesh.h" + #include "rxmesh/rxmesh_static.h" #include "rxmesh/util/cuda_query.h" -#include "rxmesh/util/export_tools.h" #include "rxmesh/util/import_obj.h" struct arg { std::string obj_file_name = STRINGIFY(INPUT_DIR) "sphere3.obj"; std::string output_folder = STRINGIFY(OUTPUT_DIR); - uint32_t device_id = 0; + uint32_t device_id = 0; char** argv; int argc; - bool shuffle = false; - bool sort = false; uint32_t num_seeds = 1; } Arg; @@ -30,19 +28,11 @@ struct arg #include "geodesic_ptp_openmesh.h" #include "geodesic_ptp_rxmesh.h" -TEST(App, GEODESIC) +TEST(App, Geodesic) { - using namespace RXMESH; + using namespace rxmesh; using dataT = float; - if (Arg.shuffle) { - ASSERT_FALSE(Arg.sort) << " cannot shuffle and sort at the same time!"; - } - if (Arg.sort) { - ASSERT_FALSE(Arg.shuffle) - << " cannot shuffle and sort at the same time!"; - } - // Select device cuda_query(Arg.device_id); @@ -50,83 +40,48 @@ TEST(App, GEODESIC) // Load mesh std::vector> Verts; std::vector> Faces; - ASSERT_TRUE(import_obj(Arg.obj_file_name, Verts, Faces)); - if (Arg.shuffle) { - shuffle_obj(Faces, Verts); - } + RXMeshStatic rxmesh(Faces, false); + ASSERT_TRUE(rxmesh.is_closed()) + << "Geodesic only works on watertight/closed manifold mesh without " + "boundaries"; + ASSERT_TRUE(rxmesh.is_edge_manifold()) + << "Geodesic only works on watertight/closed manifold mesh without " + "boundaries"; - // Create RXMeshStatic instance. If Arg.sort is true, Faces and Verts will - // be sorted based on the patching happening inside RXMesh - RXMeshStatic rxmesh_static(Faces, Verts, Arg.sort, false); - ASSERT_TRUE(rxmesh_static.is_closed()) << "Geodesic only works on watertight/closed manifold mesh without boundaries"; - ASSERT_TRUE(rxmesh_static.is_edge_manifold())<< "Geodesic only works on watertight/closed manifold mesh without boundaries"; - - // Since OpenMesh only accepts input as obj files, if the input mesh is - // shuffled or sorted, we have to write it to a temp file so that OpenMesh - // can pick it up - TriMesh input_mesh; - if (Arg.sort || Arg.shuffle) { - export_obj(Faces, Verts, "temp.obj", false); - ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, "temp.obj")); - } else { - ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, Arg.obj_file_name)); - } // Generate Seeds std::vector h_seeds(Arg.num_seeds); std::random_device dev; std::mt19937 rng(dev()); - std::uniform_int_distribution dist( - 0, rxmesh_static.get_num_vertices()); + std::uniform_int_distribution dist(0, + Verts.size()); for (auto& s : h_seeds) { s = dist(rng); // s = 0; } - //*** OpenMesh Impl - RXMeshAttribute ground_truth; - // Save a map from vertex id to topleset (number of hops from // (closest?) source). It's used by OpenMesh to help construct // sorted_index and limit. We keep it for RXMesh because it is // used to quickly determine whether or not a vertex is within // the "update band". - RXMeshAttribute toplesets("toplesets"); - toplesets.init(Verts.size(), 1u, - RXMESH::HOST); // will move() to DEVICE later - - + std::vector toplesets(Verts.size(), 1u); std::vector sorted_index; std::vector limits; - geodesic_ptp_openmesh(input_mesh, h_seeds, ground_truth, sorted_index, - limits, toplesets); - - // export_attribute_VTK("geo_openmesh.vtk", Faces, Verts, false, - // ground_truth.operator->(), - // ground_truth.operator->()); - - // Now that OpenMesh has calculated the toplesets, - // move to DEVICE -- it's needed by RXMesh version - toplesets.move(RXMESH::HOST, RXMESH::DEVICE); + geodesic_ptp_openmesh( + Faces, Verts, h_seeds, sorted_index, limits, toplesets); - - //*** RXMesh Impl - EXPECT_TRUE(geodesic_rxmesh(rxmesh_static, Faces, Verts, h_seeds, - ground_truth, sorted_index, limits, toplesets)) - << "RXMesh failed!!"; - - - // Release allocation - ground_truth.release(); - toplesets.release(); + // RXMesh Impl + geodesic_rxmesh( + rxmesh, Faces, Verts, h_seeds, sorted_index, limits, toplesets); } int main(int argc, char** argv) { - using namespace RXMESH; + using namespace rxmesh; Log::init(); ::testing::InitGoogleTest(&argc, argv); @@ -143,9 +98,7 @@ int main(int argc, char** argv) " Default is {} \n" " Hint: Only accepts OBJ files\n" " -o: JSON file output folder. Default is {} \n" - // "-num_seeds: Number of input seeds. Default is {}\n" - " -s: Shuffle input. Default is false.\n" - " -p: Sort input using patching output. Default is false.\n" + // "-num_seeds: Number of input seeds. Default is {}\n" " -device_id: GPU device ID. Default is {}", Arg.obj_file_name, Arg.output_folder ,Arg.num_seeds, Arg.device_id); // clang-format on @@ -160,12 +113,6 @@ int main(int argc, char** argv) Arg.output_folder = std::string(get_cmd_option(argv, argv + argc, "-o")); } - if (cmd_option_exists(argv, argc + argv, "-s")) { - Arg.shuffle = true; - } - if (cmd_option_exists(argv, argc + argv, "-p")) { - Arg.sort = true; - } if (cmd_option_exists(argv, argc + argv, "-device_id")) { Arg.device_id = atoi(get_cmd_option(argv, argv + argc, "-device_id")); diff --git a/apps/Geodesic/geodesic_kernel.cuh b/apps/Geodesic/geodesic_kernel.cuh index 5c8246ca..046d17ae 100644 --- a/apps/Geodesic/geodesic_kernel.cuh +++ b/apps/Geodesic/geodesic_kernel.cuh @@ -1,8 +1,8 @@ #pragma once -#include "rxmesh/kernels/rxmesh_query_dispatcher.cuh" -#include "rxmesh/rxmesh_attribute.h" -#include "rxmesh/rxmesh_context.h" +#include "rxmesh/attribute.h" +#include "rxmesh/context.h" +#include "rxmesh/kernels/query_dispatcher.cuh" #include "rxmesh/util/vector.h" /** @@ -10,14 +10,14 @@ */ template __device__ __inline__ T update_step( - const uint32_t v0_id, - const uint32_t v1_id, - const uint32_t v2_id, - const RXMESH::RXMeshAttribute& geo_distance, - const RXMESH::RXMeshAttribute& coords, + const rxmesh::VertexHandle& v0_id, + const rxmesh::VertexHandle& v1_id, + const rxmesh::VertexHandle& v2_id, + const rxmesh::VertexAttribute& geo_distance, + const rxmesh::VertexAttribute& coords, const T infinity_val) { - using namespace RXMESH; + using namespace rxmesh; const Vector<3, T> v0(coords(v0_id, 0), coords(v0_id, 1), coords(v0_id, 2)); const Vector<3, T> v1(coords(v1_id, 0), coords(v1_id, 1), coords(v1_id, 2)); const Vector<3, T> v2(coords(v2_id, 0), coords(v2_id, 1), coords(v2_id, 2)); @@ -44,14 +44,14 @@ __device__ __inline__ T update_step( Q[1][1] = q[0][0] / det; T delta = t[0] * (Q[0][0] + Q[1][0]) + t[1] * (Q[0][1] + Q[1][1]); - T dis = delta * delta - + T dis = delta * delta - (Q[0][0] + Q[0][1] + Q[1][0] + Q[1][1]) * (t[0] * t[0] * Q[0][0] + t[0] * t[1] * (Q[1][0] + Q[0][1]) + t[1] * t[1] * Q[1][1] - 1); T p = (delta + std::sqrt(dis)) / (Q[0][0] + Q[0][1] + Q[1][0] + Q[1][1]); T tp[2]; - tp[0] = t[0] - p; - tp[1] = t[1] - p; + tp[0] = t[0] - p; + tp[1] = t[1] - p; const Vector<3, T> n = (x0 * Q[0][0] + x1 * Q[1][0]) * tp[0] + (x0 * Q[0][1] + x1 * Q[1][1]) * tp[1]; T cond[2]; @@ -67,48 +67,48 @@ __device__ __inline__ T update_step( T dp[2]; dp[0] = geo_distance(v1_id) + x0.norm(); dp[1] = geo_distance(v2_id) + x1.norm(); - p = dp[dp[1] < dp[0]]; + p = dp[dp[1] < dp[0]]; } return p; } template -__launch_bounds__(blockThreads) __global__ static void relax_ptp_rxmesh( - const RXMESH::RXMeshContext context, - const RXMESH::RXMeshAttribute coords, - RXMESH::RXMeshAttribute new_geo_dist, - const RXMESH::RXMeshAttribute old_geo_dist, - const RXMESH::RXMeshAttribute toplesets, +__global__ static void relax_ptp_rxmesh( + const rxmesh::Context context, + const rxmesh::VertexAttribute coords, + rxmesh::VertexAttribute new_geo_dist, + const rxmesh::VertexAttribute old_geo_dist, + const rxmesh::VertexAttribute toplesets, const uint32_t band_start, const uint32_t band_end, uint32_t* d_error, const T infinity_val, const T error_tol) { - using namespace RXMESH; + using namespace rxmesh; - auto in_active_set = [&](uint32_t p_id) { + auto in_active_set = [&](VertexHandle p_id) { uint32_t my_band = toplesets(p_id); return my_band >= band_start && my_band < band_end; }; - auto geo_lambda = [&](uint32_t p_id, RXMeshIterator& iter) { + auto geo_lambda = [&](VertexHandle& p_id, const VertexIterator& iter) { // this vertex (p_id) update_band uint32_t my_band = toplesets(p_id); // this is the last vertex in the one-ring (before r_id) - uint32_t q_id = iter.back(); + auto q_id = iter.back(); // one-ring enumeration T current_dist = old_geo_dist(p_id); - T new_dist = current_dist; + T new_dist = current_dist; for (uint32_t v = 0; v < iter.size(); ++v) { // the current one ring vertex - uint32_t r_id = iter[v]; + auto r_id = iter[v]; - T dist = update_step(p_id, q_id, r_id, old_geo_dist, coords, - infinity_val); + T dist = update_step( + p_id, q_id, r_id, old_geo_dist, coords, infinity_val); if (dist < new_dist) { new_dist = dist; } @@ -126,6 +126,6 @@ __launch_bounds__(blockThreads) __global__ static void relax_ptp_rxmesh( }; - query_block_dispatcher(context, geo_lambda, - in_active_set, true); + query_block_dispatcher( + context, geo_lambda, in_active_set, true); } diff --git a/apps/Geodesic/geodesic_ptp_openmesh.h b/apps/Geodesic/geodesic_ptp_openmesh.h index db7f0e36..e8ad2569 100644 --- a/apps/Geodesic/geodesic_ptp_openmesh.h +++ b/apps/Geodesic/geodesic_ptp_openmesh.h @@ -9,15 +9,15 @@ #include "../common/openmesh_report.h" #include "../common/openmesh_trimesh.h" #include "gtest/gtest.h" -#include "rxmesh/rxmesh_attribute.h" +#include "rxmesh/util/export_tools.h" #include "rxmesh/util/report.h" #include "rxmesh/util/timer.h" -inline float compute_toplesets(TriMesh& mesh, - std::vector& sorted_index, - std::vector& limits, - RXMESH::RXMeshAttribute& toplesets, - const std::vector& h_seeds) +inline float compute_toplesets(TriMesh& mesh, + std::vector& sorted_index, + std::vector& limits, + std::vector& toplesets, + const std::vector& h_seeds) { limits.clear(); limits.reserve(mesh.n_vertices() / 2); @@ -26,34 +26,36 @@ inline float compute_toplesets(TriMesh& mesh, return 0; } - RXMESH::CPUTimer timer; + rxmesh::CPUTimer timer; timer.start(); - toplesets.reset(INVALID32, RXMESH::HOST); + toplesets.clear(); + toplesets.resize(mesh.n_vertices(), INVALID32); uint32_t level = 0; - uint32_t p = 0; + uint32_t p = 0; for (const uint32_t& s : h_seeds) { sorted_index[p] = s; p++; - if (toplesets(s) == INVALID32) { - toplesets(s) = level; + if (toplesets[s] == INVALID32) { + toplesets[s] = level; } } limits.push_back(0); for (uint32_t i = 0; i < p; i++) { const uint32_t v = sorted_index[i]; - if (toplesets(v) > level) { + if (toplesets[v] > level) { level++; limits.push_back(i); } TriMesh::VertexIter v_iter = mesh.vertices_begin() + v; for (TriMesh::VertexVertexIter vv_iter = mesh.vv_iter(*v_iter); - vv_iter.is_valid(); ++vv_iter) { + vv_iter.is_valid(); + ++vv_iter) { int vv = (*vv_iter).idx(); - if (toplesets(vv) == INVALID32) { - toplesets(vv) = toplesets(v) + 1; + if (toplesets[vv] == INVALID32) { + toplesets[vv] = toplesets[v] + 1; sorted_index[p] = vv; p++; } @@ -68,18 +70,18 @@ inline float compute_toplesets(TriMesh& mesh, "compute_toplesets() could not compute toplesets for all " "vertices maybe because the input is not manifold or contain " "duplicate vertices!"); - exit(0); + exit(EXIT_FAILURE); } timer.stop(); return timer.elapsed_millis(); } template -inline T update_step(TriMesh& mesh, - const uint32_t v0, - const uint32_t v1, - const uint32_t v2, - RXMESH::RXMeshAttribute& geo_distance) +inline T update_step(TriMesh& mesh, + const uint32_t v0, + const uint32_t v1, + const uint32_t v2, + std::vector& geo_distance) { TriMesh::VertexIter v0_it = mesh.vertices_begin() + v0; TriMesh::VertexIter v1_it = mesh.vertices_begin() + v1; @@ -90,8 +92,8 @@ inline T update_step(TriMesh& mesh, T t[2]; - t[0] = geo_distance(v1); - t[1] = geo_distance(v2); + t[0] = geo_distance[v1]; + t[1] = geo_distance[v2]; T q[2][2]; q[0][0] = (X0 | X0); // X0 dot_product X0 @@ -108,7 +110,7 @@ inline T update_step(TriMesh& mesh, Q[1][1] = q[0][0] / det; T delta = t[0] * (Q[0][0] + Q[1][0]) + t[1] * (Q[0][1] + Q[1][1]); - T dis = delta * delta - + T dis = delta * delta - (Q[0][0] + Q[0][1] + Q[1][0] + Q[1][1]) * (t[0] * t[0] * Q[0][0] + t[0] * t[1] * (Q[1][0] + Q[0][1]) + t[1] * t[1] * Q[1][1] - 1); @@ -138,8 +140,8 @@ inline T update_step(TriMesh& mesh, t[1] == std::numeric_limits::infinity() || dis < 0 || c[0] >= 0 || c[1] >= 0) { T dp[2]; - dp[0] = geo_distance(v1) + X0.norm(); - dp[1] = geo_distance(v2) + X1.norm(); + dp[0] = geo_distance[v1] + X0.norm(); + dp[1] = geo_distance[v2] + X1.norm(); p = dp[dp[1] < dp[0]]; } @@ -151,31 +153,27 @@ inline float toplesets_propagation(TriMesh& mesh, const std::vector& h_seeds, const std::vector& limits, const std::vector& sorted_index, - RXMESH::RXMeshAttribute& geo_distance, + std::vector& geo_distance, uint32_t& iter) { // second buffer for geodesic distance - RXMESH::RXMeshAttribute geo_distance_2; - geo_distance_2.init(mesh.n_vertices(), 1u, RXMESH::HOST); - geo_distance_2.reset(std::numeric_limits::infinity(), RXMESH::HOST); - geo_distance.reset(std::numeric_limits::infinity(), RXMESH::HOST); - RXMESH::RXMeshAttribute* double_buffer[2] = {&geo_distance, - &geo_distance_2}; + std::vector geo_distance_2(geo_distance); + std::vector* double_buffer[2] = {&geo_distance, &geo_distance_2}; // error buffer std::vector error(mesh.n_vertices(), 0); - RXMESH::CPUTimer timer; + rxmesh::CPUTimer timer; timer.start(); // source distance for (auto v : h_seeds) { - geo_distance(v) = 0; - geo_distance_2(v) = 0; + geo_distance[v] = 0; + geo_distance_2[v] = 0; } uint32_t d = 0; uint32_t i(1), j(2); - iter = 0; + iter = 0; uint32_t max_iter = 2 * limits.size(); while (i < j && iter < max_iter) { @@ -184,15 +182,15 @@ inline float toplesets_propagation(TriMesh& mesh, i = j / 2; } - const uint32_t start = limits[i]; - const uint32_t end = limits[j]; + const uint32_t start = limits[i]; + const uint32_t end = limits[j]; const uint32_t n_cond = limits[i + 1] - start; for (uint32_t vi = start; vi < end; vi++) { - const uint32_t v = sorted_index[vi]; + const uint32_t v = sorted_index[vi]; TriMesh::VertexIter v_iter = mesh.vertices_begin() + v; - double_buffer[!d]->operator()(v) = double_buffer[d]->operator()(v); + (*double_buffer[!d])[v] = (*double_buffer[d])[v]; // The last vertex in v one ring @@ -202,7 +200,8 @@ inline float toplesets_propagation(TriMesh& mesh, // iterate over one-ring for (TriMesh::VertexVertexIter vv_iter = mesh.vv_iter(*v_iter); - vv_iter.is_valid(); ++vv_iter) { + vv_iter.is_valid(); + ++vv_iter) { // current vv uint32_t vv_id = (*vv_iter).idx(); @@ -212,8 +211,8 @@ inline float toplesets_propagation(TriMesh& mesh, // working on triangle v,vv_id, p_id T dist = update_step(mesh, v, p_id, vv_id, *double_buffer[d]); - if (dist < double_buffer[!d]->operator()(v)) { - double_buffer[!d]->operator()(v) = dist; + if (dist < (*double_buffer[!d])[v]) { + (*double_buffer[!d])[v] = dist; } @@ -225,9 +224,9 @@ inline float toplesets_propagation(TriMesh& mesh, // calc error for (uint32_t vi = start; vi < start + n_cond; vi++) { const uint32_t v = sorted_index[vi]; - error[vi] = std::abs(double_buffer[!d]->operator()(v) - - double_buffer[d]-> operator()(v)) / - double_buffer[d]->operator()(v); + error[vi] = + std::abs((*double_buffer[!d])[v] - (*double_buffer[d])[v]) / + (*double_buffer[d])[v]; } uint32_t count = 0; @@ -247,23 +246,26 @@ inline float toplesets_propagation(TriMesh& mesh, timer.stop(); // copy most updated results (if needed) - if (geo_distance.operator->() != double_buffer[!d]->operator->()) { - geo_distance.copy(*(double_buffer[!d]), RXMESH::HOST, RXMESH::HOST); + if (&geo_distance != double_buffer[!d]) { + for (size_t i = 0; i < geo_distance.size(); ++i) { + geo_distance[i] = geo_distance_2[i]; + } } - geo_distance_2.release(); - return timer.elapsed_millis(); } template -void geodesic_ptp_openmesh(TriMesh& input_mesh, - const std::vector& h_seeds, - RXMESH::RXMeshAttribute& geo_distance, - std::vector& sorted_index, - std::vector& limits, - RXMESH::RXMeshAttribute& toplesets) +void geodesic_ptp_openmesh(const std::vector>& Faces, + const std::vector>& Verts, + const std::vector& h_seeds, + std::vector& sorted_index, + std::vector& limits, + std::vector& toplesets) { + TriMesh input_mesh; + ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, Arg.obj_file_name)); + // Report OpenMeshReport report("Geodesic_OpenMesh"); report.command_line(Arg.argc, Arg.argv); @@ -272,19 +274,13 @@ void geodesic_ptp_openmesh(TriMesh& input_mesh, report.add_member("seeds", h_seeds); std::string method = "OpenMeshSingleCore"; report.add_member("method", method); - std::string order = "default"; - if (Arg.shuffle) { - order = "shuffle"; - } else if (Arg.sort) { - order = "sorted"; - } - report.add_member("input_order", order); + ASSERT_TRUE(Faces.size() == input_mesh.n_faces()); + ASSERT_TRUE(Verts.size() == input_mesh.n_vertices()); - // Geodesic distance attribute for all vertices - geo_distance.set_name("GeodesicDistance"); - geo_distance.init(input_mesh.n_vertices(), 1u, RXMESH::HOST); - geo_distance.reset(std::numeric_limits::infinity(), RXMESH::HOST); + + std::vector geo_distance(input_mesh.n_vertices(), + std::numeric_limits::infinity()); // sorted indices for toplesets sorted_index.clear(); @@ -295,26 +291,34 @@ void geodesic_ptp_openmesh(TriMesh& input_mesh, // compute toplesets float compute_toplesets_time = compute_toplesets(input_mesh, sorted_index, limits, toplesets, h_seeds); + RXMESH_TRACE("OpenMesh: Computing toplesets took {} (ms)", compute_toplesets_time); + report.add_member("compute_toplesets_time", compute_toplesets_time); // compute geodesic distance - uint32_t iter = 0; + uint32_t iter = 0; float processing_time = toplesets_propagation( input_mesh, h_seeds, limits, sorted_index, geo_distance, iter); RXMESH_TRACE("geodesic_ptp_openmesh() took {} (ms)", processing_time); + // export_attribute_VTK("geo_openmesh.vtk", + // Faces, + // Verts, + // false, + // geo_distance.data(), + // geo_distance.data()); // Finalize report report.add_member("num_iter_taken", iter); - RXMESH::TestData td; - td.test_name = "Geodesic"; + rxmesh::TestData td; + td.test_name = "Geodesic"; td.num_threads = 1; td.time_ms.push_back(processing_time); td.passed.push_back(true); report.add_test(td); report.write( Arg.output_folder + "/openmesh", - "Geodesic_OpenMesh" + RXMESH::extract_file_name(Arg.obj_file_name)); + "Geodesic_OpenMesh" + rxmesh::extract_file_name(Arg.obj_file_name)); } \ No newline at end of file diff --git a/apps/Geodesic/geodesic_ptp_rxmesh.h b/apps/Geodesic/geodesic_ptp_rxmesh.h index 75205f92..11a2c819 100644 --- a/apps/Geodesic/geodesic_ptp_rxmesh.h +++ b/apps/Geodesic/geodesic_ptp_rxmesh.h @@ -6,17 +6,16 @@ constexpr float EPS = 10e-6; -template -inline bool geodesic_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, - std::vector>& Faces, - std::vector>& Verts, - const std::vector& h_seeds, - const RXMESH::RXMeshAttribute& ground_truth, - const std::vector& h_sorted_index, - const std::vector& h_limits, - const RXMESH::RXMeshAttribute& toplesets) +template +inline void geodesic_rxmesh(rxmesh::RXMeshStatic& rxmesh, + const std::vector>& Faces, + const std::vector>& Verts, + const std::vector& h_seeds, + const std::vector& h_sorted_index, + const std::vector& h_limits, + const std::vector& toplesets) { - using namespace RXMESH; + using namespace rxmesh; constexpr uint32_t blockThreads = 256; // Report @@ -24,49 +23,45 @@ inline bool geodesic_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, report.command_line(Arg.argc, Arg.argv); report.device(); report.system(); - report.model_data(Arg.obj_file_name, rxmesh_static); + report.model_data(Arg.obj_file_name, rxmesh); report.add_member("seeds", h_seeds); report.add_member("method", std::string("RXMesh")); - std::string order = "default"; - if (Arg.shuffle) { - order = "shuffle"; - } else if (Arg.sort) { - order = "sorted"; - } - report.add_member("input_order", order); - // input coords - RXMESH::RXMeshAttribute input_coord; - input_coord.set_name("coord"); - input_coord.init(Verts.size(), 3u, RXMESH::LOCATION_ALL); - for (uint32_t i = 0; i < Verts.size(); ++i) { - for (uint32_t j = 0; j < Verts[i].size(); ++j) { - input_coord(i, j) = Verts[i][j]; - } - } - input_coord.change_layout(RXMESH::HOST); - input_coord.move(RXMESH::HOST, RXMESH::DEVICE); + auto input_coord = rxmesh.add_vertex_attribute(Verts, "coord"); + + // toplesets + auto d_toplesets = rxmesh.add_vertex_attribute(toplesets, "topleset"); + // RXMesh launch box LaunchBox launch_box; - rxmesh_static.prepare_launch_box(RXMESH::Op::VV, launch_box, false, true); + rxmesh.prepare_launch_box(rxmesh::Op::VV, + launch_box, + (void*)relax_ptp_rxmesh, + true); // Geodesic distance attribute for all vertices (seeds set to zero // and infinity otherwise) - RXMeshAttribute rxmesh_geo; - rxmesh_geo.init(rxmesh_static.get_num_vertices(), 1u, RXMESH::LOCATION_ALL); - rxmesh_geo.reset(std::numeric_limits::infinity(), RXMESH::HOST); - for (uint32_t v : h_seeds) { - rxmesh_geo(v) = 0; - } - rxmesh_geo.move(RXMESH::HOST, RXMESH::DEVICE); + auto rxmesh_geo = rxmesh.add_vertex_attribute("geo", 1u); + rxmesh_geo->reset(std::numeric_limits::infinity(), rxmesh::HOST); + rxmesh.for_each_vertex(rxmesh::HOST, [&](const VertexHandle vh) { + uint32_t v_id = rxmesh.map_to_global(vh); + for (uint32_t s : h_seeds) { + if (s == v_id) { + (*rxmesh_geo)(vh) = 0; + break; + } + } + }); + rxmesh_geo->move(rxmesh::HOST, rxmesh::DEVICE); // second buffer for geodesic distance for double buffering - RXMeshAttribute rxmesh_geo_2; - rxmesh_geo_2.init(rxmesh_static.get_num_vertices(), 1u, RXMESH::DEVICE); - rxmesh_geo_2.copy(rxmesh_geo, RXMESH::DEVICE, RXMESH::DEVICE); + auto rxmesh_geo_2 = + rxmesh.add_vertex_attribute("geo2", 1u, rxmesh::DEVICE); + + rxmesh_geo_2->copy_from(*rxmesh_geo, rxmesh::DEVICE, rxmesh::DEVICE); // Error @@ -74,7 +69,8 @@ inline bool geodesic_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, CUDA_ERROR(cudaMalloc((void**)&d_error, sizeof(uint32_t))); // double buffer - RXMeshAttribute* double_buffer[2] = {&rxmesh_geo, &rxmesh_geo_2}; + VertexAttribute* double_buffer[2] = {rxmesh_geo.get(), + rxmesh_geo_2.get()}; // start time GPUTimer timer; @@ -83,7 +79,7 @@ inline bool geodesic_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, // actual computation uint32_t d = 0; uint32_t i(1), j(2); - uint32_t iter = 0; + uint32_t iter = 0; uint32_t max_iter = 2 * h_limits.size(); while (i < j && iter < max_iter) { iter++; @@ -94,12 +90,19 @@ inline bool geodesic_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, // compute new geodesic relax_ptp_rxmesh <<>>( - rxmesh_static.get_context(), input_coord, *double_buffer[!d], - *double_buffer[d], toplesets, i, j, d_error, - std::numeric_limits::infinity(), T(1e-3)); - - CUDA_ERROR(cudaMemcpy(&h_error, d_error, sizeof(uint32_t), - cudaMemcpyDeviceToHost)); + rxmesh.get_context(), + *input_coord, + *double_buffer[!d], + *double_buffer[d], + *d_toplesets, + i, + j, + d_error, + std::numeric_limits::infinity(), + T(1e-3)); + + CUDA_ERROR(cudaMemcpy( + &h_error, d_error, sizeof(uint32_t), cudaMemcpyDeviceToHost)); CUDA_ERROR(cudaMemset(d_error, 0, sizeof(uint32_t))); @@ -120,38 +123,33 @@ inline bool geodesic_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, CUDA_ERROR(cudaGetLastError()); CUDA_ERROR(cudaProfilerStop()); - // verify - rxmesh_geo.copy(*double_buffer[d], RXMESH::DEVICE, RXMESH::HOST); - T err = 0; - for (uint32_t i = 0; i < ground_truth.get_num_mesh_elements(); ++i) { - if (ground_truth(i) > EPS) { - err += std::abs(rxmesh_geo(i) - ground_truth(i)) / ground_truth(i); - } - } - err /= T(ground_truth.get_num_mesh_elements()); - bool is_passed = (err < 10E-2); + rxmesh_geo->copy_from(*double_buffer[d], rxmesh::DEVICE, rxmesh::HOST); - RXMESH_TRACE("Geodesic_RXMesh took {} (ms) -- err= {} -- #iter= {}", - timer.elapsed_millis(), err, iter); + RXMESH_TRACE("Geodesic_RXMesh took {} (ms) -- #iter= {}", + timer.elapsed_millis(), + iter); - // export_attribute_VTK("geo_rxmesh.vtk", Faces, Verts, false, - // rxmesh_geo.operator->(), rxmesh_geo.operator->()); + // std::vector geo(rxmesh.get_num_vertices()); + // rxmesh.for_each_vertex(rxmesh::HOST, [&](const VertexHandle vh) { + // uint32_t v_id = rxmesh.map_to_global(vh); + // geo[v_id] = (*rxmesh_geo)(vh); + //}); + // export_attribute_VTK( + // "geo_rxmesh.vtk", Faces, Verts, false, geo.data(), geo.data()); - // Release allocation - rxmesh_geo.release(); - rxmesh_geo_2.release(); - input_coord.release(); GPU_FREE(d_error); // Finalize report report.add_member("num_iter_taken", iter); TestData td; - td.test_name = "Geodesic"; + td.test_name = "Geodesic"; + td.num_threads = launch_box.num_threads; + td.num_blocks = launch_box.blocks; + td.dyn_smem = launch_box.smem_bytes_dyn; + td.static_smem = launch_box.smem_bytes_static; + td.num_reg = launch_box.num_registers_per_thread; td.time_ms.push_back(timer.elapsed_millis()); - td.passed.push_back(is_passed); report.add_test(td); report.write(Arg.output_folder + "/rxmesh", "Geodesic_RXMesh_" + extract_file_name(Arg.obj_file_name)); - - return is_passed; } \ No newline at end of file diff --git a/apps/MCF/CMakeLists.txt b/apps/MCF/CMakeLists.txt index 1eff0cad..ec1ac6c9 100644 --- a/apps/MCF/CMakeLists.txt +++ b/apps/MCF/CMakeLists.txt @@ -27,6 +27,8 @@ endif() set_target_properties( MCF PROPERTIES FOLDER "apps") +set_property(TARGET MCF PROPERTY CUDA_SEPARABLE_COMPILATION ON) + source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "MCF" FILES ${SOURCE_LIST}) diff --git a/apps/MCF/benchmark.sh b/apps/MCF/benchmark.sh index e531d908..f1dfb38b 100644 --- a/apps/MCF/benchmark.sh +++ b/apps/MCF/benchmark.sh @@ -1,5 +1,4 @@ #!/bin/bash -echo "This script re-generates RXMesh data in Figure 8(a) in the paper." echo "Please make sure to first compile the source code and then enter the input OBJ files directory." read -p "OBJ files directory (no trailing slash): " input_dir @@ -16,7 +15,7 @@ device_id=0 for file in $input_dir/*.obj; do if [ -f "$file" ]; then - echo $exe -p -input "$file" -device_id $device_id - $exe -p -input "$file" -device_id $device_id + echo $exe -input "$file" -device_id $device_id + $exe -input "$file" -device_id $device_id fi done \ No newline at end of file diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu index 38c40128..abfec068 100644 --- a/apps/MCF/mcf.cu +++ b/apps/MCF/mcf.cu @@ -6,7 +6,7 @@ #include "../common/openmesh_trimesh.h" #include "gtest/gtest.h" -#include "rxmesh/rxmesh_attribute.h" +#include "rxmesh/attribute.h" #include "rxmesh/rxmesh_static.h" #include "rxmesh/util/cuda_query.h" #include "rxmesh/util/export_tools.h" @@ -15,18 +15,15 @@ struct arg { - std::string obj_file_name = STRINGIFY(INPUT_DIR) "sphere3.obj"; - std::string output_folder = STRINGIFY(OUTPUT_DIR); - uint32_t device_id = 0; - float time_step = 0.001; - float cg_tolerance = 1e-6; - uint32_t max_num_cg_iter = 1000; + std::string obj_file_name = STRINGIFY(INPUT_DIR) "sphere3.obj"; + std::string output_folder = STRINGIFY(OUTPUT_DIR); + uint32_t device_id = 0; + float time_step = 0.001; + float cg_tolerance = 1e-6; + uint32_t max_num_cg_iter = 1000; bool use_uniform_laplace = false; char** argv; int argc; - bool shuffle = false; - bool sort = false; - } Arg; #include "mcf_openmesh.h" @@ -35,17 +32,9 @@ struct arg TEST(App, MCF) { - using namespace RXMESH; + using namespace rxmesh; using dataT = float; - if (Arg.shuffle) { - ASSERT_FALSE(Arg.sort) << " cannot shuffle and sort at the same time!"; - } - if (Arg.sort) { - ASSERT_FALSE(Arg.shuffle) - << " cannot shuffle and sort at the same time!"; - } - // Select device cuda_query(Arg.device_id); @@ -56,41 +45,24 @@ TEST(App, MCF) ASSERT_TRUE(import_obj(Arg.obj_file_name, Verts, Faces)); - if (Arg.shuffle) { - shuffle_obj(Faces, Verts); - } - - // Create RXMeshStatic instance. If Arg.sort is true, Faces and Verts will - // be sorted based on the patching happening inside RXMesh - RXMeshStatic rxmesh_static(Faces, Verts, Arg.sort, false); + RXMeshStatic rxmesh(Faces, false); - // Since OpenMesh only accepts input as obj files, if the input mesh is - // shuffled or sorted, we have to write it to a temp file so that OpenMesh - // can pick it up TriMesh input_mesh; - if (Arg.sort || Arg.shuffle) { - export_obj(Faces, Verts, "temp.obj", false); - ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, "temp.obj")); - } else { - ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, Arg.obj_file_name)); - } - - //*** OpenMesh Impl - RXMESH::RXMeshAttribute ground_truth; - mcf_openmesh(omp_get_max_threads(), input_mesh, ground_truth); + ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, Arg.obj_file_name)); - //*** RXMesh Impl - mcf_rxmesh(rxmesh_static, Verts, ground_truth); + // OpenMesh Impl + std::vector> ground_truth(Verts); + mcf_openmesh(omp_get_max_threads(), input_mesh, ground_truth); - // Release allocation - ground_truth.release(); + // RXMesh Impl + mcf_rxmesh(rxmesh, Verts, ground_truth); } int main(int argc, char** argv) { - using namespace RXMESH; + using namespace rxmesh; Log::init(); ::testing::InitGoogleTest(&argc, argv); @@ -109,9 +81,7 @@ int main(int argc, char** argv) " -dt: Time step (delta t). Default is {} \n" " Hint: should be between (0.001, 1) for cotan Laplace or between (1, 100) for uniform Laplace\n" " -eps: Conjugate gradient tolerance. Default is {}\n" - " -max_cg_iter: Conjugate gradient maximum number of iterations. Default is {}\n" - " -s: Shuffle input. Default is false.\n" - " -p: Sort input using patching output. Default is false\n" + " -max_cg_iter: Conjugate gradient maximum number of iterations. Default is {}\n" " -device_id: GPU device ID. Default is {}", Arg.obj_file_name, Arg.output_folder, (Arg.use_uniform_laplace? "true" : "false"), Arg.time_step, Arg.cg_tolerance, Arg.max_num_cg_iter, Arg.device_id); // clang-format on @@ -141,12 +111,6 @@ int main(int argc, char** argv) if (cmd_option_exists(argv, argc + argv, "-uniform_laplace")) { Arg.use_uniform_laplace = true; } - if (cmd_option_exists(argv, argc + argv, "-s")) { - Arg.shuffle = true; - } - if (cmd_option_exists(argv, argc + argv, "-p")) { - Arg.sort = true; - } if (cmd_option_exists(argv, argc + argv, "-device_id")) { Arg.device_id = atoi(get_cmd_option(argv, argv + argc, "-device_id")); diff --git a/apps/MCF/mcf_openmesh.h b/apps/MCF/mcf_openmesh.h index e7280b6b..ed9ef468 100644 --- a/apps/MCF/mcf_openmesh.h +++ b/apps/MCF/mcf_openmesh.h @@ -2,7 +2,6 @@ #include "../common/openmesh_report.h" #include "../common/openmesh_trimesh.h" #include "mcf_util.h" -#include "rxmesh/rxmesh_attribute.h" #include "rxmesh/util/timer.h" #include "rxmesh/util/vector.h" @@ -10,24 +9,24 @@ * axpy3() */ template -void axpy3(const RXMESH::RXMeshAttribute& X, - RXMESH::Vector<3, T> alpha, - RXMESH::Vector<3, T> beta, - RXMESH::RXMeshAttribute& Y, - const int num_omp_threads) +void axpy3(const std::vector>& X, + const T alpha, + const T beta, + std::vector>& Y, + const int num_omp_threads) { // Y = beta*Y + alpha*X - int size = static_cast(X.get_num_mesh_elements()); + int size = static_cast(X.size()); #pragma omp parallel for schedule(static) num_threads(num_omp_threads) for (int i = 0; i < size; ++i) { - Y(i, 0) *= beta[0]; - Y(i, 1) *= beta[1]; - Y(i, 2) *= beta[2]; + Y[i][0] *= beta; + Y[i][1] *= beta; + Y[i][2] *= beta; - Y(i, 0) += alpha[0] * X(i, 0); - Y(i, 1) += alpha[1] * X(i, 1); - Y(i, 2) += alpha[2] * X(i, 2); + Y[i][0] += alpha * X[i][0]; + Y[i][1] += alpha * X[i][1]; + Y[i][2] += alpha * X[i][2]; } } @@ -35,26 +34,23 @@ void axpy3(const RXMESH::RXMeshAttribute& X, * dot3() */ template -void dot3(const RXMESH::RXMeshAttribute& A, - const RXMESH::RXMeshAttribute& B, - RXMESH::Vector<3, T>& res, - const int num_omp_threads) +T dot3(const std::vector>& A, + const std::vector>& B, + const int num_omp_threads) { - // creating temp variables because variable in 'reduction' clause/directive - // cannot have reference type - T x_sum(0), y_sum(0), z_sum(0); - int size = static_cast(A.get_num_mesh_elements()); -#pragma omp parallel for schedule(static) num_threads(num_omp_threads) reduction(+ : x_sum,y_sum,z_sum) + T ret = 0; + int size = static_cast(A.size()); +#pragma omp parallel for schedule(static) num_threads(num_omp_threads) reduction(+ : ret) for (int i = 0; i < size; ++i) { - x_sum += A(i, 0) * B(i, 0); - y_sum += A(i, 1) * B(i, 1); - z_sum += A(i, 2) * B(i, 2); + T partial = 0; + for (size_t j = 0; j < A[i].size(); ++j) { + partial += A[i][j] * B[i][j]; + } + ret += partial; } - res[0] = x_sum; - res[1] = y_sum; - res[2] = z_sum; + return ret; } /** @@ -77,12 +73,12 @@ T partial_voronoi_area(const int p_id, // center assert((*q_it).idx() == q_id); assert((*r_it).idx() == r_id); - const RXMESH::Vector<3, T> p(mesh.point(*p_it)[0], mesh.point(*p_it)[1], - mesh.point(*p_it)[2]); - const RXMESH::Vector<3, T> q(mesh.point(*q_it)[0], mesh.point(*q_it)[1], - mesh.point(*q_it)[2]); - const RXMESH::Vector<3, T> r(mesh.point(*r_it)[0], mesh.point(*r_it)[1], - mesh.point(*r_it)[2]); + const rxmesh::Vector<3, T> p( + mesh.point(*p_it)[0], mesh.point(*p_it)[1], mesh.point(*p_it)[2]); + const rxmesh::Vector<3, T> q( + mesh.point(*q_it)[0], mesh.point(*q_it)[1], mesh.point(*q_it)[2]); + const rxmesh::Vector<3, T> r( + mesh.point(*r_it)[0], mesh.point(*r_it)[1], mesh.point(*r_it)[2]); return partial_voronoi_area(p, q, r); } @@ -106,24 +102,24 @@ T edge_cotan_weight(const int p_id, TriMesh::VertexIter q_it = mesh.vertices_begin() + q_id; TriMesh::VertexIter s_it = mesh.vertices_begin() + s_id; - const RXMESH::Vector<3, T> p(mesh.point(*p_it)[0], mesh.point(*p_it)[1], - mesh.point(*p_it)[2]); - const RXMESH::Vector<3, T> r(mesh.point(*r_it)[0], mesh.point(*r_it)[1], - mesh.point(*r_it)[2]); - const RXMESH::Vector<3, T> q(mesh.point(*q_it)[0], mesh.point(*q_it)[1], - mesh.point(*q_it)[2]); - const RXMESH::Vector<3, T> s(mesh.point(*s_it)[0], mesh.point(*s_it)[1], - mesh.point(*s_it)[2]); + const rxmesh::Vector<3, T> p( + mesh.point(*p_it)[0], mesh.point(*p_it)[1], mesh.point(*p_it)[2]); + const rxmesh::Vector<3, T> r( + mesh.point(*r_it)[0], mesh.point(*r_it)[1], mesh.point(*r_it)[2]); + const rxmesh::Vector<3, T> q( + mesh.point(*q_it)[0], mesh.point(*q_it)[1], mesh.point(*q_it)[2]); + const rxmesh::Vector<3, T> s( + mesh.point(*s_it)[0], mesh.point(*s_it)[1], mesh.point(*s_it)[2]); return edge_cotan_weight(p, r, q, s); } template -void mcf_matvec(TriMesh& mesh, - const RXMESH::RXMeshAttribute& in, - RXMESH::RXMeshAttribute& out, - const int num_omp_threads) +void mcf_matvec(TriMesh& mesh, + const std::vector>& in, + std::vector>& out, + const int num_omp_threads) { // Matrix vector multiplication operation based on uniform Laplacian weight // defined in Equation 7 in Implicit Fairing of Irregular Meshes using @@ -153,7 +149,7 @@ void mcf_matvec(TriMesh& mesh, TriMesh::VertexIter p_iter = mesh.vertices_begin() + p_id; // Off-diagonal entries - RXMESH::Vector<3, T> x(T(0)); + rxmesh::Vector<3, T> x(T(0)); T sum_e_weight(0); // vertex weight @@ -170,7 +166,8 @@ void mcf_matvec(TriMesh& mesh, assert(s_iter.is_valid()); for (TriMesh::VertexVertexIter r_iter = mesh.vv_iter(*p_iter); - r_iter.is_valid(); ++r_iter) { + r_iter.is_valid(); + ++r_iter) { int r_id = (*r_iter).idx(); @@ -180,17 +177,18 @@ void mcf_matvec(TriMesh& mesh, e_weight = 1; } else { e_weight = std::max( - T(0.0), edge_cotan_weight(p_id, r_id, (*q_iter).idx(), - (*s_iter).idx(), mesh)); + T(0.0), + edge_cotan_weight( + p_id, r_id, (*q_iter).idx(), (*s_iter).idx(), mesh)); ++s_iter; } e_weight *= static_cast(Arg.time_step); sum_e_weight += e_weight; - x[0] -= e_weight * in(r_id, 0); - x[1] -= e_weight * in(r_id, 1); - x[2] -= e_weight * in(r_id, 2); + x[0] -= e_weight * in[r_id][0]; + x[1] -= e_weight * in[r_id][1]; + x[2] -= e_weight * in[r_id][2]; if (Arg.use_uniform_laplace) { ++v_weight; @@ -215,10 +213,10 @@ void mcf_matvec(TriMesh& mesh, assert(!std::isnan(v_weight)); assert(!std::isinf(v_weight)); - T diag = ((1.0 / v_weight) + sum_e_weight); - out(p_id, 0) = x[0] + diag * in(p_id, 0); - out(p_id, 1) = x[1] + diag * in(p_id, 1); - out(p_id, 2) = x[2] + diag * in(p_id, 2); + T diag = ((1.0 / v_weight) + sum_e_weight); + out[p_id][0] = x[0] + diag * in[p_id][0]; + out[p_id][1] = x[1] + diag * in[p_id][1]; + out[p_id][2] = x[2] + diag * in[p_id][2]; } } @@ -227,16 +225,16 @@ void mcf_matvec(TriMesh& mesh, * cg() */ template -void cg(TriMesh& mesh, - RXMESH::RXMeshAttribute& X, - RXMESH::RXMeshAttribute& B, - RXMESH::RXMeshAttribute& R, - RXMESH::RXMeshAttribute& P, - RXMESH::RXMeshAttribute& S, - uint32_t& num_cg_iter_taken, - RXMESH::Vector<3, T>& start_residual, - RXMESH::Vector<3, T>& stop_residual, - const int num_omp_threads) +void cg(TriMesh& mesh, + std::vector>& X, + std::vector>& B, + std::vector>& R, + std::vector>& P, + std::vector>& S, + uint32_t& num_cg_iter_taken, + T& start_residual, + T& stop_residual, + const int num_omp_threads) { // CG solver. Solve for the three coordinates simultaneously @@ -248,112 +246,93 @@ void cg(TriMesh& mesh, // p = r #pragma omp parallel for schedule(static) num_threads(num_omp_threads) for (int i = 0; i < int(mesh.n_vertices()); ++i) { - R(i, 0) = B(i, 0) - S(i, 0); - R(i, 1) = B(i, 1) - S(i, 1); - R(i, 2) = B(i, 2) - S(i, 2); + R[i][0] = B[i][0] - S[i][0]; + R[i][1] = B[i][1] - S[i][1]; + R[i][2] = B[i][2] - S[i][2]; - P(i, 0) = R(i, 0); - P(i, 1) = R(i, 1); - P(i, 2) = R(i, 2); + P[i][0] = R[i][0]; + P[i][1] = R[i][1]; + P[i][2] = R[i][2]; } // delta_new = - RXMESH::Vector<3, T> delta_new; - dot3(R, R, delta_new, num_omp_threads); + T delta_new = dot3(R, R, num_omp_threads); // delta_0 = delta_new - const RXMESH::Vector<3, T> delta_0(delta_new); + const T delta_0(delta_new); start_residual = delta_0; - const RXMESH::Vector<3, T> ones(1); - uint32_t iter = 0; + uint32_t iter = 0; while (iter < Arg.max_num_cg_iter) { // s = Ap mcf_matvec(mesh, P, S, num_omp_threads); // alpha = delta_new / - RXMESH::Vector<3, T> alpha; - dot3(S, P, alpha, num_omp_threads); - alpha = delta_new / alpha; + T alpha = dot3(S, P, num_omp_threads); + alpha = delta_new / alpha; // x = x + alpha*p - axpy3(P, alpha, ones, X, num_omp_threads); + axpy3(P, alpha, T(1), X, num_omp_threads); // r = r - alpha*s - axpy3(S, -alpha, ones, R, num_omp_threads); + axpy3(S, -alpha, T(1), R, num_omp_threads); // delta_old = delta_new - RXMESH::Vector<3, T> delta_old(delta_new); + T delta_old(delta_new); // delta_new = - dot3(R, R, delta_new, num_omp_threads); + delta_new = dot3(R, R, num_omp_threads); // beta = delta_new/delta_old - RXMESH::Vector<3, T> beta(delta_new / delta_old); + T beta(delta_new / delta_old); // exit if error is getting too low across three coordinates - if (delta_new[0] < Arg.cg_tolerance * Arg.cg_tolerance * delta_0[0] && - delta_new[1] < Arg.cg_tolerance * Arg.cg_tolerance * delta_0[1] && - delta_new[2] < Arg.cg_tolerance * Arg.cg_tolerance * delta_0[2]) { + if (delta_new < Arg.cg_tolerance * Arg.cg_tolerance * delta_0) { break; } // p = beta*p + r - axpy3(R, ones, beta, P, num_omp_threads); + axpy3(R, T(1), beta, P, num_omp_threads); ++iter; } num_cg_iter_taken = iter; - stop_residual = delta_new; + stop_residual = delta_new; } /** * implicit_smoothing() */ template -void implicit_smoothing(TriMesh& mesh, - RXMESH::RXMeshAttribute& X, - uint32_t& num_cg_iter_taken, - float& time, - RXMESH::Vector<3, T>& start_residual, - RXMESH::Vector<3, T>& stop_residual, - const int num_omp_threads) +void implicit_smoothing(TriMesh& mesh, + std::vector>& X, + uint32_t& num_cg_iter_taken, + float& time, + T& start_residual, + T& stop_residual, + const int num_omp_threads) { for (TriMesh::VertexIter v_it = mesh.vertices_begin(); - v_it != mesh.vertices_end(); ++v_it) { + v_it != mesh.vertices_end(); + ++v_it) { ASSERT_FALSE(mesh.is_boundary(*v_it)) << "OpenMesh MCF only takes watertight/closed mesh without " "boundaries"; } // CG containers - RXMESH::RXMeshAttribute B, R, P, S; - - X.init(mesh.n_vertices(), 3u, RXMESH::HOST); - X.reset(0.0, RXMESH::HOST); - - S.init(mesh.n_vertices(), 3u, RXMESH::HOST); - S.reset(0.0, RXMESH::HOST); - - P.init(mesh.n_vertices(), 3u, RXMESH::HOST); - P.reset(0.0, RXMESH::HOST); - - R.init(mesh.n_vertices(), 3u, RXMESH::HOST); - R.reset(0.0, RXMESH::HOST); - - B.init(mesh.n_vertices(), 3u, RXMESH::HOST); - B.reset(0.0, RXMESH::HOST); + std::vector> B(X), R(X), P(X), S(X); #pragma omp parallel for for (uint32_t v_id = 0; v_id < mesh.n_vertices(); ++v_id) { TriMesh::VertexIter v_iter = mesh.vertices_begin() + v_id; // LHS - X(v_id, 0) = mesh.point(*v_iter)[0]; - X(v_id, 1) = mesh.point(*v_iter)[1]; - X(v_id, 2) = mesh.point(*v_iter)[2]; + X[v_id][0] = mesh.point(*v_iter)[0]; + X[v_id][1] = mesh.point(*v_iter)[1]; + X[v_id][2] = mesh.point(*v_iter)[2]; // RHS T v_weight = 1; @@ -363,9 +342,9 @@ void implicit_smoothing(TriMesh& mesh, } // will fix it later for cotan weight - B(v_id, 0) = X(v_id, 0) * v_weight; - B(v_id, 1) = X(v_id, 1) * v_weight; - B(v_id, 2) = X(v_id, 2) * v_weight; + B[v_id][0] = X[v_id][0] * v_weight; + B[v_id][1] = X[v_id][1] * v_weight; + B[v_id][2] = X[v_id][2] * v_weight; } if (!Arg.use_uniform_laplace) { @@ -381,30 +360,39 @@ void implicit_smoothing(TriMesh& mesh, assert(q_iter.is_valid()); for (TriMesh::VertexVertexIter vv_iter = mesh.vv_iter(*v_iter); - vv_iter.is_valid(); ++vv_iter) { + vv_iter.is_valid(); + ++vv_iter) { - T tri_area = partial_voronoi_area(v_id, (*q_iter).idx(), - (*vv_iter).idx(), mesh); + T tri_area = partial_voronoi_area( + v_id, (*q_iter).idx(), (*vv_iter).idx(), mesh); v_weight += (tri_area > 0) ? tri_area : 0; q_iter++; assert(q_iter == vv_iter); } - v_weight = 0.5 / v_weight; - B(v_id, 0) = X(v_id, 0) / v_weight; - B(v_id, 1) = X(v_id, 1) / v_weight; - B(v_id, 2) = X(v_id, 2) / v_weight; + v_weight = 0.5 / v_weight; + B[v_id][0] = X[v_id][0] / v_weight; + B[v_id][1] = X[v_id][1] / v_weight; + B[v_id][2] = X[v_id][2] / v_weight; } } num_cg_iter_taken = 0; // solve - RXMESH::CPUTimer timer; + rxmesh::CPUTimer timer; timer.start(); - cg(mesh, X, B, R, P, S, num_cg_iter_taken, start_residual, stop_residual, + cg(mesh, + X, + B, + R, + P, + S, + num_cg_iter_taken, + start_residual, + stop_residual, num_omp_threads); timer.stop(); @@ -412,9 +400,9 @@ void implicit_smoothing(TriMesh& mesh, } template -void mcf_openmesh(const int num_omp_threads, - TriMesh& input_mesh, - RXMESH::RXMeshAttribute& smoothed_coord) +void mcf_openmesh(const int num_omp_threads, + TriMesh& input_mesh, + std::vector>& smoothed_coord) { // Report OpenMeshReport report("MCF_OpenMesh"); @@ -424,13 +412,6 @@ void mcf_openmesh(const int num_omp_threads, std::string method = "OpenMesh " + std::to_string(num_omp_threads) + " Core"; report.add_member("method", method); - std::string order = "default"; - if (Arg.shuffle) { - order = "shuffle"; - } else if (Arg.sort) { - order = "sorted"; - } - report.add_member("input_order", order); report.add_member("time_step", Arg.time_step); report.add_member("cg_tolerance", Arg.cg_tolerance); report.add_member("use_uniform_laplace", Arg.use_uniform_laplace); @@ -438,26 +419,33 @@ void mcf_openmesh(const int num_omp_threads, // implicit smoothing - uint32_t num_cg_iter_taken = 0; - float time = 0; - RXMESH::Vector<3, T> start_residual; - RXMESH::Vector<3, T> stop_residual; - - implicit_smoothing(input_mesh, smoothed_coord, num_cg_iter_taken, time, - start_residual, stop_residual, num_omp_threads); + uint32_t num_cg_iter_taken = 0; + float time = 0; + T start_residual; + T stop_residual; + + implicit_smoothing(input_mesh, + smoothed_coord, + num_cg_iter_taken, + time, + start_residual, + stop_residual, + num_omp_threads); RXMESH_TRACE( "mcf_openmesh() took {} (ms) and {} iterations (i.e., {} ms/iter) ", - time, num_cg_iter_taken, time / float(num_cg_iter_taken)); + time, + num_cg_iter_taken, + time / float(num_cg_iter_taken)); // write output //#pragma omp parallel for // for (int v_id = 0; v_id < int(input_mesh.n_vertices()); ++v_id) { // TriMesh::VertexIter v_iter = input_mesh.vertices_begin() + v_id; - // input_mesh.point(*v_iter)[0] = smoothed_coord(v_id, 0); - // input_mesh.point(*v_iter)[1] = smoothed_coord(v_id, 1); - // input_mesh.point(*v_iter)[2] = smoothed_coord(v_id, 2); + // input_mesh.point(*v_iter)[0] = smoothed_coord[v_id][0]; + // input_mesh.point(*v_iter)[1] = smoothed_coord[v_id][1]; + // input_mesh.point(*v_iter)[2] = smoothed_coord[v_id][2]; // } // std::string fn = STRINGIFY(OUTPUT_DIR) "mcf_openmesh.obj"; // if (!OpenMesh::IO::write_mesh(input_mesh, fn)) { @@ -465,17 +453,17 @@ void mcf_openmesh(const int num_omp_threads, // } // Finalize report - report.add_member("start_residual", to_string(start_residual)); - report.add_member("end_residual", to_string(stop_residual)); + report.add_member("start_residual", start_residual); + report.add_member("end_residual", stop_residual); report.add_member("num_cg_iter_taken", num_cg_iter_taken); report.add_member("total_time (ms)", time); - RXMESH::TestData td; - td.test_name = "MCF"; + rxmesh::TestData td; + td.test_name = "MCF"; td.num_threads = num_omp_threads; td.time_ms.push_back(time / float(num_cg_iter_taken)); td.passed.push_back(true); report.add_test(td); report.write( Arg.output_folder + "/openmesh", - "MCF_OpenMesh_" + RXMESH::extract_file_name(Arg.obj_file_name)); + "MCF_OpenMesh_" + rxmesh::extract_file_name(Arg.obj_file_name)); } \ No newline at end of file diff --git a/apps/MCF/mcf_rxmesh.h b/apps/MCF/mcf_rxmesh.h index 37c23547..e3681c8b 100644 --- a/apps/MCF/mcf_rxmesh.h +++ b/apps/MCF/mcf_rxmesh.h @@ -2,19 +2,56 @@ #include #include "mcf_rxmesh_kernel.cuh" -#include "rxmesh/rxmesh_attribute.h" +#include "rxmesh/attribute.h" +#include "rxmesh/reduce_handle.h" #include "rxmesh/rxmesh_static.h" #include "rxmesh/util/report.h" #include "rxmesh/util/timer.h" #include "rxmesh/util/vector.h" - -template -void mcf_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, +template +void axpy(rxmesh::RXMeshStatic& rxmesh, + rxmesh::VertexAttribute& y, + const rxmesh::VertexAttribute& x, + const T alpha, + const T beta, + cudaStream_t stream = NULL) +{ + // Y = alpha*X + beta*Y + rxmesh.for_each_vertex( + rxmesh::DEVICE, + [y, x, alpha, beta] __device__(const rxmesh::VertexHandle vh) { + for (uint32_t i = 0; i < 3; ++i) { + y(vh, i) = alpha * x(vh, i) + beta * y(vh, i); + } + }); +} + +template +void init_PR(rxmesh::RXMeshStatic& rxmesh, + const rxmesh::VertexAttribute& B, + const rxmesh::VertexAttribute& S, + rxmesh::VertexAttribute& R, + rxmesh::VertexAttribute& P) +{ + rxmesh.for_each_vertex( + rxmesh::DEVICE, [B, S, R, P] __device__(const rxmesh::VertexHandle vh) { + R(vh, 0) = B(vh, 0) - S(vh, 0); + R(vh, 1) = B(vh, 1) - S(vh, 1); + R(vh, 2) = B(vh, 2) - S(vh, 2); + + P(vh, 0) = R(vh, 0); + P(vh, 1) = R(vh, 1); + P(vh, 2) = R(vh, 2); + }); +} + +template +void mcf_rxmesh(rxmesh::RXMeshStatic& rxmesh, const std::vector>& Verts, - const RXMESH::RXMeshAttribute& ground_truth) + const std::vector>& ground_truth) { - using namespace RXMESH; + using namespace rxmesh; constexpr uint32_t blockThreads = 256; // Report @@ -22,122 +59,123 @@ void mcf_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, report.command_line(Arg.argc, Arg.argv); report.device(); report.system(); - report.model_data(Arg.obj_file_name, rxmesh_static); + report.model_data(Arg.obj_file_name, rxmesh); report.add_member("method", std::string("RXMesh")); - std::string order = "default"; - if (Arg.shuffle) { - order = "shuffle"; - } else if (Arg.sort) { - order = "sorted"; - } - report.add_member("input_order", order); report.add_member("time_step", Arg.time_step); report.add_member("cg_tolerance", Arg.cg_tolerance); report.add_member("use_uniform_laplace", Arg.use_uniform_laplace); report.add_member("max_num_cg_iter", Arg.max_num_cg_iter); report.add_member("blockThreads", blockThreads); - ASSERT_TRUE(rxmesh_static.is_closed()) + ASSERT_TRUE(rxmesh.is_closed()) << "mcf_rxmesh only takes watertight/closed mesh without boundaries"; // Different attributes used throughout the application - RXMeshAttribute input_coord; - input_coord.set_name("coord"); - input_coord.init(Verts.size(), 3u, RXMESH::LOCATION_ALL); - for (uint32_t i = 0; i < Verts.size(); ++i) { - for (uint32_t j = 0; j < Verts[i].size(); ++j) { - input_coord(i, j) = Verts[i][j]; - } - } - input_coord.change_layout(RXMESH::HOST); - input_coord.move(RXMESH::HOST, RXMESH::DEVICE); + auto input_coord = + rxmesh.add_vertex_attribute(Verts, "coord", rxmesh::LOCATION_ALL); // S in CG - RXMeshAttribute S; - S.set_name("S"); - S.init(rxmesh_static.get_num_vertices(), 3u, RXMESH::DEVICE, RXMESH::SoA); - S.reset(0.0, RXMESH::DEVICE); + auto S = + rxmesh.add_vertex_attribute("S", 3, rxmesh::DEVICE, rxmesh::SoA); + S->reset(0.0, rxmesh::DEVICE); // P in CG - RXMeshAttribute P; - P.set_name("P"); - P.init(rxmesh_static.get_num_vertices(), 3u, RXMESH::DEVICE, RXMESH::SoA); - P.reset(0.0, RXMESH::DEVICE); + auto P = + rxmesh.add_vertex_attribute("P", 3, rxmesh::DEVICE, rxmesh::SoA); + P->reset(0.0, rxmesh::DEVICE); // R in CG - RXMeshAttribute R; - R.set_name("P"); - R.init(rxmesh_static.get_num_vertices(), 3u, RXMESH::DEVICE, RXMESH::SoA); - R.reset(0.0, RXMESH::DEVICE); + auto R = + rxmesh.add_vertex_attribute("R", 3, rxmesh::DEVICE, rxmesh::SoA); + R->reset(0.0, rxmesh::DEVICE); // B in CG - RXMeshAttribute B; - B.set_name("B"); - B.init(rxmesh_static.get_num_vertices(), 3u, RXMESH::DEVICE, RXMESH::SoA); - B.reset(0.0, RXMESH::DEVICE); - - // X in CG - RXMeshAttribute X; - X.set_name("X"); - X.init(rxmesh_static.get_num_vertices(), 3u, RXMESH::LOCATION_ALL, - RXMESH::SoA); - X.copy(input_coord, RXMESH::HOST, RXMESH::DEVICE); + auto B = + rxmesh.add_vertex_attribute("B", 3, rxmesh::DEVICE, rxmesh::SoA); + B->reset(0.0, rxmesh::DEVICE); + + // X in CG (the output) + auto X = rxmesh.add_vertex_attribute(Verts, "X", rxmesh::LOCATION_ALL); + + ReduceHandle reduce_handle(*X); // RXMesh launch box - LaunchBox launch_box; - rxmesh_static.prepare_launch_box(RXMESH::Op::VV, launch_box, false, true); + LaunchBox launch_box_init_B; + LaunchBox launch_box_matvec; + rxmesh.prepare_launch_box(rxmesh::Op::VV, + launch_box_init_B, + (void*)init_B, + true); + rxmesh.prepare_launch_box(rxmesh::Op::VV, + launch_box_matvec, + (void*)rxmesh_matvec, + true); // init kernel to initialize RHS (B) - init_B - <<>>( - rxmesh_static.get_context(), X, B, Arg.use_uniform_laplace); + init_B<<>>( + rxmesh.get_context(), *X, *B, Arg.use_uniform_laplace); // CG scalars - Vector<3, T> alpha(T(0)), beta(T(0)), delta_new(T(0)), delta_old(T(0)), - ones(T(1)); + T alpha(0), beta(0), delta_new(0), delta_old(0); GPUTimer timer; timer.start(); // s = Ax - mcf_matvec - <<>>( - rxmesh_static.get_context(), input_coord, X, S, - Arg.use_uniform_laplace, Arg.time_step); + rxmesh_matvec + <<>>(rxmesh.get_context(), + *input_coord, + *X, + *S, + Arg.use_uniform_laplace, + Arg.time_step); // r = b - s = b - Ax - // p=r - const uint32_t num_blocks = - DIVIDE_UP(rxmesh_static.get_num_vertices(), blockThreads); - init_PR<<>>(rxmesh_static.get_num_vertices(), - B, S, R, P); + // p=rk + init_PR(rxmesh, *B, *S, *R, *P); + // delta_new = - R.reduce(delta_new, RXMESH::NORM2); + delta_new = reduce_handle.norm2(*R); + delta_new *= delta_new; - const Vector<3, T> delta_0(delta_new); + const T delta_0(delta_new); uint32_t num_cg_iter_taken = 0; + GPUTimer matvec_timer; + float matvec_time = 0; + + while (num_cg_iter_taken < Arg.max_num_cg_iter) { // s = Ap - - mcf_matvec - <<>>( - rxmesh_static.get_context(), input_coord, P, S, - Arg.use_uniform_laplace, Arg.time_step); + matvec_timer.start(); + rxmesh_matvec + <<>>(rxmesh.get_context(), + *input_coord, + *P, + *S, + Arg.use_uniform_laplace, + Arg.time_step); + matvec_timer.stop(); + matvec_time += matvec_timer.elapsed_millis(); // alpha = delta_new / - S.reduce(alpha, RXMESH::DOT, &P); - + alpha = reduce_handle.dot(*S, *P); alpha = delta_new / alpha; - // x = x + alpha*p - X.axpy(P, alpha, ones); + // x = alpha*p + x + axpy(rxmesh, *X, *P, alpha, 1.f); - // r = r - alpha*s - R.axpy(S, -alpha, ones); + // r = - alpha*s + r + axpy(rxmesh, *R, *S, -alpha, 1.f); // delta_old = delta_new @@ -146,15 +184,14 @@ void mcf_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, // delta_new = - R.reduce(delta_new, RXMESH::NORM2); + delta_new = reduce_handle.norm2(*R); + delta_new *= delta_new; CUDA_ERROR(cudaStreamSynchronize(0)); // exit if error is getting too low across three coordinates - if (delta_new[0] < Arg.cg_tolerance * Arg.cg_tolerance * delta_0[0] && - delta_new[1] < Arg.cg_tolerance * Arg.cg_tolerance * delta_0[1] && - delta_new[2] < Arg.cg_tolerance * Arg.cg_tolerance * delta_0[2]) { + if (delta_new < Arg.cg_tolerance * Arg.cg_tolerance * delta_0) { break; } @@ -162,7 +199,7 @@ void mcf_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, beta = delta_new / delta_old; // p = beta*p + r - P.axpy(R, ones, beta); + axpy(rxmesh, *P, *R, 1.f, beta); ++num_cg_iter_taken; @@ -176,50 +213,52 @@ void mcf_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, RXMESH_TRACE( - "mcf_rxmesh() took {} (ms) and {} iterations (i.e., {} ms/iter) ", - timer.elapsed_millis(), num_cg_iter_taken, - timer.elapsed_millis() / float(num_cg_iter_taken)); + "mcf_rxmesh() took {} (ms) and {} iterations (i.e., {} ms/iter), " + "mat_vec time {} (ms) (i.e., {} ms/iter)", + timer.elapsed_millis(), + num_cg_iter_taken, + timer.elapsed_millis() / float(num_cg_iter_taken), + matvec_time, + matvec_time / float(num_cg_iter_taken)); // move output to host - X.move(RXMESH::DEVICE, RXMESH::HOST); + X->move(rxmesh::DEVICE, rxmesh::HOST); // output to obj - // rxmesh_static.exportOBJ("mcf_rxmesh.obj", - // [&X](uint32_t i, uint32_t j) { return X(i, j); }); + // rxmesh.export_obj("mcf_rxmesh.obj", *X); // Verify + const T tol = 0.001; bool passed = true; - const T tol = 0.001; - for (uint32_t v = 0; v < X.get_num_mesh_elements(); ++v) { - if (std::fabs(X(v, 0) - ground_truth(v, 0)) > - tol * std::fabs(ground_truth(v, 0)) || - std::fabs(X(v, 1) - ground_truth(v, 1)) > - tol * std::fabs(ground_truth(v, 1)) || - std::fabs(X(v, 2) - ground_truth(v, 2)) > - tol * std::fabs(ground_truth(v, 2))) { - passed = false; - break; + rxmesh.for_each_vertex(HOST, [&](const VertexHandle& vh) { + uint32_t v_id = rxmesh.map_to_global(vh); + + for (uint32_t i = 0; i < 3; ++i) { + if (std::abs(((*X)(vh, i) - ground_truth[v_id][i]) / + ground_truth[v_id][i]) > tol) { + passed = false; + break; + } } - } + }); EXPECT_TRUE(passed); - // Release allocation - X.release(); - B.release(); - S.release(); - R.release(); - P.release(); - input_coord.release(); // Finalize report - report.add_member("start_residual", to_string(delta_0)); - report.add_member("end_residual", to_string(delta_new)); + report.add_member("start_residual", delta_0); + report.add_member("end_residual", delta_new); report.add_member("num_cg_iter_taken", num_cg_iter_taken); report.add_member("total_time (ms)", timer.elapsed_millis()); + report.add_member("matvec_time (ms)", matvec_time); TestData td; - td.test_name = "MCF"; - td.time_ms.push_back(timer.elapsed_millis() / float(num_cg_iter_taken)); + td.test_name = "MCF"; + td.num_threads = launch_box_matvec.num_threads; + td.num_blocks = launch_box_matvec.blocks; + td.dyn_smem = launch_box_matvec.smem_bytes_dyn; + td.static_smem = launch_box_matvec.smem_bytes_static; + td.num_reg = launch_box_matvec.num_registers_per_thread; td.passed.push_back(passed); + td.time_ms.push_back(timer.elapsed_millis() / float(num_cg_iter_taken)); report.add_test(td); report.write(Arg.output_folder + "/rxmesh", "MCF_RXMesh_" + extract_file_name(Arg.obj_file_name)); diff --git a/apps/MCF/mcf_rxmesh_kernel.cuh b/apps/MCF/mcf_rxmesh_kernel.cuh index 6e8cab7f..7439be1e 100644 --- a/apps/MCF/mcf_rxmesh_kernel.cuh +++ b/apps/MCF/mcf_rxmesh_kernel.cuh @@ -1,50 +1,25 @@ #pragma once #include "mcf_util.h" -#include "rxmesh/kernels/rxmesh_query_dispatcher.cuh" -#include "rxmesh/rxmesh_attribute.h" -#include "rxmesh/rxmesh_context.h" -#include "rxmesh/util/math.h" +#include "rxmesh/attribute.h" +#include "rxmesh/context.h" +#include "rxmesh/kernels/query_dispatcher.cuh" #include "rxmesh/util/vector.h" -/** - * init_PR() - */ -template -__global__ static void init_PR(const uint32_t num_vertices, - const RXMESH::RXMeshAttribute B, - const RXMESH::RXMeshAttribute S, - RXMESH::RXMeshAttribute R, - RXMESH::RXMeshAttribute P) -{ - // r = b-s = b - Ax - // p= r - uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < num_vertices) { - R(idx, 0) = B(idx, 0) - S(idx, 0); - R(idx, 1) = B(idx, 1) - S(idx, 1); - R(idx, 2) = B(idx, 2) - S(idx, 2); - - P(idx, 0) = R(idx, 0); - P(idx, 1) = R(idx, 1); - P(idx, 2) = R(idx, 2); - } -} - /** * edge_cotan_weight() */ template __device__ __forceinline__ T -edge_cotan_weight(const uint32_t p_id, - const uint32_t r_id, - const uint32_t q_id, - const uint32_t s_id, - const RXMESH::RXMeshAttribute& X) +edge_cotan_weight(const rxmesh::VertexHandle& p_id, + const rxmesh::VertexHandle& r_id, + const rxmesh::VertexHandle& q_id, + const rxmesh::VertexHandle& s_id, + const rxmesh::VertexAttribute& X) { // Get the edge weight between the two vertices p-r where // q and s composes the diamond around p-r - using namespace RXMESH; + using namespace rxmesh; const Vector<3, T> p(X(p_id, 0), X(p_id, 1), X(p_id, 2)); const Vector<3, T> r(X(r_id, 0), X(r_id, 1), X(r_id, 2)); @@ -59,14 +34,14 @@ edge_cotan_weight(const uint32_t p_id, */ template __device__ __forceinline__ T -partial_voronoi_area(const uint32_t p_id, // center - const uint32_t q_id, // before center - const uint32_t r_id, // after center - const RXMESH::RXMeshAttribute& X) +partial_voronoi_area(const rxmesh::VertexHandle& p_id, // center + const rxmesh::VertexHandle& q_id, // before center + const rxmesh::VertexHandle& r_id, // after center + const rxmesh::VertexAttribute& X) { // compute partial Voronoi area of the center vertex that is associated with // the triangle p->q->r (oriented ccw) - using namespace RXMESH; + using namespace rxmesh; const Vector<3, T> p(X(p_id, 0), X(p_id, 1), X(p_id, 2)); const Vector<3, T> q(X(q_id, 0), X(q_id, 1), X(q_id, 2)); @@ -79,31 +54,30 @@ partial_voronoi_area(const uint32_t p_id, // center * init_B() */ template -__launch_bounds__(blockThreads) __global__ - static void init_B(const RXMESH::RXMeshContext context, - const RXMESH::RXMeshAttribute X, - RXMESH::RXMeshAttribute B, - const bool use_uniform_laplace) +__global__ static void init_B(const rxmesh::Context context, + const rxmesh::VertexAttribute X, + rxmesh::VertexAttribute B, + const bool use_uniform_laplace) { - using namespace RXMESH; + using namespace rxmesh; - auto init_lambda = [&](uint32_t p_id, RXMeshIterator& iter) { + auto init_lambda = [&](VertexHandle& p_id, const VertexIterator& iter) { if (use_uniform_laplace) { const T valence = static_cast(iter.size()); - B(p_id, 0) = X(p_id, 0) * valence; - B(p_id, 1) = X(p_id, 1) * valence; - B(p_id, 2) = X(p_id, 2) * valence; + B(p_id, 0) = X(p_id, 0) * valence; + B(p_id, 1) = X(p_id, 1) * valence; + B(p_id, 2) = X(p_id, 2) * valence; } else { // using Laplace weights T v_weight = 0; // this is the last vertex in the one-ring (before r_id) - uint32_t q_id = iter.back(); + VertexHandle q_id = iter.back(); for (uint32_t v = 0; v < iter.size(); ++v) { // the current one ring vertex - uint32_t r_id = iter[v]; + VertexHandle r_id = iter[v]; T tri_area = partial_voronoi_area(p_id, q_id, r_id, X); @@ -121,21 +95,20 @@ __launch_bounds__(blockThreads) __global__ // With uniform Laplacian, we just need the valence, thus we // call query_block_dispatcher and set oriented to false - query_block_dispatcher(context, init_lambda, - !use_uniform_laplace); + query_block_dispatcher( + context, init_lambda, !use_uniform_laplace); } /** * mcf_matvec() */ template -__launch_bounds__(blockThreads) __global__ - static void mcf_matvec(const RXMESH::RXMeshContext context, - const RXMESH::RXMeshAttribute coords, - const RXMESH::RXMeshAttribute in, - RXMESH::RXMeshAttribute out, - const bool use_uniform_laplace, - const T time_step) +__global__ static void rxmesh_matvec(const rxmesh::Context context, + const rxmesh::VertexAttribute coords, + const rxmesh::VertexAttribute in, + rxmesh::VertexAttribute out, + const bool use_uniform_laplace, + const T time_step) { // To compute the vertex cotan weight, we use the following configuration @@ -150,9 +123,9 @@ __launch_bounds__(blockThreads) __global__ \ | / p */ - using namespace RXMESH; + using namespace rxmesh; - auto matvec_lambda = [&](uint32_t p_id, RXMeshIterator& iter) { + auto matvec_lambda = [&](VertexHandle& p_id, const VertexIterator& iter) { T sum_e_weight(0); Vector<3, T> x(T(0)); @@ -161,18 +134,19 @@ __launch_bounds__(blockThreads) __global__ T v_weight(0); // this is the last vertex in the one-ring (before r_id) - uint32_t q_id = iter.back(); + VertexHandle q_id = iter.back(); for (uint32_t v = 0; v < iter.size(); ++v) { // the current one ring vertex - uint32_t r_id = iter[v]; + VertexHandle r_id = iter[v]; T e_weight = 0; if (use_uniform_laplace) { e_weight = 1; } else { // the second vertex in the one ring (after r_id) - uint32_t s_id = (v == iter.size() - 1) ? iter[0] : iter[v + 1]; + VertexHandle s_id = + (v == iter.size() - 1) ? iter[0] : iter[v + 1]; e_weight = edge_cotan_weight(p_id, r_id, q_id, s_id, coords); @@ -208,7 +182,7 @@ __launch_bounds__(blockThreads) __global__ assert(!isnan(v_weight)); assert(!isinf(v_weight)); - T diag = ((1.0 / v_weight) + sum_e_weight); + T diag = ((1.0 / v_weight) + sum_e_weight); out(p_id, 0) = x[0] + diag * in(p_id, 0); out(p_id, 1) = x[1] + diag * in(p_id, 1); out(p_id, 2) = x[2] + diag * in(p_id, 2); @@ -216,6 +190,6 @@ __launch_bounds__(blockThreads) __global__ // With uniform Laplacian, we just need the valence, thus we // call query_block_dispatcher and set oriented to false - query_block_dispatcher(context, matvec_lambda, - !use_uniform_laplace); + query_block_dispatcher( + context, matvec_lambda, !use_uniform_laplace); } \ No newline at end of file diff --git a/apps/MCF/mcf_util.h b/apps/MCF/mcf_util.h index 6481beff..90d23a26 100644 --- a/apps/MCF/mcf_util.h +++ b/apps/MCF/mcf_util.h @@ -10,7 +10,7 @@ __host__ __device__ __forceinline__ void clamp_cot(T& v) // clamp cotangent values as if angles are in[1, 179] const T bound = 19.1; // 3 degrees - v = (v < -bound) ? -bound : ((v > bound) ? bound : v); + v = (v < -bound) ? -bound : ((v > bound) ? bound : v); } /** @@ -18,14 +18,14 @@ __host__ __device__ __forceinline__ void clamp_cot(T& v) */ template __host__ __device__ __forceinline__ T -partial_voronoi_area(const RXMESH::Vector<3, T>& p, // center - const RXMESH::Vector<3, T>& q, // before center - const RXMESH::Vector<3, T>& r) // after center +partial_voronoi_area(const rxmesh::Vector<3, T>& p, // center + const rxmesh::Vector<3, T>& q, // before center + const rxmesh::Vector<3, T>& r) // after center { // compute partial Voronoi area of the center vertex that is associated with // the triangle p->q->r (oriented ccw) - using namespace RXMESH; + using namespace rxmesh; // Edge vector p->q const Vector<3, T> pq = q - p; @@ -78,14 +78,14 @@ partial_voronoi_area(const RXMESH::Vector<3, T>& p, // center */ template __host__ __device__ __forceinline__ T -edge_cotan_weight(const RXMESH::Vector<3, T>& p, - const RXMESH::Vector<3, T>& r, - const RXMESH::Vector<3, T>& q, - const RXMESH::Vector<3, T>& s) +edge_cotan_weight(const rxmesh::Vector<3, T>& p, + const rxmesh::Vector<3, T>& r, + const rxmesh::Vector<3, T>& q, + const rxmesh::Vector<3, T>& s) { // Get the edge weight between the two vertices p-r where // q and s composes the diamond around p-r - using namespace RXMESH; + using namespace rxmesh; auto partial_weight = [&](const Vector<3, T>& v) -> T { const Vector<3, T> d0 = p - v; diff --git a/apps/VertexNormal/CMakeLists.txt b/apps/VertexNormal/CMakeLists.txt index 597386fd..34f7eeb2 100644 --- a/apps/VertexNormal/CMakeLists.txt +++ b/apps/VertexNormal/CMakeLists.txt @@ -14,6 +14,8 @@ target_sources(VertexNormal set_target_properties(VertexNormal PROPERTIES FOLDER "apps") +set_property(TARGET VertexNormal PROPERTY CUDA_SEPARABLE_COMPILATION ON) + source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "VertexNormal" FILES ${SOURCE_LIST}) target_link_libraries( VertexNormal diff --git a/apps/VertexNormal/benchmark.sh b/apps/VertexNormal/benchmark.sh index afedd573..bb9d9264 100644 --- a/apps/VertexNormal/benchmark.sh +++ b/apps/VertexNormal/benchmark.sh @@ -1,5 +1,4 @@ #!/bin/bash -echo "This script re-generates RXMesh data in Figure 8(d) in the paper." echo "Please make sure to first compile the source code and then enter the input OBJ files directory." read -p "OBJ files directory (no trailing slash): " input_dir @@ -16,7 +15,7 @@ device_id=0 for file in $input_dir/*.obj; do if [ -f "$file" ]; then - echo $exe -p -input "$file" -num_run $num_run -device_id $device_id - $exe -p -input "$file" -num_run $num_run -device_id $device_id + echo $exe -input "$file" -num_run $num_run -device_id $device_id + $exe -input "$file" -num_run $num_run -device_id $device_id fi done \ No newline at end of file diff --git a/apps/VertexNormal/vertex_normal.cu b/apps/VertexNormal/vertex_normal.cu index bc27e538..3fe2da5b 100644 --- a/apps/VertexNormal/vertex_normal.cu +++ b/apps/VertexNormal/vertex_normal.cu @@ -4,7 +4,7 @@ #include #include "gtest/gtest.h" -#include "rxmesh/rxmesh_attribute.h" +#include "rxmesh/attribute.h" #include "rxmesh/rxmesh_static.h" #include "rxmesh/util/import_obj.h" #include "rxmesh/util/report.h" @@ -16,22 +16,20 @@ struct arg { std::string obj_file_name = STRINGIFY(INPUT_DIR) "sphere3.obj"; std::string output_folder = STRINGIFY(OUTPUT_DIR); - uint32_t num_run = 1; - uint32_t device_id = 0; + uint32_t num_run = 1; + uint32_t device_id = 0; char** argv; int argc; - bool shuffle = false; - bool sort = false; } Arg; #include "vertex_normal_hardwired.cuh" -template -void vertex_normal_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, +template +void vertex_normal_rxmesh(rxmesh::RXMeshStatic& rxmesh, const std::vector>& Verts, const std::vector& vertex_normal_gold) { - using namespace RXMESH; + using namespace rxmesh; constexpr uint32_t blockThreads = 256; // Report @@ -39,53 +37,41 @@ void vertex_normal_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, report.command_line(Arg.argc, Arg.argv); report.device(); report.system(); - report.model_data(Arg.obj_file_name, rxmesh_static); + report.model_data(Arg.obj_file_name, rxmesh); report.add_member("method", std::string("RXMesh")); - std::string order = "default"; - if (Arg.shuffle) { - order = "shuffle"; - } else if (Arg.sort) { - order = "sorted"; - } - report.add_member("input_order", order); report.add_member("blockThreads", blockThreads); - RXMeshAttribute coords; - coords.set_name("coord"); - coords.init(Verts.size(), 3u, RXMESH::LOCATION_ALL); - // fill in the coordinates - for (uint32_t i = 0; i < Verts.size(); ++i) { - for (uint32_t j = 0; j < Verts[i].size(); ++j) { - coords(i, j) = Verts[i][j]; - } - } - // move the coordinates to device - coords.move(RXMESH::HOST, RXMESH::DEVICE); + auto coords = rxmesh.add_vertex_attribute(Verts, "coordinates"); // normals - RXMeshAttribute rxmesh_normal; - rxmesh_normal.set_name("normal"); - rxmesh_normal.init(coords.get_num_mesh_elements(), 3u, - RXMESH::LOCATION_ALL); + auto v_normals = + rxmesh.add_vertex_attribute("v_normals", 3, rxmesh::LOCATION_ALL); // launch box LaunchBox launch_box; - rxmesh_static.prepare_launch_box(RXMESH::Op::FV, launch_box); + rxmesh.prepare_launch_box( + rxmesh::Op::FV, launch_box, (void*)compute_vertex_normal); TestData td; - td.test_name = "VertexNormal"; + td.test_name = "VertexNormal"; + td.num_threads = launch_box.num_threads; + td.num_blocks = launch_box.blocks; + td.dyn_smem = launch_box.smem_bytes_dyn; + td.static_smem = launch_box.smem_bytes_static; + td.num_reg = launch_box.num_registers_per_thread; float vn_time = 0; for (uint32_t itr = 0; itr < Arg.num_run; ++itr) { - rxmesh_normal.reset(0, RXMESH::DEVICE); + v_normals->reset(0, rxmesh::DEVICE); GPUTimer timer; timer.start(); - compute_vertex_normal - <<>>( - rxmesh_static.get_context(), coords, rxmesh_normal); + compute_vertex_normal<<>>( + rxmesh.get_context(), *coords, *v_normals); timer.stop(); CUDA_ERROR(cudaDeviceSynchronize()); @@ -99,17 +85,17 @@ void vertex_normal_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, vn_time / Arg.num_run); // Verify - rxmesh_normal.move(RXMESH::DEVICE, RXMESH::HOST); + v_normals->move(rxmesh::DEVICE, rxmesh::HOST); - bool passed = compare(vertex_normal_gold.data(), - rxmesh_normal.get_pointer(RXMESH::HOST), - coords.get_num_mesh_elements() * 3, false); - td.passed.push_back(passed); - EXPECT_TRUE(passed) << " RXMesh Validation failed \n"; + rxmesh.for_each_vertex(HOST, [&](const VertexHandle& vh) { + uint32_t v_id = rxmesh.map_to_global(vh); - // Release allocation - rxmesh_normal.release(); - coords.release(); + for (uint32_t i = 0; i < 3; ++i) { + EXPECT_NEAR(std::abs(vertex_normal_gold[v_id * 3 + i]), + std::abs((*v_normals)(vh, i)), + 0.0001); + } + }); // Finalize report report.add_test(td); @@ -119,17 +105,9 @@ void vertex_normal_rxmesh(RXMESH::RXMeshStatic& rxmesh_static, TEST(Apps, VertexNormal) { - using namespace RXMESH; + using namespace rxmesh; using dataT = float; - if (Arg.shuffle) { - ASSERT_FALSE(Arg.sort) << " cannot shuffle and sort at the same time!"; - } - if (Arg.sort) { - ASSERT_FALSE(Arg.shuffle) - << " cannot shuffle and sort at the same time!"; - } - // Select device cuda_query(Arg.device_id); @@ -139,28 +117,23 @@ TEST(Apps, VertexNormal) ASSERT_TRUE(import_obj(Arg.obj_file_name, Verts, Faces)); - if (Arg.shuffle) { - shuffle_obj(Faces, Verts); - } - // Create RXMeshStatic instance. If Arg.sort is true, Faces and Verts will - // be sorted based on the patching happening inside RXMesh - RXMeshStatic rxmesh_static(Faces, Verts, Arg.sort, false); + RXMeshStatic rxmesh(Faces, false); - //*** Serial reference + // Serial reference std::vector vertex_normal_gold(3 * Verts.size()); vertex_normal_ref(Faces, Verts, vertex_normal_gold); - //*** RXMesh Impl - vertex_normal_rxmesh(rxmesh_static, Verts, vertex_normal_gold); + // RXMesh Impl + vertex_normal_rxmesh(rxmesh, Verts, vertex_normal_gold); - //*** Hardwired Impl + // Hardwired Impl vertex_normal_hardwired(Faces, Verts, vertex_normal_gold); } int main(int argc, char** argv) { - using namespace RXMESH; + using namespace rxmesh; Log::init(); ::testing::InitGoogleTest(&argc, argv); @@ -177,8 +150,6 @@ int main(int argc, char** argv) " Hint: Only accepts OBJ files\n" " -o: JSON file output folder. Default is {} \n" " -num_run: Number of iterations for performance testing. Default is {} \n" - " -s: Shuffle input. Default is false.\n" - " -p: Sort input using patching output. Default is false.\n" " -device_id: GPU device ID. Default is {}", Arg.obj_file_name, Arg.output_folder, Arg.num_run, Arg.device_id); // clang-format on @@ -201,12 +172,6 @@ int main(int argc, char** argv) Arg.device_id = atoi(get_cmd_option(argv, argv + argc, "-device_id")); } - if (cmd_option_exists(argv, argc + argv, "-s")) { - Arg.shuffle = true; - } - if (cmd_option_exists(argv, argc + argv, "-p")) { - Arg.sort = true; - } } RXMESH_TRACE("input= {}", Arg.obj_file_name); diff --git a/apps/VertexNormal/vertex_normal_hardwired.cuh b/apps/VertexNormal/vertex_normal_hardwired.cuh index c11bf837..7ab1a700 100644 --- a/apps/VertexNormal/vertex_normal_hardwired.cuh +++ b/apps/VertexNormal/vertex_normal_hardwired.cuh @@ -1,7 +1,6 @@ #pragma once #include #include "rxmesh/util/log.h" -#include "rxmesh/util/math.h" #include "rxmesh/util/report.h" template @@ -13,6 +12,28 @@ vertex_normal_hardwired_kernel(const uint32_t num_faces, T* d_vertex_normal) { uint32_t f_id = threadIdx.x + blockIdx.x * blockDim.x; + + auto l2_norm_sq = [](const T ax0, + const T ax1, + const T ax2, + const T bx0, + const T bx1, + const T bx2) { + // compute (xa0-xb0)*(xa0-xb0) + (xa1-xb1)*(xa1-xb1) + + // (xa2-xb2)*(xa2-xb2) + T x0 = ax0 - bx0; + T x1 = ax1 - bx1; + T x2 = ax2 - bx2; + return x0 * x0 + x1 * x1 + x2 * x2; + }; + + auto cross_product = + [](T xv1, T yv1, T zv1, T xv2, T yv2, T zv2, T& xx, T& yy, T& zz) { + xx = yv1 * zv2 - zv1 * yv2; + yy = zv1 * xv2 - xv1 * zv2; + zz = xv1 * yv2 - yv1 * xv2; + }; + if (f_id < num_faces) { uint32_t v0 = d_faces[f_id * 3]; uint32_t v1 = d_faces[f_id * 3 + 1]; @@ -29,11 +50,18 @@ vertex_normal_hardwired_kernel(const uint32_t num_faces, T nx, ny, nz; - RXMESH::cross_product(v1x - v0x, v1y - v0y, v1z - v0z, v2x - v0x, - v2y - v0y, v2z - v0z, nx, ny, nz); - T l0 = RXMESH::l2_norm_sq(v0x, v0y, v0z, v1x, v1y, v1z); // v0-v1 - T l1 = RXMESH::l2_norm_sq(v1x, v1y, v1z, v2x, v2y, v2z); // v1-v2 - T l2 = RXMESH::l2_norm_sq(v2x, v2y, v2z, v0x, v0y, v0z); // v2-v0 + cross_product(v1x - v0x, + v1y - v0y, + v1z - v0z, + v2x - v0x, + v2y - v0y, + v2z - v0z, + nx, + ny, + nz); + T l0 = l2_norm_sq(v0x, v0y, v0z, v1x, v1y, v1z); // v0-v1 + T l1 = l2_norm_sq(v1x, v1y, v1z, v2x, v2y, v2z); // v1-v2 + T l2 = l2_norm_sq(v2x, v2y, v2z, v0x, v0y, v0z); // v2-v0 atomicAdd(&d_vertex_normal[v0 * 3 + 0], nx / (l0 + l2)); atomicAdd(&d_vertex_normal[v0 * 3 + 1], ny / (l0 + l2)); @@ -55,9 +83,9 @@ inline void vertex_normal_hardwired( const std::vector>& Verts, const std::vector& vertex_normal_gold) { - using namespace RXMESH; + using namespace rxmesh; uint32_t num_vertices = Verts.size(); - uint32_t num_faces = Faces.size(); + uint32_t num_faces = Faces.size(); CustomReport report("VertexNormal_Hardwired"); report.command_line(Arg.argc, Arg.argv); @@ -65,13 +93,6 @@ inline void vertex_normal_hardwired( report.system(); report.model_data(Arg.obj_file_name, num_vertices, num_faces); report.add_member("method", std::string("Hardwired")); - std::string order = "default"; - if (Arg.shuffle) { - order = "shuffle"; - } else if (Arg.sort) { - order = "sorted"; - } - report.add_member("input_order", order); std::vector h_face(num_faces * 3); std::vector h_verts(num_vertices * 3); @@ -94,17 +115,20 @@ inline void vertex_normal_hardwired( CUDA_ERROR(cudaMalloc((void**)&d_face, 3 * num_faces * sizeof(uint32_t))); CUDA_ERROR(cudaMalloc((void**)&d_verts, 3 * num_vertices * sizeof(T))); CUDA_ERROR(cudaMalloc((void**)&d_normals, 3 * num_vertices * sizeof(T))); - CUDA_ERROR(cudaMemcpy(d_face, h_face.data(), + CUDA_ERROR(cudaMemcpy(d_face, + h_face.data(), h_face.size() * sizeof(uint32_t), cudaMemcpyHostToDevice)); - CUDA_ERROR(cudaMemcpy(d_verts, h_verts.data(), h_verts.size() * sizeof(T), + CUDA_ERROR(cudaMemcpy(d_verts, + h_verts.data(), + h_verts.size() * sizeof(T), cudaMemcpyHostToDevice)); const uint32_t threads = 256; - const uint32_t blocks = DIVIDE_UP(num_faces, threads); + const uint32_t blocks = DIVIDE_UP(num_faces, threads); TestData td; - td.test_name = "VertexNormal"; + td.test_name = "VertexNormal"; float vn_time = 0; for (uint32_t itr = 0; itr < Arg.num_run; ++itr) { CUDA_ERROR(cudaMemset(d_normals, 0, 3 * num_vertices * sizeof(T))); @@ -126,7 +150,8 @@ inline void vertex_normal_hardwired( T* verts_normal_hardwired; verts_normal_hardwired = (T*)malloc(num_vertices * 3 * sizeof(T)); - CUDA_ERROR(cudaMemcpy(verts_normal_hardwired, d_normals, + CUDA_ERROR(cudaMemcpy(verts_normal_hardwired, + d_normals, 3 * num_vertices * sizeof(T), cudaMemcpyDeviceToHost)); @@ -138,8 +163,10 @@ inline void vertex_normal_hardwired( RXMESH_TRACE("vertex_normal_hardwired() vertex normal kernel took {} (ms)", vn_time); - bool passed = compare(vertex_normal_gold.data(), verts_normal_hardwired, - Verts.size() * 3, false); + bool passed = compare(vertex_normal_gold.data(), + verts_normal_hardwired, + Verts.size() * 3, + false); td.passed.push_back(passed); EXPECT_TRUE(passed) << " Hardwired Validation failed \n"; @@ -148,6 +175,6 @@ inline void vertex_normal_hardwired( report.add_test(td); report.write( - Arg.output_folder + "/hardwired/" + order, + Arg.output_folder + "/hardwired", "VertexNormal_Hardwired_" + extract_file_name(Arg.obj_file_name)); } \ No newline at end of file diff --git a/apps/VertexNormal/vertex_normal_kernel.cuh b/apps/VertexNormal/vertex_normal_kernel.cuh index fe775d26..67ccb727 100644 --- a/apps/VertexNormal/vertex_normal_kernel.cuh +++ b/apps/VertexNormal/vertex_normal_kernel.cuh @@ -1,21 +1,19 @@ #pragma once -#include "rxmesh/kernels/rxmesh_query_dispatcher.cuh" -#include "rxmesh/rxmesh_attribute.h" -#include "rxmesh/rxmesh_context.h" -#include "rxmesh/util/math.h" +#include "rxmesh/attribute.h" +#include "rxmesh/context.h" +#include "rxmesh/kernels/query_dispatcher.cuh" #include "rxmesh/util/vector.h" /** * vertex_normal() */ template -__launch_bounds__(blockThreads, 6) __global__ - static void compute_vertex_normal(const RXMESH::RXMeshContext context, - RXMESH::RXMeshAttribute coords, - RXMESH::RXMeshAttribute normals) +__global__ static void compute_vertex_normal(const rxmesh::Context context, + rxmesh::VertexAttribute coords, + rxmesh::VertexAttribute normals) { - using namespace RXMESH; - auto vn_lambda = [&](uint32_t face_id, RXMeshIterator& fv) { + using namespace rxmesh; + auto vn_lambda = [&](FaceHandle face_id, VertexIterator& fv) { // get the face's three vertices coordinates Vector<3, T> c0(coords(fv[0], 0), coords(fv[0], 1), coords(fv[0], 2)); Vector<3, T> c1(coords(fv[1], 0), coords(fv[1], 1), coords(fv[1], 2)); diff --git a/apps/VertexNormal/vertex_normal_ref.h b/apps/VertexNormal/vertex_normal_ref.h index 486b7b2e..4ba55f8d 100644 --- a/apps/VertexNormal/vertex_normal_ref.h +++ b/apps/VertexNormal/vertex_normal_ref.h @@ -1,6 +1,5 @@ #pragma once #include -#include "rxmesh/util/math.h" #include "rxmesh/util/report.h" template @@ -9,7 +8,7 @@ inline void vertex_normal_ref(const std::vector>& Faces, std::vector& vertex_normal) { uint32_t num_vertices = Verts.size(); - uint32_t num_faces = Faces.size(); + uint32_t num_faces = Faces.size(); memset((void*)vertex_normal.data(), 0, vertex_normal.size() * sizeof(T)); @@ -17,46 +16,71 @@ inline void vertex_normal_ref(const std::vector>& Faces, uint32_t v[3]; T fn[3]; + auto l2_norm_sq = [](const T ax0, + const T ax1, + const T ax2, + const T bx0, + const T bx1, + const T bx2) { + // compute (xa0-xb0)*(xa0-xb0) + (xa1-xb1)*(xa1-xb1) + + // (xa2-xb2)*(xa2-xb2) + T x0 = ax0 - bx0; + T x1 = ax1 - bx1; + T x2 = ax2 - bx2; + return x0 * x0 + x1 * x1 + x2 * x2; + }; + + auto cross_product = + [](T xv1, T yv1, T zv1, T xv2, T yv2, T zv2, T& xx, T& yy, T& zz) { + xx = yv1 * zv2 - zv1 * yv2; + yy = zv1 * xv2 - xv1 * zv2; + zz = xv1 * yv2 - yv1 * xv2; + }; + for (uint32_t f = 0; f < num_faces; ++f) { v[0] = Faces[f][0]; v[1] = Faces[f][1]; v[2] = Faces[f][2]; - RXMESH::cross_product( - Verts[v[1]][0] - Verts[v[0]][0], Verts[v[1]][1] - Verts[v[0]][1], - Verts[v[1]][2] - Verts[v[0]][2], Verts[v[2]][0] - Verts[v[0]][0], - Verts[v[2]][1] - Verts[v[0]][1], Verts[v[2]][2] - Verts[v[0]][2], - fn[0], fn[1], fn[2]); + cross_product(Verts[v[1]][0] - Verts[v[0]][0], + Verts[v[1]][1] - Verts[v[0]][1], + Verts[v[1]][2] - Verts[v[0]][2], + Verts[v[2]][0] - Verts[v[0]][0], + Verts[v[2]][1] - Verts[v[0]][1], + Verts[v[2]][2] - Verts[v[0]][2], + fn[0], + fn[1], + fn[2]); - edge_len[0] = - RXMESH::l2_norm_sq(Verts[v[0]][0], Verts[v[0]][1], Verts[v[0]][2], - Verts[v[1]][0], Verts[v[1]][1], - Verts[v[1]][2]); // v0-v1 + edge_len[0] = l2_norm_sq(Verts[v[0]][0], + Verts[v[0]][1], + Verts[v[0]][2], + Verts[v[1]][0], + Verts[v[1]][1], + Verts[v[1]][2]); // v0-v1 - edge_len[1] = - RXMESH::l2_norm_sq(Verts[v[1]][0], Verts[v[1]][1], Verts[v[1]][2], - Verts[v[2]][0], Verts[v[2]][1], - Verts[v[2]][2]); // v1-v2 + edge_len[1] = l2_norm_sq(Verts[v[1]][0], + Verts[v[1]][1], + Verts[v[1]][2], + Verts[v[2]][0], + Verts[v[2]][1], + Verts[v[2]][2]); // v1-v2 - edge_len[2] = - RXMESH::l2_norm_sq(Verts[v[2]][0], Verts[v[2]][1], Verts[v[2]][2], - Verts[v[0]][0], Verts[v[0]][1], - Verts[v[0]][2]); // v2-v0 + edge_len[2] = l2_norm_sq(Verts[v[2]][0], + Verts[v[2]][1], + Verts[v[2]][2], + Verts[v[0]][0], + Verts[v[0]][1], + Verts[v[0]][2]); // v2-v0 for (uint32_t i = 0; i < 3; ++i) { - uint32_t k = (i + 2) % 3; + uint32_t k = (i + 2) % 3; uint32_t base = 3 * v[i]; for (uint32_t l = 0; l < 3; ++l) { vertex_normal[base + l] += fn[l] / (edge_len[i] + edge_len[k]); } } - } - - /*for (T v = 0; v < num_vertices; ++v) { - T base = 3 * v; - normalize_vector(vertex_normal[base], vertex_normal[base + 1], - vertex_normal[base + 2]); - }*/ + } } \ No newline at end of file diff --git a/apps/common/openmesh_report.h b/apps/common/openmesh_report.h index 446d41f5..60de6b92 100644 --- a/apps/common/openmesh_report.h +++ b/apps/common/openmesh_report.h @@ -1,12 +1,12 @@ #include "rxmesh/util/report.h" -class OpenMeshReport : public RXMESH::Report +class OpenMeshReport : public rxmesh::Report { public: - OpenMeshReport() : RXMESH::Report() + OpenMeshReport() : rxmesh::Report() { } - OpenMeshReport(const std::string& record_name) : RXMESH::Report(record_name) + OpenMeshReport(const std::string& record_name) : rxmesh::Report(record_name) { } @@ -16,8 +16,8 @@ class OpenMeshReport : public RXMESH::Report subdoc.SetObject(); add_member("model_name", model_name, subdoc); - add_member("num_vertices", static_cast(mesh.n_vertices()), - subdoc); + add_member( + "num_vertices", static_cast(mesh.n_vertices()), subdoc); add_member("num_edges", static_cast(mesh.n_edges()), subdoc); add_member("num_faces", static_cast(mesh.n_faces()), subdoc); diff --git a/apps/common/openmesh_trimesh.h b/apps/common/openmesh_trimesh.h index dc9fb700..049b7ddd 100644 --- a/apps/common/openmesh_trimesh.h +++ b/apps/common/openmesh_trimesh.h @@ -5,7 +5,7 @@ #include struct MyTraits : public OpenMesh::DefaultTraits - //DefaultTraitsDouble +// DefaultTraitsDouble { VertexAttributes(OpenMesh::Attributes::Normal); diff --git a/cmake/AutoDetectCudaArch.cmake b/cmake/AutoDetectCudaArch.cmake index 4165d93a..a9dbf914 100644 --- a/cmake/AutoDetectCudaArch.cmake +++ b/cmake/AutoDetectCudaArch.cmake @@ -48,20 +48,24 @@ int main() { if(CUDA_RETURN_CODE EQUAL 0) set(CUDA_ARCHS ${fprintf_output} CACHE STRING "CUDA Arch") else() - message(STATUS "GPU architectures auto-detect failed. Will build for all possible architectures.") - set(CUDA_ARCHS "--generate-code=arch=compute_35,code=sm_35;" - "--generate-code=arch=compute_37,code=sm_37;" - "--generate-code=arch=compute_50,code=sm_50;" - "--generate-code=arch=compute_52,code=sm_52;" - "--generate-code=arch=compute_60,code=sm_60;" - "--generate-code=arch=compute_61,code=sm_61;" - "--generate-code=arch=compute_70,code=sm_70;" - "--generate-code=arch=compute_72,code=sm_72;" - "--generate-code=arch=compute_75,code=sm_75;" + message(STATUS "GPU architectures auto-detect failed. Will build for sm_70.") + set(CUDA_ARCHS #"--generate-code=arch=compute_35,code=sm_35;" + #"--generate-code=arch=compute_37,code=sm_37;" + #"--generate-code=arch=compute_50,code=sm_50;" + #"--generate-code=arch=compute_52,code=sm_52;" + #"--generate-code=arch=compute_60,code=sm_60;" + #"--generate-code=arch=compute_61,code=sm_61;" + --generate-code=arch=compute_70,code=sm_70; + #"--generate-code=arch=compute_72,code=sm_72;" + #"--generate-code=arch=compute_75,code=sm_75;" CACHE STRING "CUDA Arch") endif() endif() message(STATUS "CUDA_ARCHS= " ${CUDA_ARCHS}) + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + #https://gitlab.kitware.com/cmake/cmake/-/issues/18265 + list(APPEND CMAKE_CUDA_FLAGS "${CUDA_ARCHS}") + endif () endif() ################################################################################### diff --git a/include/rxmesh/attribute.h b/include/rxmesh/attribute.h new file mode 100644 index 00000000..22d413a2 --- /dev/null +++ b/include/rxmesh/attribute.h @@ -0,0 +1,884 @@ +#pragma once + +#include +#include + +#include "rxmesh/handle.h" +#include "rxmesh/kernels/attribute.cuh" +#include "rxmesh/kernels/collective.cuh" +#include "rxmesh/kernels/util.cuh" +#include "rxmesh/patch_info.h" +#include "rxmesh/types.h" +#include "rxmesh/util/cuda_query.h" +#include "rxmesh/util/log.h" +#include "rxmesh/util/util.h" +#include "rxmesh/util/vector.h" + +class RXMeshTest; + + +namespace rxmesh { + + +/** + * @brief Base untyped attributes used as an interface for attribute container + */ +class AttributeBase +{ + // our friend tester class + friend class ::RXMeshTest; + + public: + AttributeBase() = default; + + virtual const char* get_name() const = 0; + + virtual void release(locationT location = LOCATION_ALL) = 0; + + virtual ~AttributeBase() = default; +}; + +/** + * @brief Here we manage the attributes on top of the mesh. An attributes is + * attached to mesh element (e.g., vertices, edges, or faces). + * largely inspired by + * https://github.com/gunrock/gunrock/blob/master/gunrock/util/array_utils.cuh + * It is discouraged to use Attribute directly in favor of using + * add_X_attributes() from RXMeshStatic where X is vertex, edge, or face. This + * way, the user does not have to specify the number of mesh elements or + * deallocate/release the Attribute (attribute garbage collection is managed by + * RXMeshStatic) + * @tparam T type of the attribute + */ +template +class Attribute : public AttributeBase +{ + template + friend class ReduceHandle; + + public: + /** + * @brief Default constructor which initializes all pointers to nullptr + */ + Attribute() + : AttributeBase(), + m_name(nullptr), + m_num_attributes(0), + m_allocated(LOCATION_NONE), + m_h_attr(nullptr), + m_h_ptr_on_device(nullptr), + m_d_attr(nullptr), + m_num_patches(0), + m_d_element_per_patch(nullptr), + m_h_element_per_patch(nullptr), + m_layout(AoS) + { + + this->m_name = (char*)malloc(sizeof(char) * 1); + this->m_name[0] = '\0'; + } + + /** + * @brief Main constructor + * @param name attribute name + */ + Attribute(const char* name) + : AttributeBase(), + m_name(nullptr), + m_num_attributes(0), + m_allocated(LOCATION_NONE), + m_h_attr(nullptr), + m_h_ptr_on_device(nullptr), + m_d_attr(nullptr), + m_num_patches(0), + m_d_element_per_patch(nullptr), + m_h_element_per_patch(nullptr), + m_layout(AoS) + { + if (name != nullptr) { + this->m_name = (char*)malloc(sizeof(char) * (strlen(name) + 1)); + strcpy(this->m_name, name); + } + } + + Attribute(const Attribute& rhs) = default; + + virtual ~Attribute() = default; + + /** + * @brief Get the name of the attribute + */ + const char* get_name() const + { + return m_name; + } + + /** + * @brief get the number of attributes per mesh element + */ + __host__ __device__ __forceinline__ uint32_t get_num_attributes() const + { + return this->m_num_attributes; + } + + /** + * @brief Flag that indicates where the memory is allocated + */ + __host__ __device__ __forceinline__ locationT get_allocated() const + { + return this->m_allocated; + } + + /** + * @brief Check if attribute is allocated on device + */ + __host__ __device__ __forceinline__ bool is_device_allocated() const + { + return ((m_allocated & DEVICE) == DEVICE); + } + + /** + * @brief Check if attribute is allocated on host + */ + __host__ __device__ __forceinline__ bool is_host_allocated() const + { + return ((m_allocated & HOST) == HOST); + } + + /** + * @brief Reset attribute to certain value + * @param value to be set + * @param location which location (device, host, or both) where attribute + * will be set + * @param stream in case of DEVICE, this is the stream that will be used to + * launch the reset kernel + */ + void reset(const T value, locationT location, cudaStream_t stream = NULL) + { + if ((location & DEVICE) == DEVICE) { + + assert((m_allocated & DEVICE) == DEVICE); + + const int threads = 256; + detail::template memset_attribute + <<>>(*this, + value, + m_d_element_per_patch, + m_num_patches, + m_num_attributes); + } + + + if ((location & HOST) == HOST) { + assert((m_allocated & HOST) == HOST); +#pragma omp parallel for + for (int p = 0; p < static_cast(m_num_patches); ++p) { + for (int e = 0; e < m_h_element_per_patch[p]; ++e) { + m_h_attr[p][e] = value; + } + } + } + } + + /** + * @brief Allocate memory for attribute. This is meant to be used by + * RXMeshStatic + * @param element_per_patch indicate the number of mesh element owned by + * each patch + * @param num_attributes number of attribute per mesh element + * @param location where the memory should reside (host, device, or both) + * @param layout memory layout in case num_attributes>1 + */ + void init(const std::vector& element_per_patch, + const uint32_t num_attributes, + locationT location = LOCATION_ALL, + const layoutT layout = AoS) + { + release(); + m_num_patches = element_per_patch.size(); + m_num_attributes = num_attributes; + m_layout = layout; + + if (m_num_patches == 0) { + return; + } + + allocate(element_per_patch.data(), location); + } + + /** + * @brief Copy memory from one location to another. If target is not + * allocated, it will be allocated first before copying the memory. + * @param source the source location + * @param target the destination location + * @param stream to be used to launch the kernel + * TODO it is better to launch a kernel that do the memcpy than relying on + * the host API from CUDA since all these small memcpy will be enqueued in + * the same stream and so serialized + */ + void move(locationT source, locationT target, cudaStream_t stream = NULL) + { + if (source == target) { + RXMESH_WARN( + "Attribute::move() source ({}) and target ({}) " + "are the same.", + location_to_string(source), + location_to_string(target)); + return; + } + + if ((source == HOST || source == DEVICE) && + ((source & m_allocated) != source)) { + RXMESH_ERROR( + "Attribute::move() moving source is not valid" + " because it was not allocated on source i.e., {}", + location_to_string(source)); + } + + if (((target & HOST) == HOST || (target & DEVICE) == DEVICE) && + ((target & m_allocated) != target)) { + RXMESH_WARN( + "Attribute::move() allocating target before moving to {}", + location_to_string(target)); + allocate(m_h_element_per_patch, target); + } + + if (this->m_num_patches == 0) { + return; + } + + if (source == HOST && target == DEVICE) { + for (uint32_t p = 0; p < m_num_patches; ++p) { + CUDA_ERROR(cudaMemcpyAsync( + m_h_ptr_on_device[p], + m_h_attr[p], + sizeof(T) * m_h_element_per_patch[p] * m_num_attributes, + cudaMemcpyHostToDevice, + stream)); + } + } else if (source == DEVICE && target == HOST) { + for (uint32_t p = 0; p < m_num_patches; ++p) { + CUDA_ERROR(cudaMemcpyAsync( + m_h_attr[p], + m_h_ptr_on_device[p], + sizeof(T) * m_h_element_per_patch[p] * m_num_attributes, + cudaMemcpyDeviceToHost, + stream)); + } + } + } + + /** + * @brief Release allocated memory in certain location + * @param location where memory will be released + */ + void release(locationT location = LOCATION_ALL) + { + if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) { + for (uint32_t p = 0; p < m_num_patches; ++p) { + free(m_h_attr[p]); + } + free(m_h_attr); + m_h_attr = nullptr; + free(m_h_element_per_patch); + m_h_element_per_patch = nullptr; + m_allocated = m_allocated & (~HOST); + } + + if (((location & DEVICE) == DEVICE) && + ((m_allocated & DEVICE) == DEVICE)) { + for (uint32_t p = 0; p < m_num_patches; ++p) { + GPU_FREE(m_h_ptr_on_device[p]); + } + GPU_FREE(m_d_attr); + GPU_FREE(m_d_element_per_patch); + m_allocated = m_allocated & (~DEVICE); + } + } + + /** + * @brief Deep copy from a source attribute. If source_flag and dst_flag are + * both set to LOCATION_ALL, then we copy what is on host to host, and what + * on device to device. If sourc_flag is set to HOST (or DEVICE) and + * dst_flag is set to LOCATION_ALL, then we copy source's HOST (or + * DEVICE) to both HOST and DEVICE. Setting source_flag to + * LOCATION_ALL while dst_flag is NOT set to LOCATION_ALL is invalid + * because we don't know which source to copy from + * @param source attribute to copy from + * @param source_flag defines where we will copy from + * @param dst_flag defines where we will copy to + * @param stream used to launch kernel/memcpy + */ + void copy_from(Attribute& source, + locationT source_flag, + locationT dst_flag, + cudaStream_t stream = NULL) + { + + + if (source.m_layout != m_layout) { + RXMESH_ERROR( + "Attribute::copy_from() does not support copy from " + "source of different layout!"); + } + + if ((source_flag & LOCATION_ALL) == LOCATION_ALL && + (dst_flag & LOCATION_ALL) != LOCATION_ALL) { + RXMESH_ERROR("Attribute::copy_from() Invalid configuration!"); + } + + if (m_num_attributes != source.get_num_attributes()) { + RXMESH_ERROR( + "Attribute::copy_from() number of attributes is " + "different!"); + } + + if (this->is_empty() || this->m_num_patches == 0) { + return; + } + + // 1) copy from HOST to HOST + if ((source_flag & HOST) == HOST && (dst_flag & HOST) == HOST) { + if ((source_flag & source.m_allocated) != source_flag) { + RXMESH_ERROR( + "Attribute::copy() copying source is not valid" + " because it was not allocated on host"); + } + if ((dst_flag & m_allocated) != dst_flag) { + RXMESH_ERROR( + "Attribute::copy() copying source is not valid" + " because location (this) was not allocated on host"); + } + + for (uint32_t p = 0; p < m_num_patches; ++p) { + assert(m_h_element_per_patch[p] == + source.m_h_element_per_patch[p]); + std::memcpy( + m_h_ptr_on_device[p], + source.m_h_ptr_on_device[p], + sizeof(T) * m_h_element_per_patch[p] * m_num_attributes); + } + } + + + // 2) copy from DEVICE to DEVICE + if ((source_flag & DEVICE) == DEVICE && (dst_flag & DEVICE) == DEVICE) { + if ((source_flag & source.m_allocated) != source_flag) { + RXMESH_ERROR( + "Attribute::copy() copying source is not valid" + " because it was not allocated on device"); + } + if ((dst_flag & m_allocated) != dst_flag) { + RXMESH_ERROR( + "Attribute::copy() copying source is not valid" + " because location (this) was not allocated on device"); + } + + for (uint32_t p = 0; p < m_num_patches; ++p) { + assert(m_h_element_per_patch[p] == + source.m_h_element_per_patch[p]); + CUDA_ERROR(cudaMemcpyAsync( + m_h_ptr_on_device[p], + source.m_h_ptr_on_device[p], + sizeof(T) * m_h_element_per_patch[p] * m_num_attributes, + cudaMemcpyDeviceToDevice, + stream)); + } + } + + + // 3) copy from DEVICE to HOST + if ((source_flag & DEVICE) == DEVICE && (dst_flag & HOST) == HOST) { + if ((source_flag & source.m_allocated) != source_flag) { + RXMESH_ERROR( + "Attribute::copy() copying source is not valid" + " because it was not allocated on host"); + } + if ((dst_flag & m_allocated) != dst_flag) { + RXMESH_ERROR( + "Attribute::copy() copying source is not valid" + " because location (this) was not allocated on device"); + } + + + for (uint32_t p = 0; p < m_num_patches; ++p) { + assert(m_h_element_per_patch[p] == + source.m_h_element_per_patch[p]); + CUDA_ERROR(cudaMemcpyAsync( + m_h_attr[p], + source.m_h_ptr_on_device[p], + sizeof(T) * m_h_element_per_patch[p] * m_num_attributes, + cudaMemcpyDeviceToHost, + stream)); + } + } + + + // 4) copy from HOST to DEVICE + if ((source_flag & HOST) == HOST && (dst_flag & DEVICE) == DEVICE) { + if ((source_flag & source.m_allocated) != source_flag) { + RXMESH_ERROR( + "Attribute::copy() copying source is not valid" + " because it was not allocated on device"); + } + if ((dst_flag & m_allocated) != dst_flag) { + RXMESH_ERROR( + "Attribute::copy() copying source is not valid" + " because location (this) was not allocated on host"); + } + + + for (uint32_t p = 0; p < m_num_patches; ++p) { + assert(m_h_element_per_patch[p] == + source.m_h_element_per_patch[p]); + CUDA_ERROR(cudaMemcpyAsync( + m_h_ptr_on_device[p], + source.m_h_attr[p], + sizeof(T) * m_h_element_per_patch[p] * m_num_attributes, + cudaMemcpyHostToDevice, + stream)); + } + } + } + + /** + * @brief Access the attribute value using patch and local index in the + * patch. This is meant to be used by XXAttribute not directly by the user + * @param patch_id patch to be accessed + * @param local_id the local id in the patch + * @param attr the attribute id + * @return const reference to the attribute + */ + __host__ __device__ __forceinline__ T& operator()(const uint32_t patch_id, + const uint16_t local_id, + const uint32_t attr) const + { + assert(patch_id < m_num_patches); + assert(attr < m_num_attributes); + + const uint32_t pitch_x = (m_layout == AoS) ? m_num_attributes : 1; +#ifdef __CUDA_ARCH__ + const uint32_t pitch_y = + (m_layout == AoS) ? 1 : m_d_element_per_patch[patch_id]; + return m_d_attr[patch_id][local_id * pitch_x + attr * pitch_y]; +#else + const uint32_t pitch_y = + (m_layout == AoS) ? 1 : m_h_element_per_patch[patch_id]; + return m_h_attr[patch_id][local_id * pitch_x + attr * pitch_y]; +#endif + } + + /** + * @brief Access the attribute value using patch and local index in the + * patch. This is meant to be used by XXAttribute not directly by the user + * @param patch_id patch to be accessed + * @param local_id the local id in the patch + * @param attr the attribute id + * @return non-const reference to the attribute + */ + __host__ __device__ __forceinline__ T& operator()(const uint32_t patch_id, + const uint16_t local_id, + const uint32_t attr) + { + assert(patch_id < m_num_patches); + assert(attr < m_num_attributes); + + const uint32_t pitch_x = (m_layout == AoS) ? m_num_attributes : 1; +#ifdef __CUDA_ARCH__ + const uint32_t pitch_y = + (m_layout == AoS) ? 1 : m_d_element_per_patch[patch_id]; + return m_d_attr[patch_id][local_id * pitch_x + attr * pitch_y]; +#else + const uint32_t pitch_y = + (m_layout == AoS) ? 1 : m_h_element_per_patch[patch_id]; + return m_h_attr[patch_id][local_id * pitch_x + attr * pitch_y]; +#endif + } + + /** + * @brief Check if the attribute is empty + */ + __host__ __device__ __forceinline__ bool is_empty() const + { + return m_num_patches == 0; + } + + + private: + /** + * @brief allocate internal memory + */ + void allocate(const uint16_t* element_per_patch, locationT location) + { + + if (m_num_patches != 0) { + + if ((location & HOST) == HOST) { + release(HOST); + m_h_element_per_patch = static_cast( + malloc(sizeof(uint16_t) * m_num_patches)); + + m_h_attr = static_cast(malloc(sizeof(T*) * m_num_patches)); + + std::memcpy(m_h_element_per_patch, + element_per_patch, + sizeof(uint16_t) * m_num_patches); + + for (uint32_t p = 0; p < m_num_patches; ++p) { + m_h_attr[p] = static_cast(malloc( + sizeof(T) * element_per_patch[p] * m_num_attributes)); + } + + m_allocated = m_allocated | HOST; + } + + if ((location & DEVICE) == DEVICE) { + release(DEVICE); + + m_h_element_per_patch = static_cast( + malloc(sizeof(uint16_t) * m_num_patches)); + + std::memcpy(m_h_element_per_patch, + element_per_patch, + sizeof(uint16_t) * m_num_patches); + + CUDA_ERROR(cudaMalloc((void**)&(m_d_element_per_patch), + sizeof(uint16_t) * m_num_patches)); + + + CUDA_ERROR(cudaMalloc((void**)&(m_d_attr), + sizeof(T*) * m_num_patches)); + m_h_ptr_on_device = + static_cast(malloc(sizeof(T*) * m_num_patches)); + + CUDA_ERROR(cudaMemcpy(m_d_element_per_patch, + element_per_patch, + sizeof(uint16_t) * m_num_patches, + cudaMemcpyHostToDevice)); + + for (uint32_t p = 0; p < m_num_patches; ++p) { + CUDA_ERROR(cudaMalloc((void**)&(m_h_ptr_on_device[p]), + sizeof(T) * m_h_element_per_patch[p] * + m_num_attributes)); + } + CUDA_ERROR(cudaMemcpy(m_d_attr, + m_h_ptr_on_device, + sizeof(T*) * m_num_patches, + cudaMemcpyHostToDevice)); + m_allocated = m_allocated | DEVICE; + } + } + } + + + char* m_name; + uint32_t m_num_attributes; + locationT m_allocated; + T** m_h_attr; + T** m_h_ptr_on_device; + T** m_d_attr; + uint32_t m_num_patches; + uint16_t* m_d_element_per_patch; + uint16_t* m_h_element_per_patch; + layoutT m_layout; + + constexpr static uint32_t m_block_size = 256; +}; + +/** + * @brief Attributes for faces + * @tparam T the attribute type + */ +template +class FaceAttribute : public Attribute +{ + public: + /** + * @brief Default constructor + */ + FaceAttribute() = default; + + /** + * @brief Main constructor to be used by RXMeshStatic not directly by the + * user + * @param name of the attribute + * @param face_per_patch number of faces owned per patch + * @param num_attributes number of attribute per face + * @param location where the attribute to be allocated + * @param layout memory layout in case of num_attributes>1 + */ + FaceAttribute(const char* name, + const std::vector& face_per_patch, + const uint32_t num_attributes, + locationT location, + const layoutT layout) + : Attribute(name) + { + this->init(face_per_patch, num_attributes, location, layout); + } + + /** + * @brief Accessing face attribute using FaceHandle + * @param f_handle input face handle + * @param attr the attribute id + * @return const reference to the attribute + */ + __host__ __device__ __forceinline__ T& operator()( + const FaceHandle f_handle, + const uint32_t attr = 0) const + { + auto pl = f_handle.unpack(); + return Attribute::operator()(pl.first, pl.second, attr); + } + + + /** + * @brief Accessing face attribute using FaceHandle + * @param f_handle input face handle + * @param attr the attribute id + * @return non-const reference to the attribute + */ + __host__ __device__ __forceinline__ T& operator()(const FaceHandle f_handle, + const uint32_t attr = 0) + { + auto pl = f_handle.unpack(); + return Attribute::operator()(pl.first, pl.second, attr); + } +}; + + +/** + * @brief Attributes for edges + * @tparam T the attribute type + */ +template +class EdgeAttribute : public Attribute +{ + public: + /** + * @brief Default constructor + */ + EdgeAttribute() = default; + + /** + * @brief Main constructor to be used by RXMeshStatic not directly by the + * user + * @param name of the attribute + * @param edge_per_patch number of edges owned per patch + * @param num_attributes number of attribute per edge + * @param location where the attribute to be allocated + * @param layout memory layout in case of num_attributes>1 + */ + EdgeAttribute(const char* name, + const std::vector& edge_per_patch, + const uint32_t num_attributes, + locationT location, + const layoutT layout) + : Attribute(name) + { + this->init(edge_per_patch, num_attributes, location, layout); + } + + /** + * @brief Accessing edge attribute using EdgeHandle + * @param e_handle input edge handle + * @param attr the attribute id + * @return const reference to the attribute + */ + __host__ __device__ __forceinline__ T& operator()( + const EdgeHandle e_handle, + const uint32_t attr = 0) const + { + auto pl = e_handle.unpack(); + return Attribute::operator()(pl.first, pl.second, attr); + } + + /** + * @brief Accessing edge attribute using EdgeHandle + * @param e_handle input edge handle + * @param attr the attribute id + * @return non-const reference to the attribute + */ + __host__ __device__ __forceinline__ T& operator()(const EdgeHandle e_handle, + const uint32_t attr = 0) + { + auto pl = e_handle.unpack(); + return Attribute::operator()(pl.first, pl.second, attr); + } +}; + + +/** + * @brief Attributes for vertices + * @tparam T the attribute type + */ +template +class VertexAttribute : public Attribute +{ + public: + /** + * @brief Default constructor + */ + VertexAttribute() = default; + + /** + * @brief Main constructor to be used by RXMeshStatic not directly by the + * user + * @param name of the attribute + * @param vertex_per_patch number of vertices owned per patch + * @param num_attributes number of attribute per vertex + * @param location where the attribute to be allocated + * @param layout memory layout in case of num_attributes > 1 + */ + VertexAttribute(const char* name, + const std::vector& vertex_per_patch, + const uint32_t num_attributes, + locationT location, + const layoutT layout) + : Attribute(name) + { + this->init(vertex_per_patch, num_attributes, location, layout); + } + + + /** + * @brief Accessing vertex attribute using VertexHandle + * @param v_handle input face handle + * @param attr the attribute id + * @return const reference to the attribute + */ + __host__ __device__ __forceinline__ T& operator()( + const VertexHandle v_handle, + const uint32_t attr = 0) const + { + auto pl = v_handle.unpack(); + return Attribute::operator()(pl.first, pl.second, attr); + } + + /** + * @brief Accessing vertex attribute using VertexHandle + * @param v_handle input face handle + * @param attr the attribute id + * @return non-const reference to the attribute + */ + __host__ __device__ __forceinline__ T& operator()( + const VertexHandle v_handle, + const uint32_t attr = 0) + { + auto pl = v_handle.unpack(); + return Attribute::operator()(pl.first, pl.second, attr); + } +}; + +/** + * @brief Attribute container used to manage a collection of attributes by + * RXMeshStatic + */ +class AttributeContainer +{ + public: + /** + * @brief Default constructor + */ + AttributeContainer() = default; + + /** + * @brief Destructor which releases all attribute managed by this container + */ + virtual ~AttributeContainer() + { + while (!m_attr_container.empty()) { + m_attr_container.back()->release(); + m_attr_container.pop_back(); + } + } + + /** + * @brief Number of attribute managed by this container + */ + size_t size() + { + return m_attr_container.size(); + } + + /** + * @brief get a list of name of the attributes managed by this container + * @return + */ + std::vector get_attribute_names() const + { + std::vector names; + for (size_t i = 0; i < m_attr_container.size(); ++i) { + names.push_back(m_attr_container[i]->get_name()); + } + return names; + } + + /** + * @brief add a new attribute to be managed by this container + * @tparam AttrT attribute type + * @param name unique name given to the attribute + * @param element_per_patch number of mesh element owned by each patch + * @param num_attributes number of attributes per mesh element + * @param location where the attributes will be allocated + * @param layout memory layout in case of num_attributes > 1 + * @return a shared pointer to the attribute + */ + template + std::shared_ptr add(const char* name, + std::vector& element_per_patch, + uint32_t num_attributes, + locationT location, + layoutT layout) + { + if (does_exist(name)) { + RXMESH_WARN( + "AttributeContainer::add() adding an attribute with " + "name {} already exists!", + std::string(name)); + } + + auto new_attr = std::make_shared( + name, element_per_patch, num_attributes, location, layout); + m_attr_container.push_back( + std::dynamic_pointer_cast(new_attr)); + + return new_attr; + } + + /** + * @brief Check if an attribute exists + * @param name of the attribute + */ + bool does_exist(const char* name) + { + for (size_t i = 0; i < m_attr_container.size(); ++i) { + if (!strcmp(m_attr_container[i]->get_name(), name)) { + return true; + } + } + return false; + } + + /** + * @brief remove an attribute and release its memory + * @param name of the attribute + */ + void remove(const char* name) + { + for (auto it = m_attr_container.begin(); it != m_attr_container.end(); + ++it) { + + if (!strcmp((*it)->get_name(), name)) { + (*it)->release(LOCATION_ALL); + m_attr_container.erase(it); + break; + } + } + } + + private: + std::vector> m_attr_container; +}; + +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/context.h b/include/rxmesh/context.h new file mode 100644 index 00000000..d93a2d72 --- /dev/null +++ b/include/rxmesh/context.h @@ -0,0 +1,111 @@ +#pragma once + +#include +#include "rxmesh/patch_info.h" +#include "rxmesh/util/macros.h" + +namespace rxmesh { + +/** + * @brief context for the mesh parameters and pointers. Everything is allocated + * on and managed by RXMesh. This class is meant to be a vehicle to copy various + * parameters to the device kernels. + */ +class Context +{ + public: + /** + * @brief Default constructor + */ + Context() + : m_num_edges(0), + m_num_faces(0), + m_num_vertices(0), + m_num_patches(0), + m_patches_info(nullptr) + { + } + + /** + * @brief initialize various members + * @param num_edges total number of edges in the mesh + * @param num_faces total number of faces in the mesh + * @param num_vertices total number of vertices in the mesh + * @param num_patches number of patches + * @param patches pointer to PatchInfo that contains different info about + * the patches + */ + void init(const uint32_t num_edges, + const uint32_t num_faces, + const uint32_t num_vertices, + const uint32_t num_patches, + PatchInfo* patches) + { + + m_num_edges = num_edges; + m_num_faces = num_faces; + m_num_vertices = num_vertices; + m_num_patches = num_patches; + m_patches_info = patches; + } + + /** + * @brief Total number of edges in mesh + */ + __device__ __forceinline__ uint32_t get_num_edges() const + { + return m_num_edges; + } + + /** + * @brief Total number of faces in mesh + */ + __device__ __forceinline__ uint32_t get_num_faces() const + { + return m_num_faces; + } + + /** + * @brief Total number of vertices in mesh + */ + __device__ __forceinline__ uint32_t get_num_vertices() const + { + return m_num_vertices; + } + + /** + * @brief Total number of patches in mesh + */ + __device__ __forceinline__ uint32_t get_num_patches() const + { + return m_num_patches; + } + + /** + * @brief A pointer to device PatchInfo used to store various information + * about the patches + */ + __device__ __forceinline__ PatchInfo* get_patches_info() const + { + return m_patches_info; + } + + /** + * @brief Unpack an edge to its edge ID and direction + * @param edge_dir The input packed edge as stored in PatchInfo and + * internally in RXMesh + * @param edge The unpacked edge ID + * @param dir The unpacked edge direction + */ + static __device__ __host__ __forceinline__ void + unpack_edge_dir(const uint16_t edge_dir, uint16_t& edge, flag_t& dir) + { + dir = (edge_dir & 1) != 0; + edge = edge_dir >> 1; + } + + private: + uint32_t m_num_edges, m_num_faces, m_num_vertices, m_num_patches; + PatchInfo* m_patches_info; +}; +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/handle.h b/include/rxmesh/handle.h new file mode 100644 index 00000000..19f0b7de --- /dev/null +++ b/include/rxmesh/handle.h @@ -0,0 +1,284 @@ +#pragma once +#include +#include +#include "rxmesh/local.h" +#include "rxmesh/patch_info.h" +#include "rxmesh/util/macros.h" + +namespace rxmesh { + +namespace detail { +/** + * @brief Return unique index of the local mesh element composed by the + * patch id and the local index + * + * @param local_id the local within-patch mesh element id + * @param patch_id the patch owning the mesh element + * @return + */ +uint64_t __device__ __host__ __forceinline__ unique_id(const uint16_t local_id, + const uint32_t patch_id) +{ + uint64_t ret = patch_id; + ret = (ret << 32); + ret |= local_id; + return ret; +} + +/** + * @brief unpack a 64 uint to its high and low 32 bits. The low 32 bit are + * casted to 16 bit. This is used to convert the unique id to its local id (16 + * low bit) and patch id (high 32 bit) + * @param uid unique id + * @return a std::pair storing the patch id and local id + */ +std::pair __device__ __host__ __forceinline__ +unpack(uint64_t uid) +{ + uint16_t local_id = uid & ((1 << 16) - 1); + uint32_t patch_id = uid >> 32; + return std::make_pair(patch_id, local_id); +} + +} // namespace detail + +/** + * @brief vertex identifier. It is a unique handle for each vertex equipped with + * operator==. It can be used to access mesh (vertex) attributes + */ +struct VertexHandle +{ + using LocalT = LocalVertexT; + + /** + * @brief Default constructor + */ + __device__ __host__ VertexHandle() : m_handle(INVALID64) + { + } + + /** + * @brief Constructor meant to be used internally by RXMesh and + * query_dispatcher + * @param patch_id the patch where the vertex belongs + * @param vertex_local_id the vertex local index within the patch + */ + __device__ __host__ VertexHandle(uint32_t patch_id, + LocalVertexT vertex_local_id) + : m_handle(detail::unique_id(vertex_local_id.id, patch_id)) + { + } + + /** + * @brief Operator == + */ + bool __device__ __host__ __inline__ operator==( + const VertexHandle& rhs) const + { + return m_handle == rhs.m_handle; + } + + /** + * @brief Operator != + */ + bool __device__ __host__ __inline__ operator!=( + const VertexHandle& rhs) const + { + return !(*this == rhs); + } + + /** + * @brief Check if the vertex is valid i.e., has been initialized by RXMesh + */ + bool __device__ __host__ __inline__ is_valid() const + { + return m_handle != INVALID64; + } + + /** + * @brief The unique identifier that represents the vertex + */ + uint64_t __device__ __host__ __inline__ unique_id() const + { + return m_handle; + } + + /** + * @brief Unpack the handle to its patch id and vertex local index within + * the patch + */ + std::pair __device__ __host__ __inline__ unpack() const + { + return detail::unpack(m_handle); + } + + private: + uint64_t m_handle; +}; + +/** + * @brief print vertex unique_id to ostream + */ +inline std::ostream& operator<<(std::ostream& os, VertexHandle v_handle) +{ + return (os << 'v' << v_handle.unique_id()); +} + +/** + * @brief edge identifier. It is a unique handle for each edge equipped with + * operator==. It can be used to access mesh (edge) attributes + */ +struct EdgeHandle +{ + using LocalT = LocalEdgeT; + + /** + * @brief Default constructor + */ + __device__ __host__ EdgeHandle() : m_handle(INVALID64) + { + } + + /** + * @brief Constructor meant to be used internally by RXMesh and + * query_dispatcher + * @param patch_id the patch where the edge belongs + * @param edge_local_id the edge local index within the patch + */ + __device__ __host__ EdgeHandle(uint32_t patch_id, LocalEdgeT edge_local_id) + : m_handle(detail::unique_id(edge_local_id.id, patch_id)) + { + } + + /** + * @brief Operator == + */ + bool __device__ __host__ __inline__ operator==(const EdgeHandle& rhs) const + { + return m_handle == rhs.m_handle; + } + + + /** + * @brief Operator != + */ + bool __device__ __host__ __inline__ operator!=(const EdgeHandle& rhs) const + { + return !(*this == rhs); + } + + /** + * @brief Check if the edge is valid i.e., has been initialized by RXMesh + */ + bool __device__ __host__ __inline__ is_valid() const + { + return m_handle != INVALID64; + } + + /** + * @brief The unique identifier that represents the edge + */ + uint64_t __device__ __host__ __inline__ unique_id() const + { + return m_handle; + } + + /** + * @brief Unpack the handle to its patch id and edge local index within + * the patch + */ + std::pair __device__ __host__ __inline__ unpack() const + { + return detail::unpack(m_handle); + } + + private: + uint64_t m_handle; +}; +/** + * @brief print edge unique_id to ostream + */ +inline std::ostream& operator<<(std::ostream& os, EdgeHandle e_handle) +{ + return (os << 'e' << e_handle.unique_id()); +} + +/** + * @brief face identifier. It is a unique handle for each face equipped with + * operator==. It can be used to access mesh (face) attributes + */ +struct FaceHandle +{ + using LocalT = LocalFaceT; + + /** + * @brief Default constructor + */ + __device__ __host__ FaceHandle() : m_handle(INVALID64) + { + } + + /** + * @brief Constructor meant to be used internally by RXMesh and + * query_dispatcher + * @param patch_id the patch where the face belongs + * @param vertex_local_id the face local index within the patch + */ + __device__ __host__ FaceHandle(uint32_t patch_id, LocalFaceT face_local_id) + : m_handle(detail::unique_id(face_local_id.id, patch_id)) + { + } + + /** + * @brief Operator == + */ + bool __device__ __host__ __inline__ operator==(const FaceHandle& rhs) const + { + return m_handle == rhs.m_handle; + } + + /** + * @brief Operator != + */ + bool __device__ __host__ __inline__ operator!=(const FaceHandle& rhs) const + { + return !(*this == rhs); + } + + /** + * @brief Check if the face is valid i.e., has been initialized by RXMesh + */ + bool __device__ __host__ __inline__ is_valid() const + { + return m_handle != INVALID64; + } + + /** + * @brief The unique identifier that represents the face + */ + uint64_t __device__ __host__ __inline__ unique_id() const + { + return m_handle; + } + + /** + * @brief Unpack the handle to its patch id and face local index within + * the patch + */ + std::pair __device__ __host__ __inline__ unpack() const + { + return detail::unpack(m_handle); + } + + private: + uint64_t m_handle; +}; + +/** + * @brief print face unique_id to ostream + */ +inline std::ostream& operator<<(std::ostream& os, FaceHandle f_handle) +{ + return (os << 'f' << f_handle.unique_id()); +} +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/iterator.cuh b/include/rxmesh/iterator.cuh new file mode 100644 index 00000000..5eda0450 --- /dev/null +++ b/include/rxmesh/iterator.cuh @@ -0,0 +1,142 @@ +#pragma once +#include +#include "rxmesh/handle.h" + +namespace rxmesh { + +template +struct Iterator +{ + using LocalT = typename HandleT::LocalT; + + __device__ Iterator(const uint16_t local_id, + const LocalT* patch_output, + const uint16_t* patch_offset, + const uint32_t offset_size, + const uint32_t patch_id, + const uint32_t num_owned, + const uint32_t* not_owned_patch, + const uint16_t* not_owned_local_id, + int shift = 0) + : m_patch_output(patch_output), + m_patch_offset(patch_offset), + m_patch_id(patch_id), + m_num_owned(num_owned), + m_not_owned_patch(not_owned_patch), + m_not_owned_local_id(not_owned_local_id), + m_shift(shift) + { + set(local_id, offset_size); + } + + Iterator(const Iterator& orig) = default; + + + __device__ uint16_t size() const + { + return m_end - m_begin; + } + + __device__ HandleT operator[](const uint16_t i) const + { + assert(m_patch_output); + assert(i + m_begin < m_end); + uint16_t lid = (m_patch_output[m_begin + i].id) >> m_shift; + if (lid < m_num_owned) { + return {m_patch_id, lid}; + } else { + lid -= m_num_owned; + return {m_not_owned_patch[lid], m_not_owned_local_id[lid]}; + } + } + + __device__ HandleT operator*() const + { + assert(m_patch_output); + return ((*this)[m_current]); + } + + __device__ HandleT back() const + { + return ((*this)[size() - 1]); + } + + __device__ HandleT front() const + { + return ((*this)[0]); + } + + __device__ Iterator& operator++() + { + // pre + m_current = (m_current + 1) % size(); + return *this; + } + __device__ Iterator operator++(int) + { + // post + Iterator pre(*this); + m_current = (m_current + 1) % size(); + return pre; + } + + __device__ Iterator& operator--() + { + // pre + m_current = (m_current == 0) ? size() - 1 : m_current - 1; + return *this; + } + + __device__ Iterator operator--(int) + { + // post + Iterator pre(*this); + m_current = (m_current == 0) ? size() - 1 : m_current - 1; + return pre; + } + + __device__ bool operator==(const Iterator& rhs) const + { + return rhs.m_local_id == m_local_id && rhs.m_patch_id == m_patch_id && + rhs.m_current == m_current; + } + + __device__ bool operator!=(const Iterator& rhs) const + { + return !(*this == rhs); + } + + + private: + const LocalT* m_patch_output; + const uint16_t* m_patch_offset; + const uint32_t m_patch_id; + const uint32_t* m_not_owned_patch; + const uint16_t* m_not_owned_local_id; + uint16_t m_num_owned; + uint16_t m_local_id; + uint16_t m_begin; + uint16_t m_end; + uint16_t m_current; + int m_shift; + + __device__ void set(const uint16_t local_id, const uint32_t offset_size) + { + m_current = 0; + m_local_id = local_id; + if (offset_size == 0) { + m_begin = m_patch_offset[m_local_id]; + m_end = m_patch_offset[m_local_id + 1]; + } else { + m_begin = m_local_id * offset_size; + m_end = (m_local_id + 1) * offset_size; + } + assert(m_end > m_begin); + } +}; + +using VertexIterator = Iterator; +using EdgeIterator = Iterator; +using FaceIterator = Iterator; + +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/kernels/attribute.cuh b/include/rxmesh/kernels/attribute.cuh new file mode 100644 index 00000000..6a4b3e90 --- /dev/null +++ b/include/rxmesh/kernels/attribute.cuh @@ -0,0 +1,93 @@ +#pragma once +#include +#include "rxmesh/util/macros.h" + + +namespace rxmesh { + +template +class Attribute; + +namespace detail { + +template +__device__ __forceinline__ void cub_block_sum(const T thread_val, + T* d_block_output) +{ + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + T block_sum = BlockReduce(temp_storage).Sum(thread_val); + if (threadIdx.x == 0) { + d_block_output[blockIdx.x] = block_sum; + } +} + +template +__launch_bounds__(blockSize) __global__ + void norm2_kernel(const Attribute X, + const uint16_t* d_element_per_patch, + const uint32_t num_patches, + const uint32_t num_attributes, + T* d_block_output) +{ + uint32_t p_id = blockIdx.x; + if (p_id < num_patches) { + const uint16_t element_per_patch = d_element_per_patch[p_id]; + T thread_val = 0; + for (uint16_t i = threadIdx.x; i < element_per_patch; i += blockSize) { + for (uint32_t j = 0; j < num_attributes; ++j) { + const T val = X(p_id, i, j); + thread_val += val * val; + } + } + + cub_block_sum(thread_val, d_block_output); + } +} + + +template +__launch_bounds__(blockSize) __global__ + void dot_kernel(const Attribute X, + const Attribute Y, + const uint16_t* d_element_per_patch, + const uint32_t num_patches, + const uint32_t num_attributes, + T* d_block_output) +{ + assert(X.get_num_attributes() == Y.get_num_attributes()); + + uint32_t p_id = blockIdx.x; + if (p_id < num_patches) { + const uint16_t element_per_patch = d_element_per_patch[p_id]; + T thread_val = 0; + for (uint16_t i = threadIdx.x; i < element_per_patch; i += blockSize) { + for (uint32_t j = 0; j < num_attributes; ++j) { + thread_val += X(p_id, i, j) * Y(p_id, i, j); + } + } + + cub_block_sum(thread_val, d_block_output); + } +} + +template +__global__ void memset_attribute(const Attribute attr, + const T value, + const uint16_t* d_element_per_patch, + const uint32_t num_patches, + const uint32_t num_attributes) +{ + uint32_t p_id = blockIdx.x; + if (p_id < num_patches) { + const uint16_t element_per_patch = d_element_per_patch[p_id]; + for (uint16_t i = threadIdx.x; i < element_per_patch; i += blockDim.x) { + for (uint32_t j = 0; j < num_attributes; ++j) { + attr(p_id, i, j) = value; + } + } + } +} + +} // namespace detail +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/kernels/collective.cuh b/include/rxmesh/kernels/collective.cuh index 1b152e93..43baec8c 100644 --- a/include/rxmesh/kernels/collective.cuh +++ b/include/rxmesh/kernels/collective.cuh @@ -4,9 +4,9 @@ #include #include "rxmesh/util/macros.h" -namespace RXMESH { +namespace rxmesh { /** - * cub_block_exclusive_sum() + * @brief Compute block-wide exclusive sum using CUB */ template __device__ __forceinline__ void cub_block_exclusive_sum(T* data, @@ -94,4 +94,4 @@ __device__ __forceinline__ void cub_block_exclusive_sum(T* data, }*/ } -} // namespace RXMESH \ No newline at end of file +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/kernels/debug.cuh b/include/rxmesh/kernels/debug.cuh index 6224702d..2cc4754a 100644 --- a/include/rxmesh/kernels/debug.cuh +++ b/include/rxmesh/kernels/debug.cuh @@ -2,7 +2,7 @@ #include #include "cuda_runtime.h" -namespace RXMESH { +namespace rxmesh { /** * print_arr_uint() @@ -11,7 +11,7 @@ template __device__ void print_arr_uint(char msg[], uint32_t size, T* arr, - uint32_t block_id = 0, + uint32_t block_id = 0, uint32_t thread_id = 0) { if (blockIdx.x == block_id && threadIdx.x == thread_id) { @@ -59,4 +59,4 @@ __device__ __forceinline__ unsigned total_smem_size() asm volatile("mov.u32 %0, %total_smem_size;" : "=r"(ret)); return ret; } -} // namespace RXMESH \ No newline at end of file +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/kernels/for_each.cuh b/include/rxmesh/kernels/for_each.cuh new file mode 100644 index 00000000..3606eac7 --- /dev/null +++ b/include/rxmesh/kernels/for_each.cuh @@ -0,0 +1,54 @@ +#pragma once +#include "rxmesh/patch_info.h" + +namespace rxmesh { +namespace detail { +template +__global__ void for_each_vertex(const uint32_t num_patches, + const PatchInfo* patch_info, + LambdaT apply) +{ + const uint32_t p_id = blockIdx.x; + if (p_id < num_patches) { + const uint16_t num_v = patch_info[p_id].num_owned_vertices; + for (uint16_t v = threadIdx.x; v < num_v; v += blockDim.x) { + const VertexHandle v_handle(p_id, v); + apply(v_handle); + } + } +} + + +template +__global__ void for_each_edge(const uint32_t num_patches, + const PatchInfo* patch_info, + LambdaT apply) +{ + const uint32_t p_id = blockIdx.x; + if (p_id < num_patches) { + const uint16_t num_e = patch_info[p_id].num_owned_edges; + for (uint16_t e = threadIdx.x; e < num_e; e += blockDim.x) { + const EdgeHandle e_handle(p_id, e); + apply(e_handle); + } + } +} + + +template +__global__ void for_each_face(const uint32_t num_patches, + const PatchInfo* patch_info, + LambdaT apply) +{ + const uint32_t p_id = blockIdx.x; + if (p_id < num_patches) { + const uint16_t num_f = patch_info[p_id].num_owned_faces; + for (uint16_t f = threadIdx.x; f < num_f; f += blockDim.x) { + const FaceHandle f_handle(p_id, f); + apply(f_handle); + } + } +} + +} // namespace detail +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/kernels/get_arch.cuh b/include/rxmesh/kernels/get_arch.cuh index 769ab88f..9dd323f8 100644 --- a/include/rxmesh/kernels/get_arch.cuh +++ b/include/rxmesh/kernels/get_arch.cuh @@ -2,7 +2,7 @@ #include #include "rxmesh/util/macros.h" -namespace RXMESH { +namespace rxmesh { __global__ static void get_cude_arch_k(int* d_arch) { @@ -24,4 +24,4 @@ inline int cuda_arch() cudaFree(d_arch); return h_arch; } -} // namespace RXMESH \ No newline at end of file +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/kernels/loader.cuh b/include/rxmesh/kernels/loader.cuh new file mode 100644 index 00000000..9f4fb62c --- /dev/null +++ b/include/rxmesh/kernels/loader.cuh @@ -0,0 +1,285 @@ +#pragma once + +#include +#include +#include "rxmesh/context.h" +#include "rxmesh/local.h" +#include "rxmesh/types.h" + +namespace rxmesh { + +template +__device__ __forceinline__ void load_uint16(const uint16_t* in, + const uint16_t size, + uint16_t* out) +{ + const uint32_t size32 = size / 2; + const uint32_t reminder = size % 2; + const uint32_t* in32 = reinterpret_cast(in); + uint32_t* out32 = reinterpret_cast(out); + + for (uint32_t i = threadIdx.x; i < size32; i += blockThreads) { + uint32_t a = in32[i]; + out32[i] = a; + } + + if (reminder != 0) { + if (threadIdx.x == 0) { + out[size - 1] = in[size - 1]; + } + } +} + + +/** + * @brief load the patch FE + * @param patch_info input patch info + * @param patch_faces output FE + * @return + */ +template +__device__ __forceinline__ void load_patch_FE(const PatchInfo& patch_info, + LocalEdgeT* fe) +{ + load_uint16(reinterpret_cast(patch_info.fe), + patch_info.num_faces * 3, + reinterpret_cast(fe)); +} + +/** + * @brief load the patch EV + * @param patch_info input patch info + * @param ev output EV + * @return + */ +template +__device__ __forceinline__ void load_patch_EV(const PatchInfo& patch_info, + LocalVertexT* ev) +{ + const uint32_t num_edges = patch_info.num_edges; + const uint32_t* input_ev32 = + reinterpret_cast(patch_info.ev); + uint32_t* output_ev32 = reinterpret_cast(ev); +#pragma unroll 2 + for (uint32_t i = threadIdx.x; i < num_edges; i += blockThreads) { + uint32_t a = input_ev32[i]; + output_ev32[i] = a; + } +} + +/** + * @brief load the patch topology i.e., EV and FE + * @param patch_info input patch info + * @param load_ev input indicates if we should load EV + * @param load_fe input indicates if we should load FE + * @param s_ev where EV will be loaded + * @param s_fe where FE will be loaded + * @return + */ +template +__device__ __forceinline__ void load_mesh(const PatchInfo& patch_info, + const bool load_ev, + const bool load_fe, + LocalVertexT*& s_ev, + LocalEdgeT*& s_fe) +{ + + if (load_ev) { + load_patch_EV(patch_info, s_ev); + } + // load patch faces + if (load_fe) { + if (load_ev) { + // if we loaded the edges, then we need to move where + // s_fe is pointing at to avoid overwrite + s_fe = + reinterpret_cast(&s_ev[patch_info.num_edges * 2]); + } + load_patch_FE(patch_info, s_fe); + } +} + +template +__device__ __forceinline__ void load_not_owned_local_id( + const uint16_t num_not_owned, + uint16_t* output_not_owned_local_id, + const uint16_t* input_not_owned_local_id) +{ + load_uint16( + input_not_owned_local_id, num_not_owned, output_not_owned_local_id); +} + +template +__device__ __forceinline__ void load_not_owned_patch( + const uint16_t num_not_owned, + uint32_t* output_not_owned_patch, + const uint32_t* input_not_owned_patch) +{ + for (uint32_t i = threadIdx.x; i < num_not_owned; i += blockThreads) { + output_not_owned_patch[i] = input_not_owned_patch[i]; + } +} + +/** + * @brief Load local id and patch of the not-owned verteices, edges, or faces + * based on query op. + * @param patch_info input patch info + * @param not_owned_local_id output local id + * @param not_owned_patch output patch id + * @param num_not_owned number of not-owned mesh elements + */ +template +__device__ __forceinline__ void load_not_owned(const PatchInfo& patch_info, + uint16_t*& not_owned_local_id, + uint32_t*& not_owned_patch, + uint16_t& num_owned) +{ + uint32_t num_not_owned = 0; + switch (op) { + case Op::VV: { + num_owned = patch_info.num_owned_vertices; + num_not_owned = patch_info.num_vertices - num_owned; + + // should be 4*patch_info.num_edges but VV (offset and values) are + // stored as uint16_t and not_owned_patch is uint32_t* so we need to + // shift the pointer only by half this amount + not_owned_patch = not_owned_patch + 2 * patch_info.num_edges; + not_owned_local_id = + reinterpret_cast(not_owned_patch + num_not_owned); + load_not_owned_patch( + num_not_owned, not_owned_patch, patch_info.not_owned_patch_v); + load_not_owned_local_id( + num_not_owned, + not_owned_local_id, + reinterpret_cast(patch_info.not_owned_id_v)); + break; + } + case Op::VE: { + num_owned = patch_info.num_owned_edges; + num_not_owned = patch_info.num_edges - num_owned; + + // should be 4*patch_info.num_edges but VE (offset and values) are + // stored as uint16_t and not_owned_patch is uint32_t* so we need to + // shift the pointer only by half this amount + not_owned_patch = not_owned_patch + 2 * patch_info.num_edges; + not_owned_local_id = + reinterpret_cast(not_owned_patch + num_not_owned); + load_not_owned_patch( + num_not_owned, not_owned_patch, patch_info.not_owned_patch_e); + load_not_owned_local_id( + num_not_owned, + not_owned_local_id, + reinterpret_cast(patch_info.not_owned_id_e)); + break; + } + case Op::VF: { + num_owned = patch_info.num_owned_faces; + num_not_owned = patch_info.num_faces - num_owned; + + uint32_t shift = DIVIDE_UP( + 3 * patch_info.num_faces + std::max(3 * patch_info.num_faces, + 2 * patch_info.num_edges), + 2); + not_owned_patch = not_owned_patch + shift; + not_owned_local_id = + reinterpret_cast(not_owned_patch + num_not_owned); + load_not_owned_patch( + num_not_owned, not_owned_patch, patch_info.not_owned_patch_f); + load_not_owned_local_id( + num_not_owned, + not_owned_local_id, + reinterpret_cast(patch_info.not_owned_id_f)); + break; + } + case Op::FV: { + num_owned = patch_info.num_owned_vertices; + num_not_owned = patch_info.num_vertices - num_owned; + + assert(2 * patch_info.num_edges >= (1 + 2) * num_not_owned); + not_owned_local_id = + reinterpret_cast(not_owned_patch + num_not_owned); + load_not_owned_patch( + num_not_owned, not_owned_patch, patch_info.not_owned_patch_v); + load_not_owned_local_id( + num_not_owned, + not_owned_local_id, + reinterpret_cast(patch_info.not_owned_id_v)); + break; + } + case Op::FE: { + num_owned = patch_info.num_owned_edges; + num_not_owned = patch_info.num_edges - num_owned; + + // should be 3*patch_info.num_faces but FE is stored as uint16_t and + // not_owned_patch is uint32_t* so we need to shift the pointer only + // by half this amount + not_owned_patch = + not_owned_patch + DIVIDE_UP(3 * patch_info.num_faces, 2); + not_owned_local_id = + reinterpret_cast(not_owned_patch + num_not_owned); + load_not_owned_patch( + num_not_owned, not_owned_patch, patch_info.not_owned_patch_e); + load_not_owned_local_id( + num_not_owned, + not_owned_local_id, + reinterpret_cast(patch_info.not_owned_id_e)); + break; + } + case Op::FF: { + num_owned = patch_info.num_owned_faces; + num_not_owned = patch_info.num_faces - num_owned; + + not_owned_local_id = + reinterpret_cast(not_owned_patch + num_not_owned); + load_not_owned_patch( + num_not_owned, not_owned_patch, patch_info.not_owned_patch_f); + load_not_owned_local_id( + num_not_owned, + not_owned_local_id, + reinterpret_cast(patch_info.not_owned_id_f)); + break; + } + case Op::EV: { + num_owned = patch_info.num_owned_vertices; + num_not_owned = patch_info.num_vertices - num_owned; + + // should be 2*patch_info.num_edges but EV is stored as uint16_t and + // not_owned_patch is uint32_t* so we need to shift the pointer only + // by num_edges + not_owned_patch = not_owned_patch + patch_info.num_edges; + not_owned_local_id = + reinterpret_cast(not_owned_patch + num_not_owned); + load_not_owned_patch( + num_not_owned, not_owned_patch, patch_info.not_owned_patch_v); + load_not_owned_local_id( + num_not_owned, + not_owned_local_id, + reinterpret_cast(patch_info.not_owned_id_v)); + break; + } + case Op::EF: { + num_owned = patch_info.num_owned_faces; + num_not_owned = patch_info.num_faces - num_owned; + + // should be 6*patch_info.num_faces but EF (offset and values) are + // stored as uint16_t and not_owned_patch is uint32_t* so we need to + // shift the pointer only by half this amount + not_owned_patch = not_owned_patch + 3 * patch_info.num_faces; + not_owned_local_id = + reinterpret_cast(not_owned_patch + num_not_owned); + load_not_owned_patch( + num_not_owned, not_owned_patch, patch_info.not_owned_patch_f); + load_not_owned_local_id( + num_not_owned, + not_owned_local_id, + reinterpret_cast(patch_info.not_owned_id_f)); + break; + } + default: { + assert(1 != 1); + break; + } + } +} + +} // namespace rxmesh diff --git a/include/rxmesh/kernels/prototype.cuh b/include/rxmesh/kernels/prototype.cuh deleted file mode 100644 index 73ab7c24..00000000 --- a/include/rxmesh/kernels/prototype.cuh +++ /dev/null @@ -1,63 +0,0 @@ -#pragma once -#include "rxmesh/kernels/rxmesh_query_dispatcher.cuh" -namespace RXMESH { -namespace detail { - -/** - * query_prototype() represents the minimal user function for op query. - * This function is only used in order to calculate the static shared memory and - * registers used - */ -template -__launch_bounds__(blockThreads) __global__ - static void query_prototype(const RXMeshContext context, - const bool oriented = false) -{ - static_assert(op != Op::EE, "Op::EE is not supported!"); - - auto user_lambda = [&](uint32_t id, RXMeshIterator& iter) { - printf("\n iter.size() = %u", iter.size()); - for (uint32_t i = 0; i < iter.size(); ++i) { - printf("\n iter[%u] = %u", i, iter[i]); - } - }; - - query_block_dispatcher(context, user_lambda, oriented); -} - -/** - * higher_query_prototype() represents the minimal user function for higeher - * queries. Higher we assume that all query of similar type. This function is - * only used in order to calculate the static shared memory and registers used/ - */ -template -__launch_bounds__(blockThreads) __global__ - static void higher_query_prototype(const RXMeshContext context, - const bool oriented = false) -{ - static_assert(op != Op::EE, "Op::EE is not supported!"); - - uint32_t thread_element; - auto first_ring = [&](uint32_t id, RXMeshIterator& iter) { - thread_element = id; - printf("\n iter.size() = %u", iter.size()); - for (uint32_t i = 0; i < iter.size(); ++i) { - printf("\n iter[%u] = %u", i, iter[i]); - } - }; - - query_block_dispatcher(context, first_ring, oriented); - - auto n_ring = [&](uint32_t id, RXMeshIterator& iter) { - printf("\n iter.size() = %u", iter.size()); - for (uint32_t i = 0; i < iter.size(); ++i) { - printf("\n iter[%u] = %u", i, iter[i]); - } - }; - - query_block_dispatcher(context, thread_element, n_ring, - oriented); -} - -} // namespace detail -} // namespace RXMESH \ No newline at end of file diff --git a/include/rxmesh/kernels/query_dispatcher.cuh b/include/rxmesh/kernels/query_dispatcher.cuh new file mode 100644 index 00000000..ae3c1387 --- /dev/null +++ b/include/rxmesh/kernels/query_dispatcher.cuh @@ -0,0 +1,415 @@ +#pragma once +#include +#include +#include + +#include "rxmesh/context.h" +#include "rxmesh/handle.h" +#include "rxmesh/iterator.cuh" +#include "rxmesh/kernels/collective.cuh" +#include "rxmesh/kernels/debug.cuh" +#include "rxmesh/kernels/loader.cuh" +#include "rxmesh/kernels/rxmesh_queries.cuh" +#include "rxmesh/types.h" +#include "rxmesh/util/meta.h" + +namespace rxmesh { + +namespace detail { + +/** + * query_block_dispatcher() + */ +template +__device__ __inline__ void query_block_dispatcher(const PatchInfo& patch_info, + activeSetT compute_active_set, + const bool oriented, + uint32_t& num_src_in_patch, + uint16_t*& s_output_offset, + uint16_t*& s_output_value, + uint16_t& num_owned, + uint32_t*& not_owned_patch, + uint16_t*& not_owned_local_id) +{ + static_assert(op != Op::EE, "Op::EE is not supported!"); + + constexpr bool load_fe = (op == Op::VF || op == Op::EE || op == Op::EF || + op == Op::FV || op == Op::FE || op == Op::FF); + constexpr bool loead_ev = (op == Op::VV || op == Op::VE || op == Op::VF || + op == Op::EV || op == Op::FV); + static_assert(loead_ev || load_fe, + "At least faces or edges needs to be loaded"); + + // Check if any of the mesh elements are in the active set + // input mapping does not need to be stored in shared memory since it will + // be read coalesced, we can rely on L1 cache here + num_src_in_patch = 0; + if constexpr (op == Op::VV || op == Op::VE || op == Op::VF) { + num_src_in_patch = patch_info.num_owned_vertices; + } + if constexpr (op == Op::EV || op == Op::EF) { + num_src_in_patch = patch_info.num_owned_edges; + } + if constexpr (op == Op::FV || op == Op::FE || op == Op::FF) { + num_src_in_patch = patch_info.num_owned_faces; + } + + bool is_active = false; + uint16_t local_id = threadIdx.x; + while (local_id < num_src_in_patch) { + is_active = + is_active || compute_active_set({patch_info.patch_id, local_id}); + local_id += blockThreads; + } + + if (__syncthreads_or(is_active) == 0) { + // reset num_src_in_patch to zero to indicate that this block/patch has + // no work to do + num_src_in_patch = 0; + return; + } + + // 2) Load the patch info + extern __shared__ uint16_t shrd_mem[]; + LocalVertexT* s_ev = reinterpret_cast(shrd_mem); + LocalEdgeT* s_fe = reinterpret_cast(shrd_mem); + load_mesh(patch_info, loead_ev, load_fe, s_ev, s_fe); + + not_owned_patch = reinterpret_cast(shrd_mem); + not_owned_local_id = shrd_mem; + num_owned = 0; + // 3)Perform the query operation + if (oriented) { + assert(op == Op::VV); + if constexpr (op == Op::VV) { + __syncthreads(); + v_v_oreinted(patch_info, + s_output_offset, + s_output_value, + reinterpret_cast(s_ev)); + } + } else { + if constexpr (!(op == Op::VV || op == Op::FV || op == Op::FF)) { + load_not_owned( + patch_info, not_owned_local_id, not_owned_patch, num_owned); + } + __syncthreads(); + query(s_output_offset, + s_output_value, + reinterpret_cast(s_ev), + reinterpret_cast(s_fe), + patch_info.num_vertices, + patch_info.num_edges, + patch_info.num_faces); + } + + // load not-owned local and patch id + if constexpr (op == Op::VV || op == Op::FV || op == Op::FF) { + // need to sync since we will overwrite things that are used in + // query + __syncthreads(); + load_not_owned( + patch_info, not_owned_local_id, not_owned_patch, num_owned); + } + + + __syncthreads(); +} + + +/** + * query_block_dispatcher() + */ +template +__device__ __inline__ void query_block_dispatcher(const Context& context, + const uint32_t patch_id, + computeT compute_op, + activeSetT compute_active_set, + const bool oriented = false) +{ + // Extract the type of the input parameters of the compute lambda function. + // The first parameter should be Vertex/Edge/FaceHandle and second parameter + // should be RXMeshVertex/Edge/FaceIterator + + using ComputeTraits = detail::FunctionTraits; + using ComputeHandleT = typename ComputeTraits::template arg<0>::type; + using ComputeIteratorT = typename ComputeTraits::template arg<1>::type; + using LocalT = typename ComputeIteratorT::LocalT; + + // Extract the type of the single input parameter of the active_set lambda + // function. It should be Vertex/Edge/FaceHandle and it should match the + // first parameter of the compute lambda function + using ActiveSetTraits = detail::FunctionTraits; + using ActiveSetHandleT = typename ActiveSetTraits::template arg<0>::type; + static_assert( + std::is_same_v, + "First argument of compute_op lambda function should match the first " + "argument of active_set lambda function "); + + static_assert(op != Op::EE, "Op::EE is not supported!"); + + + assert(patch_id < context.get_num_patches()); + + uint32_t num_src_in_patch = 0; + uint16_t* s_output_offset(nullptr); + uint16_t* s_output_value(nullptr); + uint16_t num_owned; + uint32_t* not_owned_patch(nullptr); + uint16_t* not_owned_local_id(nullptr); + + detail::template query_block_dispatcher( + context.get_patches_info()[patch_id], + compute_active_set, + oriented, + num_src_in_patch, + s_output_offset, + s_output_value, + num_owned, + not_owned_patch, + not_owned_local_id); + + // Call compute on the output in shared memory by looping over all + // source elements in this patch. + + uint16_t local_id = threadIdx.x; + while (local_id < num_src_in_patch) { + + assert(s_output_value); + + if (compute_active_set({patch_id, local_id})) { + constexpr uint32_t fixed_offset = + ((op == Op::EV) ? 2 : + (op == Op::FV || op == Op::FE) ? 3 : + 0); + + + ComputeHandleT handle(patch_id, local_id); + ComputeIteratorT iter(local_id, + reinterpret_cast(s_output_value), + s_output_offset, + fixed_offset, + patch_id, + num_owned, + not_owned_patch, + not_owned_local_id, + int(op == Op::FE)); + + compute_op(handle, iter); + } + + local_id += blockThreads; + } +} + +} // namespace detail +/** + * @brief The main query function to be called by the whole block. In this + * function, threads will be assigned to mesh elements which will be accessible + * through the input computation lambda function (compute_op). This function + * also provides a predicate to specify the active set i.e., the set on which + * the query operations should be done. This is mainly used to skip query on + * a subset of the input mesh elements which may lead to better performance + * @tparam Op the type of query operation + * @tparam blockThreads the number of CUDA threads in the block + * @tparam computeT the type of compute lambda function (inferred) + * @tparam activeSetT the type of active set lambda function (inferred) + * @param context which store various parameters needed for the query + * operation. The context can be obtained from RXMeshStatic + * @param compute_op the computation lambda function that will be executed by + * each thread in the block. This lambda function takes two input parameters: + * 1. Handle to the mesh element assigned to the thread. The handle type matches + * the source of the query (e.g., VertexHandle for VE query) 2. an iterator to + * the query output. The iterator type matches the type of the mesh element + * "iterated" on (e.g., EdgeIterator for VE query) + * @param compute_active_set a predicate used to specify the active set. This + * lambda function take a single parameter which is a handle of the type similar + * to the input of the query operation (e.g., VertexHandle for VE query) + * @param oriented specifies if the query are oriented. Currently only VV query + * is supported for oriented queries. FV, FE and EV is oriented by default + */ +template +__device__ __inline__ void query_block_dispatcher(const Context& context, + computeT compute_op, + activeSetT compute_active_set, + const bool oriented = false) +{ + if (blockIdx.x >= context.get_num_patches()) { + return; + } + + detail::query_block_dispatcher( + context, blockIdx.x, compute_op, compute_active_set, oriented); +} + + +/** + * @brief The main query function to be called by the whole block. In this + * function, threads will be assigned to mesh elements which will be accessible + * through the input computation lambda function (compute_op). + * @tparam Op the type of query operation + * @tparam blockThreads the number of CUDA threads in the block + * @tparam computeT the type of compute lambda function (inferred) + * @param context which store various parameters needed for the query + * operation. The context can be obtained from RXMeshStatic + * @param compute_op the computation lambda function that will be executed by + * each thread in the block. This lambda function takes two input parameters: + * 1. Handle to the mesh element assigned to the thread. The handle type matches + * the source of the query (e.g., VertexHandle for VE query) 2. an iterator to + * the query output. The iterator type matches the type of the mesh element + * "iterated" on (e.g., EdgeIterator for VE query) + * @param oriented specifies if the query are oriented. Currently only VV query + * is supported for oriented queries. FV, FE and EV is oriented by default + */ +template +__device__ __inline__ void query_block_dispatcher(const Context& context, + computeT compute_op, + const bool oriented = false) +{ + // Extract the type of the first input parameters of the compute lambda + // function. It should be Vertex/Edge/FaceHandle + using ComputeTraits = detail::FunctionTraits; + using ComputeHandleT = typename ComputeTraits::template arg<0>::type; + + query_block_dispatcher( + context, compute_op, [](ComputeHandleT) { return true; }, oriented); +} + + +/** + * @brief This function is used to perform a query operation on a specific mesh + * element. This is only needed for higher query (e.g., 2-ring query) where the + * first query is done using query_block_dispatcher in which each thread is + * assigned to a mesh element. Subsequent queries should be handled by this + * function. This function should be called by the whole CUDA block. + * @tparam Op the type of query operation + * @tparam blockThreads the number of CUDA threads in the block + * @tparam computeT the type of compute lambda function (inferred) + * @tparam HandleT the type of input handle (inferred) which should match the + * input of the query operations (e.g., VertexHandle for VE query) + * @param context which store various parameters needed for the query + * operation. The context can be obtained from RXMeshStatic + * @param src_id the input mesh element to the query. Inactive threads can + * simply pass HandleT() in which case they are skipped + * @param compute_op the computation lambda function that will be executed by + * the thread. This lambda function takes two input parameters: + * 1. HandleT which is the same as src_id 2. an iterator to the query output. + * The iterator type matches the type of the mesh element "iterated" on (e.g., + * EdgeIterator for VE query) + * @param oriented specifies if the query are oriented. Currently only VV query + * is supported for oriented queries. FV, FE and EV is oriented by default + */ +template +__device__ __inline__ void higher_query_block_dispatcher( + const Context& context, + const HandleT src_id, + computeT compute_op, + const bool oriented = false) +{ + using ComputeTraits = detail::FunctionTraits; + using ComputeIteratorT = typename ComputeTraits::template arg<1>::type; + + // The whole block should be calling this function. If one thread is not + // participating, its src_id should be INVALID32 + + auto compute_active_set = [](HandleT) { return true; }; + + // the source and local id of the source mesh element + std::pair pl = src_id.unpack(); + + // Here, we want to identify the set of unique patches for this thread + // block. We do this by first sorting the patches, compute discontinuity + // head flag, then threads with head flag =1 can add their patches to the + // shared memory buffer that will contain the unique patches + + __shared__ uint32_t s_block_patches[blockThreads]; + __shared__ uint32_t s_num_patches; + if (threadIdx.x == 0) { + s_num_patches = 0; + } + typedef cub::BlockRadixSort BlockRadixSort; + typedef cub::BlockDiscontinuity BlockDiscontinuity; + union TempStorage + { + typename BlockRadixSort::TempStorage sort_storage; + typename BlockDiscontinuity::TempStorage discont_storage; + }; + __shared__ TempStorage all_temp_storage; + uint32_t thread_data[1], thread_head_flags[1]; + thread_data[0] = pl.first; + thread_head_flags[0] = 0; + BlockRadixSort(all_temp_storage.sort_storage).Sort(thread_data); + BlockDiscontinuity(all_temp_storage.discont_storage) + .FlagHeads(thread_head_flags, thread_data, cub::Inequality()); + + if (thread_head_flags[0] == 1 && thread_data[0] != INVALID32) { + uint32_t id = ::atomicAdd(&s_num_patches, uint32_t(1)); + s_block_patches[id] = thread_data[0]; + } + + // We could eliminate the discontinuity operation and atomicAdd and instead + // use thrust::unique. However, this method causes illegal memory access + // and it looks like a bug in thrust + /*__syncthreads(); + // uniquify + uint32_t* new_end = thrust::unique(thrust::device, s_block_patches, + s_block_patches + blockThreads); + __syncthreads(); + + if (threadIdx.x == 0) { + s_num_patches = new_end - s_block_patches - 1; + }*/ + __syncthreads(); + + + for (uint32_t p = 0; p < s_num_patches; ++p) { + + uint32_t patch_id = s_block_patches[p]; + + assert(patch_id < context.get_num_patches()); + + uint32_t num_src_in_patch = 0; + uint16_t *s_output_offset(nullptr), *s_output_value(nullptr); + uint16_t num_owned = 0; + uint16_t* not_owned_local_id(nullptr); + uint32_t* not_owned_patch(nullptr); + + detail::template query_block_dispatcher( + context.get_patches_info()[patch_id], + compute_active_set, + oriented, + num_src_in_patch, + s_output_offset, + s_output_value, + num_owned, + not_owned_patch, + not_owned_local_id); + + + if (pl.first == patch_id) { + + constexpr uint32_t fixed_offset = + ((op == Op::EV) ? 2 : + (op == Op::FV || op == Op::FE) ? 3 : + 0); + + ComputeIteratorT iter( + pl.second, + reinterpret_cast( + s_output_value), + s_output_offset, + fixed_offset, + patch_id, + num_owned, + not_owned_patch, + not_owned_local_id, + int(op == Op::FE)); + + compute_op(src_id, iter); + } + __syncthreads(); + } +} + + +} // namespace rxmesh diff --git a/include/rxmesh/kernels/rxmesh_attribute.cuh b/include/rxmesh/kernels/rxmesh_attribute.cuh deleted file mode 100644 index 4e6b319f..00000000 --- a/include/rxmesh/kernels/rxmesh_attribute.cuh +++ /dev/null @@ -1,87 +0,0 @@ -#pragma once -#include -#include "rxmesh/util/macros.h" -namespace RXMESH { - -template -class RXMeshAttribute; - -template -__global__ void rxmesh_attribute_axpy(const RXMeshAttribute X, - const T* alpha, - RXMeshAttribute Y, - const T* beta, - const uint32_t attribute_id = INVALID32) -{ - // Y = alpha*X + beta*Y - // if attribute is INVALID32, then the operation is applied to all - // attribute (one thread per mesh element on all attribute) - // otherwise, the operation is applied on only that attribute - - // alpha and beta should be of size attributes per element if attribute == - // INVALID32. Otherwise, they should point to a single variable - - assert(X.get_num_mesh_elements() == Y.get_num_mesh_elements()); - assert(X.get_num_attribute_per_element() == - Y.get_num_attribute_per_element()); - - uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x; - - if (idx < X.get_num_mesh_elements()) { - - if (attribute_id == INVALID32) { - for (uint32_t attr = 0; attr < X.get_num_attribute_per_element(); - ++attr) { - Y(idx, attr) = - alpha[attr] * X(idx, attr) + beta[attr] * Y(idx, attr); - } - } else { - Y(idx, attribute_id) = alpha[0] * X(idx, attribute_id) + - beta[0] * Y(idx, attribute_id); - } - } -} - - -template -__global__ void rxmesh_attribute_norm2(const RXMeshAttribute X, - const uint32_t attribute_id, - T* d_block_output) -{ - uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x; - T threa_val = 0; - if (idx < X.get_num_mesh_elements()) { - threa_val = X(idx, attribute_id); - } - threa_val *= threa_val; - - - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - T block_sum = BlockReduce(temp_storage).Sum(threa_val); - if (threadIdx.x == 0) { - d_block_output[blockIdx.x] = block_sum; - } -} - - -template -__global__ void rxmesh_attribute_dot(const RXMeshAttribute X, - const RXMeshAttribute Y, - const uint32_t attribute_id, - T* d_block_output) -{ - uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x; - T threa_val = 0; - if (idx < X.get_num_mesh_elements()) { - threa_val = X(idx, attribute_id) * Y(idx, attribute_id); - } - - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - T block_sum = BlockReduce(temp_storage).Sum(threa_val); - if (threadIdx.x == 0) { - d_block_output[blockIdx.x] = block_sum; - } -} -} // namespace RXMESH \ No newline at end of file diff --git a/include/rxmesh/kernels/rxmesh_iterator.cuh b/include/rxmesh/kernels/rxmesh_iterator.cuh deleted file mode 100644 index 486f7136..00000000 --- a/include/rxmesh/kernels/rxmesh_iterator.cuh +++ /dev/null @@ -1,128 +0,0 @@ -#pragma once -#include -namespace RXMESH { - -struct RXMeshIterator -{ - __device__ RXMeshIterator(const uint16_t local_id, - const uint16_t* patch_output, - const uint16_t* patch_offset, - const uint32_t* output_ltog_map, - const uint32_t offset_size, - const uint32_t num_src_in_patch, - int shift = 0) - : m_patch_output(patch_output), m_patch_offset(patch_offset), - m_output_ltog_map(output_ltog_map), - m_num_src_in_patch(num_src_in_patch), m_shift(shift) - { - set(local_id, offset_size); - } - - RXMeshIterator(const RXMeshIterator& orig) = default; - - __device__ uint16_t local_id() const - { - return m_local_id; - } - - __device__ uint16_t size() const - { - return m_end - m_begin; - } - - __device__ uint16_t neighbour_local_id(uint32_t i) const - { - return m_patch_output[m_begin + i]; - } - - __device__ uint32_t operator[](const uint32_t i) const - { - assert(m_patch_output); - assert(m_output_ltog_map); - assert(i + m_begin < m_end); - return m_output_ltog_map[((m_patch_output[m_begin + i]) >> m_shift)]; - } - - __device__ uint32_t operator*() const - { - assert(m_patch_output); - assert(m_output_ltog_map); - return ((*this)[m_current]); - } - - __device__ uint32_t back() const - { - return ((*this)[size() - 1]); - } - - __device__ uint32_t front() const - { - return ((*this)[0]); - } - - __device__ RXMeshIterator& operator++() - { - // pre - m_current = (m_current + 1) % size(); - return *this; - } - __device__ const RXMeshIterator operator++(int) - { - // post - RXMeshIterator pre(*this); - m_current = (m_current + 1) % size(); - return pre; - } - - __device__ RXMeshIterator& operator--() - { - // pre - m_current = (m_current == 0) ? size() - 1 : m_current - 1; - return *this; - } - - __device__ const RXMeshIterator operator--(int) - { - // post - RXMeshIterator pre(*this); - m_current = (m_current == 0) ? size() - 1 : m_current - 1; - return pre; - } - - __device__ bool operator==(const RXMeshIterator& rhs) const - { - return rhs.m_local_id == m_local_id && rhs.m_current == m_current; - } - - __device__ bool operator!=(const RXMeshIterator& rhs) const - { - return !(*this == rhs); - } - - - // private: - const uint16_t* m_patch_output; - const uint16_t* m_patch_offset; - const uint32_t* m_output_ltog_map; - uint16_t m_local_id; - uint16_t m_begin; - uint16_t m_end; - uint16_t m_current; - int m_shift; - uint32_t m_num_src_in_patch; - - __device__ void set(const uint16_t local_id, const uint32_t offset_size) - { - m_current = 0; - m_local_id = local_id; - if (offset_size == 0) { - m_begin = m_patch_offset[m_local_id]; - m_end = m_patch_offset[m_local_id + 1]; - } else { - m_begin = m_local_id * offset_size; - m_end = (m_local_id + 1) * offset_size; - } - assert(m_end > m_begin); - } -}; -} // namespace RXMESH \ No newline at end of file diff --git a/include/rxmesh/kernels/rxmesh_loader.cuh b/include/rxmesh/kernels/rxmesh_loader.cuh deleted file mode 100644 index a93e912c..00000000 --- a/include/rxmesh/kernels/rxmesh_loader.cuh +++ /dev/null @@ -1,171 +0,0 @@ -#pragma once - -#include -#include - -#include "rxmesh/rxmesh.h" -#include "rxmesh/rxmesh_context.h" - -namespace RXMESH { - -/** -* load_patch_ad_size() -*/ -__device__ __forceinline__ void load_patch_ad_size(const RXMeshContext& context, - const uint32_t p_id, - uint4& ad_size, - uint2& ad_size_ltog_v, - uint2& ad_size_ltog_e, - uint2& ad_size_ltog_f) -{ - - ad_size.x = context.get_ad_size()[p_id].x; - ad_size.y = context.get_ad_size()[p_id].y; - ad_size.z = context.get_ad_size()[p_id].z; - ad_size.w = context.get_ad_size()[p_id].w; - - ad_size_ltog_v = context.get_ad_size_ltog_v()[p_id]; - ad_size_ltog_e = context.get_ad_size_ltog_e()[p_id]; - ad_size_ltog_f = context.get_ad_size_ltog_f()[p_id]; - assert(ad_size.y % 2 == 0); - assert(ad_size.w % context.get_face_degree() == 0); - - /*if (threadIdx.x == 0) { - printf("\n blockIdx.x= %u, p_id = %u \n" - " edges_add= %u, edges_size= %u \n" - " faces_add= %u, faces_size= %u \n" - " s_ad_size_ltog_v.x= %u, s_ad_size_ltog_v.y= %u \n" - " s_ad_size_ltog_e.x= %u, s_ad_size_ltog_e.y= %u \n" - " s_ad_size_ltog_f.x= %u, s_ad_size_ltog_f.y= %u \n", - blockIdx.x, p_id, - s_ad_size.x, s_ad_size.y, s_ad_size.z, s_ad_size.w, - s_ad_size_ltog_v.x, s_ad_size_ltog_v.y, - s_ad_size_ltog_e.x, s_ad_size_ltog_e.y, - s_ad_size_ltog_f.x, s_ad_size_ltog_f.y); - }*/ -} - -/** - * load_patch_edges() - */ -__device__ __forceinline__ void load_patch_edges(const RXMeshContext& context, - uint16_t* patch_edges, - const uint4& ad_sz) -{ - - // whole block should be calling this - - // load edges - assert(ad_sz.y % 2 == 0); - uint32_t size32 = ad_sz.y / 2; - const uint32_t* edges_ptr32 = - (const uint32_t*)(context.get_patches_edges() + ad_sz.x); - uint32_t* patch_edges32 = (uint32_t*)(patch_edges); -#pragma unroll 2 - for (uint32_t i = threadIdx.x; i < size32; i += blockDim.x) { - uint32_t a = edges_ptr32[i]; - patch_edges32[i] = a; - } -} - -/** - * load_patch_faces() - */ -__device__ __forceinline__ void load_patch_faces(const RXMeshContext& context, - uint16_t* patch_faces, - const uint4& ad_sz) -{ - - // whole block should be calling this - - // load faces - assert(ad_sz.w % 3 == 0); - - uint32_t size32 = ad_sz.w / 2; - uint32_t reminder = ad_sz.w % 2; - const uint32_t* faces_ptr32 = - (const uint32_t*)(context.get_patches_faces() + ad_sz.z); - uint32_t* patch_faces32 = (uint32_t*)(patch_faces); - //#pragma unroll 3 - for (uint32_t i = threadIdx.x; i < size32; i += blockDim.x) { - uint32_t a = faces_ptr32[i]; - patch_faces32[i] = a; - } - - if (reminder != 0) { - if (threadIdx.x == 0) { - patch_faces[ad_sz.w - 1] = - context.get_patches_faces()[ad_sz.z + ad_sz.w - 1]; - } - } -} - -/** - * load_mapping() - */ -__device__ __forceinline__ void load_mapping(const RXMeshContext& context, - const ELEMENT ele, - const uint2& s_ad_size_ltog, - uint32_t* mapping, - const bool keep_patch_bit) -{ - // whole block should be calling this - for (uint32_t i = threadIdx.x, start = s_ad_size_ltog.x; - i < s_ad_size_ltog.y; i += blockDim.x) { - - switch (ele) { - case ELEMENT::VERTEX: - if (keep_patch_bit) { - mapping[i] = context.get_patches_ltog_v()[i + start]; - } else { - mapping[i] = (context.get_patches_ltog_v()[i + start] >> 1); - } - - break; - case ELEMENT::EDGE: - if (keep_patch_bit) { - mapping[i] = context.get_patches_ltog_e()[i + start]; - } else { - mapping[i] = (context.get_patches_ltog_e()[i + start] >> 1); - } - break; - case ELEMENT::FACE: - if (keep_patch_bit) { - mapping[i] = context.get_patches_ltog_f()[i + start]; - } else { - mapping[i] = (context.get_patches_ltog_f()[i + start] >> 1); - } - break; - default: - assert(1 != 1); - break; - } - } -} - -/** - * load_mesh() - */ -__device__ __forceinline__ void load_mesh(const RXMeshContext& context, - const bool load_edges, - const bool load_faces, - uint16_t*& s_patch_edges, - uint16_t*& s_patch_faces, - const uint4& ad_size) -{ - - if (load_edges) { - load_patch_edges(context, s_patch_edges, ad_size); - } - // load patch faces - if (load_faces) { - if (load_edges) { - // if we loaded the edges, then we need to move where - // s_patch_faces is pointing at to avoid overwrite - s_patch_faces = &s_patch_edges[ad_size.y]; - } - load_patch_faces(context, s_patch_faces, ad_size); - } -} - -} // namespace RXMESH diff --git a/include/rxmesh/kernels/rxmesh_queries.cuh b/include/rxmesh/kernels/rxmesh_queries.cuh index cf6c8d18..b77459c8 100644 --- a/include/rxmesh/kernels/rxmesh_queries.cuh +++ b/include/rxmesh/kernels/rxmesh_queries.cuh @@ -3,14 +3,13 @@ #include #include +#include "rxmesh/context.h" #include "rxmesh/kernels/collective.cuh" -#include "rxmesh/kernels/rxmesh_loader.cuh" +#include "rxmesh/kernels/loader.cuh" #include "rxmesh/kernels/util.cuh" -#include "rxmesh/rxmesh.h" -#include "rxmesh/rxmesh_context.h" +#include "rxmesh/types.h" -namespace RXMESH { -//********************** Tools +namespace rxmesh { template @@ -33,7 +32,7 @@ __device__ __forceinline__ void block_mat_transpose(const uint32_t num_rows, // INVALID16; if (index < nnz) { thread_data[i] = mat[index] >> shift; - mat[index] = 0; + mat[index] = 0; } else { thread_data[i] = INVALID16; } @@ -63,7 +62,7 @@ __device__ __forceinline__ void block_mat_transpose(const uint32_t num_rows, __syncthreads(); for (uint32_t i = threadIdx.x; i < num_cols; i += blockThreads) { uint16_t val = uint16_t(mat_half[i]); - mat[i] = val; + mat[i] = val; } #else for (uint32_t i = 0; i < itemPerThread; ++i) { @@ -85,43 +84,43 @@ __device__ __forceinline__ void block_mat_transpose(const uint32_t num_rows, uint16_t item = thread_data[i]; if (item != INVALID16) { uint16_t offset = mat[item] + local_offset[i]; - uint16_t row = (itemPerThread * threadIdx.x + i) / rowOffset; - output[offset] = row; + uint16_t row = (itemPerThread * threadIdx.x + i) / rowOffset; + output[offset] = row; } else { break; } } } -//************************************************************************* template -__device__ __forceinline__ void v_v_oreinted(uint16_t*& s_offset_all_patches, - uint16_t*& s_output_all_patches, - uint16_t* s_patch_edges, - const RXMeshContext& context, - const uint4& ad_size, - const uint16_t num_vertices, - const uint16_t num_owned_vertices) +__device__ __forceinline__ void v_v_oreinted(const PatchInfo& patch_info, + uint16_t*& s_output_offset, + uint16_t*& s_output_value, + uint16_t* s_ev) { - const uint32_t num_faces = ad_size.w / 3; - const uint32_t num_edges = ad_size.y / 2; - s_offset_all_patches = &s_patch_edges[0]; - s_output_all_patches = - &s_patch_edges[num_vertices + 1 + (num_vertices + 1) % 2]; + const uint16_t num_edges = patch_info.num_edges; + const uint16_t num_faces = patch_info.num_faces; + const uint16_t num_vertices = patch_info.num_vertices; + const uint16_t num_owned_vertices = patch_info.num_owned_vertices; + + s_output_offset = &s_ev[0]; + s_output_value = &s_ev[num_vertices + 1 + (num_vertices + 1) % 2]; // start by loading the faces while also doing transposing EV (might // increase ILP) - uint16_t* s_patch_FE = &s_output_all_patches[2 * num_edges]; - uint16_t* s_patch_EF = &s_patch_FE[3 * num_faces + (3 * num_faces) % 2]; - load_patch_faces(context, s_patch_FE, ad_size); + uint16_t* s_fe = &s_output_value[2 * num_edges]; + uint16_t* s_ef = &s_fe[3 * num_faces + (3 * num_faces) % 2]; + LocalEdgeT* temp_fe = reinterpret_cast(s_fe); + load_patch_FE(patch_info, temp_fe); + for (uint32_t i = threadIdx.x; i < num_edges * 2; i += blockThreads) { - s_patch_EF[i] = INVALID16; + s_ef[i] = INVALID16; } block_mat_transpose<2u, blockThreads>( - num_edges, num_vertices, s_offset_all_patches, s_output_all_patches); + num_edges, num_vertices, s_output_offset, s_output_value); // block_mat_transpose<2u, blockThreads>( // num_faces, num_edges, s_patch_EF_offset, s_patch_EF_output); @@ -131,17 +130,17 @@ __device__ __forceinline__ void v_v_oreinted(uint16_t*& s_offset_all_patches, // that we are working on manifold so it is only two edges per face. We // also wanna keep FE for quick look up on a face's three edges. - // We need to sync here to make sure that s_patch_FE is loaded but there is + // We need to sync here to make sure that s_fe is loaded but there is // a sync in block_mat_transpose that takes care of this for (uint16_t e = threadIdx.x; e < 3 * num_faces; e += blockThreads) { - uint16_t edge = s_patch_FE[e] >> 1; + uint16_t edge = s_fe[e] >> 1; uint16_t face_id = e / 3; - auto ret = atomicCAS(s_patch_EF + 2 * edge, INVALID16, face_id); + auto ret = atomicCAS(s_ef + 2 * edge, INVALID16, face_id); if (ret != INVALID16) { - ret = atomicCAS(s_patch_EF + 2 * edge + 1, INVALID16, face_id); + ret = atomicCAS(s_ef + 2 * edge + 1, INVALID16, face_id); assert(ret == INVALID16); } } @@ -156,13 +155,13 @@ __device__ __forceinline__ void v_v_oreinted(uint16_t*& s_offset_all_patches, // if the vertex is not owned by this patch, then there is no reason // to orient its edges because no serious computation is done on it - uint16_t start = s_offset_all_patches[v]; - uint16_t end = s_offset_all_patches[v + 1]; + uint16_t start = s_output_offset[v]; + uint16_t end = s_output_offset[v + 1]; for (uint16_t e_id = start; e_id < end - 1; ++e_id) { - uint16_t e_0 = s_output_all_patches[e_id]; - uint16_t f0(s_patch_EF[2 * e_0]), f1(s_patch_EF[2 * e_0 + 1]); + uint16_t e_0 = s_output_value[e_id]; + uint16_t f0(s_ef[2 * e_0]), f1(s_ef[2 * e_0 + 1]); // we don't do it for boundary faces assert(f0 != INVALID16 && f1 != INVALID16 && f0 < num_faces && @@ -172,33 +171,33 @@ __device__ __forceinline__ void v_v_oreinted(uint16_t*& s_offset_all_patches, // candidate next edge (only one of them will win) uint16_t e_candid_0, e_candid_1; - if ((s_patch_FE[3 * f0 + 0] >> 1) == e_0) { - e_candid_0 = s_patch_FE[3 * f0 + 2] >> 1; + if ((s_fe[3 * f0 + 0] >> 1) == e_0) { + e_candid_0 = s_fe[3 * f0 + 2] >> 1; } - if ((s_patch_FE[3 * f0 + 1] >> 1) == e_0) { - e_candid_0 = s_patch_FE[3 * f0 + 0] >> 1; + if ((s_fe[3 * f0 + 1] >> 1) == e_0) { + e_candid_0 = s_fe[3 * f0 + 0] >> 1; } - if ((s_patch_FE[3 * f0 + 2] >> 1) == e_0) { - e_candid_0 = s_patch_FE[3 * f0 + 1] >> 1; + if ((s_fe[3 * f0 + 2] >> 1) == e_0) { + e_candid_0 = s_fe[3 * f0 + 1] >> 1; } - if ((s_patch_FE[3 * f1 + 0] >> 1) == e_0) { - e_candid_1 = s_patch_FE[3 * f1 + 2] >> 1; + if ((s_fe[3 * f1 + 0] >> 1) == e_0) { + e_candid_1 = s_fe[3 * f1 + 2] >> 1; } - if ((s_patch_FE[3 * f1 + 1] >> 1) == e_0) { - e_candid_1 = s_patch_FE[3 * f1 + 0] >> 1; + if ((s_fe[3 * f1 + 1] >> 1) == e_0) { + e_candid_1 = s_fe[3 * f1 + 0] >> 1; } - if ((s_patch_FE[3 * f1 + 2] >> 1) == e_0) { - e_candid_1 = s_patch_FE[3 * f1 + 1] >> 1; + if ((s_fe[3 * f1 + 2] >> 1) == e_0) { + e_candid_1 = s_fe[3 * f1 + 1] >> 1; } for (uint16_t vn = e_id + 1; vn < end; ++vn) { - uint16_t e_winning_candid = s_output_all_patches[vn]; + uint16_t e_winning_candid = s_output_value[vn]; if (e_candid_0 == e_winning_candid || e_candid_1 == e_winning_candid) { - uint16_t temp = s_output_all_patches[e_id + 1]; - s_output_all_patches[e_id + 1] = e_winning_candid; - s_output_all_patches[vn] = temp; + uint16_t temp = s_output_value[e_id + 1]; + s_output_value[e_id + 1] = e_winning_candid; + s_output_value[vn] = temp; break; } } @@ -207,31 +206,33 @@ __device__ __forceinline__ void v_v_oreinted(uint16_t*& s_offset_all_patches, __syncthreads(); - // Load EV into s_patch_EF since both has the same size (2*#E) - s_patch_edges = &s_patch_EF[0]; - load_patch_edges(context, s_patch_edges, ad_size); + // Load EV into s_ef since both has the same size (2*#E) + s_ev = s_ef; + LocalVertexT* temp_ev = reinterpret_cast(s_ef); + load_patch_EV(patch_info, temp_ev); + __syncthreads(); for (uint32_t v = threadIdx.x; v < num_vertices; v += blockThreads) { - uint32_t start = s_offset_all_patches[v]; - uint32_t end = s_offset_all_patches[v + 1]; + uint32_t start = s_output_offset[v]; + uint32_t end = s_output_offset[v + 1]; for (uint32_t e = start; e < end; ++e) { - uint16_t edge = s_output_all_patches[e]; - uint16_t v0 = s_patch_edges[2 * edge]; - uint16_t v1 = s_patch_edges[2 * edge + 1]; + uint16_t edge = s_output_value[e]; + uint16_t v0 = s_ev[2 * edge]; + uint16_t v1 = s_ev[2 * edge + 1]; assert(v0 == v || v1 == v); // d_output[e] = (v0 == v) ? v1 : v0; - s_output_all_patches[e] = (v0 == v) * v1 + (v1 == v) * v0; + s_output_value[e] = (v0 == v) * v1 + (v1 == v) * v0; } } } -//********************** 1) vertex incident edges + template -__device__ __forceinline__ void v_e(const uint32_t num_vertices, - const uint32_t num_edges, +__device__ __forceinline__ void v_e(const uint16_t num_vertices, + const uint16_t num_edges, uint16_t* d_edges, uint16_t* d_output) { @@ -243,15 +244,13 @@ __device__ __forceinline__ void v_e(const uint32_t num_vertices, // num_edges*2 (zero is stored and the end can be inferred). Thus, // d_output should be allocated to size = num_edges*2 - block_mat_transpose<2u, blockThreads>(num_edges, num_vertices, d_edges, - d_output); + block_mat_transpose<2u, blockThreads>( + num_edges, num_vertices, d_edges, d_output); } -//************************************************************************* -//********************** 0) Vertex adjacent vertices template -__device__ __forceinline__ void v_v(const uint32_t num_vertices, - const uint32_t num_edges, +__device__ __forceinline__ void v_v(const uint16_t num_vertices, + const uint16_t num_edges, uint16_t* d_edges, uint16_t* d_output) { @@ -280,12 +279,12 @@ __device__ __forceinline__ void v_v(const uint32_t num_vertices, for (uint32_t v = threadIdx.x; v < num_vertices; v += blockThreads) { uint32_t start = d_edges[v]; - uint32_t end = d_edges[v + 1]; + uint32_t end = d_edges[v + 1]; for (uint32_t e = start; e < end; ++e) { uint16_t edge = d_output[e]; - uint16_t v0 = s_edges_duplicate[2 * edge]; - uint16_t v1 = s_edges_duplicate[2 * edge + 1]; + uint16_t v0 = s_edges_duplicate[2 * edge]; + uint16_t v1 = s_edges_duplicate[2 * edge + 1]; assert(v0 == v || v1 == v); // d_output[e] = (v0 == v) ? v1 : v0; @@ -293,13 +292,10 @@ __device__ __forceinline__ void v_v(const uint32_t num_vertices, } } } -//************************************************************************* - -//********************** 3) Face incident vertices -__device__ __forceinline__ void f_v(const uint32_t num_edges, +__device__ __forceinline__ void f_v(const uint16_t num_edges, const uint16_t* d_edges, - const uint32_t num_faces, + const uint16_t num_faces, uint16_t* d_faces) { // M_FV = M_FE \dot M_EV @@ -315,7 +311,7 @@ __device__ __forceinline__ void f_v(const uint32_t num_edges, for (uint32_t i = 0; i < 3; i++) { uint16_t e = d_faces[f_id + i]; flag_t e_dir(0); - RXMeshContext::unpack_edge_dir(e, e, e_dir); + Context::unpack_edge_dir(e, e, e_dir); // if the direction is flipped, we take the second vertex uint16_t e_id = (2 * e) + (1 * e_dir); assert(e_id < 2 * num_edges); @@ -326,14 +322,11 @@ __device__ __forceinline__ void f_v(const uint32_t num_edges, } } } -//************************************************************************* - -//********************** 2) Vertex incident faces template -__device__ __forceinline__ void v_f(const uint32_t num_faces, - const uint32_t num_edges, - const uint32_t num_vertices, +__device__ __forceinline__ void v_f(const uint16_t num_faces, + const uint16_t num_edges, + const uint16_t num_vertices, uint16_t* d_edges, uint16_t* d_faces) { @@ -350,15 +343,13 @@ __device__ __forceinline__ void v_f(const uint32_t num_faces, f_v(num_edges, d_edges, num_faces, d_faces); __syncthreads(); - block_mat_transpose<3u, blockThreads>(num_faces, num_vertices, d_faces, - d_edges); + block_mat_transpose<3u, blockThreads>( + num_faces, num_vertices, d_faces, d_edges); } -//************************************************************************* -//********************** 8) Edge incident faces template -__device__ __forceinline__ void e_f(const uint32_t num_edges, - const uint32_t num_faces, +__device__ __forceinline__ void e_f(const uint16_t num_edges, + const uint16_t num_faces, uint16_t* d_faces, uint16_t* d_output, int shift = 1) @@ -372,16 +363,13 @@ __device__ __forceinline__ void e_f(const uint32_t num_edges, // num_faces*3 (zero is stored and the end can be inferred). Thus, // d_output should be allocated to size = num_faces*3 - block_mat_transpose<3u, blockThreads>(num_faces, num_edges, d_faces, - d_output, shift); + block_mat_transpose<3u, blockThreads>( + num_faces, num_edges, d_faces, d_output, shift); } -//************************************************************************* - -//********************** 5) Face adjacent faces template -__device__ __forceinline__ void f_f(const uint32_t num_edges, - const uint32_t num_faces, +__device__ __forceinline__ void f_f(const uint16_t num_edges, + const uint16_t num_faces, uint16_t* s_FE, uint16_t* s_FF_offset, uint16_t* s_FF_output) @@ -395,9 +383,9 @@ __device__ __forceinline__ void f_f(const uint32_t num_edges, // losing FE for (uint16_t i = threadIdx.x; i < num_faces * 3; i += blockThreads) { flag_t dir(0); - uint16_t e = s_FE[i] >> 1; + uint16_t e = s_FE[i] >> 1; s_EF_offset[i] = e; - s_FE[i] = e; + s_FE[i] = e; } __syncthreads(); @@ -458,83 +446,74 @@ __device__ __forceinline__ void f_f(const uint32_t num_edges, } }*/ } -//************************************************************************* - -//********************** template -__device__ __forceinline__ void query(uint16_t*& s_offset_all_patches, - uint16_t*& s_output_all_patches, - uint16_t* s_patch_edges, - uint16_t* s_patch_faces, - const uint32_t num_vertices, - const uint32_t num_edges, - const uint32_t num_faces) +__device__ __forceinline__ void query(uint16_t*& s_output_offset, + uint16_t*& s_output_value, + uint16_t* s_ev, + uint16_t* s_fe, + const uint16_t num_vertices, + const uint16_t num_edges, + const uint16_t num_faces) { switch (op) { case Op::VV: { assert(num_vertices <= 2 * num_edges); - s_offset_all_patches = &s_patch_edges[0]; - s_output_all_patches = &s_patch_edges[num_vertices + 1]; - v_v(num_vertices, num_edges, s_patch_edges, - s_output_all_patches); + s_output_offset = &s_ev[0]; + s_output_value = &s_ev[num_vertices + 1]; + v_v(num_vertices, num_edges, s_ev, s_output_value); break; } case Op::VE: { assert(num_vertices <= 2 * num_edges); - s_offset_all_patches = &s_patch_edges[0]; - s_output_all_patches = &s_patch_edges[num_vertices + 1]; - v_e(num_vertices, num_edges, s_patch_edges, - s_output_all_patches); + s_output_offset = &s_ev[0]; + s_output_value = &s_ev[num_vertices + 1]; + v_e(num_vertices, num_edges, s_ev, s_output_value); break; } case Op::VF: { assert(num_vertices <= 2 * num_edges); - s_output_all_patches = &s_patch_edges[0]; - s_offset_all_patches = &s_patch_faces[0]; - v_f(num_faces, num_edges, num_vertices, s_patch_edges, - s_patch_faces); + s_output_offset = &s_fe[0]; + s_output_value = &s_ev[0]; + v_f(num_faces, num_edges, num_vertices, s_ev, s_fe); break; } case Op::EV: { - s_output_all_patches = s_patch_edges; + s_output_value = s_ev; break; } case Op::EF: { assert(num_edges <= 3 * num_faces); - s_offset_all_patches = &s_patch_faces[0]; - s_output_all_patches = &s_patch_faces[num_edges + 1]; - e_f(num_edges, num_faces, s_patch_faces, - s_output_all_patches); + s_output_offset = &s_fe[0]; + s_output_value = &s_fe[num_edges + 1]; + e_f(num_edges, num_faces, s_fe, s_output_value); break; } case Op::FV: { - s_output_all_patches = s_patch_faces; - f_v(num_edges, s_patch_edges, num_faces, s_patch_faces); + s_output_value = s_fe; + f_v(num_edges, s_ev, num_faces, s_fe); break; } case Op::FE: { - s_output_all_patches = s_patch_faces; + s_output_value = s_fe; break; } case Op::FF: { assert(num_edges <= 3 * num_faces); - s_offset_all_patches = - &s_patch_faces[3 * num_faces + 2 * 3 * num_faces]; - // ^^^^FE ^^^^^EF - s_output_all_patches = &s_offset_all_patches[num_faces + 1]; - f_f(num_edges, num_faces, s_patch_faces, - s_offset_all_patches, s_output_all_patches); + s_output_offset = &s_fe[3 * num_faces + 2 * 3 * num_faces]; + // ^^^^FE ^^^^^EF + s_output_value = &s_output_offset[num_faces + 1]; + f_f( + num_edges, num_faces, s_fe, s_output_offset, s_output_value); break; } default: assert(1 != 1); break; - } // namespace RXMESH + } } -//************************************************************************* -} // namespace RXMESH +} // namespace rxmesh diff --git a/include/rxmesh/kernels/rxmesh_query_dispatcher.cuh b/include/rxmesh/kernels/rxmesh_query_dispatcher.cuh deleted file mode 100644 index d37e9325..00000000 --- a/include/rxmesh/kernels/rxmesh_query_dispatcher.cuh +++ /dev/null @@ -1,404 +0,0 @@ -#pragma once -#include -#include -#include - -#include "rxmesh/kernels/collective.cuh" -#include "rxmesh/kernels/rxmesh_iterator.cuh" -#include "rxmesh/kernels/rxmesh_loader.cuh" -#include "rxmesh/kernels/rxmesh_queries.cuh" -#include "rxmesh/rxmesh.h" -#include "rxmesh/rxmesh_context.h" -#include "rxmesh/rxmesh_util.h" - - -namespace RXMESH { - -namespace detail { - -/** - * query_block_dispatcher() - */ -template -__device__ __inline__ void query_block_dispatcher( - const RXMeshContext& context, - const uint32_t current_patch_id, - activeSetT compute_active_set, - const bool oriented, - const bool output_needs_mapping, - uint32_t& num_src_in_patch, - uint32_t*& input_mapping, - uint32_t*& s_output_mapping, - uint16_t*& s_offset_all_patches, - uint16_t*& s_output_all_patches) -{ - static_assert(op != Op::EE, "Op::EE is not supported!"); - assert(current_patch_id < context.get_num_patches()); - - - ELEMENT src_element, output_element; - io_elements(op, src_element, output_element); - - extern __shared__ uint16_t shrd_mem[]; - - - s_offset_all_patches = shrd_mem; - s_output_all_patches = shrd_mem; - uint16_t *s_patch_edges(shrd_mem), *s_patch_faces(shrd_mem); - - constexpr bool load_faces = (op == Op::VF || op == Op::EE || op == Op::EF || - op == Op::FV || op == Op::FE || op == Op::FF); - constexpr bool load_edges = (op == Op::VV || op == Op::VE || op == Op::VF || - op == Op::EV || op == Op::FV); - static_assert(load_edges || load_faces, - "At least faces or edges needs to be loaded"); - - constexpr bool is_fixed_offset = - (op == Op::EV || op == Op::FV || op == Op::FE); - - __syncthreads(); - - // 1) load the patch addressed and size - uint4 ad_size; - uint2 ad_size_ltog_v, ad_size_ltog_e, ad_size_ltog_f; - const uint2& output_ele_ad_size = - ((output_element == ELEMENT::EDGE) ? - ad_size_ltog_e : - ((output_element == ELEMENT::FACE) ? ad_size_ltog_f : - ad_size_ltog_v)); - const uint2& src_element_ad_size = - ((src_element == ELEMENT::EDGE) ? - ad_size_ltog_e : - ((src_element == ELEMENT::FACE) ? ad_size_ltog_f : - ad_size_ltog_v)); - load_patch_ad_size(context, current_patch_id, ad_size, ad_size_ltog_v, - ad_size_ltog_e, ad_size_ltog_f); - - // Check if any of the vertices are in the active set - // input mapping does not need to be stored in shared memory since it will - // be read coalesced, we can rely on L1 cache here - input_mapping = nullptr; - num_src_in_patch = 0; - switch (src_element) { - case RXMESH::ELEMENT::VERTEX: { - input_mapping = - context.get_patches_ltog_v() + src_element_ad_size.x; - num_src_in_patch = context.get_size_owned()[current_patch_id].z; - break; - } - case RXMESH::ELEMENT::EDGE: { - input_mapping = - context.get_patches_ltog_e() + src_element_ad_size.x; - num_src_in_patch = context.get_size_owned()[current_patch_id].y; - break; - } - case RXMESH::ELEMENT::FACE: { - input_mapping = - context.get_patches_ltog_f() + src_element_ad_size.x; - num_src_in_patch = context.get_size_owned()[current_patch_id].x; - break; - } - } - - - bool is_active = false; - uint16_t local_id = threadIdx.x; - while (local_id < num_src_in_patch) { - is_active = - local_id || compute_active_set(input_mapping[local_id] >> 1); - local_id += blockThreads; - } - - - if (__syncthreads_or(is_active) == 0) { - return; - } - - assert(ad_size.y == ad_size_ltog_e.y * 2); - assert(ad_size.w == ad_size_ltog_f.y * 3); - - - // 2) Load the patch info - load_mesh(context, load_edges, load_faces, s_patch_edges, s_patch_faces, - ad_size); - __syncthreads(); - - // 3)Perform the query operation - if (oriented) { - assert(op == Op::VV); - if constexpr (op == Op::VV) { - v_v_oreinted( - s_offset_all_patches, s_output_all_patches, s_patch_edges, - context, ad_size, ad_size_ltog_v.y, num_src_in_patch); - } - } else { - query(s_offset_all_patches, s_output_all_patches, - s_patch_edges, s_patch_faces, ad_size_ltog_v.y, - ad_size_ltog_e.y, ad_size_ltog_f.y); - } - - - // 4) load output mapping - s_output_mapping = nullptr; - if (output_needs_mapping) { - // Read comments in calc_shared_memory() to understand how we calculate - // s_output_mapping pointer location in shared memory such that it does - // not overwrite the results - - // We add ad_size.w % 2 for padding in case ad_size.w is not - // dividable by 2 in which case memory misalignment happens - if constexpr (op == Op::FE) { - s_output_mapping = - (uint32_t*)&shrd_mem[ad_size.w + (ad_size.w % 2)]; - } - if constexpr (op == Op::EV) { - s_output_mapping = (uint32_t*)&shrd_mem[ad_size.y]; - } - if constexpr (op == Op::FV) { - s_output_mapping = - (uint32_t*)&shrd_mem[ad_size.w + (ad_size.w % 2) + ad_size.y]; - } - if constexpr (op == Op::VE) { - s_output_mapping = (uint32_t*)&shrd_mem[2 * ad_size.y]; - } - if constexpr (op == Op::EF || op == Op::VF) { - s_output_mapping = (uint32_t*)&shrd_mem[2 * ad_size.w]; - } - if constexpr (op == Op::FF) { - // FF uses a lot of shared memory and some of it can be overridden - // but we need to wait for the query to be done. - __syncthreads(); - s_output_mapping = (uint32_t*)&shrd_mem[0]; - } - - if constexpr (op == Op::VV) { - // We use extra shared memory that is read only for VV which we can - // just use for loading ltog. The drawback is that we need to wait - // for the query to finish first before overwriting it with ltog - __syncthreads(); - uint16_t last_vv = ad_size_ltog_v.y + 1 + 2 * ad_size_ltog_e.y; - s_output_mapping = (uint32_t*)&shrd_mem[last_vv + last_vv % 2]; - } - - load_mapping(context, output_element, output_ele_ad_size, - s_output_mapping, false); - } - __syncthreads(); -} -} // namespace detail -/** - * query_block_dispatcher() - */ -template -__device__ __inline__ void query_block_dispatcher( - const RXMeshContext& context, - const uint32_t current_patch_id, - computeT compute_op, - activeSetT compute_active_set, - const bool oriented = false, - const bool output_needs_mapping = true) -{ - static_assert(op != Op::EE, "Op::EE is not supported!"); - assert(current_patch_id < context.get_num_patches()); - - uint32_t num_src_in_patch = 0; - uint32_t *input_mapping(nullptr), *s_output_mapping(nullptr); - uint16_t *s_offset_all_patches(nullptr), *s_output_all_patches(nullptr); - - detail::template query_block_dispatcher( - context, current_patch_id, compute_active_set, oriented, - output_needs_mapping, num_src_in_patch, input_mapping, s_output_mapping, - s_offset_all_patches, s_output_all_patches); - - assert(input_mapping); - assert(s_output_all_patches); - - // 5) Call compute on the output in shared memory by looping over all - // source elements in this patch. - - uint16_t local_id = threadIdx.x; - while (local_id < num_src_in_patch) { - - uint32_t global_id = input_mapping[local_id] >> 1; - - if (compute_active_set(global_id)) { - constexpr uint32_t fixed_offset = - ((op == Op::EV) ? 2 : - (op == Op::FV || op == Op::FE) ? 3 : - 0); - RXMeshIterator iter(local_id, s_output_all_patches, - s_offset_all_patches, s_output_mapping, - fixed_offset, num_src_in_patch, - int(op == Op::FE)); - - compute_op(global_id, iter); - } - - local_id += blockThreads; - } -} - -/** - * query_block_dispatcher() - */ -template -__device__ __inline__ void query_block_dispatcher( - const RXMeshContext& context, - computeT compute_op, - activeSetT compute_active_set, - const bool oriented = false, - const bool output_needs_mapping = true) -{ - if (blockIdx.x >= context.get_num_patches()) { - return; - } - query_block_dispatcher(context, blockIdx.x, compute_op, - compute_active_set, oriented, - output_needs_mapping); -} - -/** - * query_block_dispatcher() - */ -template -__device__ __inline__ void query_block_dispatcher( - const RXMeshContext& context, - computeT compute_op, - const bool oriented = false, - const bool output_needs_mapping = true) -{ - if (blockIdx.x >= context.get_num_patches()) { - return; - } - query_block_dispatcher( - context, blockIdx.x, compute_op, [](uint32_t) { return true; }, - oriented, output_needs_mapping); -} - - -/** - * query_block_dispatcher() - */ -template -__device__ __inline__ void query_block_dispatcher(const RXMeshContext& context, - const uint32_t element_id, - computeT compute_op, - const bool oriented = false) -{ - // The whole block should be calling this function. If one thread is not - // participating, its element_id should be INVALID32 - - auto compute_active_set = [](uint32_t) { return true; }; - - uint32_t element_patch = INVALID32; - if (element_id != INVALID32) { - switch (op) { - case RXMESH::Op::VV: - case RXMESH::Op::VE: - case RXMESH::Op::VF: - element_patch = context.get_vertex_patch()[element_id]; - break; - case RXMESH::Op::FV: - case RXMESH::Op::FE: - case RXMESH::Op::FF: - element_patch = context.get_face_patch()[element_id]; - break; - case RXMESH::Op::EV: - case RXMESH::Op::EE: - case RXMESH::Op::EF: - element_patch = context.get_edge_patch()[element_id]; - break; - } - } - - // Here, we want to identify the set of unique patches for this thread - // block. We do this by first sorting the patches, compute discontinuity - // head flag, then threads with head flag =1 can add their patches to the - // shared memory buffer that will contain the unique patches - - __shared__ uint32_t s_block_patches[blockThreads]; - __shared__ uint32_t s_num_patches; - if (threadIdx.x == 0) { - s_num_patches = 0; - } - typedef cub::BlockRadixSort BlockRadixSort; - typedef cub::BlockDiscontinuity BlockDiscontinuity; - union TempStorage - { - typename BlockRadixSort::TempStorage sort_storage; - typename BlockDiscontinuity::TempStorage discont_storage; - }; - __shared__ TempStorage all_temp_storage; - uint32_t thread_data[1], thread_head_flags[1]; - thread_data[0] = element_patch; - thread_head_flags[0] = 0; - BlockRadixSort(all_temp_storage.sort_storage).Sort(thread_data); - BlockDiscontinuity(all_temp_storage.discont_storage) - .FlagHeads(thread_head_flags, thread_data, cub::Inequality()); - - if (thread_head_flags[0] == 1 && thread_data[0] != INVALID32) { - uint32_t id = ::atomicAdd(&s_num_patches, uint32_t(1)); - s_block_patches[id] = thread_data[0]; - } - - // We could eliminate the discontinuity operation and atomicAdd and instead - // use thrust::unique. However, this method causes illegal memory access - // and it looks like a bug in thrust - /*__syncthreads(); - // uniquify - uint32_t* new_end = thrust::unique(thrust::device, s_block_patches, - s_block_patches + blockThreads); - __syncthreads(); - - if (threadIdx.x == 0) { - s_num_patches = new_end - s_block_patches - 1; - }*/ - __syncthreads(); - - - for (uint32_t p = 0; p < s_num_patches; ++p) { - - uint32_t patch_id = s_block_patches[p]; - - assert(patch_id < context.get_num_patches()); - - uint32_t num_src_in_patch = 0; - uint32_t *input_mapping(nullptr), *s_output_mapping(nullptr); - uint16_t *s_offset_all_patches(nullptr), *s_output_all_patches(nullptr); - - detail::template query_block_dispatcher( - context, patch_id, compute_active_set, oriented, true, - num_src_in_patch, input_mapping, s_output_mapping, - s_offset_all_patches, s_output_all_patches); - - assert(input_mapping); - assert(s_output_all_patches); - - - if (element_patch == patch_id) { - - uint16_t local_id = INVALID16; - - for (uint16_t j = 0; j < num_src_in_patch; ++j) { - if (element_id == s_output_mapping[j]) { - local_id = j; - break; - } - } - - constexpr uint32_t fixed_offset = - ((op == Op::EV) ? 2 : - (op == Op::FV || op == Op::FE) ? 3 : - 0); - - RXMeshIterator iter(local_id, s_output_all_patches, - s_offset_all_patches, s_output_mapping, - fixed_offset, num_src_in_patch, - int(op == Op::FE)); - - compute_op(element_id, iter); - } - } -} - -} // namespace RXMESH diff --git a/include/rxmesh/kernels/util.cuh b/include/rxmesh/kernels/util.cuh index 9ccd97ce..7c2c88ce 100644 --- a/include/rxmesh/kernels/util.cuh +++ b/include/rxmesh/kernels/util.cuh @@ -2,87 +2,77 @@ #include #include -namespace RXMESH { +namespace rxmesh { -/** - * memcpy() - */ template __global__ void memcpy(attrT* d_dest, const attrT* d_src, const uint32_t length) { const uint32_t stride = blockDim.x * gridDim.x; - uint32_t i = blockDim.x * blockIdx.x + threadIdx.x; + uint32_t i = blockDim.x * blockIdx.x + threadIdx.x; while (i < length) { d_dest[i] = d_src[i]; i += stride; } } -/** - * memset() - */ + template __global__ void memset(attrT* d_dest, const attrT val, const uint32_t length) { const uint32_t stride = blockDim.x * gridDim.x; - uint32_t i = blockDim.x * blockIdx.x + threadIdx.x; + uint32_t i = blockDim.x * blockIdx.x + threadIdx.x; while (i < length) { d_dest[i] = val; i += stride; } } -/** - * atomicAdd() on uint16_t - */ __device__ __forceinline__ uint16_t atomicAdd(uint16_t* address, uint16_t val) { // Taken from // https://github.com/pytorch/pytorch/blob/master/aten/src/THC/THCAtomics.cuh#L36 - size_t offset = (size_t)address & 2; + size_t offset = (size_t)address & 2; uint32_t* address_as_ui = (uint32_t*)((char*)address - offset); - bool is_32_align = offset; - uint32_t old = *address_as_ui; + bool is_32_align = offset; + uint32_t old = *address_as_ui; uint32_t old_bytes; uint32_t newval; uint32_t assumed; do { - assumed = old; + assumed = old; old_bytes = is_32_align ? old >> 16 : old & 0xffff; // preserve size in initial cast. Casting directly to uint32_t pads // negative signed values with 1's (e.g. signed -1 = unsigned ~0). newval = static_cast(val + old_bytes); newval = is_32_align ? (old & 0xffff) | (newval << 16) : (old & 0xffff0000) | newval; - old = atomicCAS(address_as_ui, assumed, newval); + old = atomicCAS(address_as_ui, assumed, newval); } while (assumed != old); return (is_32_align) ? uint16_t(old >> 16) : uint16_t(old & 0xffff); } -/** - * atomicAdd() on uint8_t - */ + __device__ __forceinline__ uint8_t atomicAdd(uint8_t* address, uint8_t val) { // Taken from // https://github.com/pytorch/pytorch/blob/master/aten/src/THC/THCAtomics.cuh#L14 - size_t offset = (size_t)address & 3; + size_t offset = (size_t)address & 3; uint32_t* address_as_ui = (uint32_t*)((char*)address - offset); - uint32_t old = *address_as_ui; - uint32_t shift = offset * 8; + uint32_t old = *address_as_ui; + uint32_t shift = offset * 8; uint32_t old_byte; uint32_t newval; uint32_t assumed; do { - assumed = old; + assumed = old; old_byte = (old >> shift) & 0xff; // preserve size in initial cast. Casting directly to uint32_t pads // negative signed values with 1's (e.g. signed -1 = unsigned ~0). newval = static_cast(val + old_byte); newval = (old & ~(0x000000ff << shift)) | (newval << shift); - old = atomicCAS(address_as_ui, assumed, newval); + old = atomicCAS(address_as_ui, assumed, newval); } while (assumed != old); return uint8_t((old >> shift) & 0xff); @@ -111,7 +101,7 @@ __device__ __forceinline__ unsigned short int atomicCAS( #else // Taken from // https://github.com/rapidsai/cudf/blob/89b802e6cecffe2425048f1f70cd682b865730b8/cpp/include/cudf/detail/utilities/device_atomics.cuh - using T_int = unsigned int; + using T_int = unsigned int; using T_int_short = unsigned short int; bool is_32_align = (reinterpret_cast(address) & 2) ? false : true; @@ -132,7 +122,7 @@ __device__ __forceinline__ unsigned short int atomicCAS( T_int new_value = (is_32_align) ? (old & 0xffff0000) | u_val : (old & 0xffff) | (T_int(u_val) << 16); - old = ::atomicCAS(address_uint32, assumed, new_value); + old = ::atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return target_value; @@ -140,9 +130,7 @@ __device__ __forceinline__ unsigned short int atomicCAS( #endif } -/** - * dynamic_smem_size() - */ + __device__ __forceinline__ unsigned dynamic_smem_size() { unsigned ret; @@ -151,4 +139,4 @@ __device__ __forceinline__ unsigned dynamic_smem_size() } -} // namespace RXMESH \ No newline at end of file +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/launch_box.h b/include/rxmesh/launch_box.h index b99198c1..70423d2e 100644 --- a/include/rxmesh/launch_box.h +++ b/include/rxmesh/launch_box.h @@ -1,12 +1,18 @@ +#pragma once #include -namespace RXMESH { +namespace rxmesh { +/** + * @brief Stores different parameters needed to launch kernels i.e., number of + * CUDA blocks and threads, dynamic shared memory. These parameters are meant to + * be calculated by RXMeshStatic and then used by the user to launch kernels + */ template struct LaunchBox { - uint32_t blocks, smem_bytes_dyn, smem_bytes_static, - expected_output_per_block; + uint32_t blocks, num_registers_per_thread; + size_t smem_bytes_dyn, smem_bytes_static; const uint32_t num_threads = blockThreads; }; -} // namespace RXMESH \ No newline at end of file +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/local.h b/include/rxmesh/local.h new file mode 100644 index 00000000..d8b0fadb --- /dev/null +++ b/include/rxmesh/local.h @@ -0,0 +1,77 @@ +#pragma once +#include +#include +#include "rxmesh/util/macros.h" + +namespace rxmesh { + +/** + * @brief Local vertex type (wrapped around uint16_t) + */ +struct LocalVertexT +{ + /** + * @brief Default constructor + */ + __device__ __host__ LocalVertexT() : id(INVALID16) + { + } + + /** + * @brief Constructor using local index + * @param id vertex local index in the owner patch + * @return + */ + __device__ __host__ LocalVertexT(uint16_t id) : id(id) + { + } + uint16_t id; +}; + +/** + * @brief Local edge type (wrapped around uint16_t) + */ +struct LocalEdgeT +{ + /** + * @brief Default constructor + */ + __device__ __host__ LocalEdgeT() : id(INVALID16) + { + } + + /** + * @brief Constructor using local index + * @param id edge local index in the owner patch + * @return + */ + __device__ __host__ LocalEdgeT(uint16_t id) : id(id) + { + } + uint16_t id; +}; + +/** + * @brief Local face type (wrapped around uint16_t) + */ +struct LocalFaceT +{ + /** + * @brief Default constructor + */ + __device__ __host__ LocalFaceT() : id(INVALID16) + { + } + + /** + * @brief Constructor using local index + * @param id face local index in the owner patch + * @return + */ + __device__ __host__ LocalFaceT(uint16_t id) : id(id) + { + } + uint16_t id; +}; + +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/patch_info.h b/include/rxmesh/patch_info.h new file mode 100644 index 00000000..f7e3fdd4 --- /dev/null +++ b/include/rxmesh/patch_info.h @@ -0,0 +1,43 @@ +#pragma once +#include +#include +#include +#include +#include "rxmesh/local.h" +#include "rxmesh/util/macros.h" + +namespace rxmesh { + +/** + * @brief PatchInfo stores the information needed for query operations in a + * patch + */ +struct ALIGN(16) PatchInfo +{ + // The topology information: edge incident vertices and face incident edges + LocalVertexT* ev; + LocalEdgeT* fe; + + + // Non-owned mesh elements patch ID + uint32_t* not_owned_patch_v; + uint32_t* not_owned_patch_e; + uint32_t* not_owned_patch_f; + + + // Non-owned mesh elements local ID + LocalVertexT* not_owned_id_v; + LocalEdgeT* not_owned_id_e; + LocalFaceT* not_owned_id_f; + + // Number of mesh elements in the patch + uint16_t num_vertices, num_edges, num_faces; + + // Number of mesh elements owned by this patch + uint16_t num_owned_vertices, num_owned_edges, num_owned_faces; + + // The index of this patch (relative to all other patches) + uint32_t patch_id; +}; + +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/patcher/patcher.cu b/include/rxmesh/patcher/patcher.cu index b1599a63..576305cd 100644 --- a/include/rxmesh/patcher/patcher.cu +++ b/include/rxmesh/patcher/patcher.cu @@ -3,36 +3,61 @@ #include #include #include +#include #include "cub/device/device_radix_sort.cuh" #include "cub/device/device_scan.cuh" #include "cuda_profiler_api.h" #include "rxmesh/kernels/util.cuh" #include "rxmesh/patcher/patcher.h" #include "rxmesh/patcher/patcher_kernel.cuh" -#include "rxmesh/util/export_tools.h" #include "rxmesh/util/log.h" #include "rxmesh/util/macros.h" #include "rxmesh/util/timer.h" #include "rxmesh/util/util.h" -extern std::vector> Verts; // TODO remove this -namespace RXMESH { - - -namespace PATCHER { - -//********************** Constructors/Destructors -Patcher::Patcher(uint32_t patch_size, - const std::vector>& fvn, - const uint32_t num_vertices, - const uint32_t num_edges, - const bool is_multi_component /* = true*/, - const bool quite /*=true*/) - : m_patch_size(patch_size), m_fvn(fvn), m_num_vertices(num_vertices), - m_num_edges(num_edges), m_num_faces(fvn.size()), m_num_seeds(0), - m_max_num_patches(0), m_is_multi_component(is_multi_component), - m_quite(quite), m_num_components(0), m_patching_time_ms(0) +namespace rxmesh { + + +namespace patcher { + +Patcher::Patcher(uint32_t patch_size, + const std::vector& ff_offset, + const std::vector& ff_values, + const std::vector>& fv, + const std::unordered_map, + uint32_t, + detail::edge_key_hash> edges_map, + const uint32_t num_vertices, + const uint32_t num_edges, + const bool quite) + : m_patch_size(patch_size), + m_num_patches(0), + m_num_vertices(num_vertices), + m_num_edges(num_edges), + m_num_faces(fv.size()), + m_num_seeds(0), + m_max_num_patches(0), + m_num_components(0), + m_num_lloyd_run(0), + m_d_face_patch(nullptr), + m_d_vertex_patch(nullptr), + m_d_edge_patch(nullptr), + m_d_patches_offset(nullptr), + m_d_patches_size(nullptr), + m_d_patches_val(nullptr), + m_patching_time_ms(0.0), + m_d_seeds(nullptr), + m_d_ff_values(nullptr), + m_d_ff_offset(nullptr), + m_d_queue(nullptr), + m_d_queue_ptr(nullptr), + m_d_new_num_patches(nullptr), + m_d_max_patch_size(nullptr), + m_d_cub_temp_storage_scan(nullptr), + m_d_cub_temp_storage_max(nullptr), + m_cub_scan_bytes(0), + m_cub_max_bytes(0) { m_num_patches = @@ -42,11 +67,44 @@ Patcher::Patcher(uint32_t patch_size, m_num_seeds = m_num_patches; - mem_alloc(); + allocate_memory(); + + // degenerate cases + if (m_num_patches <= 1) { + m_patches_offset[0] = m_num_faces; + m_num_seeds = 1; + m_num_components = 1; + m_num_lloyd_run = 0; + for (uint32_t i = 0; i < m_num_faces; ++i) { + m_face_patch[i] = 0; + m_patches_val[i] = i; + } + allocate_device_memory(ff_offset, ff_values); + assign_patch(fv, edges_map); + } else { + + initialize_random_seeds(ff_offset, ff_values); + allocate_device_memory(ff_offset, ff_values); + run_lloyd(); + postprocess(fv, ff_offset, ff_values); + assign_patch(fv, edges_map); + } + + if (!quite) { + print_statistics(); + } } -void Patcher::mem_alloc() +Patcher::~Patcher() { + GPU_FREE(m_d_face_patch); + GPU_FREE(m_d_vertex_patch); + GPU_FREE(m_d_edge_patch); +} + +void Patcher::allocate_memory() +{ + m_seeds.reserve(m_num_seeds); // patches assigned to each face, vertex, and edge m_face_patch.resize(m_num_faces); @@ -61,14 +119,9 @@ void Patcher::mem_alloc() // explicit patches in compressed format m_patches_val.resize(m_num_faces); - // we allow upto double the number of faces due to patch bisecting + // we allow up to double the number of faces due to patch bisecting m_patches_offset.resize(m_max_num_patches); - // used to track the frontier and current seeds - m_frontier.resize(m_num_faces, INVALID32); - m_tf.resize(3); - m_seeds.reserve(m_num_seeds); - // external ribbon. it assumes first that all faces will be in there and // then shrink to fit after the construction is done m_ribbon_ext_offset.resize(m_max_num_patches, 0); @@ -76,13 +129,89 @@ void Patcher::mem_alloc() m_ribbon_ext_val.resize(m_num_faces); } -Patcher::~Patcher() +void Patcher::allocate_device_memory(const std::vector& ff_offset, + const std::vector& ff_values) { -} -//************************************************************************** + // ff + CUDA_ERROR(cudaMalloc((void**)&m_d_ff_values, + ff_values.size() * sizeof(uint32_t))); + CUDA_ERROR(cudaMalloc((void**)&m_d_ff_offset, + ff_offset.size() * sizeof(uint32_t))); + + CUDA_ERROR(cudaMemcpy((void**)m_d_ff_values, + ff_values.data(), + ff_values.size() * sizeof(uint32_t), + cudaMemcpyHostToDevice)); + + CUDA_ERROR(cudaMemcpy((void**)m_d_ff_offset, + ff_offset.data(), + ff_offset.size() * sizeof(uint32_t), + cudaMemcpyHostToDevice)); + // face/vertex/edge patch + CUDA_ERROR( + cudaMalloc((void**)&m_d_face_patch, m_num_faces * sizeof(uint32_t))); + CUDA_ERROR(cudaMalloc((void**)&m_d_vertex_patch, + m_num_vertices * sizeof(uint32_t))); + CUDA_ERROR( + cudaMalloc((void**)&m_d_edge_patch, m_num_edges * sizeof(uint32_t))); + + // seeds + CUDA_ERROR( + cudaMalloc((void**)&m_d_seeds, m_max_num_patches * sizeof(uint32_t))); + + CUDA_ERROR(cudaMemcpy((void**)m_d_seeds, + m_seeds.data(), + m_num_patches * sizeof(uint32_t), + cudaMemcpyHostToDevice)); + // utility + // 0 -> queue start + // 1-> queue end + // 2-> next queue end + std::vector h_queue_ptr{0, m_num_patches, m_num_patches}; + CUDA_ERROR(cudaMalloc((void**)&m_d_queue, m_num_faces * sizeof(uint32_t))); + CUDA_ERROR(cudaMalloc((void**)&m_d_queue_ptr, 3 * sizeof(uint32_t))); + CUDA_ERROR(cudaMemcpy(m_d_queue_ptr, + h_queue_ptr.data(), + 3 * sizeof(uint32_t), + cudaMemcpyHostToDevice)); + + // patch offset/size/value and max patch size + CUDA_ERROR(cudaMalloc((void**)&m_d_patches_offset, + m_max_num_patches * sizeof(uint32_t))); + CUDA_ERROR(cudaMalloc((void**)&m_d_patches_size, + m_max_num_patches * sizeof(uint32_t))); + CUDA_ERROR( + cudaMalloc((void**)&m_d_patches_val, m_num_faces * sizeof(uint32_t))); + CUDA_ERROR(cudaMalloc((void**)&m_d_max_patch_size, sizeof(uint32_t))); + + CUDA_ERROR(cudaMalloc((void**)&m_d_new_num_patches, sizeof(uint32_t))); + + CUDA_ERROR(cudaMemcpy((void**)m_d_new_num_patches, + &m_num_patches, + sizeof(uint32_t), + cudaMemcpyHostToDevice)); + + // CUB temp memory + m_d_cub_temp_storage_scan = nullptr; + m_d_cub_temp_storage_max = nullptr; + m_cub_scan_bytes = 0; + m_cub_max_bytes = 0; + ::cub::DeviceScan::InclusiveSum(m_d_cub_temp_storage_scan, + m_cub_scan_bytes, + m_d_patches_size, + m_d_patches_offset, + m_max_num_patches); + ::cub::DeviceReduce::Max(m_d_cub_temp_storage_max, + m_cub_max_bytes, + m_d_patches_size, + m_d_max_patch_size, + m_max_num_patches); + CUDA_ERROR( + cudaMalloc((void**)&m_d_cub_temp_storage_scan, m_cub_scan_bytes)); + CUDA_ERROR(cudaMalloc((void**)&m_d_cub_temp_storage_max, m_cub_max_bytes)); +} -//********************** Exporters/Importer void Patcher::print_statistics() { RXMESH_TRACE("Patcher: num_patches = {}", m_num_patches); @@ -94,325 +223,101 @@ void Patcher::print_statistics() RXMESH_TRACE( "Patcher: Parallel patches construction time = {} (ms) and {} " "(ms/lloyd_run)", - m_patching_time_ms, m_patching_time_ms / float(m_num_lloyd_run)); + m_patching_time_ms, + m_patching_time_ms / float(m_num_lloyd_run)); // max-min patch size uint32_t max_patch_size(0), min_patch_size(m_num_faces), avg_patch_size(0); get_max_min_avg_patch_size(min_patch_size, max_patch_size, avg_patch_size); RXMESH_TRACE( "Patcher: max_patch_size= {}, min_patch_size= {}, avg_patch_size= {}", - max_patch_size, min_patch_size, avg_patch_size); + max_patch_size, + min_patch_size, + avg_patch_size); RXMESH_TRACE("Patcher: number external ribbon faces = {} ({:02.2f}%)", - get_num_ext_ribbon_faces(), get_ribbon_overhead()); - - /*std::string filename = "patch_dist.txt"; - filename = STRINGIFY(OUTPUT_DIR) + filename; - std::fstream file(filename.c_str(), std::ios::out); - file.precision(15); - for (uint32_t p = 0; p < m_num_patches; p++) { - uint32_t p_size = - m_patches_offset[p] - ((p == 0) ? 0 : m_patches_offset[p - 1]); - file << p_size << "\n"; - } - file.close();*/ + get_num_ext_ribbon_faces(), + get_ribbon_overhead()); } -template -void Patcher::export_ext_ribbon(const std::vector>& Verts, - int patch_id) +void Patcher::initialize_random_seeds(const std::vector& ff_offset, + const std::vector& ff_values) { - uint32_t start = ((patch_id == 0) ? 0 : m_ribbon_ext_offset[patch_id - 1]); - export_face_list("ribbon_ext" + std::to_string(patch_id) + ".obj", m_fvn, - Verts, m_ribbon_ext_offset[patch_id] - start, - m_ribbon_ext_val.data() + start); -} -template -void Patcher::export_patches(const std::vector>& Verts) -{ - export_attribute_VTK("patches.vtk", m_fvn, Verts, 1, m_face_patch.data(), - m_vertex_patch.data(), false); - - /*if (!m_vertex_patch.empty()) { - export_as_cubes_VTK( - "patches_vertex.vtk", m_num_vertices, 0.05f, m_vertex_patch.data(), - [&Verts](uint32_t i) { return Verts[i][0]; }, - [&Verts](uint32_t i) { return Verts[i][1]; }, - [&Verts](uint32_t i) { return Verts[i][2]; }, m_num_patches, false); - }*/ -} + // 1) Identify the components i.e., for each component list the faces + // that belong to that it + // 2) Generate number of (random) seeds in each component + // proportional to the number of faces it contain + std::vector> components; + get_multi_components(components, ff_offset, ff_values); -template -void Patcher::export_components( - const std::vector>& Verts, - const std::vector>& components) -{ - - uint32_t num_components = components.size(); - std::vector rand_color(num_components + 1); - for (uint32_t i = 0; i < num_components; ++i) { - rand_color[i] = float(rand()) / float(RAND_MAX); - } - rand_color[num_components] = 0.0f; - std::vector face_component(m_num_faces, INVALID32); - uint32_t comp_id = 0; - for (const auto& comp : components) { - for (const auto& cf : comp) { - assert(face_component[cf] == INVALID32); - face_component[cf] = comp_id; - } - ++comp_id; - } - export_attribute_VTK("components.vtk", m_fvn, Verts, 1, - face_component.data(), face_component.data(), - num_components, false, rand_color.data()); -} - - -template -void Patcher::export_single_patch(const std::vector>& Verts, - int patch_id) -{ - - std::vector vf1(3); - - std::string filename = - STRINGIFY(OUTPUT_DIR) + ("patch_" + std::to_string(patch_id) + ".obj"); - - std::fstream file(filename, std::ios::out); - - for (uint32_t i = 0; i < Verts.size(); ++i) { - file << "v " << Verts[i][0] << " " << Verts[i][1] << " " << Verts[i][2] - << std::endl; - } - - uint32_t p_start = (patch_id == 0) ? 0 : m_patches_offset[patch_id - 1]; - uint32_t p_end = m_patches_offset[patch_id]; - - for (uint32_t fb = p_start; fb < p_end; ++fb) { - uint32_t face = m_patches_val[fb]; - - get_incident_vertices(face, vf1); - - file << "f " << vf1[0] + 1 << " " << vf1[1] + 1 << " " << vf1[2] + 1 - << std::endl; - } -} - -template -void Patcher::export_single_patch_edges( - const std::vector>& Verts, - int patch_id, - EdgeIDFunc get_edge_id) -{ - // export edges of that are assigned to patch_id - - std::string filename = STRINGIFY(OUTPUT_DIR) + - ("patch_edges_" + std::to_string(patch_id) + ".obj"); - - std::fstream file(filename, std::ios::out); - - for (uint32_t i = 0; i < Verts.size(); ++i) { - file << "v " << Verts[i][0] << " " << Verts[i][1] << " " << Verts[i][2] - << std::endl; - } - - - std::vector vf1(3); - - for (uint32_t f = 0; f < m_num_faces; ++f) { - get_incident_vertices(f, vf1); - - uint32_t v1 = vf1.back(); - for (uint32_t v = 0; v < vf1.size(); ++v) { - uint32_t v0 = vf1[v]; - - uint32_t edge_id = get_edge_id(v0, v1); - - if (get_edge_patch_id(edge_id) == patch_id) { - file << "f " << v0 + 1 << " " << v1 + 1 << " " << v0 + 1 - << std::endl; - } - v1 = v0; - } - } -} -//************************************************************************** - - -//********************** executer/internal utilities -void Patcher::execute(std::function get_edge_id, - const std::vector>& ef) -{ - - // degenerate cases - if (m_num_patches <= 1) { - m_patches_offset[0] = m_num_faces; - - for (uint32_t i = 0; i < m_num_faces; ++i) { - m_face_patch[i] = 0; - m_patches_val[i] = i; - } - m_neighbour_patches_offset.resize(1, 0); - assign_patch(get_edge_id); - if (!m_quite) { - print_statistics(); - } - return; - } - - parallel_execute(ef); - - postprocess(); - - // export_patches(Verts); - // for (uint32_t i = 0; i < m_num_patches;++i){ - // export_ext_ribbon(Verts, i); - //} - - m_ribbon_ext_val.resize(m_ribbon_ext_offset[m_num_patches - 1]); - - // assign patches to vertices and edges - assign_patch(get_edge_id); - - // export_single_patch_edges(Verts, 0, get_edge_id); - - - if (!m_quite) { - print_statistics(); - } -} - -void Patcher::initialize_cluster_seeds() -{ - // cluster i.e., start from one triangle and grow in bfs style from it - // for experiments only - - double r = double(rand()) / double(RAND_MAX); - uint32_t rand_face = - static_cast(r * static_cast(m_num_faces - 1)); - std::queue qu; - qu.push(rand_face); - - std::vector n_faces(3); - std::vector taken; - taken.push_back(rand_face); - - while (true) { - uint32_t current_face = qu.front(); - qu.pop(); - - m_seeds.push_back(current_face); - - if (m_seeds.size() == m_num_seeds) { - return; - } - - get_adjacent_faces(current_face, n_faces); - - for (uint32_t i = 0; i < n_faces.size(); i++) { - uint32_t ff = n_faces[i]; - if (ff == SPECIAL || ff == INVALID32 || - find_index(ff, taken) != std::numeric_limits::max()) { - continue; - } - qu.push(ff); - taken.push_back(ff); - } - } -} - -void Patcher::initialize_random_seeds() -{ - // random - if (!m_is_multi_component) { + m_num_components = components.size(); + if (m_num_components == 1) { initialize_random_seeds_single_component(); } else { - // if multi-component, - // 1) Identify the components i.e., for each component list the faces - // that belong to that it - // 2) Generate number of (random) seeds in each component - // proportional to the number of faces it contain - - std::vector> components; - get_multi_components(components); - - // export_components(Verts, components); - - m_num_components = components.size(); - if (m_num_components == 1) { - initialize_random_seeds_single_component(); + if (m_num_seeds <= m_num_components) { + // we have too many components so we increase the number of + // seeds. this case should not be encountered frequently + // since we generate only one seed per component + m_num_seeds = m_num_components; + for (auto& comp : components) { + generate_random_seed_from_component(comp, 1); + } } else { - if (m_num_seeds <= m_num_components) { - // we have too many components so we increase the number of - // seeds. this case should not be encountered frequently - // since we generate only one seed per component - m_num_seeds = m_num_components; - for (auto& comp : components) { - generate_random_seed_from_component(comp, 1); - } - } else { - // if we have more seeds to give than the number of components, - // then first secure that we have at least one seed per - // component then we calculate the number of extra/remaining - // seeds that will need be added. Every component then will have - // a weight proportional to its size that tells how many of - // these remaining seeds it can take - - uint32_t num_remaining_seeds = m_num_seeds - m_num_components; - uint32_t num_extra_seeds_inserted = 0; - - // sort the order of the component to be processed by their size - std::vector component_order(components.size()); - fill_with_sequential_numbers(component_order.data(), - component_order.size()); - std::sort(component_order.begin(), component_order.end(), - [&components](const size_t& a, const size_t& b) { - return components[a].size() > - components[b].size(); - }); - - // process components in descending order with repsect to their - // size - for (size_t c = 0; c < component_order.size(); ++c) { - - std::vector& comp = - components[component_order[c]]; - - uint32_t size = comp.size(); - // this weight tells how many extra faces this component - // have from num_remaining_seeds - float weight = static_cast(size) / - static_cast(m_num_faces); - uint32_t component_num_seeds = - static_cast(std::ceil( - weight * static_cast(num_remaining_seeds))); - - - num_extra_seeds_inserted += component_num_seeds; - if (num_extra_seeds_inserted > num_remaining_seeds) { - if (num_extra_seeds_inserted - num_remaining_seeds > - component_num_seeds) { - component_num_seeds = 0; - } else { - component_num_seeds -= (num_extra_seeds_inserted - - num_remaining_seeds); - } + // if we have more seeds to give than the number of components, + // then first secure that we have at least one seed per + // component then we calculate the number of extra/remaining + // seeds that will need be added. Every component then will have + // a weight proportional to its size that tells how many of + // these remaining seeds it can take + + uint32_t num_remaining_seeds = m_num_seeds - m_num_components; + uint32_t num_extra_seeds_inserted = 0; + + // sort the order of the component to be processed by their size + std::vector component_order(components.size()); + fill_with_sequential_numbers(component_order.data(), + component_order.size()); + std::sort(component_order.begin(), + component_order.end(), + [&components](const size_t& a, const size_t& b) { + return components[a].size() > components[b].size(); + }); + + // process components in descending order with respect to their + // size + for (size_t c = 0; c < component_order.size(); ++c) { + + std::vector& comp = components[component_order[c]]; + + uint32_t size = comp.size(); + // this weight tells how many extra faces this component + // have from num_remaining_seeds + float weight = + static_cast(size) / static_cast(m_num_faces); + uint32_t component_num_seeds = static_cast(std::ceil( + weight * static_cast(num_remaining_seeds))); + + + num_extra_seeds_inserted += component_num_seeds; + if (num_extra_seeds_inserted > num_remaining_seeds) { + if (num_extra_seeds_inserted - num_remaining_seeds > + component_num_seeds) { + component_num_seeds = 0; + } else { + component_num_seeds -= + (num_extra_seeds_inserted - num_remaining_seeds); } - - component_num_seeds += 1; - generate_random_seed_from_component(comp, - component_num_seeds); } + + component_num_seeds += 1; + generate_random_seed_from_component(comp, component_num_seeds); } } } - - // export_face_list("seeds.obj", m_fvn, Verts, uint32_t(m_seeds.size()), - // m_seeds.data()); + assert(m_num_patches == m_seeds.size()); } void Patcher::initialize_random_seeds_single_component() @@ -422,8 +327,8 @@ void Patcher::initialize_random_seeds_single_component() fill_with_sequential_numbers(rand_num.data(), rand_num.size()); random_shuffle(rand_num.data(), rand_num.size()); m_seeds.resize(m_num_seeds); - std::memcpy(m_seeds.data(), rand_num.data(), - m_num_seeds * sizeof(uint32_t)); + std::memcpy( + m_seeds.data(), rand_num.data(), m_num_seeds * sizeof(uint32_t)); } void Patcher::generate_random_seed_from_component( @@ -441,16 +346,18 @@ void Patcher::generate_random_seed_from_component( random_shuffle(component.data(), component.size()); m_seeds.resize(num_seeds_before + num_seeds); - std::memcpy(m_seeds.data() + num_seeds_before, component.data(), + std::memcpy(m_seeds.data() + num_seeds_before, + component.data(), num_seeds * sizeof(uint32_t)); } void Patcher::get_multi_components( - std::vector>& components) + std::vector>& components, + const std::vector& ff_offset, + const std::vector& ff_values) { - std::vector visited(m_num_faces, false); - std::vector ff(3); + std::vector visited(m_num_faces, false); for (uint32_t f = 0; f < m_num_faces; ++f) { if (!visited[f]) { std::vector current_component; @@ -461,15 +368,16 @@ void Patcher::get_multi_components( std::queue face_queue; face_queue.push(f); while (!face_queue.empty()) { - uint32_t current_face = face_queue.front(); + uint32_t face = face_queue.front(); face_queue.pop(); - get_adjacent_faces(current_face, ff); - - for (const auto& f : ff) { - if (!visited[f]) { - current_component.push_back(f); - face_queue.push(f); - visited[f] = true; + uint32_t start = (face == 0) ? 0 : ff_offset[face - 1]; + uint32_t end = ff_offset[face]; + for (uint32_t f = start; f < end; ++f) { + uint32_t n_face = ff_values[f]; + if (!visited[n_face]) { + current_component.push_back(n_face); + face_queue.push(n_face); + visited[n_face] = true; } } } @@ -479,10 +387,11 @@ void Patcher::get_multi_components( } } -void Patcher::postprocess() +void Patcher::postprocess(const std::vector>& fv, + const std::vector& ff_offset, + const std::vector& ff_values) { - // Post process the patches by extracting the ribbons and populate the - // neighbour patches storage + // Post process the patches by extracting the ribbons // // For patch P, we start first by identifying boundary faces; faces that has // an edge on P's boundary. These faces are captured by querying the @@ -491,13 +400,11 @@ void Patcher::postprocess() // faces we can extract boundary vertices. We also now know which patch is // neighbor to P. Then we can use the boundary vertices to find the faces // that are incident to these vertices on the neighbor patches + std::vector frontier; + frontier.reserve(m_num_faces); std::vector bd_vertices; bd_vertices.reserve(m_patch_size); - std::vector vf1(3), vf2(3); - - m_neighbour_patches_offset.resize(m_num_patches); - m_neighbour_patches.reserve(m_num_patches * 3); // build vertex incident faces std::vector> vertex_incident_faces( @@ -506,23 +413,18 @@ void Patcher::postprocess() vertex_incident_faces[i].clear(); } for (uint32_t face = 0; face < m_num_faces; ++face) { - get_incident_vertices(face, vf1); - for (uint32_t v = 0; v < vf1.size(); ++v) { - vertex_incident_faces[vf1[v]].push_back(face); + for (uint32_t v = 0; v < fv[face].size(); ++v) { + vertex_incident_faces[fv[face][v]].push_back(face); } } for (uint32_t cur_p = 0; cur_p < m_num_patches; ++cur_p) { uint32_t p_start = (cur_p == 0) ? 0 : m_patches_offset[cur_p - 1]; - uint32_t p_end = m_patches_offset[cur_p]; - - m_neighbour_patches_offset[cur_p] = - (cur_p == 0) ? 0 : m_neighbour_patches_offset[cur_p - 1]; - uint32_t neighbour_patch_start = m_neighbour_patches_offset[cur_p]; + uint32_t p_end = m_patches_offset[cur_p]; bd_vertices.clear(); - m_frontier.clear(); + frontier.clear(); //***** Pass One @@ -531,49 +433,36 @@ void Patcher::postprocess() for (uint32_t fb = p_start; fb < p_end; ++fb) { uint32_t face = m_patches_val[fb]; - get_adjacent_faces(face, m_tf); + bool added = false; + uint32_t start = (face == 0) ? 0 : ff_offset[face - 1]; + uint32_t end = ff_offset[face]; - bool added = false; - for (uint32_t g = 0; g < m_tf.size(); ++g) { - uint32_t n = m_tf[g]; + for (uint32_t g = start; g < end; ++g) { + uint32_t n = ff_values[g]; uint32_t n_patch = get_face_patch_id(n); // n is boundary face if its patch is not the current patch we // are processing if (n_patch != cur_p) { if (!added) { - m_frontier.push_back(face); + frontier.push_back(face); added = true; } - // add n_patch as a neighbour patch to the current patch - auto itt = std::find( - m_neighbour_patches.begin() + neighbour_patch_start, - m_neighbour_patches.end(), n_patch); - - if (itt == m_neighbour_patches.end()) { - m_neighbour_patches.push_back(n_patch); - ++m_neighbour_patches_offset[cur_p]; - assert(m_neighbour_patches_offset[cur_p] == - m_neighbour_patches.size()); - } - // find/add the boundary vertices; these are the vertices // that are shared between face and n - get_incident_vertices(face, vf1); - get_incident_vertices(n, vf2); - - // add the common vertices in vf1 and vf2 - for (uint32_t i = 0; i < vf1.size(); ++i) { - auto it_vf = std::find(vf2.begin(), vf2.end(), vf1[i]); - if (it_vf != vf2.end()) { - bd_vertices.push_back(vf1[i]); + + // add the common vertices in fv[face] and fv[n] + for (uint32_t i = 0; i < fv[face].size(); ++i) { + auto it_vf = + std::find(fv[n].begin(), fv[n].end(), fv[face][i]); + if (it_vf != fv[n].end()) { + bd_vertices.push_back(fv[face][i]); } } - // we don't break out of this loop because we want to get - // all the neighbour patches and boundary vertices + // all the boundary vertices // break; } } @@ -585,13 +474,6 @@ void Patcher::postprocess() inplace_remove_duplicates_sorted(bd_vertices); - // export_as_cubes("cubes" + std::to_string(cur_p) + ".obj", - // bd_vertices.size(), 0.01f, - // [&bd_vertices](uint32_t i) {return Verts[bd_vertices[i]][0]; }, - // [&bd_vertices](uint32_t i) {return Verts[bd_vertices[i]][1]; }, - // [&bd_vertices](uint32_t i) {return Verts[bd_vertices[i]][2]; }); - - //***** Pass Two // 3) for every vertex on the patch boundary, we add all the faces @@ -636,61 +518,37 @@ void Patcher::postprocess() } } } -} - -void Patcher::get_adjacent_faces(uint32_t face_id, - std::vector& ff) const -{ - if (m_fvn.size() != 0) { - // We account here for non-manifold cases where a face might not be - // adjacent to just three faces - uint32_t size = m_fvn[face_id].size() - 3; - ff.resize(size); - std::memcpy(ff.data(), m_fvn[face_id].data() + 3, - size * sizeof(uint32_t)); - } else { - RXMESH_ERROR( - "Patcher::get_adjacent_faces() can not get adjacent faces!!"); - } -} -void Patcher::get_incident_vertices(uint32_t face_id, std::vector& fv) -{ - if (m_fvn.size() != 0) { - fv.resize(3); - std::memcpy(fv.data(), m_fvn[face_id].data(), 3 * sizeof(uint32_t)); - } else { - RXMESH_ERROR( - "Patcher::get_incident_vertices() can not get adjacent faces!!"); - } + m_ribbon_ext_val.resize(m_ribbon_ext_offset[m_num_patches - 1]); } void Patcher::assign_patch( - std::function get_edge_id) + const std::vector>& fv, + const std::unordered_map, + uint32_t, + ::rxmesh::detail::edge_key_hash> edges_map) { // For every patch p, for every face in the patch, find the three edges // that bound that face, and assign them to the patch. For boundary vertices // and edges assign them to one patch (TODO smallest face count). For now, // we assign it to the first patch - std::vector vf1(3); - for (uint32_t cur_p = 0; cur_p < m_num_patches; ++cur_p) { uint32_t p_start = (cur_p == 0) ? 0 : m_patches_offset[cur_p - 1]; - uint32_t p_end = m_patches_offset[cur_p]; + uint32_t p_end = m_patches_offset[cur_p]; for (uint32_t f = p_start; f < p_end; ++f) { uint32_t face = m_patches_val[f]; - get_incident_vertices(face, vf1); + uint32_t v1 = fv[face].back(); + for (uint32_t v = 0; v < fv[face].size(); ++v) { + uint32_t v0 = fv[face][v]; - uint32_t v1 = vf1.back(); - for (uint32_t v = 0; v < vf1.size(); ++v) { - uint32_t v0 = vf1[v]; - - uint32_t edge_id = get_edge_id(v0, v1); + std::pair key = + ::rxmesh::detail::edge_key(v0, v1); + uint32_t edge_id = edges_map.at(key); if (m_vertex_patch[v0] == INVALID32) { m_vertex_patch[v0] = cur_p; @@ -704,170 +562,23 @@ void Patcher::assign_patch( } } } -} -//********************** Parallel Execute -void Patcher::populate_ff(const std::vector>& ef, - std::vector& h_ff_values, - std::vector& h_ff_offset) -{ - assert(ef.size() == m_num_edges); - uint32_t total_ff_values = 0; - std::vector> h_ff_values_vec; - for (uint32_t f = 0; f < m_num_faces; ++f) { - std::vector ff; - ff.reserve(3); - h_ff_values_vec.push_back(ff); - } - for (uint32_t e = 0; e < ef.size(); ++e) { - for (uint32_t f0 = 0; f0 < ef[e].size() - 1; ++f0) { - uint32_t face0 = ef[e][f0]; - for (uint32_t f1 = f0 + 1; f1 < ef[e].size(); ++f1) { - uint32_t face1 = ef[e][f1]; - total_ff_values += 2; - h_ff_values_vec[face0].push_back(face1); - h_ff_values_vec[face1].push_back(face0); - } - } - } - h_ff_offset.clear(); - h_ff_offset.resize(m_num_faces); - for (uint32_t f = 0; f < m_num_faces; ++f) { - uint32_t s = 0; - if (f != 0) { - s = h_ff_offset[f - 1]; - } - h_ff_offset[f] = s + h_ff_values_vec[f].size(); - } - assert(h_ff_offset.back() == total_ff_values); - h_ff_values.clear(); - h_ff_values.reserve(total_ff_values); - for (uint32_t f = 0; f < m_num_faces; ++f) { - for (uint32_t ff = 0; ff < h_ff_values_vec[f].size(); ff++) { - h_ff_values.push_back(h_ff_values_vec[f][ff]); - } - } + CUDA_ERROR(cudaMemcpy(m_d_edge_patch, + m_edge_patch.data(), + sizeof(uint32_t) * (m_num_edges), + cudaMemcpyHostToDevice)); + CUDA_ERROR(cudaMemcpy(m_d_vertex_patch, + m_vertex_patch.data(), + sizeof(uint32_t) * (m_num_vertices), + cudaMemcpyHostToDevice)); } -void Patcher::parallel_execute(const std::vector>& ef) +void Patcher::run_lloyd() { - // TODO use streams - // TODO we don't need ef. We only use it to compute FF which we already - // compute in RXMesh build_local method before invoking patcher. - - // adjacent faces - uint32_t *d_ff_values(nullptr), *d_ff_offset(nullptr); - { - std::vector h_ff_values, h_ff_offset; - populate_ff(ef, h_ff_values, h_ff_offset); - assert(h_ff_offset.size() == m_num_faces); - CUDA_ERROR(cudaMalloc((void**)&d_ff_values, - h_ff_values.size() * sizeof(uint32_t))); - CUDA_ERROR(cudaMalloc((void**)&d_ff_offset, - h_ff_offset.size() * sizeof(uint32_t))); - - CUDA_ERROR(cudaMemcpy(d_ff_values, h_ff_values.data(), - h_ff_values.size() * sizeof(uint32_t), - cudaMemcpyHostToDevice)); - CUDA_ERROR(cudaMemcpy(d_ff_offset, h_ff_offset.data(), - h_ff_offset.size() * sizeof(uint32_t), - cudaMemcpyHostToDevice)); - } - - - // faces patch - uint32_t* d_face_patch = nullptr; - CUDA_ERROR( - cudaMalloc((void**)&d_face_patch, m_num_faces * sizeof(uint32_t))); - - // seeds (allocate m_max_num_patches but copy only m_num_patches) - initialize_random_seeds(); - uint32_t* d_seeds = nullptr; - assert(m_num_patches == m_seeds.size()); - CUDA_ERROR( - cudaMalloc((void**)&d_seeds, m_max_num_patches * sizeof(uint32_t))); - CUDA_ERROR(cudaMemcpy(d_seeds, m_seeds.data(), - m_num_patches * sizeof(uint32_t), - cudaMemcpyHostToDevice)); - - - // queue of size num_faces - // queue_start and queue_end - uint32_t* d_queue = nullptr; - CUDA_ERROR(cudaMalloc((void**)&d_queue, m_num_faces * sizeof(uint32_t))); - - // 0 -> queue start - // 1-> queue end - // 2-> next queue end std::vector h_queue_ptr{0, m_num_patches, m_num_patches}; - uint32_t* d_queue_ptr; - CUDA_ERROR(cudaMalloc((void**)&d_queue_ptr, 3 * sizeof(uint32_t))); - CUDA_ERROR(cudaMemcpy(d_queue_ptr, h_queue_ptr.data(), 3 * sizeof(uint32_t), - cudaMemcpyHostToDevice)); - // patches offset, values, and size - uint32_t *d_patches_offset, *d_patches_val, *d_patches_size, - *d_max_patch_size; - CUDA_ERROR(cudaMalloc((void**)&d_patches_offset, - m_max_num_patches * sizeof(uint32_t))); - CUDA_ERROR(cudaMalloc((void**)&d_patches_size, - m_max_num_patches * sizeof(uint32_t))); - CUDA_ERROR( - cudaMalloc((void**)&d_patches_val, m_num_faces * sizeof(uint32_t))); - CUDA_ERROR(cudaMalloc((void**)&d_max_patch_size, sizeof(uint32_t))); - void * d_cub_temp_storage_scan(nullptr), *d_cub_temp_storage_max(nullptr); - size_t cub_temp_storage_bytes_scan = 0; - size_t cub_temp_storage_bytes_max = 0; - ::cub::DeviceScan::InclusiveSum(d_cub_temp_storage_scan, - cub_temp_storage_bytes_scan, d_patches_size, - d_patches_offset, m_max_num_patches); - ::cub::DeviceReduce::Max(d_cub_temp_storage_max, cub_temp_storage_bytes_max, - d_patches_size, d_max_patch_size, - m_max_num_patches); - CUDA_ERROR(cudaMalloc((void**)&d_cub_temp_storage_scan, - cub_temp_storage_bytes_scan)); - CUDA_ERROR(cudaMalloc((void**)&d_cub_temp_storage_max, - cub_temp_storage_bytes_max)); - - // Lloyd iterations loop - uint32_t* d_new_num_patches = nullptr; - CUDA_ERROR(cudaMalloc((void**)&d_new_num_patches, sizeof(uint32_t))); - CUDA_ERROR(cudaMemcpy(d_new_num_patches, &m_num_patches, sizeof(uint32_t), - cudaMemcpyHostToDevice)); - - /* const char separator = ' '; - const int numWidth = 15; - if (!m_quite) { - std::cout << std::endl; - std::cout << std::left << std::setw(numWidth) << std::setfill(separator) - << "iter"; - std::cout << std::left << std::setw(numWidth) << std::setfill(separator) - << "#"; - std::cout << std::left << std::setw(numWidth) << std::setfill(separator) - << "avg"; - std::cout << std::left << std::setw(numWidth) << std::setfill(separator) - << "stddev"; - std::cout << std::left << std::setw(numWidth) << std::setfill(separator) - << "max"; - std::cout << std::left << std::setw(numWidth) << std::setfill(separator) - << "min" << std::endl - << std::endl; - }*/ - - /*auto draw = [&]() { - CUDA_ERROR(cudaDeviceSynchronize()); - CUDA_ERROR(cudaMemcpy(m_face_patch, d_face_patch, - m_num_faces * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - for (uint32_t i = 0; i < m_num_faces; ++i) { - m_face_patch[i] = m_face_patch[i] >> 1; - } - export_patches(Verts); - };*/ - - - CUDA_ERROR(cudaProfilerStart()); + //CUDA_ERROR(cudaProfilerStart()); GPUTimer timer; timer.start(); @@ -876,158 +587,101 @@ void Patcher::parallel_execute(const std::vector>& ef) ++m_num_lloyd_run; const uint32_t threads_s = 256; - const uint32_t blocks_s = DIVIDE_UP(m_num_patches, threads_s); + const uint32_t blocks_s = DIVIDE_UP(m_num_patches, threads_s); const uint32_t threads_f = 256; - const uint32_t blocks_f = DIVIDE_UP(m_num_faces, threads_f); + const uint32_t blocks_f = DIVIDE_UP(m_num_faces, threads_f); // add more seeds if needed if (m_num_lloyd_run % 5 == 0 && m_num_lloyd_run > 0) { uint32_t threshold = m_patch_size; - /*{ - //add new seeds only to the top 10% large patches - CUDA_ERROR(cudaMemcpy(m_patches_offset.data(), d_patches_offset, - m_num_patches * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - std::vector sorted_patches(m_num_patches); - for (uint32_t p = 0; p < m_num_patches; ++p) { - sorted_patches[p] = (p == 0) ? 0 : m_patches_offset[p - 1]; - sorted_patches[p] = m_patches_offset[p] - sorted_patches[p]; - } - std::sort(sorted_patches.begin(), sorted_patches.end()); - auto dd = std::upper_bound(sorted_patches.begin(), - sorted_patches.end(), m_patch_size); - uint32_t large_patches_start = dd - sorted_patches.begin(); - uint32_t large_patches_num = - sorted_patches.size() - large_patches_start; - threshold = sorted_patches[sorted_patches.size() - - 0.1 * large_patches_num] - - 1; - }*/ - - CUDA_ERROR(cudaMemcpy(d_new_num_patches, &m_num_patches, - sizeof(uint32_t), cudaMemcpyHostToDevice)); - add_more_seeds<<>>( - m_num_patches, d_new_num_patches, d_seeds, d_patches_offset, - d_patches_val, threshold); - - CUDA_ERROR(cudaMemcpy(&m_num_patches, d_new_num_patches, - sizeof(uint32_t), cudaMemcpyDeviceToHost)); + CUDA_ERROR(cudaMemcpy(m_d_new_num_patches, + &m_num_patches, + sizeof(uint32_t), + cudaMemcpyHostToDevice)); + add_more_seeds<<>>(m_num_patches, + m_d_new_num_patches, + m_d_seeds, + m_d_patches_offset, + m_d_patches_val, + threshold); + + CUDA_ERROR(cudaMemcpy(&m_num_patches, + m_d_new_num_patches, + sizeof(uint32_t), + cudaMemcpyDeviceToHost)); if (m_num_patches >= m_max_num_patches) { RXMESH_ERROR( - "Patcher::parallel_execute() m_num_patches exceeds " + "Patcher::run_lloyd() m_num_patches exceeds " "m_max_num_patches"); } } h_queue_ptr[0] = 0; h_queue_ptr[1] = m_num_patches; h_queue_ptr[2] = m_num_patches; - CUDA_ERROR(cudaMemcpy(d_queue_ptr, h_queue_ptr.data(), - 3 * sizeof(uint32_t), cudaMemcpyHostToDevice)); + CUDA_ERROR(cudaMemcpy(m_d_queue_ptr, + h_queue_ptr.data(), + 3 * sizeof(uint32_t), + cudaMemcpyHostToDevice)); - RXMESH::memset<<>>(d_face_patch, INVALID32, - m_num_faces); + rxmesh::memset<<>>( + m_d_face_patch, INVALID32, m_num_faces); - RXMESH::memcpy<<>>(d_queue, d_seeds, - m_num_patches); + rxmesh::memcpy<<>>( + m_d_queue, m_d_seeds, m_num_patches); - RXMESH::memset<<>>(d_patches_size, 0u, - m_num_patches); + rxmesh::memset<<>>( + m_d_patches_size, 0u, m_num_patches); write_initial_face_patch<<>>( - m_num_patches, d_face_patch, d_seeds, d_patches_size); + m_num_patches, m_d_face_patch, m_d_seeds, m_d_patches_size); // Cluster seed propagation while (true) { // Launch enough threads to cover all the faces. However, only // subset will do actual work depending on the queue size - cluster_seed_propagation<<>>( - m_num_faces, m_num_patches, d_queue_ptr, d_queue, d_face_patch, - d_patches_size, d_ff_offset, d_ff_values); - - reset_queue_ptr<<<1, 1>>>(d_queue_ptr); - - CUDA_ERROR(cudaMemcpy(h_queue_ptr.data(), d_queue_ptr, - sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cluster_seed_propagation<<>>(m_num_faces, + m_num_patches, + m_d_queue_ptr, + m_d_queue, + m_d_face_patch, + m_d_patches_size, + m_d_ff_offset, + m_d_ff_values); + + reset_queue_ptr<<<1, 1>>>(m_d_queue_ptr); + + CUDA_ERROR(cudaMemcpy(h_queue_ptr.data(), + m_d_queue_ptr, + sizeof(uint32_t), + cudaMemcpyDeviceToHost)); if (h_queue_ptr[0] >= m_num_faces) { break; } } - - uint32_t max_patch_size = construct_patches_compressed_parallel( - d_cub_temp_storage_max, cub_temp_storage_bytes_max, d_patches_size, - d_max_patch_size, d_cub_temp_storage_scan, - cub_temp_storage_bytes_scan, d_patches_offset, d_face_patch, - d_patches_val); - - // draw(); - - - /*uint32_t* d_second_queue; - { - CUDA_ERROR(cudaMalloc((void**)&d_second_queue, - m_num_faces * sizeof(uint32_t))); - CUDA_ERROR( - cudaMemset(d_second_queue, 0, m_num_faces * sizeof(uint32_t))); - }*/ + uint32_t max_patch_size = construct_patches_compressed_format(); // Interior - uint32_t threads_i = 512; + uint32_t threads_i = 512; uint32_t shmem_bytes = max_patch_size * (sizeof(uint32_t)); - RXMESH::memset<<>>(d_queue, INVALID32, - m_num_faces); - interior<<>>( - m_num_patches, d_patches_offset, d_patches_val, d_face_patch, - d_seeds, d_ff_offset, d_ff_values, d_queue /*, d_second_queue*/); - - /*{ - std::vector second_queue(m_num_faces); - CUDA_ERROR(cudaMemcpy(second_queue.data(), d_second_queue, - m_num_faces * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - std::vector face_list; - for (uint32_t i = 0; i < second_queue.size(); ++i) { - if (second_queue[i] == 1) { - face_list.push_back(i); - } - } - export_face_list("second_queue.obj", m_fvn, Verts, - uint32_t(face_list.size()), face_list.data()); - }*/ - /*{ - printf("\n d_face_patch"); - ::RXMESH::print_arr_host<<<1, 1>>>(m_num_faces, d_face_patch); - CUDA_ERROR(cudaDeviceSynchronize()); - }*/ - - /* if (!m_quite) { - CUDA_ERROR(cudaDeviceSynchronize()); - double my_avg(0), my_stddev(0); - uint32_t my_max(0), my_min(0); - CUDA_ERROR(cudaMemcpy(m_patches_offset.data(), d_patches_offset, - m_num_patches * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - compute_avg_stddev_max_min_rs(m_patches_offset.data(), - m_num_patches, my_avg, my_stddev, - my_max, my_min); - std::cout << std::left << std::setw(numWidth) - << std::setfill(separator) << m_num_lloyd_run; - std::cout << std::left << std::setw(numWidth) - << std::setfill(separator) << m_num_patches; - std::cout << std::left << std::setw(numWidth) - << std::setfill(separator) << my_avg; - std::cout << std::left << std::setw(numWidth) - << std::setfill(separator) << my_stddev; - std::cout << std::left << std::setw(numWidth) - << std::setfill(separator) << my_max; - std::cout << std::left << std::setw(numWidth) - << std::setfill(separator) << my_min << std::endl; - }*/ + rxmesh::memset<<>>( + m_d_queue, INVALID32, m_num_faces); + interior<<>>(m_num_patches, + m_d_patches_offset, + m_d_patches_val, + m_d_face_patch, + m_d_seeds, + m_d_ff_offset, + m_d_ff_values, + m_d_queue); if (max_patch_size < m_patch_size) { + shift<<>>( + m_num_faces, m_d_face_patch, m_d_patches_val); + break; } } @@ -1037,105 +691,88 @@ void Patcher::parallel_execute(const std::vector>& ef) CUDA_ERROR(cudaDeviceSynchronize()); CUDA_ERROR(cudaGetLastError()); m_patching_time_ms = timer.elapsed_millis(); - CUDA_ERROR(cudaProfilerStop()); + //CUDA_ERROR(cudaProfilerStop()); // move data to host m_num_seeds = m_num_patches; m_seeds.resize(m_num_seeds); - CUDA_ERROR(cudaMemcpy(m_seeds.data(), d_seeds, + CUDA_ERROR(cudaMemcpy(m_seeds.data(), + m_d_seeds, m_num_seeds * sizeof(uint32_t), cudaMemcpyDeviceToHost)); - CUDA_ERROR(cudaMemcpy(m_face_patch.data(), d_face_patch, + CUDA_ERROR(cudaMemcpy(m_face_patch.data(), + m_d_face_patch, sizeof(uint32_t) * m_num_faces, cudaMemcpyDeviceToHost)); m_patches_offset.resize(m_num_patches); - CUDA_ERROR(cudaMemcpy(m_patches_offset.data(), d_patches_offset, + CUDA_ERROR(cudaMemcpy(m_patches_offset.data(), + m_d_patches_offset, sizeof(uint32_t) * m_num_patches, cudaMemcpyDeviceToHost)); - CUDA_ERROR(cudaMemcpy(m_patches_val.data(), d_patches_val, + CUDA_ERROR(cudaMemcpy(m_patches_val.data(), + m_d_patches_val, sizeof(uint32_t) * m_num_faces, cudaMemcpyDeviceToHost)); + GPU_FREE(m_d_ff_values); + GPU_FREE(m_d_ff_offset); - // draw(); + GPU_FREE(m_d_new_num_patches); + GPU_FREE(m_d_max_patch_size); - for (uint32_t i = 0; i < m_num_faces; ++i) { - m_face_patch[i] = m_face_patch[i] >> 1; - m_patches_val[i] = m_patches_val[i] >> 1; - } + GPU_FREE(m_d_cub_temp_storage_scan); + GPU_FREE(m_d_cub_temp_storage_max); + m_cub_max_bytes = 0; + m_cub_scan_bytes = 0; + GPU_FREE(m_d_seeds); + GPU_FREE(m_d_queue); + GPU_FREE(m_d_queue_ptr); - GPU_FREE(d_ff_values); - GPU_FREE(d_ff_offset); - GPU_FREE(d_face_patch); - GPU_FREE(d_seeds); - GPU_FREE(d_queue); - GPU_FREE(d_patches_offset); - GPU_FREE(d_patches_size); - GPU_FREE(d_patches_val); - GPU_FREE(d_queue_ptr); - GPU_FREE(d_cub_temp_storage_scan); - GPU_FREE(d_cub_temp_storage_max); - GPU_FREE(d_max_patch_size); - GPU_FREE(d_new_num_patches); + GPU_FREE(m_d_patches_offset); + GPU_FREE(m_d_patches_size); + GPU_FREE(m_d_patches_val); } -uint32_t Patcher::construct_patches_compressed_parallel( - void* d_cub_temp_storage_max, - size_t cub_temp_storage_bytes_max, - uint32_t* d_patches_size, - uint32_t* d_max_patch_size, - void* d_cub_temp_storage_scan, - size_t cub_temp_storage_bytes_scan, - uint32_t* d_patches_offset, - uint32_t* d_face_patch, - uint32_t* d_patches_val) +uint32_t Patcher::construct_patches_compressed_format() { uint32_t max_patch_size = 0; - const uint32_t threads_s = 256; - const uint32_t blocks_s = DIVIDE_UP(m_num_patches, threads_s); - const uint32_t threads_f = 256; - const uint32_t blocks_f = DIVIDE_UP(m_num_faces, threads_f); + const uint32_t threads_s = 256; + const uint32_t blocks_s = DIVIDE_UP(m_num_patches, threads_s); + const uint32_t threads_f = 256; + const uint32_t blocks_f = DIVIDE_UP(m_num_faces, threads_f); // Compute max patch size max_patch_size = 0; - ::cub::DeviceReduce::Max(d_cub_temp_storage_max, cub_temp_storage_bytes_max, - d_patches_size, d_max_patch_size, m_num_patches); - CUDA_ERROR(cudaMemcpy(&max_patch_size, d_max_patch_size, sizeof(uint32_t), + ::cub::DeviceReduce::Max(m_d_cub_temp_storage_max, + m_cub_max_bytes, + m_d_patches_size, + m_d_max_patch_size, + m_num_patches); + CUDA_ERROR(cudaMemcpy(&max_patch_size, + m_d_max_patch_size, + sizeof(uint32_t), cudaMemcpyDeviceToHost)); // Construct compressed patches - ::cub::DeviceScan::InclusiveSum(d_cub_temp_storage_scan, - cub_temp_storage_bytes_scan, d_patches_size, - d_patches_offset, m_num_patches); - RXMESH::memset<<>>(d_patches_size, 0u, m_num_patches); - - construct_patches_compressed<<>>( - m_num_faces, d_face_patch, m_num_patches, d_patches_offset, - d_patches_size, d_patches_val); + ::cub::DeviceScan::InclusiveSum(m_d_cub_temp_storage_scan, + m_cub_scan_bytes, + m_d_patches_size, + m_d_patches_offset, + m_num_patches); + rxmesh::memset<<>>( + m_d_patches_size, 0u, m_num_patches); + + construct_patches_compressed<<>>(m_num_faces, + m_d_face_patch, + m_num_patches, + m_d_patches_offset, + m_d_patches_size, + m_d_patches_val); return max_patch_size; } -//************************************************************************** - - -template void Patcher::export_single_patch( - const std::vector>& Verts, - int patch_id); -template void Patcher::export_single_patch( - const std::vector>& Verts, - int patch_id); -template void Patcher::export_patches( - const std::vector>& Verts); -template void Patcher::export_patches( - const std::vector>& Verts); -template void Patcher::export_ext_ribbon( - const std::vector>& Verts, - int patch_id); -template void Patcher::export_ext_ribbon( - const std::vector>& Verts, - int patch_id); - -} // namespace PATCHER -} // namespace RXMESH \ No newline at end of file + +} // namespace patcher +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/patcher/patcher.h b/include/rxmesh/patcher/patcher.h index 028a8fb7..47ea1230 100644 --- a/include/rxmesh/patcher/patcher.h +++ b/include/rxmesh/patcher/patcher.h @@ -2,47 +2,37 @@ #include #include -namespace RXMESH { +#include +#include "rxmesh/util/util.h" -namespace PATCHER { +namespace rxmesh { +namespace patcher { + +/** + * @brief Takes an input mesh and partition it to patches using Lloyd algorithm + * on the gpu + */ class Patcher { public: + Patcher() = default; + Patcher(uint32_t patch_size, - const std::vector>& fvn, - const uint32_t num_vertices, - const uint32_t num_edges, - const bool is_multi_component = true, - const bool quite = true); - - void execute(std::function get_edge_id, - const std::vector>& ef); - - template - void export_patches(const std::vector>& Verts); - - template - void export_components( - const std::vector>& Verts, - const std::vector>& components); - - template - void export_ext_ribbon(const std::vector>& Verts, - int patch_id); - - template - void export_single_patch(const std::vector>& Verts, - int patch_id); - - template - void export_single_patch_edges(const std::vector>& Verts, - int patch_id, - EdgeIDFunc get_edge_id); - void print_statistics(); + const std::vector& ff_offset, + const std::vector& ff_values, + const std::vector>& fv, + const std::unordered_map, + uint32_t, + ::rxmesh::detail::edge_key_hash> edges_map, + const uint32_t num_vertices, + const uint32_t num_edges, + const bool quite); + + virtual ~Patcher(); + void print_statistics(); - //********************** Getter uint32_t get_num_patches() const { return m_num_patches; @@ -58,6 +48,21 @@ class Patcher return m_face_patch; } + uint32_t* get_device_face_patch() + { + return m_d_face_patch; + } + + uint32_t* get_device_vertex_patch() + { + return m_d_vertex_patch; + } + + uint32_t* get_device_edge_patch() + { + return m_d_edge_patch; + } + std::vector& get_vertex_patch() { return m_vertex_patch; @@ -78,16 +83,6 @@ class Patcher return m_patches_offset.data(); } - uint32_t* get_neighbour_patches() - { - return m_neighbour_patches.data(); - } - - uint32_t* get_neighbour_patches_offset() - { - return m_neighbour_patches_offset.data(); - } - std::vector& get_external_ribbon_val() { return m_ribbon_ext_val; @@ -154,75 +149,88 @@ class Patcher { return m_num_lloyd_run; } - //************************************************************************** - - - ~Patcher(); private: - void mem_alloc(); - - void assign_patch(std::function get_edge_id); - - void initialize_cluster_seeds(); - void initialize_random_seeds(); - void get_multi_components(std::vector>& components); + /** + * @brief Allocate various auxiliary memory needed to store patches info on + * the host + */ + void allocate_memory(); + + /** + * @brief Allocate various temporarily memory on the device needed to + * compute patches on the device + * @param ff_offset offset indicate start (and end) to index ff_values to + * get face-incident-faces + * @param ff_values stores face-incident-faces in compressed format + */ + void allocate_device_memory(const std::vector& ff_offset, + const std::vector& ff_values); + + void assign_patch( + const std::vector>& fv, + const std::unordered_map, + uint32_t, + ::rxmesh::detail::edge_key_hash> edges_map); + + void initialize_random_seeds(const std::vector& ff_offset, + const std::vector& ff_values); + + void get_multi_components(std::vector>& components, + const std::vector& ff_offset, + const std::vector& ff_values); void initialize_random_seeds_single_component(); void generate_random_seed_from_component(std::vector& component, uint32_t num_seeds); - void postprocess(); - void get_adjacent_faces(uint32_t face_id, std::vector& ff) const; - void get_incident_vertices(uint32_t face_id, std::vector& fv); - - void populate_ff(const std::vector>& ef, - std::vector& h_ff_values, - std::vector& h_ff_offset); - uint32_t construct_patches_compressed_parallel( - void* d_cub_temp_storage_max, - size_t cub_temp_storage_bytes_max, - uint32_t* d_patches_size, - uint32_t* d_max_patch_size, - void* d_cub_temp_storage_scan, - size_t cub_temp_storage_bytes_scan, - uint32_t* d_patches_offset, - uint32_t* d_face_patch, - uint32_t* d_patches_val); - void parallel_execute(const std::vector>& ef); - //******** - - const std::vector>& m_fvn; - - uint32_t m_patch_size; - uint32_t m_num_patches, m_num_vertices, m_num_edges, m_num_faces, - m_num_seeds, m_max_num_patches; + void postprocess(const std::vector>& fv, + const std::vector& ff_offset, + const std::vector& ff_values); + + uint32_t construct_patches_compressed_format(); + + void run_lloyd(); + + + uint32_t m_patch_size, m_num_patches, m_num_vertices, m_num_edges, + m_num_faces, m_num_seeds, m_max_num_patches, m_num_components, + m_num_lloyd_run; // store the face, vertex, edge patch std::vector m_face_patch, m_vertex_patch, m_edge_patch; + uint32_t * m_d_face_patch, *m_d_vertex_patch, *m_d_edge_patch; - bool m_is_multi_component; - bool m_quite; - - uint32_t m_num_components; // Stores the patches in compressed format std::vector m_patches_val, m_patches_offset; - //Stores ribbon in compressed format - std::vector m_ribbon_ext_val, m_ribbon_ext_offset; + // deallocated immediately after computing patches + uint32_t *m_d_patches_offset, *m_d_patches_size, *m_d_patches_val; - //Stores neighbour patches in compressed format - std::vector m_neighbour_patches, m_neighbour_patches_offset; + // Stores ribbon in compressed format + std::vector m_ribbon_ext_val, m_ribbon_ext_offset; // caching the time taken to construct the patches float m_patching_time_ms; - // utility vectors - std::vector m_frontier, m_tf, m_seeds; - uint32_t m_num_lloyd_run = 0; - //******** + std::vector m_seeds; + + // (deallocated immediately after computing patches) + uint32_t* m_d_seeds; + + // stores ff on the device (deallocated immediately after computing patches) + uint32_t *m_d_ff_values, *m_d_ff_offset; + + // utility used during creating patches (deallocated immediately after + // computing patches) + uint32_t *m_d_queue, *m_d_queue_ptr, *m_d_new_num_patches, + *m_d_max_patch_size; + + // CUB temp memory(deallocated immediately after computing patches) + void * m_d_cub_temp_storage_scan, *m_d_cub_temp_storage_max; + size_t m_cub_scan_bytes, m_cub_max_bytes; }; -} // namespace PATCHER -} // namespace RXMESH \ No newline at end of file +} // namespace patcher +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/patcher/patcher_kernel.cuh b/include/rxmesh/patcher/patcher_kernel.cuh index 479e7882..13aa8fca 100644 --- a/include/rxmesh/patcher/patcher_kernel.cuh +++ b/include/rxmesh/patcher/patcher_kernel.cuh @@ -1,10 +1,21 @@ #pragma once #include "rxmesh/kernels/collective.cuh" -namespace RXMESH { +namespace rxmesh { -namespace PATCHER { +namespace patcher { +__global__ static void shift(const uint32_t num_faces, + uint32_t* face_patch, + uint32_t* patches_val) +{ + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; + while (tid < num_faces) { + face_patch[tid] = face_patch[tid] >> 1; + patches_val[tid] = patches_val[tid] >> 1; + tid += blockDim.x * gridDim.x; + } +} __device__ __forceinline__ const uint32_t* get_face_faces( const uint32_t* d_ff_offset, @@ -29,7 +40,7 @@ __global__ static void write_initial_face_patch(const uint32_t num_seeds, { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; while (tid < num_seeds) { - uint32_t seed = d_seeds[tid]; + uint32_t seed = d_seeds[tid]; d_face_patch[seed] = tid << 1; assert(d_patches_size[tid] == 0); d_patches_size[tid] = 1; @@ -57,12 +68,12 @@ __global__ static void cluster_seed_propagation(const uint32_t num_faces, // first bit in d_face_patch is reserved for 'is boundary face' uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; - uint32_t current_queue_end = d_queue_ptr[1]; + uint32_t current_queue_end = d_queue_ptr[1]; uint32_t current_queue_start = d_queue_ptr[0]; while (tid >= current_queue_start && tid < current_queue_end) { - uint32_t face_id = d_queue[tid]; + uint32_t face_id = d_queue[tid]; uint32_t face_patch = d_face_patch[face_id] >> 1; - uint32_t ff_len = 0; + uint32_t ff_len = 0; const uint32_t* ff_ptr = get_face_faces(d_ff_offset, d_ff_values, face_id, ff_len); @@ -71,8 +82,8 @@ __global__ static void cluster_seed_propagation(const uint32_t num_faces, for (uint32_t i = 0; i < ff_len; i++) { uint32_t n_face = ff_ptr[i]; - uint32_t assumed = ::atomicCAS(&d_face_patch[n_face], INVALID32, - (face_patch << 1)); + uint32_t assumed = ::atomicCAS( + &d_face_patch[n_face], INVALID32, (face_patch << 1)); assert((assumed >> 1) < num_patches || assumed == INVALID32); if (assumed == INVALID32) { @@ -90,8 +101,8 @@ __global__ static void cluster_seed_propagation(const uint32_t num_faces, } } - face_patch = face_patch << 1; - face_patch = face_patch | is_boundary; + face_patch = face_patch << 1; + face_patch = face_patch | is_boundary; d_face_patch[face_id] = face_patch; tid += blockDim.x * gridDim.x; @@ -108,15 +119,15 @@ __global__ static void construct_patches_compressed( { uint32_t face = threadIdx.x + blockIdx.x * blockDim.x; while (face < num_faces) { - uint32_t patch_id = d_face_patch[face]; + uint32_t patch_id = d_face_patch[face]; uint32_t is_boundary = patch_id & 1; - patch_id = patch_id >> 1; + patch_id = patch_id >> 1; uint32_t pos = ::atomicAdd(&d_patches_size[patch_id], uint32_t(1)); if (patch_id != 0) { pos += d_patches_offset[patch_id - 1]; } uint32_t res = face << 1; - res = res | is_boundary; + res = res | is_boundary; assert(pos < num_faces); assert(face < ((num_faces << 1) | 1)); d_patches_val[pos] = res; @@ -148,9 +159,9 @@ __global__ static void interior(const uint32_t num_patches, const uint32_t patch_id = blockIdx.x; const uint32_t p_start = (patch_id == 0) ? 0 : d_patches_offset[patch_id - 1]; - const uint32_t p_end = d_patches_offset[patch_id]; + const uint32_t p_end = d_patches_offset[patch_id]; const uint32_t p_size = p_end - p_start; - uint32_t tid = threadIdx.x; + uint32_t tid = threadIdx.x; extern __shared__ uint32_t s_queue[]; @@ -167,7 +178,7 @@ __global__ static void interior(const uint32_t num_patches, /*if (blockIdx.x == 1) { d_second_queue[face >> 1] = 1; }*/ - s_queue[pos] = face; + s_queue[pos] = face; d_queue[face] = 0; } tid += blockDim.x; @@ -177,14 +188,14 @@ __global__ static void interior(const uint32_t num_patches, // if there is no boundary, it means that the patch is a single // component. Pick any face as a seed, nobody cares! if (s_queue_size > 0) { - uint32_t queue_end = 0; + uint32_t queue_end = 0; uint32_t queue_start = 0; while (true) { // loop++; queue_start = queue_end; - queue_end = s_queue_size; + queue_end = s_queue_size; /*if (threadIdx.x == 0 && patch_id == 0) { printf( @@ -202,15 +213,15 @@ __global__ static void interior(const uint32_t num_patches, tid = threadIdx.x; while (tid < queue_end - queue_start) { - uint32_t face = s_queue[tid + queue_start]; + uint32_t face = s_queue[tid + queue_start]; uint32_t ff_len = 0; const uint32_t* ff_ptr = get_face_faces(d_ff_offset, d_ff_values, face, ff_len); for (uint32_t i = 0; i < ff_len; ++i) { uint32_t n_face = ff_ptr[i]; if (d_face_patch[n_face] >> 1 == patch_id) { - uint32_t assumed = ::atomicCAS(d_queue + n_face, - INVALID32, patch_id); + uint32_t assumed = ::atomicCAS( + d_queue + n_face, INVALID32, patch_id); if (assumed == INVALID32) { uint32_t pos = ::atomicAdd(&s_queue_size, uint32_t(1)); @@ -253,7 +264,7 @@ __global__ static void add_more_seeds(const uint32_t num_patches, uint32_t patch_id = blockIdx.x; const uint32_t p_start = (patch_id == 0) ? 0 : d_patches_offset[patch_id - 1]; - const uint32_t p_end = d_patches_offset[patch_id]; + const uint32_t p_end = d_patches_offset[patch_id]; const uint32_t p_size = p_end - p_start; if (p_size > threshold) { @@ -275,5 +286,5 @@ __global__ static void add_more_seeds(const uint32_t num_patches, } } } -} // namespace PATCHER -} // namespace RXMESH \ No newline at end of file +} // namespace patcher +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/reduce_handle.h b/include/rxmesh/reduce_handle.h new file mode 100644 index 00000000..a44bb9ce --- /dev/null +++ b/include/rxmesh/reduce_handle.h @@ -0,0 +1,140 @@ +#pragma once + +#include "rxmesh/attribute.h" +#include "rxmesh/kernels/attribute.cuh" + +namespace rxmesh { + +/** + * @brief This class is used to compute different reduction operations on + * Attribute. To create a new ReduceHandle, use create_reduce_handle() + * from Attribute + * @tparam T The type of the attribute + */ +template +class ReduceHandle +{ + + public: + ReduceHandle() = default; + ReduceHandle(const ReduceHandle&) = default; + + /** + * @brief Constructor which allocates internal memory used in all reduce + * operations + * @param attr one of Attribute used for subsequent reduction + * operations + */ + ReduceHandle(const Attribute& attr) : m_num_patches(attr.m_num_patches) + { + CUDA_ERROR( + cudaMalloc(&m_d_reduce_1st_stage, m_num_patches * sizeof(T))); + + CUDA_ERROR(cudaMalloc(&m_d_reduce_2nd_stage, sizeof(T))); + + m_d_reduce_temp_storage = NULL; + cub::DeviceReduce::Sum(m_d_reduce_temp_storage, + m_reduce_temp_storage_bytes, + m_d_reduce_1st_stage, + m_d_reduce_2nd_stage, + m_num_patches); + + CUDA_ERROR( + cudaMalloc(&m_d_reduce_temp_storage, m_reduce_temp_storage_bytes)); + } + + ~ReduceHandle() + { + GPU_FREE(m_d_reduce_1st_stage); + GPU_FREE(m_d_reduce_2nd_stage); + GPU_FREE(m_d_reduce_temp_storage); + m_reduce_temp_storage_bytes = 0; + } + + /** + * @brief compute dot product between two input attributes and return the + * output on the host + * @param attr1 first input attribute + * @param attr2 second input attribute + * @param stream stream to run the computation on + * @return the output of dot product on the host + */ + T dot(const Attribute& attr1, + const Attribute& attr2, + cudaStream_t stream = NULL) + { + if ((attr1.get_allocated() & DEVICE) != DEVICE || + (attr2.get_allocated() & DEVICE) != DEVICE) { + RXMESH_ERROR( + "ReduceHandle::dot() input attributes to should be " + "allocated on the device"); + } + + detail::dot_kernel + <<>>( + attr1, + attr2, + attr1.m_d_element_per_patch, + m_num_patches, + attr1.get_num_attributes(), + m_d_reduce_1st_stage); + + return reduce_2nd_stage(stream); + } + + /** + * @brief compute L2 norm between two input attributes and return the output + * on the host + * @param attr input attribute + * @param stream stream to run the computation on + * @return the output of L2 norm on the host + */ + T norm2(const Attribute& attr, cudaStream_t stream = NULL) + { + if ((attr.get_allocated() & DEVICE) != DEVICE) { + RXMESH_ERROR( + "ReduceHandle::norm2() input attribute to should be " + "allocated on the device"); + } + + detail::norm2_kernel + <<>>( + attr, + attr.m_d_element_per_patch, + m_num_patches, + attr.get_num_attributes(), + m_d_reduce_1st_stage); + + return std::sqrt(reduce_2nd_stage(stream)); + } + + + private: + T reduce_2nd_stage(cudaStream_t stream) + { + T h_output = 0; + + cub::DeviceReduce::Sum(m_d_reduce_temp_storage, + m_reduce_temp_storage_bytes, + m_d_reduce_1st_stage, + m_d_reduce_2nd_stage, + m_num_patches, + stream); + + CUDA_ERROR(cudaMemcpyAsync(&h_output, + m_d_reduce_2nd_stage, + sizeof(T), + cudaMemcpyDeviceToHost, + stream)); + CUDA_ERROR(cudaStreamSynchronize(stream)); + + return h_output; + } + + size_t m_reduce_temp_storage_bytes; + T* m_d_reduce_1st_stage; + T* m_d_reduce_2nd_stage; + void* m_d_reduce_temp_storage; + uint32_t m_num_patches; +}; +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/rxmesh.cpp b/include/rxmesh/rxmesh.cpp index a9176497..a9c28b07 100644 --- a/include/rxmesh/rxmesh.cpp +++ b/include/rxmesh/rxmesh.cpp @@ -1,1212 +1,709 @@ -#include "rxmesh.h" - #include -#include +#include +#include #include +#include #include + #include "patcher/patcher.h" -#include "rxmesh/rxmesh_context.h" -#include "rxmesh/util/export_tools.h" -#include "rxmesh/util/math.h" - -namespace RXMESH { -// extern std::vector> Verts; // TODO remove this - -//********************** Constructors/Destructors -template -RXMesh::RXMesh(std::vector>& fv, - std::vector>& coordinates, - const bool sort /*= false*/, - const bool quite /*= true*/) - : m_num_edges(0), m_num_faces(0), m_num_vertices(0), m_max_ele_count(0), - m_max_valence(0), m_max_valence_vertex_id(INVALID32), - m_max_edge_incident_faces(0), m_max_face_adjacent_faces(0), - m_face_degree(3), m_num_patches(0), m_is_input_edge_manifold(true), - m_is_input_closed(true), m_is_sort(sort), m_quite(quite), - m_max_vertices_per_patch(0), m_max_edges_per_patch(0), - m_max_faces_per_patch(0), m_d_patches_ltog_v(nullptr), - m_d_patches_ltog_e(nullptr), m_d_patches_ltog_f(nullptr), - m_d_ad_size_ltog_v(nullptr), m_d_ad_size_ltog_e(nullptr), - m_d_ad_size_ltog_f(nullptr), m_d_patches_edges(nullptr), - m_d_patches_faces(nullptr), m_d_patch_distribution_v(nullptr), - m_d_patch_distribution_e(nullptr), m_d_patch_distribution_f(nullptr), - m_d_ad_size(nullptr), m_d_neighbour_patches(nullptr), - m_d_neighbour_patches_offset(nullptr) +#include "rxmesh/context.h" +#include "rxmesh/rxmesh.h" +#include "rxmesh/util/util.h" + +namespace rxmesh { +RXMesh::RXMesh(const std::vector>& fv, const bool quite) + : m_num_edges(0), + m_num_faces(0), + m_num_vertices(0), + m_max_valence(0), + m_max_edge_incident_faces(0), + m_max_face_adjacent_faces(0), + m_max_vertices_per_patch(0), + m_max_edges_per_patch(0), + m_max_faces_per_patch(0), + m_max_not_owned_vertices(0), + m_max_not_owned_edges(0), + m_max_not_owned_faces(0), + m_num_patches(0), + m_patch_size(512), + m_is_input_edge_manifold(true), + m_is_input_closed(true), + m_quite(quite), + m_d_patches_info(nullptr), + m_h_patches_info(nullptr) { // Build everything from scratch including patches - build_local(fv, coordinates); - device_alloc_local(); -} - -template -RXMesh::~RXMesh() -{ - GPU_FREE(m_d_patches_ltog_v); - GPU_FREE(m_d_patches_ltog_e); - GPU_FREE(m_d_patches_ltog_f); - GPU_FREE(m_d_patches_edges); - GPU_FREE(m_d_patches_faces); - GPU_FREE(m_d_ad_size_ltog_v); - GPU_FREE(m_d_ad_size_ltog_e); - GPU_FREE(m_d_ad_size_ltog_f); - GPU_FREE(m_d_ad_size); - GPU_FREE(m_d_patch_distribution_v); - GPU_FREE(m_d_patch_distribution_e); - GPU_FREE(m_d_patch_distribution_f); - GPU_FREE(m_d_vertex_patch); - GPU_FREE(m_d_edge_patch); - GPU_FREE(m_d_face_patch); - GPU_FREE(m_d_neighbour_patches); - GPU_FREE(m_d_neighbour_patches_offset); -}; -//************************************************************************** - - -//********************** Builders -template -void RXMesh::build_local( - std::vector>& fv, - std::vector>& coordinates) -{ - // we build everything here from scratch - // 1) set num vertices - // 2) populate edge_map - // 3) for each edge, store a list of faces that are incident to that edge - // 4) copy fv to m_fvn and append the adjacent faces for each face using - // info from 3) - // 5) patch the mesh - // 6) populate the local mesh - - //=========== 1) - m_num_faces = static_cast(fv.size()); - set_num_vertices(fv); - //=============================== - - - //=========== 2) - populate_edge_map(fv); - m_num_edges = static_cast(m_edges_map.size()); - //=============================== - - - //=========== 3) - std::vector> ef; - edge_incident_faces(fv, ef); - // caching mesh type; edge manifold, closed - for (uint32_t e = 0; e < ef.size(); ++e) { - if (ef[e].size() < 2) { - m_is_input_closed = false; - } - if (ef[e].size() > 2) { - m_is_input_edge_manifold = false; - } - } - //=============================== - - - //=========== 4) - // copy fv - std::vector> rep(fv); - rep.swap(m_fvn); - // extend m_fvn by adding the face neighbors - for (uint32_t e = 0; e < ef.size(); ++e) { - assert(ef[e].size() != 0); // we don't handle dangling edges - - for (uint32_t f = 0; f < ef[e].size(); ++f) { - uint32_t f0 = ef[e][f]; - for (uint32_t s = f + 1; s < ef[e].size(); ++s) { - uint32_t f1 = ef[e][s]; - m_fvn[f0].push_back(f1); - m_fvn[f1].push_back(f0); - } - } - } - //=============================== - - - //=========== 5) - // create an instance of Patcher and execute it and then move the - // ownership to m_patcher - std::unique_ptr pp = std::make_unique( - patchSize, m_fvn, m_num_vertices, m_num_edges, true, m_quite); - pp->execute( - [this](uint32_t v0, uint32_t v1) { return this->get_edge_id(v0, v1); }, - ef); - - m_patcher = std::move(pp); - m_num_patches = m_patcher->get_num_patches(); - // m_patcher->export_patches(Verts); - //=============================== - - //=========== 5.5) - // sort indices based on patches - if (m_is_sort) { - sort(fv, coordinates); - } - //=============================== - - //=========== 6) - m_max_size.x = m_max_size.y = 0; - m_h_owned_size.resize(m_num_patches); - for (uint32_t p = 0; p < m_num_patches; ++p) { - build_patch_locally(p); - m_max_size.x = static_cast( - std::max(size_t(m_max_size.x), m_h_patches_edges[p].size())); - m_max_size.y = static_cast( - std::max(size_t(m_max_size.y), m_h_patches_faces[p].size())); - } - - m_max_size.x = round_up_multiple(m_max_size.x, 32u); - m_max_size.y = round_up_multiple(m_max_size.y, 32u); - - m_max_vertices_per_patch = 0; - m_max_edges_per_patch = 0; - m_max_faces_per_patch = 0; - m_max_owned_vertices_per_patch = 0; - m_max_owned_edges_per_patch = 0; - m_max_owned_faces_per_patch = 0; - for (uint32_t p = 0; p < m_num_patches; ++p) { - m_max_vertices_per_patch = std::max( - m_max_vertices_per_patch, uint32_t(m_h_patches_ltog_v[p].size())); - m_max_edges_per_patch = std::max( - m_max_edges_per_patch, uint32_t(m_h_patches_ltog_e[p].size())); - m_max_faces_per_patch = std::max( - m_max_faces_per_patch, uint32_t(m_h_patches_ltog_f[p].size())); - - m_max_owned_faces_per_patch = - std::max(m_max_owned_faces_per_patch, m_h_owned_size[p].x); - m_max_owned_edges_per_patch = - std::max(m_max_owned_edges_per_patch, m_h_owned_size[p].y); - m_max_owned_vertices_per_patch = - std::max(m_max_owned_vertices_per_patch, m_h_owned_size[p].z); - } - - // scanned histogram of element count in patches - m_h_patch_distribution_v.resize(m_num_patches + 1, 0); - m_h_patch_distribution_e.resize(m_num_patches + 1, 0); - m_h_patch_distribution_f.resize(m_num_patches + 1, 0); - - for (uint32_t v = 0; v < m_num_vertices; ++v) { - uint32_t patch = m_patcher->get_vertex_patch_id(v); - if (patch != INVALID32) { - m_h_patch_distribution_v[patch]++; - } - } - for (uint32_t f = 0; f < m_num_faces; ++f) { - uint32_t patch = m_patcher->get_face_patch_id(f); - if (patch != INVALID32) { - m_h_patch_distribution_f[patch]++; - } - } - for (uint32_t e = 0; e < m_num_edges; ++e) { - uint32_t patch = m_patcher->get_edge_patch_id(e); - if (patch != INVALID32) { - m_h_patch_distribution_e[patch]++; - } + if (fv.empty()) { + RXMESH_ERROR( + "RXMesh::RXMesh input fv is empty. Can not be build RXMesh " + "properly"); } - auto ex_scan = [](std::vector& vv) { - uint32_t dd = 0; - for (uint32_t i = 1; i < vv.size(); ++i) { - uint32_t temp = vv[i]; - vv[i] = dd + vv[i - 1]; - dd = temp; - } - vv[0] = 0; - }; + build(fv); + build_device(); + calc_max_not_owned_elements(); - ex_scan(m_h_patch_distribution_v); - ex_scan(m_h_patch_distribution_e); - ex_scan(m_h_patch_distribution_f); + // Allocate and copy the context to the gpu + m_rxmesh_context.init(m_num_edges, + m_num_faces, + m_num_vertices, + m_num_patches, + m_d_patches_info); if (!m_quite) { - RXMESH_TRACE("#Vertices = {}, #Faces= {}, #Edges= {}", m_num_vertices, - m_num_faces, m_num_edges); + RXMESH_TRACE("#Vertices = {}, #Faces= {}, #Edges= {}", + m_num_vertices, + m_num_faces, + m_num_edges); RXMESH_TRACE("Input is {} edge manifold", ((m_is_input_edge_manifold) ? "" : " Not")); RXMESH_TRACE("Input is {} closed", ((m_is_input_closed) ? "" : " Not")); RXMESH_TRACE("max valence = {}", m_max_valence); RXMESH_TRACE("max edge incident faces = {}", m_max_edge_incident_faces); RXMESH_TRACE("max face adjacent faces = {}", m_max_face_adjacent_faces); - RXMESH_TRACE("per-patch maximum edges references= {}", m_max_size.x); - RXMESH_TRACE("per-patch maximum faces references= {}", m_max_size.y); - RXMESH_TRACE("per-patch maximum face count (owned)= {} ({})", - m_max_faces_per_patch, m_max_owned_faces_per_patch); - RXMESH_TRACE("per-patch maximum edge count (owned) = {} ({})", - m_max_edges_per_patch, m_max_owned_edges_per_patch); - RXMESH_TRACE("per-patch maximum vertex count (owned)= {} ({})", - m_max_vertices_per_patch, m_max_owned_vertices_per_patch); + RXMESH_TRACE("per-patch maximum face count = {}", + m_max_faces_per_patch); + RXMESH_TRACE("per-patch maximum edge count = {}", + m_max_edges_per_patch); + RXMESH_TRACE("per-patch maximum vertex count = {}", + m_max_vertices_per_patch); + RXMESH_TRACE("per-patch maximum not-owned face count = {}", + m_max_not_owned_faces); + RXMESH_TRACE("per-patch maximum not-owned edge count = {}", + m_max_not_owned_edges); + RXMESH_TRACE("per-patch maximum not-owned vertex count = {}", + m_max_not_owned_vertices); } - //=============================== - - m_max_ele_count = std::max(m_num_edges, m_num_faces); - m_max_ele_count = std::max(m_num_vertices, m_max_ele_count); } -template -void RXMesh::build_patch_locally(const uint32_t patch_id) +RXMesh::~RXMesh() { - // Build the patch in local index space - // This is the two small matrices defining incident relation between - // edge-vertices and faces-edges along with the mapping from local to - // global space for vertices, edge, and faces - - // We we create a new patch, we make sure that the elements owned by the - // patch will have local indices lower than any elements (of the same type) - // that is not owned by the patch - const uint32_t *p_val(m_patcher->get_patches_val()), - *p_off(m_patcher->get_patches_offset()); - - - // patch start and end - const uint32_t p_start = (patch_id == 0) ? 0 : p_off[patch_id - 1]; - const uint32_t p_end = p_off[patch_id]; - const uint32_t r_start = - (patch_id == 0) ? 0 : - m_patcher->get_external_ribbon_offset()[patch_id - 1]; - const uint32_t r_end = m_patcher->get_external_ribbon_offset()[patch_id]; - - const uint32_t total_patch_num_faces = - (p_end - p_start) + (r_end - r_start); - uint16_t total_patch_num_edges(0), total_patch_num_vertices(0); - - assert(total_patch_num_faces <= m_num_faces); - - //** faces - // container for this patch local faces i.e., face incident edges - std::vector fp(m_face_degree * total_patch_num_faces); - - // the mapping from this patch local space (uint16_t) to global one - std::vector f_ltog(total_patch_num_faces); - - //** edges - // container for this patch local edges i.e., edge incident vertices - std::vector ep; - - // the mapping from this patch local space to global one - std::vector e_ltog; - - //** vertices - // the mapping from this patch local space to global one - std::vector v_ltog; - - // count the number of elements owned and not owned by the patch - uint16_t num_edges_owned(0), num_vertices_owned(0); - std::vector tmp_e, tmp_v; - tmp_e.reserve(patchSize * 3); - tmp_v.reserve(patchSize); - auto insert_if_not_found = [](uint32_t index, - std::vector& tmp) -> uint32_t { - for (uint32_t i = 0; i < tmp.size(); ++i) { - if (tmp[i] == index) { - return INVALID32; - } - } - tmp.push_back(index); - return static_cast(tmp.size() - 1); - }; - auto count_num_elements = [&](uint32_t global_f) { - for (uint32_t j = 0; j < 3; j++) { - // find the edge global id - uint32_t global_v0 = m_fvn[global_f][j]; - uint32_t global_v1 = m_fvn[global_f][(j + 1) % 3]; - - // find the edge in m_edge_map with v0,v1 - std::pair my_edge = - edge_key(global_v0, global_v1); - uint32_t global_e = get_edge_id(my_edge); - - uint32_t v_index = insert_if_not_found(global_v0, tmp_v); - if (v_index != INVALID32) { - total_patch_num_vertices++; - if (m_patcher->get_vertex_patch_id(global_v0) == patch_id) { - num_vertices_owned++; - } - } - - uint32_t e_index = insert_if_not_found(global_e, tmp_e); - if (e_index != INVALID32) { - total_patch_num_edges++; - if (m_patcher->get_edge_patch_id(global_e) == patch_id) { - num_edges_owned++; - } - } - } - }; - for (uint32_t s = p_start; s < p_end; ++s) { - uint32_t global_f = p_val[s]; - count_num_elements(global_f); - } - for (uint32_t s = r_start; s < r_end; ++s) { - uint32_t global_f = m_patcher->get_external_ribbon_val()[s]; - count_num_elements(global_f); - } - - // 1) loop over patch faces - e_ltog.resize(total_patch_num_edges); - v_ltog.resize(total_patch_num_vertices); - ep.resize(total_patch_num_edges * 2); - - // to track how many faces/edges/vertices we have locally created so far - uint16_t faces_count(0), edges_owned_count(0), edges_not_owned_count(0), - vertices_owned_count(0), vertices_not_owned_count(0); - for (uint32_t s = p_start; s < p_end; ++s) { - uint32_t global_f = p_val[s]; - create_new_local_face(patch_id, global_f, m_fvn[global_f], faces_count, - edges_owned_count, edges_not_owned_count, - vertices_owned_count, vertices_not_owned_count, - num_edges_owned, num_vertices_owned, f_ltog, - e_ltog, v_ltog, fp, ep); - } - - - // 2) loop over ribbon faces - for (uint32_t s = r_start; s < r_end; ++s) { - uint32_t global_f = m_patcher->get_external_ribbon_val()[s]; - create_new_local_face(patch_id, global_f, m_fvn[global_f], faces_count, - edges_owned_count, edges_not_owned_count, - vertices_owned_count, vertices_not_owned_count, - num_edges_owned, num_vertices_owned, f_ltog, - e_ltog, v_ltog, fp, ep); - } - - if (vertices_owned_count != num_vertices_owned || - edges_owned_count != num_edges_owned || - edges_owned_count + edges_not_owned_count != total_patch_num_edges || - vertices_owned_count + vertices_not_owned_count != - total_patch_num_vertices) { - RXMESH_ERROR("RXMesh::build_patch_locally() patch is " + - std::to_string(patch_id) + " not built correctly!!"); + for (uint32_t p = 0; p < m_num_patches; ++p) { + free(m_h_patches_info[p].not_owned_patch_v); + free(m_h_patches_info[p].not_owned_patch_e); + free(m_h_patches_info[p].not_owned_patch_f); + free(m_h_patches_info[p].not_owned_id_v); + free(m_h_patches_info[p].not_owned_id_e); + free(m_h_patches_info[p].not_owned_id_f); } + // m_d_patches_info is a pointer to pointer(s) which we can not dereference + // on the host so we copy these pointers to the host by re-using + // m_h_patches_info and then free the memory these pointers are pointing to. + // Finally, we free the parent pointer memory - m_h_owned_size[patch_id].x = (p_end - p_start); - m_h_owned_size[patch_id].y = num_edges_owned; - m_h_owned_size[patch_id].z = num_vertices_owned; - - // faces - m_h_patches_faces.push_back(fp); - m_h_patches_ltog_f.push_back(f_ltog); - - - // edges - m_h_patches_edges.push_back(ep); - m_h_patches_ltog_e.push_back(e_ltog); + CUDA_ERROR(cudaMemcpy(m_h_patches_info, + m_d_patches_info, + m_num_patches * sizeof(PatchInfo), + cudaMemcpyDeviceToHost)); - // vertices - m_h_patches_ltog_v.push_back(v_ltog); + for (uint32_t p = 0; p < m_num_patches; ++p) { + GPU_FREE(m_h_patches_info[p].not_owned_patch_v); + GPU_FREE(m_h_patches_info[p].not_owned_patch_e); + GPU_FREE(m_h_patches_info[p].not_owned_patch_f); + GPU_FREE(m_h_patches_info[p].not_owned_id_v); + GPU_FREE(m_h_patches_info[p].not_owned_id_e); + GPU_FREE(m_h_patches_info[p].not_owned_id_f); + GPU_FREE(m_h_patches_info[p].ev); + GPU_FREE(m_h_patches_info[p].fe); + } + GPU_FREE(m_d_patches_info); + free(m_h_patches_info); } -template -uint16_t RXMesh::create_new_local_face( - const uint32_t patch_id, - const uint32_t global_f, - const std::vector& fv, - uint16_t& faces_count, - uint16_t& edges_owned_count, - uint16_t& edges_not_owned_count, - uint16_t& vertices_owned_count, - uint16_t& vertices_not_owned_count, - const uint16_t num_edges_owned, - const uint16_t num_vertices_owned, - std::vector& f_ltog, - std::vector& e_ltog, - std::vector& v_ltog, - std::vector& fp, - std::vector& ep) +void RXMesh::build(const std::vector>& fv) { + std::vector ff_values; + std::vector ff_offset; + std::vector> ef; + build_supporting_structures(fv, ef, ff_offset, ff_values); - uint16_t local_f = faces_count++; - f_ltog[local_f] = global_f; - - // shift to left and set first bit to 1 if global_f's patch is this patch - f_ltog[local_f] = f_ltog[local_f] << 1; - f_ltog[local_f] = - f_ltog[local_f] | (m_patcher->get_face_patch_id(global_f) == patch_id); - - auto find_increment_index = - [&patch_id](const uint32_t& global, std::vector& vect, - uint16_t& owned_count, uint16_t& not_owned_count, - const uint16_t num_owned, bool& incremented, - const uint32_t ele_patch) -> uint16_t { - incremented = true; - - for (uint16_t id = 0; id < owned_count; ++id) { - if (global == (vect[id] >> 1)) { - incremented = false; - return id; - } - } - - for (uint16_t id = num_owned; id < num_owned + not_owned_count; ++id) { - if (global == (vect[id] >> 1)) { - incremented = false; - return id; - } - } - uint32_t to_store = (global << 1); - uint16_t ret_id; - if (ele_patch == patch_id) { - to_store = to_store | 1; - ret_id = owned_count++; - } else { - ret_id = num_owned + (not_owned_count++); - } - vect[ret_id] = to_store; - return ret_id; - }; - - for (uint32_t j = 0; j < m_face_degree; j++) { - - // find the edge global id - uint32_t global_v0 = fv[j]; - uint32_t global_v1 = fv[(j + 1) % m_face_degree]; - - // find the edge in m_edge_map with v0,v1 - std::pair my_edge = edge_key(global_v0, global_v1); + m_patcher = std::make_unique(m_patch_size, + ff_offset, + ff_values, + fv, + m_edges_map, + m_num_vertices, + m_num_edges, + m_quite); - assert(my_edge.first == global_v0 || my_edge.first == global_v1); - assert(my_edge.second == global_v0 || my_edge.second == global_v1); + m_num_patches = m_patcher->get_num_patches(); - int dir = 1; - if (my_edge.first == global_v0 && my_edge.second == global_v1) { - dir = 0; - } + m_h_patches_ltog_f.resize(m_num_patches); + m_h_patches_ltog_e.resize(m_num_patches); + m_h_patches_ltog_v.resize(m_num_patches); + m_h_num_owned_f.resize(m_num_patches); + m_h_num_owned_v.resize(m_num_patches); + m_h_num_owned_e.resize(m_num_patches); + m_h_patches_fe.resize(m_num_patches); + m_h_patches_ev.resize(m_num_patches); - uint32_t global_e = get_edge_id(my_edge); - - // convert edge to local index by searching for it. if not - // found, then increment the number of local edges - bool new_e(false); - uint16_t local_e = find_increment_index( - global_e, e_ltog, edges_owned_count, edges_not_owned_count, - num_edges_owned, new_e, m_patcher->get_edge_patch_id(global_e)); - - if (new_e) { - // if it is new edges, then we need to either look for - // its vertices. if there were inserted before in the - // patch, then retrieve their local id. otherwise, we - // new vertices to the patch - assert(my_edge.first != my_edge.second); - - bool new_v(false); - uint16_t local_v0 = find_increment_index( - my_edge.first, v_ltog, vertices_owned_count, - vertices_not_owned_count, num_vertices_owned, new_v, - m_patcher->get_vertex_patch_id(my_edge.first)); - - uint16_t local_v1 = find_increment_index( - my_edge.second, v_ltog, vertices_owned_count, - vertices_not_owned_count, num_vertices_owned, new_v, - m_patcher->get_vertex_patch_id(my_edge.second)); - - assert(local_v0 != local_v1); - - // new edges are appended in the end of e_ltog - // and so as their vertices in ep - ep[2 * local_e] = local_v0; - ep[2 * local_e + 1] = local_v1; - } - // shift local_e to left - // set the first bit to 1 if (dir ==1) - local_e = local_e << 1; - local_e = local_e | (dir & 1); - fp[local_f * m_face_degree + j] = local_e; +#pragma omp parallel for + for (int p = 0; p < static_cast(m_num_patches); ++p) { + build_single_patch(fv, p); } - return local_f; + calc_statistics(fv, ef); } -template -void RXMesh::set_num_vertices( - const std::vector>& fv) +void RXMesh::build_supporting_structures( + const std::vector>& fv, + std::vector>& ef, + std::vector& ff_offset, + std::vector& ff_values) { + m_num_faces = static_cast(fv.size()); m_num_vertices = 0; - for (uint32_t i = 0; i < fv.size(); ++i) { - if (fv[i].size() != 3) { - RXMESH_ERROR("RXMesh::count_vertices() Face" + std::to_string(i) + - " is not triangles. Non-triangular faces are not " - "supported yet"); - } - for (uint32_t j = 0; j < fv[i].size(); ++j) { - m_num_vertices = std::max(m_num_vertices, fv[i][j]); - } - } - ++m_num_vertices; -} - - -template -void RXMesh::populate_edge_map( - const std::vector>& fv) -{ - - // create edges and populate edge_map - // and also compute max valence + m_num_edges = 0; + m_edges_map.clear(); - m_edges_map.reserve(m_num_faces * 3); // upper bound - - std::vector vv_count(m_num_vertices, 0); - m_max_valence = 0; + // assuming manifold mesh i.e., #E = 1.5#F + ef.clear(); + uint32_t reserve_size = + static_cast(1.5f * static_cast(m_num_faces)); + ef.reserve(reserve_size); + m_edges_map.reserve(reserve_size); - for (uint32_t f = 0; f < m_num_faces; ++f) { + std::vector ff_size(m_num_faces, 0); - if (fv[f].size() < 3) { + for (uint32_t f = 0; f < fv.size(); ++f) { + if (fv[f].size() != 3) { RXMESH_ERROR( - "RXMesh::populate_edge_map() Face {} has less than three " - "vertices", + "rxmesh::build_supporting_structures() Face {} is not " + "triangle. Non-triangular faces are not supported", f); + exit(EXIT_FAILURE); } - for (uint32_t j = 0; j < fv[f].size(); ++j) { - - uint32_t v0 = fv[f][j]; - uint32_t v1 = (j != fv[f].size() - 1) ? fv[f][j + 1] : fv[f][0]; - - std::pair my_edge = edge_key(v0, v1); - typename std::unordered_map, uint32_t, - edge_key_hash>::const_iterator e_it = - m_edges_map.find(my_edge); + for (uint32_t v = 0; v < fv[f].size(); ++v) { + uint32_t v0 = fv[f][v]; + uint32_t v1 = fv[f][(v + 1) % 3]; - if (e_it == m_edges_map.end()) { - m_edges_map.insert(std::make_pair(my_edge, m_num_edges++)); + m_num_vertices = std::max(m_num_vertices, v0); - vv_count[v0]++; - vv_count[v1]++; + std::pair edge = detail::edge_key(v0, v1); + auto e_iter = m_edges_map.find(edge); + if (e_iter == m_edges_map.end()) { + uint32_t edge_id = m_num_edges++; + m_edges_map.insert(std::make_pair(edge, edge_id)); + std::vector tmp(1, f); + ef.push_back(tmp); + } else { + uint32_t edge_id = (*e_iter).second; - // also set max valence - if (m_max_valence < vv_count[v0]) { - m_max_valence = vv_count[v0]; - m_max_valence_vertex_id = v0; - } - if (m_max_valence < vv_count[v1]) { - m_max_valence = vv_count[v1]; - m_max_valence_vertex_id = v1; + for (uint32_t f0 = 0; f0 < ef[edge_id].size(); ++f0) { + uint32_t other_face = ef[edge_id][f0]; + ++ff_size[other_face]; } + ff_size[f] += ef[edge_id].size(); + + ef[edge_id].push_back(f); } } } -} + ++m_num_vertices; -template -void RXMesh::edge_incident_faces( - const std::vector>& fv, - std::vector>& ef) -{ - // populate ef by the faces incident to each edge - // must call populate_edge_map before call it + if (m_num_edges != static_cast(m_edges_map.size())) { + RXMESH_ERROR( + "rxmesh::build_supporting_structures() m_num_edges ({}) should " + "match the size of edge_map ({})", + m_num_edges, + m_edges_map.size()); + exit(EXIT_FAILURE); + } - assert(m_edges_map.size() > 0); + ff_offset.resize(m_num_faces); + std::inclusive_scan(ff_size.begin(), ff_size.end(), ff_offset.begin()); + ff_values.clear(); + ff_values.resize(ff_offset.back()); + std::fill(ff_size.begin(), ff_size.end(), 0); - uint32_t num_edges = static_cast(m_edges_map.size()); + for (uint32_t e = 0; e < m_num_edges; ++e) { + for (uint32_t i = 0; i < ef[e].size(); ++i) { + uint32_t f0 = ef[e][i]; + for (uint32_t j = i + 1; j < ef[e].size(); ++j) { + uint32_t f1 = ef[e][j]; + + uint32_t f0_offset = ff_size[f0]++; + uint32_t f1_offset = ff_size[f1]++; + f0_offset += (f0 == 0) ? 0 : ff_offset[f0 - 1]; + f1_offset += (f1 == 0) ? 0 : ff_offset[f1 - 1]; + + ff_values[f0_offset] = f1; + ff_values[f1_offset] = f0; + } + } + } +} - // reserve space assuming mesh is mostly manifold (edge is shared by - // two faces) - ef.clear(); - ef.resize(num_edges, std::vector(0)); - for (uint32_t e = 0; e < num_edges; ++e) { - ef[e].reserve(2); +void RXMesh::calc_statistics(const std::vector>& fv, + const std::vector>& ef) +{ + if (m_num_vertices == 0 || m_num_faces == 0 || m_num_edges == 0 || + fv.size() == 0 || ef.size() == 0) { + RXMESH_ERROR( + "RXMesh::calc_statistics() input mesh has not been initialized"); + exit(EXIT_FAILURE); } + // calc max valence, max ef, is input closed, and is input manifold m_max_edge_incident_faces = 0; - for (uint32_t f = 0; f < m_num_faces; ++f) { + m_max_valence = 0; + std::vector vv_count(m_num_vertices, 0); + m_is_input_closed = true; + m_is_input_edge_manifold = true; + for (auto& e_iter : m_edges_map) { + uint32_t v0 = e_iter.first.first; + uint32_t v1 = e_iter.first.second; - for (uint32_t j = 0; j < fv[f].size(); ++j) { + vv_count[v0]++; + vv_count[v1]++; - uint32_t v0 = fv[f][j]; - uint32_t v1 = (j != fv[f].size() - 1) ? fv[f][j + 1] : fv[f][0]; + m_max_valence = std::max(m_max_valence, vv_count[v0]); + m_max_valence = std::max(m_max_valence, vv_count[v1]); - uint32_t edge_num = get_edge_id(v0, v1); - ef[edge_num].push_back(f); - m_max_edge_incident_faces = std::max(m_max_edge_incident_faces, - uint32_t(ef[edge_num].size())); + uint32_t edge_id = e_iter.second; + m_max_edge_incident_faces = + std::max(m_max_edge_incident_faces, uint32_t(ef[edge_id].size())); + + if (ef[edge_id].size() < 2) { + m_is_input_closed = false; + } + if (ef[edge_id].size() > 2) { + m_is_input_edge_manifold = false; } } - // calc m_max_face_adjacent_faces + // calc max ff m_max_face_adjacent_faces = 0; - for (uint32_t f = 0; f < m_num_faces; ++f) { + for (uint32_t f = 0; f < fv.size(); ++f) { uint32_t ff_count = 0; - for (uint32_t j = 0; j < fv[f].size(); ++j) { - uint32_t v0 = fv[f][j]; - uint32_t v1 = (j != fv[f].size() - 1) ? fv[f][j + 1] : fv[f][0]; + for (uint32_t v = 0; v < fv[f].size(); ++v) { + uint32_t v0 = fv[f][v]; + uint32_t v1 = fv[f][(v + 1) % 3]; uint32_t edge_num = get_edge_id(v0, v1); ff_count += ef[edge_num].size() - 1; } m_max_face_adjacent_faces = std::max(ff_count, m_max_face_adjacent_faces); } + + // max number of vertices/edges/faces per patch + m_max_vertices_per_patch = 0; + m_max_edges_per_patch = 0; + m_max_faces_per_patch = 0; + for (uint32_t p = 0; p < m_num_patches; ++p) { + m_max_vertices_per_patch = std::max( + m_max_vertices_per_patch, uint32_t(m_h_patches_ltog_v[p].size())); + m_max_edges_per_patch = std::max( + m_max_edges_per_patch, uint32_t(m_h_patches_ltog_e[p].size())); + m_max_faces_per_patch = std::max( + m_max_faces_per_patch, uint32_t(m_h_patches_ltog_f[p].size())); + } } -template -uint32_t RXMesh::get_edge_id(const uint32_t v0, - const uint32_t v1) const +void RXMesh::calc_max_not_owned_elements() { - // v0 and v1 are two vertices in global space. we return the edge - // id in global space also (by querying m_edges_map) - assert(m_edges_map.size() != 0); - - std::pair edge = edge_key(v0, v1); - - assert(edge.first == v0 || edge.first == v1); - assert(edge.second == v0 || edge.second == v1); - - return get_edge_id(edge); + m_max_not_owned_vertices = 0; + m_max_not_owned_edges = 0; + m_max_not_owned_faces = 0; + + for (int p = 0; p < static_cast(m_num_patches); ++p) { + m_max_not_owned_vertices = + std::max(m_max_not_owned_vertices, + uint32_t(m_h_patches_info[p].num_vertices - + m_h_patches_info[p].num_owned_vertices)); + + m_max_not_owned_edges = + std::max(m_max_not_owned_edges, + uint32_t(m_h_patches_info[p].num_edges - + m_h_patches_info[p].num_owned_edges)); + + m_max_not_owned_faces = + std::max(m_max_not_owned_faces, + uint32_t(m_h_patches_info[p].num_faces - + m_h_patches_info[p].num_owned_faces)); + } } -template -uint32_t RXMesh::get_edge_id( - const std::pair& edge) const +void RXMesh::build_single_patch(const std::vector>& fv, + const uint32_t patch_id) { - uint32_t edge_id = -1; - try { - edge_id = m_edges_map.at(edge); - } catch (const std::out_of_range&) { - RXMESH_ERROR( - "RXMesh::get_edge_id() mapping edges went wrong." - " Can not find an edge connecting vertices {} and {}", - edge.first, edge.second); - } + // Build the patch local index space + // This is the two small matrices defining incident relation between + // edge-vertices and faces-edges (i.e., the topology) along with the mapping + // from local to global space for vertices, edge, and faces - return edge_id; + // When we create a new patch, we make sure that the elements owned by the + // patch will have local indices lower than any other elements (of the same + // type) that is not owned by the patch. + + build_single_patch_ltog(fv, patch_id); + + build_single_patch_topology(fv, patch_id); } -//************************************************************************** -//********************** sort -template -void RXMesh::sort(std::vector>& fv, - std::vector>& coordinates) +void RXMesh::build_single_patch_ltog( + const std::vector>& fv, + const uint32_t patch_id) { - if (m_num_patches == 1) { - return; - } - std::vector new_face_id(m_num_faces, INVALID32); - std::vector new_vertex_id(m_num_vertices, INVALID32); - std::vector new_edge_id(m_num_edges, INVALID32); - - const uint32_t* patches_offset = m_patcher->get_patches_offset(); - const uint32_t* patches_val = m_patcher->get_patches_val(); - - // patch status: - // 1) 0: has not been processed/seen before - // 2) 1: currently in the queue - // 3) 2: has been processed (assigned new id) - std::vector patch_status(m_num_patches, 0); - - std::queue patch_queue; - patch_queue.push(0); - uint32_t face_counter = 0; - uint32_t vertex_counter = 0; - uint32_t edge_counter = 0; - - //*****Compute new ID for faces, edges, and vertices - while (true) { - - std::queue patch_queue; - - for (uint32_t p = 0; p < m_num_patches; ++p) { - if (patch_status[p] == 0) { - patch_queue.push(p); - patch_status[p] = 1; - break; - } - } + // patch start and end + const uint32_t p_start = + (patch_id == 0) ? 0 : m_patcher->get_patches_offset()[patch_id - 1]; + const uint32_t p_end = m_patcher->get_patches_offset()[patch_id]; - // this happens when all patches has been processed - if (patch_queue.empty()) { - break; - } + // ribbon start and end + const uint32_t r_start = + (patch_id == 0) ? 0 : + m_patcher->get_external_ribbon_offset()[patch_id - 1]; + const uint32_t r_end = m_patcher->get_external_ribbon_offset()[patch_id]; - while (patch_queue.size() > 0) { - uint32_t p = patch_queue.front(); - patch_queue.pop(); - patch_status[p] = 2; - - uint32_t p_start = (p == 0) ? 0 : patches_offset[p - 1]; - uint32_t p_end = patches_offset[p]; - // first loop over p's faces and assigned its faces new id - for (uint32_t f = p_start; f < p_end; ++f) { - uint32_t face = patches_val[f]; - new_face_id[face] = face_counter++; - - // assign face's vertices new id - for (uint32_t v = 0; v < 3; ++v) { - uint32_t vertex = m_fvn[face][v]; - // if the vertex is owned by this patch - if (m_patcher->get_vertex_patch_id(vertex) == p && - new_vertex_id[vertex] == INVALID32) { - new_vertex_id[vertex] = vertex_counter++; - } - } + const uint32_t total_patch_num_faces = + (p_end - p_start) + (r_end - r_start); + m_h_patches_ltog_f[patch_id].resize(total_patch_num_faces); + m_h_patches_ltog_v[patch_id].resize(3 * total_patch_num_faces); + m_h_patches_ltog_e[patch_id].resize(3 * total_patch_num_faces); + auto add_new_face = [&](uint32_t global_face_id, uint16_t local_face_id) { + m_h_patches_ltog_f[patch_id][local_face_id] = global_face_id; - // assign face's edge new id - uint32_t v1 = 2; - for (uint32_t v0 = 0; v0 < 3; ++v0) { - uint32_t vertex0 = m_fvn[face][v0]; - uint32_t vertex1 = m_fvn[face][v1]; - uint32_t edge = get_edge_id(vertex0, vertex1); + for (uint32_t v = 0; v < 3; ++v) { + uint32_t v0 = fv[global_face_id][v]; + uint32_t v1 = fv[global_face_id][(v + 1) % 3]; - // if the edge is owned by this patch - if (m_patcher->get_edge_patch_id(edge) == p && - new_edge_id[edge] == INVALID32) { - new_edge_id[edge] = edge_counter++; - } - v1 = v0; - } - } + uint32_t edge_id = get_edge_id(v0, v1); - // second loop over p's ribbon and push new patches into the queue - // only if there are not in the queue and the have not been - // processed yet. - uint32_t ribbon_start = - (p == 0) ? 0 : m_patcher->get_external_ribbon_offset()[p - 1]; - uint32_t ribbon_end = m_patcher->get_external_ribbon_offset()[p]; - for (uint32_t f = ribbon_start; f < ribbon_end; ++f) { - // this is a face in the ribbon - uint32_t face = m_patcher->get_external_ribbon_val()[f]; - // get the face actual patch - uint32_t face_patch = m_patcher->get_face_patch_id(face); - assert(face_patch != p); - if (patch_status[face_patch] == 0) { - patch_queue.push(face_patch); - patch_status[face_patch] = 1; - } - } - } - } - if (edge_counter != m_num_edges || vertex_counter != m_num_vertices || - face_counter != m_num_faces) { - RXMESH_ERROR("RXMesh::sort Error in assigning new IDs"); - } - //**** Apply changes - m_max_valence_vertex_id = new_vertex_id[m_max_valence_vertex_id]; - // coordinates - { - std::vector> coord_ordered(coordinates); - for (uint32_t v = 0; v < m_num_vertices; ++v) { - uint32_t new_v_id = new_vertex_id[v]; - coord_ordered[new_v_id][0] = coordinates[v][0]; - coord_ordered[new_v_id][1] = coordinates[v][1]; - coord_ordered[new_v_id][2] = coordinates[v][2]; - } - coordinates.swap(coord_ordered); - } + m_h_patches_ltog_v[patch_id][local_face_id * 3 + v] = v0; - // edge map - { - std::unordered_map, uint32_t, - edge_key_hash> - edges_map; - edges_map.reserve(m_num_faces * 3); - for (auto& it : m_edges_map) { - uint32_t v0 = new_vertex_id[it.first.first]; - uint32_t v1 = new_vertex_id[it.first.second]; - uint32_t edge_id = new_edge_id[it.second]; - - std::pair my_edge = edge_key(v0, v1); - - typename std::unordered_map, uint32_t, - edge_key_hash>::const_iterator e_it = - edges_map.find(my_edge); - - if (e_it == edges_map.end()) { - edges_map.insert(std::make_pair(my_edge, edge_id)); - } else { - RXMESH_ERROR("RXMesh::sort Unknown error"); - } + m_h_patches_ltog_e[patch_id][local_face_id * 3 + v] = edge_id; } - m_edges_map.swap(edges_map); - } + }; - // m_fvn - { - std::vector> fvn(m_fvn); - for (uint32_t f = 0; f < m_fvn.size(); ++f) { - uint32_t new_f_id = new_face_id[f]; - fvn[new_f_id].resize(3); - // v - fvn[new_f_id][0] = new_vertex_id[m_fvn[f][0]]; - fvn[new_f_id][1] = new_vertex_id[m_fvn[f][1]]; - fvn[new_f_id][2] = new_vertex_id[m_fvn[f][2]]; - - fv[new_f_id][0] = fvn[new_f_id][0]; - fv[new_f_id][1] = fvn[new_f_id][1]; - fv[new_f_id][2] = fvn[new_f_id][2]; - - // n - for (uint32_t n = 3; n < m_fvn[f].size(); ++n) { - fvn[new_f_id].push_back(new_face_id[m_fvn[f][n]]); - } - } - m_fvn.swap(fvn); + uint16_t local_face_id = 0; + for (uint32_t f = p_start; f < p_end; ++f) { + uint32_t face_id = m_patcher->get_patches_val()[f]; + add_new_face(face_id, local_face_id++); } - // patcher - { - uint32_t* patch_val = m_patcher->get_patches_val(); - for (uint32_t i = 0; i < m_num_faces; ++i) { - patch_val[i] = new_face_id[patch_val[i]]; - } - - uint32_t num_ext_ribbon_faces = - m_patcher->get_external_ribbon_offset()[m_num_patches - 1]; - for (uint32_t i = 0; i < num_ext_ribbon_faces; ++i) { - m_patcher->get_external_ribbon_val()[i] = - new_face_id[m_patcher->get_external_ribbon_val()[i]]; - } + for (uint32_t f = r_start; f < r_end; ++f) { + uint32_t face_id = m_patcher->get_external_ribbon_val()[f]; + add_new_face(face_id, local_face_id++); + } - { - std::vector face_patch(m_num_faces); - for (uint32_t f = 0; f < m_num_faces; ++f) { - uint32_t new_f_id = new_face_id[f]; - face_patch[new_f_id] = m_patcher->get_face_patch_id(f); - } - std::memcpy(m_patcher->get_face_patch().data(), face_patch.data(), - m_num_faces * sizeof(uint32_t)); - } - { + auto create_unique_mapping = [&](std::vector& ltog_map, + const std::vector& patch) { + std::sort(ltog_map.begin(), ltog_map.end()); + auto unique_end = std::unique(ltog_map.begin(), ltog_map.end()); + ltog_map.resize(unique_end - ltog_map.begin()); - std::vector vertex_patch(m_num_vertices); - for (uint32_t v = 0; v < m_num_vertices; ++v) { - uint32_t new_v_id = new_vertex_id[v]; - vertex_patch[new_v_id] = m_patcher->get_vertex_patch_id(v); - } - std::memcpy(m_patcher->get_vertex_patch().data(), - vertex_patch.data(), m_num_vertices * sizeof(uint32_t)); - } + // we use stable partition since we want ltog to be sorted so we can + // use binary search on it when we populate the topology + auto part_end = std::stable_partition( + ltog_map.begin(), ltog_map.end(), [&patch, patch_id](uint32_t i) { + return patch[i] == patch_id; + }); + return static_cast(part_end - ltog_map.begin()); + }; - { - std::vector edge_patch(m_num_edges); - for (uint32_t e = 0; e < m_num_edges; ++e) { - uint32_t new_e_id = new_edge_id[e]; - edge_patch[new_e_id] = m_patcher->get_edge_patch_id(e); - } - std::memcpy(m_patcher->get_edge_patch().data(), edge_patch.data(), - m_num_edges * sizeof(uint32_t)); - } - } + m_h_num_owned_f[patch_id] = create_unique_mapping( + m_h_patches_ltog_f[patch_id], m_patcher->get_face_patch()); - /*m_patcher->export_patches(coordinates); + m_h_num_owned_e[patch_id] = create_unique_mapping( + m_h_patches_ltog_e[patch_id], m_patcher->get_edge_patch()); - std::vector vert_id(m_num_vertices); - std::vector face_id(m_num_faces); - fill_with_sequential_numbers(vert_id.data(), vert_id.size()); - fill_with_sequential_numbers(face_id.data(), face_id.size()); - export_attribute_VTK("sort_faces.vtk", m_fvn, coordinates, - true, face_id.data(), vert_id.data(), false); - export_attribute_VTK("sort_vertices.vtk", m_fvn, coordinates, - false, face_id.data(), vert_id.data(), false);*/ -} -//************************************************************************** - -//********************** Move to Device -template -template -void RXMesh::get_starting_ids( - const std::vector>& input, - std::vector& starting_id) -{ - // get the starting ids for the mesh elements in input and store it - // in the first (x) component of starting_id - - // uint32_t prv = 0; - assert(starting_id.size() > 0); - assert(starting_id.size() > input.size()); - starting_id[0].x = 0; - for (uint32_t p = 1; p <= input.size(); ++p) { - starting_id[p].x = starting_id[p - 1].x + input[p - 1].size(); - // starting_id[p].x = input[p].size() + prv; - // prv = starting_id[p].x; - } + m_h_num_owned_v[patch_id] = create_unique_mapping( + m_h_patches_ltog_v[patch_id], m_patcher->get_vertex_patch()); } -template -template -void RXMesh::get_size(const std::vector>& input, - std::vector& ad) +void RXMesh::build_single_patch_topology( + const std::vector>& fv, + const uint32_t patch_id) { - // get the size of each element of input and store it as the second(y) - // component in ad - assert(ad.size() >= input.size()); + // patch start and end + const uint32_t p_start = + (patch_id == 0) ? 0 : m_patcher->get_patches_offset()[patch_id - 1]; + const uint32_t p_end = m_patcher->get_patches_offset()[patch_id]; - for (uint32_t p = 0; p < input.size(); ++p) { - ad[p].y = input[p].size(); - } -} + // ribbon start and end + const uint32_t r_start = + (patch_id == 0) ? 0 : + m_patcher->get_external_ribbon_offset()[patch_id - 1]; + const uint32_t r_end = m_patcher->get_external_ribbon_offset()[patch_id]; -template -template -void RXMesh::padding_to_multiple(std::vector>& input, - const uint32_t multiple, - const T init_val) -{ - // resize each element on input to be mulitple of multiple by add - // init_val to the end - - for (uint32_t p = 0; p < input.size(); ++p) { - const uint32_t new_size = - round_up_multiple(uint32_t(input[p].size()), multiple); - assert(new_size >= input[p].size()); - input[p].resize(new_size, static_cast(init_val)); - } -} + const uint16_t patch_num_edges = m_h_patches_ltog_e[patch_id].size(); + const uint16_t patch_num_faces = m_h_patches_ltog_f[patch_id].size(); + + m_h_patches_ev[patch_id].resize(patch_num_edges * 2); + m_h_patches_fe[patch_id].resize(patch_num_faces * 3); + + std::vector is_added_edge(patch_num_edges, false); + + auto find_local_index = [&patch_id]( + const uint32_t global_id, + const uint32_t element_patch, + const uint16_t num_owned_elements, + const std::vector& ltog) -> uint16_t { + uint32_t start = 0; + uint32_t end = num_owned_elements; + if (element_patch != patch_id) { + start = num_owned_elements; + end = ltog.size(); + } + auto it = std::lower_bound( + ltog.begin() + start, ltog.begin() + end, global_id); + if (it == ltog.begin() + end) { + return INVALID16; + } else { + return static_cast(it - ltog.begin()); + } + }; -template -void RXMesh::device_alloc_local() -{ - // allocate and transfer patch information to device - // make sure to build_local first before calling this + auto add_new_face = [&](const uint32_t global_face_id) { + const uint16_t local_face_id = + find_local_index(global_face_id, + m_patcher->get_face_patch_id(global_face_id), + m_h_num_owned_f[patch_id], + m_h_patches_ltog_f[patch_id]); - // storing the start id(x) and element count(y) - m_h_ad_size_ltog_v.resize(m_num_patches + 1); - m_h_ad_size_ltog_e.resize(m_num_patches + 1); - m_h_ad_size_ltog_f.resize(m_num_patches + 1); - m_h_ad_size.resize(m_num_patches + 1); + for (uint32_t v = 0; v < 3; ++v) { - // get mesh element count per patch - get_size(m_h_patches_ltog_v, m_h_ad_size_ltog_v); - get_size(m_h_patches_ltog_e, m_h_ad_size_ltog_e); - get_size(m_h_patches_ltog_f, m_h_ad_size_ltog_f); - // how many edges and faces we have in each patch - for (uint32_t p = 0; p < m_num_patches; ++p) { - m_h_ad_size[p].y = m_h_ad_size_ltog_e[p].y * 2; // edges size - m_h_ad_size[p].w = - m_h_ad_size_ltog_f[p].y * m_face_degree; // faces size - } + const uint32_t global_v0 = fv[global_face_id][v]; + const uint32_t global_v1 = fv[global_face_id][(v + 1) % 3]; + std::pair edge_key = + detail::edge_key(global_v0, global_v1); - // increase to multiple so that each vector size is multiple of 32 - // so that when we copy it to the device, read will be coalesced - padding_to_multiple(m_h_patches_edges, WARPSIZE, - static_cast(INVALID16)); - padding_to_multiple(m_h_patches_faces, WARPSIZE, - static_cast(INVALID16)); - padding_to_multiple(m_h_patches_ltog_v, WARPSIZE, - static_cast(INVALID32)); - padding_to_multiple(m_h_patches_ltog_e, WARPSIZE, - static_cast(INVALID32)); - padding_to_multiple(m_h_patches_ltog_f, WARPSIZE, - static_cast(INVALID32)); - - // get the starting id of each patch - std::vector h_edges_ad(m_num_patches + 1), - h_faces_ad(m_num_patches + 1); - - get_starting_ids(m_h_patches_ltog_v, m_h_ad_size_ltog_v); - get_starting_ids(m_h_patches_ltog_e, m_h_ad_size_ltog_e); - get_starting_ids(m_h_patches_ltog_f, m_h_ad_size_ltog_f); - get_starting_ids(m_h_patches_edges, h_edges_ad); - get_starting_ids(m_h_patches_faces, h_faces_ad); - - // m_h_ad_size[0].x = m_h_ad_size[0].z = 0; - for (uint32_t p = 0; p <= m_num_patches; ++p) { - m_h_ad_size[p].x = h_edges_ad[p].x; // edges address - m_h_ad_size[p].z = h_faces_ad[p].x; // faces address - } + assert(edge_key.first == global_v0 || edge_key.first == global_v1); + assert(edge_key.second == global_v0 || + edge_key.second == global_v1); + int dir = 1; + if (edge_key.first == global_v0 && edge_key.second == global_v1) { + dir = 0; + } - // alloc mesh data - CUDA_ERROR(cudaMalloc((void**)&m_d_patches_ltog_v, - sizeof(uint32_t) * m_h_ad_size_ltog_v.back().x)); - CUDA_ERROR(cudaMalloc((void**)&m_d_patches_ltog_e, - sizeof(uint32_t) * m_h_ad_size_ltog_e.back().x)); - CUDA_ERROR(cudaMalloc((void**)&m_d_patches_ltog_f, - sizeof(uint32_t) * m_h_ad_size_ltog_f.back().x)); - CUDA_ERROR(cudaMalloc((void**)&m_d_patches_edges, - sizeof(uint16_t) * m_h_ad_size.back().x)); - CUDA_ERROR(cudaMalloc((void**)&m_d_patches_faces, - sizeof(uint16_t) * m_h_ad_size.back().z)); - if (!m_quite) { - uint32_t patch_local_storage = - sizeof(uint16_t) * (m_h_ad_size.back().x + m_h_ad_size.back().z) + - sizeof(uint32_t) * - (m_h_ad_size_ltog_v.back().x + m_h_ad_size_ltog_e.back().x + - m_h_ad_size_ltog_f.back().x); - uint32_t patch_membership_storage = - (m_num_faces + m_num_edges + m_num_vertices) * sizeof(uint32_t); - m_total_gpu_storage_mb = - double(patch_local_storage + patch_membership_storage) / - double(1024 * 1024); - RXMESH_TRACE("Total storage = {0:f} Mb", m_total_gpu_storage_mb); - } + const uint32_t global_edge_id = get_edge_id(edge_key); - // alloc ad_size_ltog and edges_/faces_ad - CUDA_ERROR(cudaMalloc((void**)&m_d_ad_size_ltog_v, - sizeof(uint2) * (m_num_patches + 1))); - CUDA_ERROR(cudaMalloc((void**)&m_d_ad_size_ltog_e, - sizeof(uint2) * (m_num_patches + 1))); - CUDA_ERROR(cudaMalloc((void**)&m_d_ad_size_ltog_f, - sizeof(uint2) * (m_num_patches + 1))); - CUDA_ERROR( - cudaMalloc((void**)&m_d_ad_size, sizeof(uint4) * (m_num_patches + 1))); + uint16_t local_edge_id = + find_local_index(global_edge_id, + m_patcher->get_edge_patch_id(global_edge_id), + m_h_num_owned_e[patch_id], + m_h_patches_ltog_e[patch_id]); - CUDA_ERROR(cudaMalloc((void**)&m_d_owned_size, - sizeof(uint4) * (m_num_patches + 1))); + assert(local_edge_id != INVALID16); + if (!is_added_edge[local_edge_id]) { + is_added_edge[local_edge_id] = true; - // copy the mesh data for each patch - for (uint32_t p = 0; p < m_num_patches; ++p) { - // m_d_ pointer are linear. The host containers are not but we can - // take advantage of pointer arthematic (w/ word offsetting) to get - // things work without copyt the host containers in a linear array - - uint32_t start_v = m_h_ad_size_ltog_v[p].x; - uint32_t start_e = m_h_ad_size_ltog_e[p].x; - uint32_t start_f = m_h_ad_size_ltog_f[p].x; - uint32_t start_edges = m_h_ad_size[p].x; - uint32_t start_faces = m_h_ad_size[p].z; - - // ltog - CUDA_ERROR(cudaMemcpy(m_d_patches_ltog_v + start_v, - m_h_patches_ltog_v[p].data(), - m_h_ad_size_ltog_v[p].y * sizeof(uint32_t), - cudaMemcpyHostToDevice)); + const uint16_t local_v0 = find_local_index( + edge_key.first, + m_patcher->get_vertex_patch_id(edge_key.first), + m_h_num_owned_v[patch_id], + m_h_patches_ltog_v[patch_id]); - CUDA_ERROR(cudaMemcpy(m_d_patches_ltog_e + start_e, - m_h_patches_ltog_e[p].data(), - m_h_ad_size_ltog_e[p].y * sizeof(uint32_t), - cudaMemcpyHostToDevice)); + const uint16_t local_v1 = find_local_index( + edge_key.second, + m_patcher->get_vertex_patch_id(edge_key.second), + m_h_num_owned_v[patch_id], + m_h_patches_ltog_v[patch_id]); - CUDA_ERROR(cudaMemcpy(m_d_patches_ltog_f + start_f, - m_h_patches_ltog_f[p].data(), - m_h_ad_size_ltog_f[p].y * sizeof(uint32_t), - cudaMemcpyHostToDevice)); + assert(local_v0 != INVALID16 && local_v1 != INVALID16); - // patches - CUDA_ERROR(cudaMemcpy(m_d_patches_edges + start_edges, - m_h_patches_edges[p].data(), - m_h_ad_size_ltog_e[p].y * 2 * sizeof(uint16_t), - cudaMemcpyHostToDevice)); + m_h_patches_ev[patch_id][local_edge_id * 2] = local_v0; + m_h_patches_ev[patch_id][local_edge_id * 2 + 1] = local_v1; + } - CUDA_ERROR(cudaMemcpy( - m_d_patches_faces + start_faces, m_h_patches_faces[p].data(), - m_h_ad_size_ltog_f[p].y * m_face_degree * sizeof(uint16_t), - cudaMemcpyHostToDevice)); - } + // shift local_e to left + // set the first bit to 1 if (dir ==1) + local_edge_id = local_edge_id << 1; + local_edge_id = local_edge_id | (dir & 1); + m_h_patches_fe[patch_id][local_face_id * 3 + v] = local_edge_id; + } + }; - // copy ad_size - CUDA_ERROR(cudaMemcpy(m_d_ad_size_ltog_v, m_h_ad_size_ltog_v.data(), - sizeof(uint2) * (m_num_patches + 1), - cudaMemcpyHostToDevice)); - CUDA_ERROR(cudaMemcpy(m_d_ad_size_ltog_e, m_h_ad_size_ltog_e.data(), - sizeof(uint2) * (m_num_patches + 1), - cudaMemcpyHostToDevice)); - CUDA_ERROR(cudaMemcpy(m_d_ad_size_ltog_f, m_h_ad_size_ltog_f.data(), - sizeof(uint2) * (m_num_patches + 1), - cudaMemcpyHostToDevice)); - CUDA_ERROR(cudaMemcpy(m_d_ad_size, m_h_ad_size.data(), - sizeof(uint4) * (m_num_patches + 1), - cudaMemcpyHostToDevice)); - CUDA_ERROR(cudaMemcpy(m_d_owned_size, m_h_owned_size.data(), - sizeof(uint4) * (m_num_patches), - cudaMemcpyHostToDevice)); - - - // allocate and copy face/vertex/edge patch - CUDA_ERROR( - cudaMalloc((void**)&m_d_face_patch, sizeof(uint32_t) * (m_num_faces))); - CUDA_ERROR( - cudaMalloc((void**)&m_d_edge_patch, sizeof(uint32_t) * (m_num_edges))); - CUDA_ERROR(cudaMalloc((void**)&m_d_vertex_patch, - sizeof(uint32_t) * (m_num_vertices))); - - CUDA_ERROR( - cudaMemcpy(m_d_face_patch, this->m_patcher->get_face_patch().data(), - sizeof(uint32_t) * (m_num_faces), cudaMemcpyHostToDevice)); - CUDA_ERROR( - cudaMemcpy(m_d_edge_patch, this->m_patcher->get_edge_patch().data(), - sizeof(uint32_t) * (m_num_edges), cudaMemcpyHostToDevice)); - CUDA_ERROR(cudaMemcpy( - m_d_vertex_patch, this->m_patcher->get_vertex_patch().data(), - sizeof(uint32_t) * (m_num_vertices), cudaMemcpyHostToDevice)); - - CUDA_ERROR(cudaMalloc((void**)&m_d_patch_distribution_v, - (m_num_patches + 1) * sizeof(uint32_t))); - CUDA_ERROR(cudaMalloc((void**)&m_d_patch_distribution_e, - (m_num_patches + 1) * sizeof(uint32_t))); - CUDA_ERROR(cudaMalloc((void**)&m_d_patch_distribution_f, - (m_num_patches + 1) * sizeof(uint32_t))); - CUDA_ERROR(cudaMemcpy( - m_d_patch_distribution_v, m_h_patch_distribution_v.data(), - (m_num_patches + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice)); - CUDA_ERROR(cudaMemcpy( - m_d_patch_distribution_e, m_h_patch_distribution_e.data(), - (m_num_patches + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice)); - CUDA_ERROR(cudaMemcpy( - m_d_patch_distribution_f, m_h_patch_distribution_f.data(), - (m_num_patches + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice)); - - - uint32_t* n_patches = m_patcher->get_neighbour_patches(); - uint32_t* n_patches_offset = m_patcher->get_neighbour_patches_offset(); - - CUDA_ERROR(cudaMalloc((void**)&m_d_neighbour_patches_offset, - m_num_patches * sizeof(uint32_t))); - CUDA_ERROR(cudaMemcpy(m_d_neighbour_patches_offset, n_patches_offset, - m_num_patches * sizeof(uint32_t), - cudaMemcpyHostToDevice)); - if (n_patches) { - CUDA_ERROR( - cudaMalloc((void**)&m_d_neighbour_patches, - n_patches_offset[m_num_patches - 1] * sizeof(uint32_t))); - CUDA_ERROR( - cudaMemcpy(m_d_neighbour_patches, n_patches, - n_patches_offset[m_num_patches - 1] * sizeof(uint32_t), - cudaMemcpyHostToDevice)); + for (uint32_t f = p_start; f < p_end; ++f) { + uint32_t face_id = m_patcher->get_patches_val()[f]; + add_new_face(face_id); } - - // Allocate and copy the context to the gpu - m_rxmesh_context.init( - m_num_edges, m_num_faces, m_num_vertices, m_face_degree, m_max_valence, - m_max_edge_incident_faces, m_max_face_adjacent_faces, m_num_patches, - m_d_face_patch, m_d_edge_patch, m_d_vertex_patch, m_d_patches_ltog_v, - m_d_patches_ltog_e, m_d_patches_ltog_f, m_d_ad_size_ltog_v, - m_d_ad_size_ltog_e, m_d_ad_size_ltog_f, m_d_patches_edges, - m_d_patches_faces, m_d_ad_size, m_d_owned_size, m_max_size, - m_d_patch_distribution_v, m_d_patch_distribution_e, - m_d_patch_distribution_f, m_d_neighbour_patches, - m_d_neighbour_patches_offset); + for (uint32_t f = r_start; f < r_end; ++f) { + uint32_t face_id = m_patcher->get_external_ribbon_val()[f]; + add_new_face(face_id); + } } +uint32_t RXMesh::get_edge_id(const uint32_t v0, const uint32_t v1) const +{ + // v0 and v1 are two vertices in global space. we return the edge + // id in global space also (by querying m_edges_map) + assert(m_edges_map.size() != 0); -//************************************************************************** + std::pair edge = detail::edge_key(v0, v1); + assert(edge.first == v0 || edge.first == v1); + assert(edge.second == v0 || edge.second == v1); -//********************** Export -template -void RXMesh::write_connectivity(std::fstream& file) const -{ - for (uint32_t p = 0; p < m_num_patches; ++p) { // for every patch - assert(m_h_ad_size[p].w % 3 == 0); - uint16_t patch_num_faces = m_h_ad_size[p].w / 3; - for (uint32_t f = 0; f < patch_num_faces; ++f) { - uint32_t f_global = m_h_patches_ltog_f[p][f] >> 1; - if (m_patcher->get_face_patch_id(f_global) != p) { - // if it is a ribbon - continue; - } + return get_edge_id(edge); +} - file << "f "; - for (uint32_t e = 0; e < 3; ++e) { - uint16_t edge = m_h_patches_faces[p][3 * f + e]; - flag_t dir(0); - RXMeshContext::unpack_edge_dir(edge, edge, dir); - uint16_t e_id = (2 * edge) + dir; - uint16_t v = m_h_patches_edges[p][e_id]; - file << (m_h_patches_ltog_v[p][v] >> 1) + 1 << " "; - } - file << std::endl; - } +uint32_t RXMesh::get_edge_id(const std::pair& edge) const +{ + uint32_t edge_id = INVALID32; + try { + edge_id = m_edges_map.at(edge); + } catch (const std::out_of_range&) { + RXMESH_ERROR( + "rxmesh::get_edge_id() mapping edges went wrong." + " Can not find an edge connecting vertices {} and {}", + edge.first, + edge.second); + exit(EXIT_FAILURE); } + + return edge_id; } -//************************************************************************** +void RXMesh::build_device() +{ + CUDA_ERROR(cudaMalloc((void**)&m_d_patches_info, + m_num_patches * sizeof(PatchInfo))); + + m_h_patches_info = (PatchInfo*)malloc(m_num_patches * sizeof(PatchInfo)); + +#pragma omp parallel for + for (int p = 0; p < static_cast(m_num_patches); ++p) { + PatchInfo d_patch; + d_patch.num_faces = m_h_patches_ltog_f[p].size(); + d_patch.num_edges = m_h_patches_ltog_e[p].size(); + d_patch.num_vertices = m_h_patches_ltog_v[p].size(); + d_patch.num_owned_faces = m_h_num_owned_f[p]; + d_patch.num_owned_edges = m_h_num_owned_e[p]; + d_patch.num_owned_vertices = m_h_num_owned_v[p]; + d_patch.patch_id = p; + + m_h_patches_info[p].num_faces = m_h_patches_ltog_f[p].size(); + m_h_patches_info[p].num_edges = m_h_patches_ltog_e[p].size(); + m_h_patches_info[p].num_vertices = m_h_patches_ltog_v[p].size(); + m_h_patches_info[p].num_owned_faces = m_h_num_owned_f[p]; + m_h_patches_info[p].num_owned_edges = m_h_num_owned_e[p]; + m_h_patches_info[p].num_owned_vertices = m_h_num_owned_v[p]; + m_h_patches_info[p].patch_id = p; + + + // allocate and copy patch topology to the device + CUDA_ERROR(cudaMalloc((void**)&d_patch.ev, + d_patch.num_edges * 2 * sizeof(LocalVertexT))); + CUDA_ERROR(cudaMemcpy(d_patch.ev, + m_h_patches_ev[p].data(), + d_patch.num_edges * 2 * sizeof(LocalVertexT), + cudaMemcpyHostToDevice)); + m_h_patches_info[p].ev = + reinterpret_cast(m_h_patches_ev[p].data()); + + CUDA_ERROR(cudaMalloc((void**)&d_patch.fe, + d_patch.num_faces * 3 * sizeof(LocalEdgeT))); + CUDA_ERROR(cudaMemcpy(d_patch.fe, + m_h_patches_fe[p].data(), + d_patch.num_faces * 3 * sizeof(LocalEdgeT), + cudaMemcpyHostToDevice)); + m_h_patches_info[p].fe = + reinterpret_cast(m_h_patches_fe[p].data()); + + // copy not-owned mesh elements to device + + auto populate_not_owned = + [p](const std::vector>& ltog, + const std::vector& element_patch, + const std::vector& num_owned, + auto*& d_not_owned_id, + uint32_t*& d_not_owned_patch, + auto*& h_not_owned_id, + uint32_t*& h_not_owned_patch) { + using LocalT = typename std::remove_reference::type; + + const uint16_t num_not_owned = ltog[p].size() - num_owned[p]; + + h_not_owned_id = + (LocalT*)malloc(num_not_owned * sizeof(LocalT)); + h_not_owned_patch = + (uint32_t*)malloc(num_not_owned * sizeof(uint32_t)); + + for (uint16_t i = 0; i < num_not_owned; ++i) { + uint16_t local_id = i + num_owned[p]; + uint32_t global_id = ltog[p][local_id]; + uint32_t owning_patch = element_patch[global_id]; + h_not_owned_patch[i] = owning_patch; + + auto it = std::lower_bound( + ltog[owning_patch].begin(), + ltog[owning_patch].begin() + num_owned[owning_patch], + global_id); + + if (it == + ltog[owning_patch].begin() + num_owned[owning_patch]) { + RXMESH_ERROR( + "rxmesh::build_device can not find the local id of " + "{} in patch {}. Maybe this patch does not own " + "this mesh element.", + global_id, + owning_patch); + } else { + h_not_owned_id[i].id = static_cast( + it - ltog[owning_patch].begin()); + } + } + + // Copy to device + CUDA_ERROR(cudaMalloc((void**)&d_not_owned_id, + sizeof(LocalT) * num_not_owned)); + CUDA_ERROR(cudaMemcpy(d_not_owned_id, + h_not_owned_id, + sizeof(LocalT) * num_not_owned, + cudaMemcpyHostToDevice)); + + CUDA_ERROR(cudaMalloc((void**)&d_not_owned_patch, + sizeof(uint32_t) * num_not_owned)); + CUDA_ERROR(cudaMemcpy(d_not_owned_patch, + h_not_owned_patch, + sizeof(uint32_t) * num_not_owned, + cudaMemcpyHostToDevice)); + }; + + + populate_not_owned(m_h_patches_ltog_f, + m_patcher->get_face_patch(), + m_h_num_owned_f, + d_patch.not_owned_id_f, + d_patch.not_owned_patch_f, + m_h_patches_info[p].not_owned_id_f, + m_h_patches_info[p].not_owned_patch_f); + + populate_not_owned(m_h_patches_ltog_e, + m_patcher->get_edge_patch(), + m_h_num_owned_e, + d_patch.not_owned_id_e, + d_patch.not_owned_patch_e, + m_h_patches_info[p].not_owned_id_e, + m_h_patches_info[p].not_owned_patch_e); + + populate_not_owned(m_h_patches_ltog_v, + m_patcher->get_vertex_patch(), + m_h_num_owned_v, + d_patch.not_owned_id_v, + d_patch.not_owned_patch_v, + m_h_patches_info[p].not_owned_id_v, + m_h_patches_info[p].not_owned_patch_v); + + CUDA_ERROR(cudaMemcpy(m_d_patches_info + p, + &d_patch, + sizeof(PatchInfo), + cudaMemcpyHostToDevice)); + } +} -template class RXMesh; -} // namespace RXMESH +} // namespace rxmesh diff --git a/include/rxmesh/rxmesh.h b/include/rxmesh/rxmesh.h index 8d1d42b5..cbf10331 100644 --- a/include/rxmesh/rxmesh.h +++ b/include/rxmesh/rxmesh.h @@ -1,166 +1,129 @@ #pragma once - -#include #include #include #include +#include "rxmesh/context.h" +#include "rxmesh/patch_info.h" #include "rxmesh/patcher/patcher.h" -#include "rxmesh/rxmesh_context.h" +#include "rxmesh/types.h" #include "rxmesh/util/log.h" #include "rxmesh/util/macros.h" +#include "rxmesh/util/util.h" class RXMeshTest; -namespace RXMESH { -using coordT = float; - -// This class is responsible for building the data structure of representing -// the mesh a matrix (small sub-matrices). It should/can not be instantiated. -// In order to use it, use RXMeshStatic - -enum class Op -{ - // Vertex - VV = 0, - VE = 1, - VF = 2, - - // Face - FV = 3, - FE = 4, - FF = 5, - - // Edge - EV = 6, - EE = 7, - EF = 8, -}; - -inline std::string op_to_string(const Op& op) -{ - switch (op) { - case RXMESH::Op::VV: - return "VV"; - case RXMESH::Op::VE: - return "VE"; - case RXMESH::Op::VF: - return "VF"; - case RXMESH::Op::FV: - return "FV"; - case RXMESH::Op::FE: - return "FE"; - case RXMESH::Op::FF: - return "FF"; - case RXMESH::Op::EV: - return "EV"; - case RXMESH::Op::EF: - return "EF"; - case RXMESH::Op::EE: - return "EE"; - default: - return ""; - } -} - -enum class ELEMENT -{ - VERTEX = 0, - EDGE = 1, - FACE = 2 -}; +namespace rxmesh { -template +/** + * @brief The main class for creating RXMesh data structure. It takes an input + * mesh on the host, computes the patches, and creates the data structure on the + * GPU. It is not mean to be used directly by the user. Users should use + * RXMeshStatic instead + */ class RXMesh { public: - // Exporter - template - void exportOBJ(const std::string& filename, VertT getCoords) - { - std::string fn = STRINGIFY(OUTPUT_DIR) + filename; - std::fstream file(fn, std::ios::out); - file.precision(30); - - // write vertices - for (uint32_t v = 0; v < m_num_vertices; ++v) { - uint32_t v_id = v; - - file << "v "; - for (uint32_t i = 0; i < 3; ++i) { - file << getCoords(v_id, i) << " "; - } - file << std::endl; - } - // write connectivity - write_connectivity(file); - file.close(); - } - - - // getter + /** + * @brief Total number of vertices in the mesh + */ uint32_t get_num_vertices() const { return m_num_vertices; } + + /** + * @brief Total number of edges in the mesh + */ uint32_t get_num_edges() const { return m_num_edges; } + + /** + * @brief Total number of faces in the mesh + */ uint32_t get_num_faces() const { return m_num_faces; } + /** + * @brief Maximum valence in the input mesh + */ uint32_t get_max_valence() const { return m_max_valence; } + /** + * @brief Maximum number of incident faces to an edge in the input mesh + */ uint32_t get_max_edge_incident_faces() const { return m_max_edge_incident_faces; } - uint32_t get_max_edge_adjacent_faces() const + /** + * @brief Maximum number of adjacent faces to a face in the input mesh + */ + uint32_t get_max_face_adjacent_faces() const { return m_max_face_adjacent_faces; } - uint32_t get_face_degree() const - { - return m_face_degree; - } - const RXMeshContext& get_context() const + /** + * @brief Return a context that store various information about the mesh on + * the GPU + */ + const Context& get_context() const { return m_rxmesh_context; } + /** + * @brief returns true if the input mesh is manifold + */ bool is_edge_manifold() const { return m_is_input_edge_manifold; } + /** + * @brief returns true if the input mesh is closed + */ bool is_closed() const { return m_is_input_closed; } + /** + * @brief returns the patch size used during partitioning the input mesh + */ uint32_t get_patch_size() const { - return patchSize; + return m_patch_size; } + /** + * @brief Total number of patches of the input mesh + */ uint32_t get_num_patches() const { return m_num_patches; } + /** + * @brief Returns the number of disconnected component the input mesh is + * composed of + */ uint32_t get_num_components() const { return m_patcher->get_num_components(); } - + /** + * @brief Return the max, min, and average patch size of the input mesh + */ void get_max_min_avg_patch_size(uint32_t& min_p, uint32_t& max_p, uint32_t& avg_p) const @@ -168,267 +131,165 @@ class RXMesh return m_patcher->get_max_min_avg_patch_size(min_p, max_p, avg_p); } + /** + * @brief Return (approximate) overhead due to ribbons + */ double get_ribbon_overhead() const { return m_patcher->get_ribbon_overhead(); } + /** + * @brief Maximum number of vertices in a patch + */ uint32_t get_per_patch_max_vertices() const { return m_max_vertices_per_patch; } + /** + * @brief Maximum number of edges in a patch + */ uint32_t get_per_patch_max_edges() const { return m_max_edges_per_patch; } + /** + * @brief Maximum number of faces in a patch + */ uint32_t get_per_patch_max_faces() const { return m_max_faces_per_patch; } - uint32_t get_per_patch_max_owned_vertices() const - { - return m_max_owned_vertices_per_patch; - } - - uint32_t get_per_patch_max_owned_edges() const - { - return m_max_owned_edges_per_patch; - } - - uint32_t get_per_patch_max_owned_faces() const - { - return m_max_owned_faces_per_patch; - } - + /** + * @brief The time used to construct the patches on the GPU + */ float get_patching_time() const { return m_patcher->get_patching_time(); } + /** + * @brief The number of Lloyd iterations run to partition the mesh into + * patches + */ uint32_t get_num_lloyd_run() const { return m_patcher->get_num_lloyd_run(); } + /** + * @brief Return the edge id given two vertices. Edges are undirected. + * @param v0 first input vertex + * @param v1 second input vertex + * @return edge id composed by v0-v1 (same as edge id for v1-v0) + */ uint32_t get_edge_id(const uint32_t v0, const uint32_t v1) const; - double get_gpu_storage_mb() const - { - return m_total_gpu_storage_mb; - } - - const std::unique_ptr& get_patcher() const - { - return m_patcher; - }; - protected: virtual ~RXMesh(); - RXMeshContext m_rxmesh_context; - RXMesh(const RXMesh&) = delete; - virtual void write_connectivity(std::fstream& file) const; - - // build everything from scratch including patches (use this) - RXMesh(std::vector>& fv, - std::vector>& coordinates, - const bool sort = false, - const bool quite = true); + RXMesh(const std::vector>& fv, + const bool quite = false); + + /** + * @brief build different supporting data structure used to build RXMesh + * + * Set the number of vertices, edges, and faces, populate edge_map (which + * takes two connected vertices and returns their edge id), build + * face-incident-faces data structure (used to in creating patches). This is + * done using a single pass over FV + * + * @param fv input face incident vertices + * @param ef output edge incident faces + * @param ef output face adjacent faces + */ + void build_supporting_structures( + const std::vector>& fv, + std::vector>& ef, + std::vector& ff_offset, + std::vector& ff_values); + + /** + * @brief Calculate various statistics for the input mesh + * + * Calculate max valence, max edge incident faces, max face adjacent faces, + * if the input is closed, if the input is edge manifold, and max number of + * vertices/edges/faces per patch + * + * @param fv input face incident vertices + * @param ef input edge incident faces + */ + void calc_statistics(const std::vector>& fv, + const std::vector>& ef); + + void calc_max_not_owned_elements(); + + void build(const std::vector>& fv); + void build_single_patch(const std::vector>& fv, + const uint32_t patch_id); + + void build_single_patch_ltog(const std::vector>& fv, + const uint32_t patch_id); + + void build_single_patch_topology( + const std::vector>& fv, + const uint32_t patch_id); + + + void build_device(); uint32_t get_edge_id(const std::pair& edge) const; - void build_local(std::vector>& fv, - std::vector>& coordinates); - void build_patch_locally(const uint32_t patch_id); - void populate_edge_map(const std::vector>& fv); - uint16_t create_new_local_face(const uint32_t patch_id, - const uint32_t global_f, - const std::vector& fv, - uint16_t& faces_count, - uint16_t& edges_owned_count, - uint16_t& edges_not_owned_count, - uint16_t& vertices_owned_count, - uint16_t& vertices_not_owned_count, - const uint16_t num_edges_owned, - const uint16_t num_vertices_owned, - std::vector& f_ltog, - std::vector& e_ltog, - std::vector& v_ltog, - std::vector& fp, - std::vector& ep); - void set_num_vertices(const std::vector>& fv); - void edge_incident_faces(const std::vector>& fv, - std::vector>& ef); - - inline std::pair edge_key(const uint32_t v0, - const uint32_t v1) const - { - uint32_t i = std::max(v0, v1); - uint32_t j = std::min(v0, v1); - return std::make_pair(i, j); - } - - template - void host_malloc(pt_T*& arr, uint32_t count) - { - arr = (pt_T*)malloc(count * sizeof(pt_T)); - if (arr == NULL) { - RXMESH_ERROR( - "RXMesh::host_malloc() malloc failed with count = {} and total " - "size = {}", - count, count * sizeof(pt_T)); - } - } - - void device_alloc_local(); - - template - void get_starting_ids(const std::vector>& input, - std::vector& starting_id); - - template - void padding_to_multiple(std::vector>& input, - const uint32_t multiple, - const T init_val); - - template - void get_size(const std::vector>& input, - std::vector& ad); - - void sort(std::vector>& fv, - std::vector>& coordinates); - - - // www.techiedelight.com/use-std-pair-key-std-unordered_map-cpp/ - struct edge_key_hash - { - template - inline std::size_t operator()(const std::pair& e_key) const - { - return std::hash()(e_key.first * 8191 + e_key.second * 11003); - } - }; - - // variables // our friend tester class friend class ::RXMeshTest; - // var - uint32_t m_num_edges, m_num_faces, m_num_vertices, m_max_ele_count, - m_max_valence, m_max_valence_vertex_id, m_max_edge_incident_faces, - m_max_face_adjacent_faces; - const uint32_t m_face_degree; + Context m_rxmesh_context; - // patches - uint32_t m_num_patches; + uint32_t m_num_edges, m_num_faces, m_num_vertices, m_max_valence, + m_max_edge_incident_faces, m_max_face_adjacent_faces; - bool m_is_input_edge_manifold; - bool m_is_input_closed; - bool m_is_sort; - bool m_quite; + uint32_t m_max_vertices_per_patch, m_max_edges_per_patch, + m_max_faces_per_patch; - std::unordered_map, uint32_t, edge_key_hash> - m_edges_map; + uint32_t m_max_not_owned_vertices, m_max_not_owned_edges, + m_max_not_owned_faces; - // store a copy of face incident vertices along with the neighbor - // faces of that face - std::vector> m_fvn; + uint32_t m_num_patches; + const uint32_t m_patch_size; + bool m_is_input_edge_manifold; + bool m_is_input_closed; + bool m_quite; + + // Edge hash map that takes two vertices and return their edge id + std::unordered_map, + uint32_t, + detail::edge_key_hash> + m_edges_map; // pointer to the patcher class responsible for everything related to // patching the mesh into small pieces - std::unique_ptr m_patcher; + std::unique_ptr m_patcher; + //** main incident relations + std::vector> m_h_patches_ev; + std::vector> m_h_patches_fe; - //*************** Patch sub-matrices + // the number of owned mesh elements per patch + std::vector m_h_num_owned_f, m_h_num_owned_e, m_h_num_owned_v; - //****** Host - uint32_t m_max_vertices_per_patch, m_max_edges_per_patch, - m_max_faces_per_patch; - uint32_t m_max_owned_vertices_per_patch, m_max_owned_edges_per_patch, - m_max_owned_faces_per_patch; - //** main incident relations - std::vector> m_h_patches_edges; - std::vector> m_h_patches_faces; - //.x edge address - //.y edge size - //.z face address - //.w face size - std::vector m_h_ad_size; - - // the size of owned mesh elements per patch - //.x faces - //.y edges - //.z vertex - std::vector m_h_owned_size; - - uint2 m_max_size; // max number of edges(*2) and faces(*face_degree) - // in a patch - // this counts the size of edges and faces arrays - // rounded up to multiple of 32 - - //** mappings + // mappings // local to global map for (v)ertices (e)dges and (f)aces std::vector> m_h_patches_ltog_v; std::vector> m_h_patches_ltog_e; std::vector> m_h_patches_ltog_f; - // storing the start id(x) and element count(y) - std::vector m_h_ad_size_ltog_v, m_h_ad_size_ltog_e, - m_h_ad_size_ltog_f; - - - //****** Device - // Each device pointer points to a long array that holds specific data - // separated by patch id - // ____________ _____________ ____________ - // |____________|_____________|____________| - // ^^ ^^ ^^ - // patch 1 data patch 2 data patch 3 data - - // We store the starting id and the size of mesh elements for each patch - // in m_d_ad_size_ltog_MESHELE (ad for address) where MESHELE could be - // v,e, or f. This is for the mapping pointers - // For incidence pointers, we only need store the starting id - //** face/vertex/edge patch (indexed by in global space) - uint32_t *m_d_face_patch, *m_d_vertex_patch, *m_d_edge_patch; - - //** mapping - uint32_t *m_d_patches_ltog_v, *m_d_patches_ltog_e, *m_d_patches_ltog_f; - uint2 * m_d_ad_size_ltog_v, *m_d_ad_size_ltog_e, *m_d_ad_size_ltog_f; - - //** incidence - uint16_t *m_d_patches_edges, *m_d_patches_faces; - - //*** Scanned histogram of the number of mesh elements per patch - std::vector m_h_patch_distribution_v, m_h_patch_distribution_e, - m_h_patch_distribution_f; - uint32_t *m_d_patch_distribution_v, *m_d_patch_distribution_e, - *m_d_patch_distribution_f; - - //.x edge address - //.y edge size - //.z face address - //.w face size - uint4* m_d_ad_size; - - // the size of owned mesh elements per patch - //.x faces - //.y edges - //.z vertex - uint4* m_d_owned_size; - - // neighbour patches - uint32_t *m_d_neighbour_patches, *m_d_neighbour_patches_offset; - - double m_total_gpu_storage_mb; + PatchInfo *m_d_patches_info, *m_h_patches_info; }; - -extern template class RXMesh; -} // namespace RXMESH +} // namespace rxmesh diff --git a/include/rxmesh/rxmesh_attribute.h b/include/rxmesh/rxmesh_attribute.h deleted file mode 100644 index ebfff216..00000000 --- a/include/rxmesh/rxmesh_attribute.h +++ /dev/null @@ -1,866 +0,0 @@ -#pragma once - -#include -#include "rxmesh/kernels/collective.cuh" -#include "rxmesh/kernels/rxmesh_attribute.cuh" -#include "rxmesh/kernels/util.cuh" -#include "rxmesh/util/util.h" -#include "rxmesh/util/vector.h" - -namespace RXMESH { - -// Flags for where the attributes array resides -using locationT = uint32_t; -enum : locationT -{ - LOCATION_NONE = 0x00, - HOST = 0x01, - DEVICE = 0x02, - LOCATION_ALL = 0x0F, -}; - -// The memory layout -using layoutT = uint32_t; -enum : layoutT -{ - AoS = 0x00, - SoA = 0x01, -}; - -// Reduce ops -using reduceOpT = uint32_t; -enum : reduceOpT -{ - SUM = 0x00, - MAX = 0x01, - MIN = 0X02, - NORM2 = 0X04, // L2 norm squared - DOT = 0x08, // dot product - -}; - -static std::string location_to_string(locationT target) -{ - std::string str = ""; - if ((target & HOST) == HOST) { - str = (str == "" ? "" : " ") + std::string("HOST"); - } - if ((target & DEVICE) == DEVICE) { - str = (str == "" ? "" : " ") + std::string("DEVICE"); - } - if (str == "") { - str = "NONE"; - } - return str; -} - -template -class RXMeshAttribute -{ - // Here we manage the attributes on top of the mesh. An attributes is - // attached to mesh element (e.g., vertices, edges, or faces). The user - // is expected to declare as many attributes as expected to be used - // during the lifetime of RXMesh - - // largely inspired by - // https://github.com/gunrock/gunrock/blob/master/gunrock/util/array_utils.cuh - - - public: - //********************** Constructors/Destructor - RXMeshAttribute() - : m_name(nullptr), m_num_mesh_elements(0), - m_num_attribute_per_element(0), m_allocated(LOCATION_NONE), - m_h_attr(nullptr), m_d_attr(nullptr), m_layout(AoS), - d_axpy_alpha(nullptr), d_axpy_beta(nullptr), - m_is_axpy_allocated(false), m_is_reduce_allocated(false), - m_reduce_temp_storage_bytes(0), m_d_reduce_temp_storage(nullptr), - m_d_reduce_output(nullptr), m_reduce_streams(nullptr), - m_norm2_temp_buffer(nullptr) - { - - this->m_name = (char*)malloc(sizeof(char) * 1); - this->m_name[0] = '\0'; - allocate(0, LOCATION_NONE); - m_pitch.x = 0; - m_pitch.y = 0; - } - - RXMeshAttribute(const char* const name) - : m_name(nullptr), m_num_mesh_elements(0), - m_num_attribute_per_element(0), m_allocated(LOCATION_NONE), - m_h_attr(nullptr), m_d_attr(nullptr), m_layout(AoS), - d_axpy_alpha(nullptr), d_axpy_beta(nullptr), - m_is_axpy_allocated(false), m_is_reduce_allocated(false), - m_reduce_temp_storage_bytes(0) - { - - if (name != nullptr) { - this->m_name = (char*)malloc(sizeof(char) * (strlen(name) + 1)); - strcpy(this->m_name, name); - } - allocate(0, LOCATION_NONE); - m_pitch.x = 0; - m_pitch.y = 0; - } - - //********************************************************************* - - - //********************** Setter/Getter - void set_name(std::string name) - { - free(this->m_name); - this->m_name = (char*)malloc(sizeof(char) * name.length() + 1); - strcpy(this->m_name, name.c_str()); - } - - __host__ __device__ __forceinline__ uint32_t get_num_mesh_elements() const - { - return this->m_num_mesh_elements; - } - - __host__ __device__ __forceinline__ uint32_t - get_num_attribute_per_element() const - { - return this->m_num_attribute_per_element; - } - - __host__ __device__ __forceinline__ locationT get_allocated() const - { - return this->m_allocated; - } - - __host__ __device__ __forceinline__ bool is_device_allocated() const - { - return ((m_allocated & DEVICE) == DEVICE); - } - - __host__ __device__ __forceinline__ bool is_host_allocated() const - { - return ((m_allocated & HOST) == HOST); - } - - __host__ __device__ __forceinline__ T* get_pointer(locationT target) const - { - - if (target == DEVICE) { - return m_d_attr; - } - if (target == HOST) { - return m_h_attr; - } - return nullptr; - } - - void reset(const T value, locationT target, cudaStream_t stream = NULL) - { - - if ((target & DEVICE) == DEVICE) { - - assert((m_allocated & DEVICE) == DEVICE); - - const int threads = 256; - const uint32_t total = - m_num_attribute_per_element * m_num_mesh_elements; - memset<<<(total + threads - 1) / threads, threads, 0, stream>>>( - m_d_attr, value, total); - CUDA_ERROR(cudaDeviceSynchronize()); - CUDA_ERROR(cudaGetLastError()); - } - - - if ((target & HOST) == HOST) { - assert((m_allocated & HOST) == HOST); - for (uint32_t i = 0; - i < m_num_mesh_elements * m_num_attribute_per_element; ++i) { - m_h_attr[i] = value; - } - } - } - //********************************************************************* - - - //********************** Memory Manipulation - void init(uint32_t num_elements, - uint32_t num_attributes_per_elements, - locationT target = DEVICE, - layoutT layout = AoS, - const bool with_axpy_alloc = true, - const bool with_reduce_alloc = true) - { - release(); - m_allocated = LOCATION_NONE; - this->m_num_mesh_elements = num_elements; - this->m_num_attribute_per_element = num_attributes_per_elements; - if (num_elements == 0) { - return; - } - allocate(num_elements, target); - m_layout = layout; - set_pitch(); - - if (!m_is_axpy_allocated && with_axpy_alloc) { - CUDA_ERROR(cudaMalloc((void**)&d_axpy_alpha, - m_num_attribute_per_element * sizeof(T))); - CUDA_ERROR(cudaMalloc((void**)&d_axpy_beta, - m_num_attribute_per_element * sizeof(T))); - m_is_axpy_allocated = true; - } - - if (!m_is_reduce_allocated && with_reduce_alloc) { - // Reduce operations are either SUM, MIN, MAX, or NORM2 - // NORM2 produce is done in two passes, the first pass uses cub - // device API to multiply the input and then store in a temp buffer - // (every CUDA block outputs a single value) which then is used for - // the second pass using cub host API The other three operations - // uses only cub host API. cub host API requires temp buffer which - // is taken as the max of what NORM2 requires and the other three - // operations. - - // NORM2 temp buffer (to store the per-block output) - uint32_t num_blocks = DIVIDE_UP(m_num_mesh_elements, m_block_size); - m_norm2_temp_buffer = - (T**)malloc(sizeof(T*) * m_num_attribute_per_element); - if (!m_norm2_temp_buffer) { - RXMESH_ERROR( - "RXMeshAttribute::init() could not allocate " - "m_norm2_temp_buffer."); - } - for (uint32_t i = 0; i < m_num_attribute_per_element; ++i) { - CUDA_ERROR(cudaMalloc(&m_norm2_temp_buffer[i], - sizeof(T) * num_blocks)); - } - - m_d_reduce_output = - (T**)malloc(sizeof(T*) * m_num_attribute_per_element); - if (!m_d_reduce_output) { - RXMESH_ERROR( - "RXMeshAttribute::init() could not allocate " - "m_d_reduce_output."); - } - m_d_reduce_temp_storage = - (void**)malloc(sizeof(void*) * m_num_attribute_per_element); - if (!m_d_reduce_temp_storage) { - RXMESH_ERROR( - "RXMeshAttribute::init() could not allocate " - "m_d_reduce_temp_storage."); - } - m_reduce_streams = (cudaStream_t*)malloc( - sizeof(cudaStream_t) * m_num_attribute_per_element); - if (!m_d_reduce_output) { - RXMESH_ERROR( - "RXMeshAttribute::init() could not allocate " - "m_reduce_streams."); - } - { // get the num bytes for cub device-wide reduce - size_t norm2_temp_bytes(0), other_reduce_temp_bytes(0); - T* d_out(NULL); - m_d_reduce_temp_storage[0] = NULL; - cub::DeviceReduce::Sum(m_d_reduce_temp_storage[0], - norm2_temp_bytes, m_d_attr, d_out, - num_blocks); - cub::DeviceReduce::Sum(m_d_reduce_temp_storage[0], - other_reduce_temp_bytes, m_d_attr, d_out, - m_num_mesh_elements); - m_reduce_temp_storage_bytes = - std::max(norm2_temp_bytes, other_reduce_temp_bytes); - } - - for (uint32_t i = 0; i < m_num_attribute_per_element; ++i) { - CUDA_ERROR(cudaMalloc(&m_d_reduce_temp_storage[i], - m_reduce_temp_storage_bytes)); - CUDA_ERROR(cudaMalloc(&m_d_reduce_output[i], sizeof(T))); - CUDA_ERROR(cudaStreamCreate(&m_reduce_streams[i])); - } - } - } - - void allocate(uint32_t num_mesh_elements, locationT target = DEVICE) - { - - if ((target & HOST) == HOST) { - release(HOST); - if (num_mesh_elements != 0) { - m_h_attr = (T*)malloc(sizeof(T) * num_mesh_elements * - m_num_attribute_per_element); - if (!m_h_attr) { - RXMESH_ERROR( - " RXMeshAttribute::allocate() allocation on {} failed " - "with #mesh_elemnts = {} and #attributes per element = " - "{}" + - location_to_string(HOST), - num_mesh_elements, m_num_attribute_per_element); - } - } - m_allocated = m_allocated | HOST; - } - - - if ((target & DEVICE) == DEVICE) { - release(DEVICE); - if (num_mesh_elements != 0) { - CUDA_ERROR(cudaMalloc((void**)&(m_d_attr), - sizeof(T) * num_mesh_elements * - m_num_attribute_per_element)); - } - m_allocated = m_allocated | DEVICE; - } - this->m_num_mesh_elements = num_mesh_elements; - } - - void move(locationT source, locationT target) - { - if (source == target) { - return; - } - - if ((source == HOST || source == DEVICE) && - ((source & m_allocated) != source)) { - RXMESH_ERROR( - "RXMeshAttribute::move() moving source is not valid" - " because it was not allocated on source"); - } - - if (((target & HOST) == HOST || (target & DEVICE) == DEVICE) && - ((target & m_allocated) != target)) { - allocate(this->m_num_mesh_elements, target); - } - - if (this->m_num_mesh_elements == 0) { - return; - } - - if (source == HOST && target == DEVICE) { - CUDA_ERROR(cudaMemcpy( - m_d_attr, m_h_attr, - sizeof(T) * m_num_mesh_elements * m_num_attribute_per_element, - cudaMemcpyHostToDevice)); - - } else if (source == DEVICE && target == HOST) { - CUDA_ERROR(cudaMemcpy( - m_h_attr, m_d_attr, - sizeof(T) * m_num_mesh_elements * m_num_attribute_per_element, - cudaMemcpyDeviceToHost)); - } - } - - void release(locationT target = LOCATION_ALL) - { - - if (((target & HOST) == HOST) && ((m_allocated & HOST) == HOST)) { - free(m_h_attr); - m_h_attr = nullptr; - m_allocated = m_allocated & (~HOST); - } - - if (((target & DEVICE) == DEVICE) && - ((m_allocated & DEVICE) == DEVICE)) { - GPU_FREE(m_d_attr); - m_allocated = m_allocated & (~DEVICE); - } - - if (target == LOCATION_ALL || m_allocated == 0) { - m_num_mesh_elements = 0; - m_pitch.x = 0; - m_pitch.y = 0; - - if (m_is_axpy_allocated) { - GPU_FREE(d_axpy_alpha); - GPU_FREE(d_axpy_beta); - m_is_axpy_allocated = false; - } - if (m_is_reduce_allocated) { - for (uint32_t i = 0; i < m_num_attribute_per_element; ++i) { - GPU_FREE(m_d_reduce_temp_storage[i]); - GPU_FREE(m_norm2_temp_buffer[i]); - GPU_FREE(m_d_reduce_output[i]); - CUDA_ERROR(cudaStreamDestroy(m_reduce_streams[i])); - } - m_is_reduce_allocated = false; - free(m_reduce_streams); - free(m_d_reduce_output); - free(m_norm2_temp_buffer); - free(m_d_reduce_temp_storage); - } - } - } - - void copy(RXMeshAttribute& source, - locationT source_flag, - locationT target_flag) - { - // Deep copy from source. The source_flag defines where we will copy - // from. The target_flag defines where we will copy to. - - // if source_flag and target_flag are both set to LOCATION_ALL, then we - // copy what is on host to host, and what on target to target - - // If sourc_flag is set to HOST (or DEVICE) and target_flag is set to - // LOCATION_ALL, then we copy source's HOST (or DEVICE) to both HOST - // and DEVICE in target - - // Setting source_flag to LOCATION_ALL while target_flag is Not set to - // LOCATION_ALL is invalid because we don't know which source to copy - // from - - if (source.m_layout != m_layout) { - RXMESH_ERROR( - "RXMeshAttribute::copy() does not support copy from source of " - "different layout!"); - } - - if ((source_flag & LOCATION_ALL) == LOCATION_ALL && - (target_flag & LOCATION_ALL) != LOCATION_ALL) { - RXMESH_ERROR("RXMeshAttribute::copy() Invalid configuration!"); - } - - if (source.get_num_mesh_elements() != m_num_mesh_elements) { - RXMESH_ERROR( - "RXMeshAttribute::copy() source has different size than " - "target!"); - } - - // 1) copy from HOST to HOST - if ((source_flag & HOST) == HOST && (target_flag & HOST) == HOST) { - if ((source_flag & source.m_allocated) != source_flag) { - RXMESH_ERROR( - "RXMeshAttribute::copy() copying source is not valid" - " because it was not allocated on host"); - } - if ((target_flag & m_allocated) != target_flag) { - RXMESH_ERROR( - "RXMeshAttribute::copy() copying source is not valid" - " because target (this) was not allocated on host"); - } - - std::memcpy( - (void*)m_h_attr, source.m_h_attr, - m_num_mesh_elements * m_num_attribute_per_element * sizeof(T)); - } - - - // 2) copy from DEVICE to DEVICE - if ((source_flag & DEVICE) == DEVICE && - (target_flag & DEVICE) == DEVICE) { - if ((source_flag & source.m_allocated) != source_flag) { - RXMESH_ERROR( - "RXMeshAttribute::copy() copying source is not valid" - " because it was not allocated on device"); - } - if ((target_flag & m_allocated) != target_flag) { - RXMESH_ERROR( - "RXMeshAttribute::copy() copying source is not valid" - " because target (this) was not allocated on device"); - } - - CUDA_ERROR(cudaMemcpy( - m_d_attr, source.m_d_attr, - m_num_mesh_elements * m_num_attribute_per_element * sizeof(T), - cudaMemcpyDeviceToDevice)); - } - - - // 3) copy from DEVICE to HOST - if ((source_flag & DEVICE) == DEVICE && (target_flag & HOST) == HOST) { - if ((source_flag & source.m_allocated) != source_flag) { - RXMESH_ERROR( - "RXMeshAttribute::copy() copying source is not valid" - " because it was not allocated on host"); - } - if ((target_flag & m_allocated) != target_flag) { - RXMESH_ERROR( - "RXMeshAttribute::copy() copying source is not valid" - " because target (this) was not allocated on device"); - } - - CUDA_ERROR(cudaMemcpy( - m_h_attr, source.m_d_attr, - m_num_mesh_elements * m_num_attribute_per_element * sizeof(T), - cudaMemcpyDeviceToHost)); - } - - - // 4) copy from HOST to DEVICE - if ((source_flag & HOST) == HOST && (target_flag & DEVICE) == DEVICE) { - if ((source_flag & source.m_allocated) != source_flag) { - RXMESH_ERROR( - "RXMeshAttribute::copy() copying source is not valid" - " because it was not allocated on device"); - } - if ((target_flag & m_allocated) != target_flag) { - RXMESH_ERROR( - "RXMeshAttribute::copy() copying source is not valid" - " because target (this) was not allocated on host"); - } - - CUDA_ERROR(cudaMemcpy( - m_d_attr, source.m_h_attr, - m_num_mesh_elements * m_num_attribute_per_element * sizeof(T), - cudaMemcpyHostToDevice)); - } - } - - void change_layout(locationT target) - { - // Only supporting HOST target - // If target is HOST, then the layout change only for the HOST - // the user then can copy the data to the DEVICE. - // To change the layout of data in the DEVICE, it should be copied first - // to the HOST, change layout, and then copy back to the DEVICE - - // Only make sense when number of attributes is >1 - if (m_num_attribute_per_element > 1) { - - if ((target & m_allocated) != target) { - RXMESH_ERROR( - "RXMeshAttribute::change_layout() changing layout {} is " - "not valid because it was not allocated", - location_to_string(target)); - return; - } - - if ((target & HOST) != HOST) { - RXMESH_ERROR( - "RXMeshAttribute::change_layout() changing layout {} is " - "not valid because it is not supported", - location_to_string(target)); - return; - } - - if ((target & HOST) == HOST) { - const uint32_t size = - m_num_mesh_elements * m_num_attribute_per_element; - const uint32_t num_cols = (m_layout == AoS) ? - m_num_attribute_per_element : - m_num_mesh_elements; - in_place_matrix_transpose(m_h_attr, m_h_attr + size, - uint64_t(num_cols)); - - m_layout = (m_layout == SoA) ? AoS : SoA; - set_pitch(); - } - } - } - //********************************************************************* - - //********************** BLAS - template - void axpy(const RXMeshAttribute& X, - const Vector alpha, - const Vector beta, - const locationT location = DEVICE, - const uint32_t attribute_id = INVALID32, - cudaStream_t stream = NULL) - { - // Implements - // Y = alpha*X + beta*Y - // where Y is *this. - // alpha and beta is passed as vector so different values can be applied - // to each attribute. - // if attribute == INVALID32, then axpy is applied on all attributes - // and alpha (and beta) should be of size m_num_attribute_per_element. - // Otherwise axpy will be only applied on the given attribute number - //(should be less than m_num_attribute_per_element) and alpha (and - // beta) should be of size one - // location tells on which side (host to device) the operation - // will run - - const uint32_t num_attribute = - (attribute_id == INVALID32) ? m_num_attribute_per_element : 1; - assert(N >= num_attribute); - - if ((location & DEVICE) == DEVICE) { - - const uint32_t blocks = - DIVIDE_UP(m_num_mesh_elements, m_block_size); - - CUDA_ERROR(cudaMemcpyAsync(d_axpy_alpha, (void*)&alpha, - sizeof(Vector), - cudaMemcpyHostToDevice, stream)); - CUDA_ERROR(cudaMemcpyAsync(d_axpy_beta, (void*)&beta, - sizeof(Vector), - cudaMemcpyHostToDevice, stream)); - - rxmesh_attribute_axpy<<>>( - X, d_axpy_alpha, *this, d_axpy_beta, attribute_id); - - cudaStreamSynchronize(stream); - } - if ((location & HOST) == HOST) { - for (uint32_t i = 0; i < m_num_mesh_elements; ++i) { - for (uint32_t j = 0; j < m_num_attribute_per_element; ++j) { - (*this)(i, j) = - alpha[j] * X(i, j) + beta[j] * (*this)(i, j); - } - } - } - } - - template - void reduce(Vector& h_output, - const reduceOpT op, - const RXMeshAttribute* other = nullptr, - const locationT location = DEVICE) - { - if (N < m_num_attribute_per_element) { - RXMESH_ERROR( - "RXMeshAttribute::reduce() the output Vector size should be " - ">= the number of attributes per mesh element. Output " - "Vector size = {}, number of attributes per mesh element = {}", - N, m_num_attribute_per_element); - } - - - if ((location & DEVICE) == DEVICE) { - if (m_layout != SoA) { - RXMESH_ERROR( - "RXMeshAttribute::reduce is not supported for non SoA " - "layouts on the device"); - } - for (uint32_t i = 0; i < m_num_attribute_per_element; ++i) { - switch (op) { - case SUM: { - cub::DeviceReduce::Sum( - m_d_reduce_temp_storage[i], - m_reduce_temp_storage_bytes, - m_d_attr + i * m_num_mesh_elements, - m_d_reduce_output[i], m_num_mesh_elements, - m_reduce_streams[i]); - break; - } - case MAX: { - cub::DeviceReduce::Max( - m_d_reduce_temp_storage[i], - m_reduce_temp_storage_bytes, - m_d_attr + i * m_num_mesh_elements, - m_d_reduce_output[i], m_num_mesh_elements, - m_reduce_streams[i]); - break; - } - case MIN: { - cub::DeviceReduce::Min( - m_d_reduce_temp_storage[i], - m_reduce_temp_storage_bytes, - m_d_attr + i * m_num_mesh_elements, - m_d_reduce_output[i], m_num_mesh_elements, - m_reduce_streams[i]); - break; - } - case NORM2: { - uint32_t num_blocks = - DIVIDE_UP(m_num_mesh_elements, m_block_size); - // 1st pass - rxmesh_attribute_norm2 - <<>>(*this, i, - m_norm2_temp_buffer[i]); - - // 2nd pass - cub::DeviceReduce::Sum(m_d_reduce_temp_storage[i], - m_reduce_temp_storage_bytes, - m_norm2_temp_buffer[i], - m_d_reduce_output[i], num_blocks, - m_reduce_streams[i]); - break; - } - case DOT: { - if (other == nullptr) { - RXMESH_ERROR( - "RXMeshAttribute::reduce other can not be " - "nullptr for dot product"); - } - uint32_t num_blocks = - DIVIDE_UP(m_num_mesh_elements, m_block_size); - // 1st pass - rxmesh_attribute_dot - <<>>(*this, *other, i, - m_norm2_temp_buffer[i]); - - // 2nd pass - cub::DeviceReduce::Sum(m_d_reduce_temp_storage[i], - m_reduce_temp_storage_bytes, - m_norm2_temp_buffer[i], - m_d_reduce_output[i], num_blocks, - m_reduce_streams[i]); - break; - } - default: { - RXMESH_ERROR( - "RXMeshAttribute::reduce is not supported for the " - "given operation"); - break; - } - } - CUDA_ERROR(cudaStreamSynchronize(m_reduce_streams[i])); - CUDA_ERROR(cudaMemcpy(&h_output[i], m_d_reduce_output[i], - sizeof(T), cudaMemcpyDeviceToHost)); - } - } - - if ((location & HOST) == HOST) { - for (uint32_t j = 0; j < m_num_attribute_per_element; ++j) { - for (uint32_t i = 0; i < m_num_mesh_elements; ++i) { - h_output[i] = 0; - if (op == MAX || op == MIN) { - h_output[i] = (*this)(i, j); - } - - switch (op) { - case SUM: { - h_output[i] += (*this)(i, j); - break; - } - case MAX: { - h_output[i] = std::max(h_output[i], (*this)(i, j)); - break; - } - case MIN: { - h_output[i] = std::min(h_output[i], (*this)(i, j)); - break; - } - case NORM2: { - h_output[i] += (*this)(i, j) * (*this)(i, j); - break; - } - case DOT: { - if (other == nullptr) { - RXMESH_ERROR( - "RXMeshAttribute::reduce other can not be " - "nullptr for dot product"); - } - h_output[i] += (*this)(i, j) * (*other)(i, j); - } - default: - break; - } - } - } - } - } - - - //********************************************************************* - - - //********************** Operators - __host__ __device__ __forceinline__ T& operator()(uint32_t idx, - uint32_t attr) - { - - assert(attr < m_num_attribute_per_element); - assert(idx < m_num_mesh_elements); - assert(m_pitch.x > 0 && m_pitch.y > 0); - -#ifdef __CUDA_ARCH__ - return m_d_attr[idx * m_pitch.x + attr * m_pitch.y]; -#else - return m_h_attr[idx * m_pitch.x + attr * m_pitch.y]; -#endif - } - - __host__ __device__ __forceinline__ T& operator()(uint32_t idx) - { - // for m_num_attribute_per_element =1 - - assert(m_num_attribute_per_element == 1); - assert(idx < m_num_mesh_elements); - -#ifdef __CUDA_ARCH__ - return m_d_attr[idx]; -#else - return m_h_attr[idx]; -#endif - } - - __host__ __device__ __forceinline__ T& operator()(uint32_t idx, - uint32_t attr) const - { - - assert(attr < m_num_attribute_per_element); - assert(idx < m_num_mesh_elements); - -#ifdef __CUDA_ARCH__ - return m_d_attr[idx * m_pitch.x + attr * m_pitch.y]; -#else - return m_h_attr[idx * m_pitch.x + attr * m_pitch.y]; -#endif - } - - __host__ __device__ __forceinline__ T& operator()(uint32_t idx) const - { - // for m_num_attribute_per_element =1 - - assert(m_num_attribute_per_element == 1); - assert(idx < m_num_mesh_elements); - -#ifdef __CUDA_ARCH__ - return m_d_attr[idx]; -#else - return m_h_attr[idx]; -#endif - } - - __host__ __device__ __forceinline__ T* operator->() const - { -#ifdef __CUDA_ARCH__ - return m_d_attr; -#else - return m_h_attr; -#endif - } - - __host__ __device__ __forceinline__ bool is_empty() const - { -#ifdef __CUDA_ARCH__ - - return (m_d_attr == nullptr) ? true : false; -#else - return (m_h_attr == nullptr) ? true : false; - -#endif - } - //********************************************************************* - - - private: - void set_pitch() - { - if (m_layout == AoS) { - m_pitch.x = m_num_attribute_per_element; - m_pitch.y = 1; - } else if (m_layout == SoA) { - m_pitch.x = 1; - m_pitch.y = m_num_mesh_elements; - } else { - RXMESH_ERROR("RXMeshAttribute::set_pitch() unknown layout"); - } - } - //********************** Member Variables - char* m_name; - uint32_t m_num_mesh_elements; - uint32_t m_num_attribute_per_element; - locationT m_allocated; - T* m_h_attr; - T* m_d_attr; - layoutT m_layout; - // to index: id*m_pitch.x + attr*m_pitch.y - uint2 m_pitch; - - constexpr static uint32_t m_block_size = 256; - - // temp array for alpha and beta parameters of axpy allocated on the device - T * d_axpy_alpha, *d_axpy_beta; - bool m_is_axpy_allocated; - - // temp array for reduce operations - bool m_is_reduce_allocated; - size_t m_reduce_temp_storage_bytes; - void** m_d_reduce_temp_storage; - T** m_d_reduce_output; - cudaStream_t* m_reduce_streams; - T** m_norm2_temp_buffer; - //********************************************************************* -}; -} // namespace RXMESH \ No newline at end of file diff --git a/include/rxmesh/rxmesh_context.h b/include/rxmesh/rxmesh_context.h deleted file mode 100644 index 31987e79..00000000 --- a/include/rxmesh/rxmesh_context.h +++ /dev/null @@ -1,284 +0,0 @@ -#pragma once - -#include -#include "rxmesh/util/macros.h" - -namespace RXMESH { - -// context for the mesh parameters and pointers. everything is allocated -// on rxmesh. this class is meant to be a vehicle to copy various parameters -// to the device kernels. -// TODO make sure that __align__(16) is the right one -class __align__(16) RXMeshContext -{ - - public: - RXMeshContext() - : m_num_edges(0), m_num_faces(0), m_num_vertices(0), m_face_degree(0), - m_max_valence(0), m_max_edge_incident_faces(0), - m_max_face_adjacent_faces(0), m_num_patches(0), - m_d_face_patch(nullptr), m_d_edge_patch(nullptr), - m_d_vertex_patch(nullptr), m_d_patches_ltog_v(nullptr), - m_d_patches_ltog_e(nullptr), m_d_patches_ltog_f(nullptr), - m_d_ad_size_ltog_v(nullptr), m_d_ad_size_ltog_e(nullptr), - m_d_ad_size_ltog_f(nullptr), m_d_patches_edges(nullptr), - m_d_patches_faces(nullptr), m_d_patch_distribution_v(nullptr), - m_d_patch_distribution_e(nullptr), m_d_patch_distribution_f(nullptr), - m_d_ad_size(nullptr), m_d_owned_size(nullptr), - m_d_neighbour_patches(nullptr), m_d_neighbour_patches_offset(nullptr) - - { - m_d_max_size.x = m_d_max_size.y = 0; - } - - void init( - const uint32_t num_edges, const uint32_t num_faces, - const uint32_t num_vertices, const uint32_t face_degree, - const uint32_t max_valence, const uint32_t max_edge_incident_faces, - const uint32_t max_face_adjacent_faces, const uint32_t num_patches, - uint32_t* d_face_patch, uint32_t* d_edge_patch, - uint32_t* d_vertex_patch, uint32_t* d_patches_ltog_v, - uint32_t* d_patches_ltog_e, uint32_t* d_patches_ltog_f, - uint2* d_ad_size_ltog_v, uint2* d_ad_size_ltog_e, - uint2* d_ad_size_ltog_f, uint16_t* d_patches_edges, - uint16_t* d_patches_faces, uint4* d_ad_size, uint4* d_owned_size, - uint2 max_size, uint32_t* d_patch_distribution_v, - uint32_t* d_patch_distribution_e, uint32_t* d_patch_distribution_f, - uint32_t* d_neighbour_patches, uint32_t* d_neighbour_patches_offset) - { - - m_num_edges = num_edges; - m_num_faces = num_faces; - m_num_vertices = num_vertices; - m_face_degree = face_degree; - m_max_valence = max_valence; - m_max_edge_incident_faces = max_edge_incident_faces; - m_max_face_adjacent_faces = max_face_adjacent_faces; - m_num_patches = num_patches; - m_d_face_patch = d_face_patch; - m_d_edge_patch = d_edge_patch; - m_d_vertex_patch = d_vertex_patch; - m_d_patches_ltog_v = d_patches_ltog_v; - m_d_patches_ltog_e = d_patches_ltog_e; - m_d_patches_ltog_f = d_patches_ltog_f; - m_d_ad_size_ltog_v = d_ad_size_ltog_v; - m_d_ad_size_ltog_e = d_ad_size_ltog_e; - m_d_ad_size_ltog_f = d_ad_size_ltog_f; - m_d_patches_edges = d_patches_edges; - m_d_patches_faces = d_patches_faces; - m_d_ad_size = d_ad_size; - m_d_owned_size = d_owned_size; - m_d_max_size = max_size; - m_d_patch_distribution_v = d_patch_distribution_v; - m_d_patch_distribution_e = d_patch_distribution_e; - m_d_patch_distribution_f = d_patch_distribution_f; - m_d_neighbour_patches = d_neighbour_patches; - m_d_neighbour_patches_offset = d_neighbour_patches_offset; - } - - - template - __device__ void print_data(const dataT* arr, const uint32_t start_id, - const uint32_t len, int shift = 0) const - { - printf(" start_id = %u, len = %u\n", start_id, len); - - uint32_t end = len + start_id; - for (uint32_t i = start_id; i < end; ++i) { - printf(" [%u] ", arr[i] >> shift); - if (i % 20 == 0 && i != start_id) { - printf("\n"); - } - } - printf("\n\n"); - } - - __device__ void print_patch(uint32_t p_id) const - { - // print all relevant data of a single patch - - // if (threadIdx.x == 0){ - printf("\n ********* p_id = %u *********\n", p_id); - printf(" global_num_vertices=%u \n", m_num_vertices); - printf(" global_num_edges=%u \n", m_num_edges); - printf(" global_num_faces=%u \n", m_num_faces); - printf(" global_num_patches=%u \n", m_num_patches); - - printf(" patch #vertices = %u, start_id= %u \n", - m_d_ad_size_ltog_v[p_id].y, m_d_ad_size_ltog_v[p_id].x); - printf(" patch #edges = %u, start_id= %u\n", m_d_ad_size_ltog_e[p_id].y, - m_d_ad_size_ltog_e[p_id].x); - printf(" patch #faces = %u, start_id= %u\n", m_d_ad_size_ltog_f[p_id].y, - m_d_ad_size_ltog_f[p_id].x); - - printf("\n ** d_ltog_v **\n"); - print_data(m_d_patches_ltog_v, uint32_t(m_d_ad_size_ltog_v[p_id].x), - uint32_t(m_d_ad_size_ltog_v[p_id].y), 1); - - printf("\n ** d_ltog_e **\n"); - print_data(m_d_patches_ltog_e, uint32_t(m_d_ad_size_ltog_e[p_id].x), - uint32_t(m_d_ad_size_ltog_e[p_id].y), 1); - - printf("\n ** d_ltog_f **\n"); - print_data(m_d_patches_ltog_f, uint32_t(m_d_ad_size_ltog_f[p_id].x), - uint32_t(m_d_ad_size_ltog_f[p_id].y), 1); - - - printf("\n ** d_edges **\n"); - print_data(m_d_patches_edges, uint32_t(m_d_ad_size[p_id].x), - uint32_t(m_d_ad_size[p_id].y)); - - printf("\n ** d_faces **\n"); - print_data(m_d_patches_faces, uint32_t(m_d_ad_size[p_id].z), - uint32_t(m_d_ad_size[p_id].w), 1); - //} - } - - - //********************** Getters - __device__ __forceinline__ uint32_t get_num_edges() const - { - return m_num_edges; - } - __device__ __forceinline__ uint32_t get_num_faces() const - { - return m_num_faces; - } - __device__ __forceinline__ uint32_t get_num_vertices() const - { - return m_num_vertices; - } - __device__ __forceinline__ uint32_t get_face_degree() const - { - return m_face_degree; - } - __device__ __forceinline__ uint32_t get_max_valence() const - { - return m_max_valence; - } - __device__ __forceinline__ uint32_t get_max_edge_incident_faces() const - { - return m_max_edge_incident_faces; - } - - __device__ __forceinline__ uint32_t get_max_edge_adjacent_faces() const - { - return m_max_face_adjacent_faces; - } - __device__ __forceinline__ uint32_t get_num_patches() const - { - return m_num_patches; - } - __device__ __forceinline__ uint32_t* get_face_patch() const - { - return m_d_face_patch; - } - __device__ __forceinline__ uint32_t* get_edge_patch() const - { - return m_d_edge_patch; - } - __device__ __forceinline__ uint32_t* get_vertex_patch() const - { - return m_d_vertex_patch; - } - __device__ __forceinline__ uint32_t* get_patches_ltog_v() const - { - return m_d_patches_ltog_v; - } - __device__ __forceinline__ uint32_t* get_patches_ltog_e() const - { - return m_d_patches_ltog_e; - } - __device__ __forceinline__ uint32_t* get_patches_ltog_f() const - { - return m_d_patches_ltog_f; - } - __device__ __forceinline__ uint2* get_ad_size_ltog_v() const - { - return m_d_ad_size_ltog_v; - } - __device__ __forceinline__ uint2* get_ad_size_ltog_e() const - { - return m_d_ad_size_ltog_e; - } - __device__ __forceinline__ uint2* get_ad_size_ltog_f() const - { - return m_d_ad_size_ltog_f; - } - __device__ __forceinline__ uint16_t* get_patches_edges() const - { - return m_d_patches_edges; - } - __device__ __forceinline__ uint16_t* get_patches_faces() const - { - return m_d_patches_faces; - } - __device__ __forceinline__ uint4* get_ad_size() const - { - return m_d_ad_size; - } - __device__ __forceinline__ uint4* get_size_owned() const - { - return m_d_owned_size; - } - __device__ __forceinline__ uint2 get_max_size() const - { - return m_d_max_size; - } - __device__ __forceinline__ uint32_t* get_vertex_distribution() const - { - return m_d_patch_distribution_v; - } - __device__ __forceinline__ uint32_t* get_edge_distribution() const - { - return m_d_patch_distribution_e; - } - __device__ __forceinline__ uint32_t* get_face_distribution() const - { - return m_d_patch_distribution_f; - } - //********************************************************************** - - static __device__ __host__ __forceinline__ void unpack_edge_dir( - const uint16_t edge_dir, uint16_t& edge, flag_t& dir) - { - dir = (edge_dir & 1) != 0; - edge = edge_dir >> 1; - } - - private: - // mesh elements count - uint32_t m_num_edges, m_num_faces, m_num_vertices, m_face_degree, - m_max_valence, m_max_edge_incident_faces, m_max_face_adjacent_faces, - m_num_patches; - - - // max max_num_edges_per_patch*2 for all patches rounded to multiple of 32 - // max max_num_faces_per_patch*m_face_degree for all patches rounded to - // multiple of 32 - uint2 m_d_max_size; - - //** face/vertex/edge patch (indexed by in global space) - uint32_t *m_d_face_patch, *m_d_edge_patch, *m_d_vertex_patch; - - // mapping - uint32_t *m_d_patches_ltog_v, *m_d_patches_ltog_e, *m_d_patches_ltog_f; - uint2 * m_d_ad_size_ltog_v, *m_d_ad_size_ltog_e, *m_d_ad_size_ltog_f; - - // incidence - uint16_t *m_d_patches_edges, *m_d_patches_faces; - - // scanned histogram of the mesh elements distribution per patch - uint32_t *m_d_patch_distribution_v, *m_d_patch_distribution_e, - *m_d_patch_distribution_f; - - //.x edge address .y edge size .z face address .w face size - uint4* m_d_ad_size; - - //.x faces .y edges .z vertex - uint4* m_d_owned_size; - - // patch neighbour - uint32_t *m_d_neighbour_patches, *m_d_neighbour_patches_offset; -}; -} // namespace RXMESH \ No newline at end of file diff --git a/include/rxmesh/rxmesh_static.h b/include/rxmesh/rxmesh_static.h index 92f6d84c..af851b5c 100644 --- a/include/rxmesh/rxmesh_static.h +++ b/include/rxmesh/rxmesh_static.h @@ -1,46 +1,191 @@ #pragma once #include +#include +#include + #include -#include "rxmesh/kernels/prototype.cuh" + +#include "rxmesh/attribute.h" +#include "rxmesh/handle.h" +#include "rxmesh/kernels/for_each.cuh" #include "rxmesh/launch_box.h" #include "rxmesh/rxmesh.h" -#include "rxmesh/rxmesh_util.h" +#include "rxmesh/types.h" #include "rxmesh/util/log.h" #include "rxmesh/util/timer.h" -namespace RXMESH { +namespace rxmesh { -template -class RXMeshStatic : public RXMesh +/** + * @brief This class is responsible for query operations of static meshes. It + * extends RXMesh with methods needed to launch kernel and do computation on the + * mesh as well as managing mesh attributes + */ +class RXMeshStatic : public RXMesh { - // This class is responsible for query operation of static meshes. It - // inherits the constructor and build methods from the base class RXMesh - // and create new method(s) for queries public: - //********************** Constructors/Destructors RXMeshStatic(const RXMeshStatic&) = delete; + /** + * @brief Main constructor used to initialize internal member variables + * @param fv Face incident vertices as read from an obj file + * @param quite run in quite mode + */ RXMeshStatic(std::vector>& fv, - std::vector>& coordinates, - const bool sort = false, - const bool quite = true) - : RXMesh(fv, coordinates, sort, quite){}; + const bool quite = false) + : RXMesh(fv, quite) + { + m_attr_container = std::make_shared(); + }; virtual ~RXMeshStatic() { } - //********************************************************************* /** - * prepare_launch_box() + * @brief Apply a lambda function on all vertices in the mesh + * @tparam LambdaT type of the lambda function (inferred) + * @param location the execution location + * @param apply lambda function to be applied on all vertices. The lambda + * function signature takes a VertexHandle + * @param stream the stream used to run the kernel in case of DEVICE + * execution location + */ + template + void for_each_vertex(locationT location, + LambdaT apply, + cudaStream_t stream = NULL) + { + if ((location & HOST) == HOST) { + const int num_patches = this->get_num_patches(); +#pragma omp parallel for + for (int p = 0; p < num_patches; ++p) { + for (uint16_t v = 0; + v < this->m_h_patches_info[p].num_owned_vertices; + ++v) { + const VertexHandle v_handle(static_cast(p), v); + apply(v_handle); + } + } + } + + if ((location & DEVICE) == DEVICE) { + if constexpr (IS_HD_LAMBDA(LambdaT) || IS_D_LAMBDA(LambdaT)) { + + const int num_patches = this->get_num_patches(); + const int threads = 256; + detail::for_each_vertex<<>>( + num_patches, this->m_d_patches_info, apply); + } else { + RXMESH_ERROR( + "RXMeshStatic::for_each_vertex() Input lambda function " + "should be annotated with __device__ for execution on " + "device"); + } + } + } + + /** + * @brief Apply a lambda function on all edges in the mesh + * @tparam LambdaT type of the lambda function (inferred) + * @param location the execution location + * @param apply lambda function to be applied on all edges. The lambda + * function signature takes a EdgeHandle + * @param stream the stream used to run the kernel in case of DEVICE + * execution location + */ + template + void for_each_edge(locationT location, + LambdaT apply, + cudaStream_t stream = NULL) + { + if ((location & HOST) == HOST) { + const int num_patches = this->get_num_patches(); +#pragma omp parallel for + for (int p = 0; p < num_patches; ++p) { + for (uint16_t e = 0; + e < this->m_h_patches_info[p].num_owned_edges; + ++e) { + const EdgeHandle e_handle(static_cast(p), e); + apply(e_handle); + } + } + } + + if ((location & DEVICE) == DEVICE) { + if constexpr (IS_HD_LAMBDA(LambdaT) || IS_D_LAMBDA(LambdaT)) { + + const int num_patches = this->get_num_patches(); + const int threads = 256; + detail::for_each_edge<<>>( + num_patches, this->m_d_patches_info, apply); + } else { + RXMESH_ERROR( + "RXMeshStatic::for_each_edge() Input lambda function " + "should be annotated with __device__ for execution on " + "device"); + } + } + } + + /** + * @brief Apply a lambda function on all faces in the mesh + * @tparam LambdaT type of the lambda function (inferred) + * @param location the execution location + * @param apply lambda function to be applied on all faces. The lambda + * function signature takes a FaceHandle + * @param stream the stream used to run the kernel in case of DEVICE + * execution location + */ + template + void for_each_face(locationT location, + LambdaT apply, + cudaStream_t stream = NULL) + { + if ((location & HOST) == HOST) { + const int num_patches = this->get_num_patches(); +#pragma omp parallel for + for (int p = 0; p < num_patches; ++p) { + for (int f = 0; f < this->m_h_patches_info[p].num_owned_faces; + ++f) { + const FaceHandle f_handle(static_cast(p), f); + apply(f_handle); + } + } + } + + if ((location & DEVICE) == DEVICE) { + if constexpr (IS_HD_LAMBDA(LambdaT) || IS_D_LAMBDA(LambdaT)) { + + const int num_patches = this->get_num_patches(); + const int threads = 256; + detail::for_each_face<<>>( + num_patches, this->m_d_patches_info, apply); + } else { + RXMESH_ERROR( + "RXMeshStatic::for_each_face() Input lambda function " + "should be annotated with __device__ for execution on " + "device"); + } + } + } + + /** + * @brief populate the launch_box with grid size and dynamic shared memory + * needed for kernel launch * TODO provide variadic version of this function that can accept multiple * ops + * @param op Query operation done inside this the kernel + * @param launch_box input launch box to be populated + * @param is_higher_query if the query done will be a higher ordered e.g., + * k-ring + * @param oriented if the query is oriented. Valid only for Op::VV queries */ template void prepare_launch_box(const Op op, LaunchBox& launch_box, - const bool is_higher_query = false, + const void* kernel, const bool oriented = false) const { static_assert( @@ -51,18 +196,380 @@ class RXMeshStatic : public RXMesh launch_box.blocks = this->m_num_patches; - const uint32_t output_fixed_offset = - (op == Op::EV) ? 2 : ((op == Op::FV || op == Op::FE) ? 3 : 0); - this->template calc_shared_memory( - op, launch_box, is_higher_query, oriented); + op, launch_box, kernel, oriented); + } + + + /** + * @brief Adding a new face attribute + * @tparam T type of the attribute + * @param name of the attribute. Should not collide with other attributes + * names + * @param num_attributes number of the attributes + * @param location where to allocate the attributes + * @param layout as SoA or AoS + * operations + * @return shared pointer to the created attribute + */ + template + std::shared_ptr> add_face_attribute( + const std::string& name, + uint32_t num_attributes, + locationT location = LOCATION_ALL, + layoutT layout = SoA) + { + return m_attr_container->template add>( + name.c_str(), + this->m_h_num_owned_f, + num_attributes, + location, + layout); + } + + /** + * @brief Adding a new face attribute by reading values from a host buffer + * f_attributes where the order of faces is the same as the order of + * faces given to the constructor.The attributes are populated on device + * and host + * @tparam T type of the attribute + * @param name of the attribute. Should not collide with other attributes + * names + * @param layout as SoA or AoS + * operations + * @return shared pointer to the created attribute + * TODO implement this + */ + template + std::shared_ptr> add_face_attribute( + const std::vector>& f_attributes, + const std::string& name, + layoutT layout = SoA) + { + } + + /** + * @brief Adding a new face attribute by reading values from a host buffer + * f_attributes where the order of faces is the same as the order of + * faces given to the constructor.The attributes are populated on device + * and host + * @tparam T type of the attribute + * @param name of the attribute. Should not collide with other attributes + * names + * @param layout as SoA or AoS + * operations + * @return shared pointer to the created attribute + * TODO implement this + */ + template + std::shared_ptr> add_face_attribute( + const std::vector& f_attributes, + const std::string& name, + layoutT layout = SoA) + { + } + + /** + * @brief Adding a new edge attribute + * @tparam T type of the attribute + * @param name of the attribute. Should not collide with other attributes + * names + * @param num_attributes number of the attributes + * @param location where to allocate the attributes + * @param layout as SoA or AoS + * operations + * @return shared pointer to the created attribute + */ + template + std::shared_ptr> add_edge_attribute( + const std::string& name, + uint32_t num_attributes, + locationT location = LOCATION_ALL, + layoutT layout = SoA) + { + return m_attr_container->template add>( + name.c_str(), + this->m_h_num_owned_e, + num_attributes, + location, + layout); + } + + /** + * @brief Adding a new vertex attribute + * @tparam T type of the attribute + * @param name of the attribute. Should not collide with other attributes + * names + * @param num_attributes number of the attributes + * @param location where to allocate the attributes + * @param layout as SoA or AoS + * operations + * @return shared pointer to the created attribute + */ + template + std::shared_ptr> add_vertex_attribute( + const std::string& name, + uint32_t num_attributes, + locationT location = LOCATION_ALL, + layoutT layout = SoA) + { + return m_attr_container->template add>( + name.c_str(), + this->m_h_num_owned_v, + num_attributes, + location, + layout); + } + + /** + * @brief Adding a new vertex attribute by reading values from a host buffer + * v_attributes where the order of vertices is the same as the order of + * vertices given to the constructor. The attributes are populated on device + * and host + * @tparam T type of the attribute + * @param v_attributes attributes to read + * @param name of the attribute. Should not collide with other attributes + * names + * @param layout as SoA or AoS + * operations + * @return shared pointer to the created attribute + */ + template + std::shared_ptr> add_vertex_attribute( + const std::vector>& v_attributes, + const std::string& name, + layoutT layout = SoA) + { + if (v_attributes.empty()) { + RXMESH_ERROR( + "RXMeshStatic::add_vertex_attribute() input attribute is " + "empty"); + } + + if (v_attributes.size() != get_num_vertices()) { + RXMESH_ERROR( + "RXMeshStatic::add_vertex_attribute() input attribute size " + "({}) is not the same as number of vertices in the input mesh " + "({})", + v_attributes.size(), + get_num_vertices()); + } + + uint32_t num_attributes = v_attributes[0].size(); + + auto ret = m_attr_container->template add>( + name.c_str(), + this->m_h_num_owned_v, + num_attributes, + LOCATION_ALL, + layout); + + // populate the attribute before returning it + const int num_patches = this->get_num_patches(); +#pragma omp parallel for + for (int p = 0; p < num_patches; ++p) { + for (uint16_t v = 0; v < this->m_h_num_owned_v[p]; ++v) { + + const VertexHandle v_handle(static_cast(p), v); + + uint32_t global_v = m_h_patches_ltog_v[p][v]; + + for (uint32_t a = 0; a < num_attributes; ++a) { + (*ret)(v_handle, a) = v_attributes[global_v][a]; + } + } + } + + // move to device + ret->move(rxmesh::HOST, rxmesh::DEVICE); + return ret; + } + + /** + * @brief Adding a new vertex attribute by reading values from a host buffer + * v_attributes where the order of vertices is the same as the order of + * vertices given to the constructor. The attributes are populated on device + * and host + * @tparam T type of the attribute + * @param v_attributes attributes to read + * @param name of the attribute. Should not collide with other attributes + * names + * @param layout as SoA or AoS + * operations + * @return shared pointer to the created attribute + */ + template + std::shared_ptr> add_vertex_attribute( + const std::vector& v_attributes, + const std::string& name, + layoutT layout = SoA) + { + if (v_attributes.empty()) { + RXMESH_ERROR( + "RXMeshStatic::add_vertex_attribute() input attribute is " + "empty"); + } + + if (v_attributes.size() != get_num_vertices()) { + RXMESH_ERROR( + "RXMeshStatic::add_vertex_attribute() input attribute size " + "({}) is not the same as number of vertices in the input mesh " + "({})", + v_attributes.size(), + get_num_vertices()); + } + + uint32_t num_attributes = 1; + + auto ret = m_attr_container->template add>( + name.c_str(), + this->m_h_num_owned_v, + num_attributes, + LOCATION_ALL, + layout); + + // populate the attribute before returning it + const int num_patches = this->get_num_patches(); +#pragma omp parallel for + for (int p = 0; p < num_patches; ++p) { + for (uint16_t v = 0; v < this->m_h_num_owned_v[p]; ++v) { + + const VertexHandle v_handle(static_cast(p), v); + + uint32_t global_v = m_h_patches_ltog_v[p][v]; + + (*ret)(v_handle, 0) = v_attributes[global_v]; + } + } + + // move to device + ret->move(rxmesh::HOST, rxmesh::DEVICE); + return ret; + } + + /** + * @brief Checks if an attribute exists given its name + * @param name the attribute name + * @return True if the attribute exists. False otherwise. + */ + bool does_attribute_exist(const std::string& name) + { + return m_attr_container->does_exist(name.c_str()); + } + + /** + * @brief Remove an attribute. Could be vertex, edge, or face attribute + * @param name the attribute name + */ + void remove_attribute(const std::string& name) + { + if (!this->does_attribute_exist(name)) { + RXMESH_WARN( + "RXMeshStatic::remove_attribute() trying to remove an " + "attribute that does not exit with name {}", + name); + return; + } + + m_attr_container->remove(name.c_str()); + } + + + /** + * @brief Map a vertex handle into a global index as seen in the input + * to RXMeshStatic + * @param vh input vertex handle + * @return the global index of vh + */ + uint32_t map_to_global(const VertexHandle vh) const + { + auto pl = vh.unpack(); + return m_h_patches_ltog_v[pl.first][pl.second]; + } + + /** + * @brief Map an edge handle into a global index + * @param eh input edge handle + * @return the global index of eh + */ + uint32_t map_to_global(const EdgeHandle eh) const + { + auto pl = eh.unpack(); + return m_h_patches_ltog_e[pl.first][pl.second]; + } + + /** + * @brief Map a face handle into a global index as seen in the input + * to RXMeshStatic + * @param vh input face handle + * @return the global index of fh + */ + uint32_t map_to_global(const FaceHandle fh) const + { + auto pl = fh.unpack(); + return m_h_patches_ltog_f[pl.first][pl.second]; + } + + /** + * @brief Export the mesh to obj file + * @tparam T type of vertices coordinates + * @param filename the output file + * @param coords vertices coordinates + */ + template + void export_obj(const std::string& filename, + const VertexAttribute& coords) + { + std::string fn = filename; + std::fstream file(fn, std::ios::out); + file.precision(30); + + uint32_t num_v = 0; + for (uint32_t p = 0; p < this->m_num_patches; ++p) { + + const uint32_t p_num_vertices = + this->m_h_patches_info[p].num_vertices; + + for (uint16_t v = 0; v < p_num_vertices; ++v) { + uint16_t v_id = v; + uint32_t p_id = p; + if (v >= this->m_h_patches_info[p].num_owned_vertices) { + uint16_t l = + v - this->m_h_patches_info[p].num_owned_vertices; + v_id = this->m_h_patches_info[p].not_owned_id_v[l].id; + p_id = this->m_h_patches_info[p].not_owned_patch_v[l]; + } + VertexHandle vh(p_id, {v_id}); + file << "v " << coords(vh, 0) << " " << coords(vh, 1) << " " + << coords(vh, 2) << std::endl; + } + + const uint32_t p_num_faces = + this->m_h_patches_info[p].num_owned_faces; + + for (uint32_t f = 0; f < p_num_faces; ++f) { + + file << "f "; + for (uint32_t e = 0; e < 3; ++e) { + uint16_t edge = this->m_h_patches_info[p].fe[3 * f + e].id; + flag_t dir(0); + Context::unpack_edge_dir(edge, edge, dir); + uint16_t e_id = (2 * edge) + dir; + uint16_t v = this->m_h_patches_info[p].ev[e_id].id; + file << v + num_v + 1 << " "; + } + file << std::endl; + } + + num_v += p_num_vertices; + } } protected: template void calc_shared_memory(const Op op, LaunchBox& launch_box, - const bool is_higher_query, + const void* kernel, const bool oriented = false) const { // Operations that uses matrix transpose needs a template parameter @@ -75,7 +582,8 @@ class RXMeshStatic : public RXMesh "RXMeshStatic::calc_shared_memory() " "TRANSPOSE_ITEM_PER_THREAD = {} needs " "to be increased for op = {}", - TRANSPOSE_ITEM_PER_THREAD, op_to_string(op)); + TRANSPOSE_ITEM_PER_THREAD, + op_to_string(op)); } } else if (op == Op::VE || op == Op::EF || op == Op::FF) { if (3 * this->m_max_faces_per_patch > @@ -84,7 +592,8 @@ class RXMeshStatic : public RXMesh "RXMeshStatic::calc_shared_memory() " "TRANSPOSE_ITEM_PER_THREAD = {} needs " "to be increased for op = {}", - TRANSPOSE_ITEM_PER_THREAD, op_to_string(op)); + TRANSPOSE_ITEM_PER_THREAD, + op_to_string(op)); } } @@ -105,40 +614,92 @@ class RXMeshStatic : public RXMesh launch_box.smem_bytes_dyn = 0; if (op == Op::FE) { - // only faces will be loaded and no extra shared memory is needed + // only FE will be loaded launch_box.smem_bytes_dyn = 3 * this->m_max_faces_per_patch * sizeof(uint16_t); + // to load not-owned edges local and patch id + launch_box.smem_bytes_dyn += + this->m_max_not_owned_edges * + (sizeof(uint16_t) + sizeof(uint32_t)) + + sizeof(uint16_t); } else if (op == Op::EV) { - // only edges will be loaded and no extra shared memory is needed + // only EV will be loaded launch_box.smem_bytes_dyn = 2 * this->m_max_edges_per_patch * sizeof(uint16_t); + // to load not-owned vertices local and patch id + launch_box.smem_bytes_dyn += this->m_max_not_owned_vertices * + (sizeof(uint16_t) + sizeof(uint32_t)); } else if (op == Op::FV) { - // We load both faces and edges. We don't change edges. - // faces are updated to contain FV instead of FE by reading from - // edges + // We load both FE and EV. We don't change EV. + // FE are updated to contain FV instead of FE by reading from + // EV launch_box.smem_bytes_dyn = 3 * this->m_max_faces_per_patch * sizeof(uint16_t) + 2 * this->m_max_edges_per_patch * sizeof(uint16_t); + // no need for extra memory to load not-owned vertices local and + // patch id. We load them and overwrite EV. + const uint32_t not_owned_v_bytes = + this->m_max_not_owned_vertices * + (sizeof(uint16_t) + sizeof(uint32_t)); + const uint32_t edges_bytes = + 2 * this->m_max_edges_per_patch * sizeof(uint16_t); + if (not_owned_v_bytes > edges_bytes) { + // launch_box.smem_bytes_dyn += not_owned_v_bytes - edges_bytes; + RXMESH_ERROR( + "RXMeshStatic::calc_shared_memory() FV query might fail!"); + } } else if (op == Op::VE) { - // load edges and then transpose it in place + // load EV and then transpose it in place // The transpose needs two buffer; one for prefix sum and another // for the actual output - // The prefix sum will be stored in place (where edges are loaded) + // The prefix sum will be stored in place (where EV are loaded) // The output will be stored in another buffer with size equal to - // the edges since this output buffer will stored the nnz and the - // nnz of a matrix the same before/after transpose + // the EV (i.e., 2*#edges) since this output buffer will stored the + // nnz and the nnz of a matrix the same before/after transpose launch_box.smem_bytes_dyn = - (2 * 2 * this->m_max_edges_per_patch) * sizeof(uint16_t); - } else if (op == Op::EF || op == Op::VF) { - // same as above but with faces + (2 * 2 * this->m_max_edges_per_patch) * sizeof(uint16_t) + + sizeof(uint16_t); + + // to load the not-owned edges local and patch id + launch_box.smem_bytes_dyn += this->m_max_not_owned_edges * + (sizeof(uint16_t) + sizeof(uint32_t)); + } else if (op == Op::EF) { + // same as Op::VE but with faces launch_box.smem_bytes_dyn = (2 * 3 * this->m_max_faces_per_patch) * sizeof(uint16_t) + + sizeof(uint16_t) + sizeof(uint16_t); + + // to load the not-owned faces local and patch id + launch_box.smem_bytes_dyn += this->m_max_not_owned_faces * + (sizeof(uint16_t) + sizeof(uint32_t)); + } else if (op == Op::VF) { + // load EV and FE simultaneously. changes FE to FV using EV. Then + // transpose FV in place and use EV to store the values/output while + // using FV to store the prefix sum. Thus, the space used to store + // EV should be max(3*#faces, 2*#edges) + launch_box.smem_bytes_dyn = + 3 * this->m_max_faces_per_patch * sizeof(uint16_t) + + std::max(3 * this->m_max_faces_per_patch, + 2 * this->m_max_edges_per_patch) * + sizeof(uint16_t) + sizeof(uint16_t); + + // to load the not-owned faces local and patch id + launch_box.smem_bytes_dyn += this->m_max_not_owned_faces * + (sizeof(uint16_t) + sizeof(uint32_t)); } else if (op == Op::VV) { - // similar to VE but we also need to store the edges (EV) even after - // we do the transpose. + // similar to VE but we also need to store the EV even after + // we do the transpose launch_box.smem_bytes_dyn = (3 * 2 * this->m_max_edges_per_patch) * sizeof(uint16_t); + // no need for extra memory to load not-owned local and patch id. + // We load them and overwrite the extra EV + if (this->m_max_not_owned_vertices * + (sizeof(uint16_t) + sizeof(uint32_t)) > + (2 * this->m_max_edges_per_patch) * sizeof(uint16_t)) { + RXMESH_ERROR( + "RXMeshStatic::calc_shared_memory() VV query might fail!"); + } } else if (op == Op::FF) { // FF needs to store FE and EF along side with the output itself // FE needs 3*max_num_faces @@ -153,6 +714,8 @@ class RXMeshStatic : public RXMesh 4 * this->m_max_faces_per_patch // FF ) * sizeof(uint16_t); + // no need for extra memory to load not-owned faces local and + // patch id. We load them and overwrite FE. } if (op == Op::VV && oriented) { @@ -162,160 +725,56 @@ class RXMeshStatic : public RXMesh // Since oriented is only done on manifold, EF needs only // 2*max_num_edges since every edge is neighbor to maximum of two // faces (which we write on the same place as the extra EV) - launch_box.smem_bytes_dyn += (/*2 * this->m_max_edges_per_patch +*/ - 3 * this->m_max_faces_per_patch) * - sizeof(uint16_t); - } - - // to store output ltog map without the need to overlap it with - // where we store mesh edges/faces - // The +1 is for padding - if (op == Op::EV || op == Op::FV /*|| op == Op::VV*/) { - // For VV, we overwrite the extra storage we used above - // to store the mapping which is more than enough to store the - // vertices ltog launch_box.smem_bytes_dyn += - (this->m_max_vertices_per_patch + 1) * sizeof(uint32_t); - - } else if (op == Op::FE || op == Op::VE || op == Op::EE) { - launch_box.smem_bytes_dyn += - (this->m_max_edges_per_patch + 1) * sizeof(uint32_t); - } else if (op == Op::VF || op == Op::EF /*|| op == Op::FF*/) { - launch_box.smem_bytes_dyn += - (this->m_max_faces_per_patch + 1) * sizeof(uint32_t); + (3 * this->m_max_faces_per_patch) * sizeof(uint16_t); } - launch_box.smem_bytes_static = check_shared_memory( - op, launch_box.smem_bytes_dyn, is_higher_query); + check_shared_memory(op, + launch_box.smem_bytes_dyn, + launch_box.smem_bytes_static, + launch_box.num_registers_per_thread, + kernel); if (!this->m_quite) { RXMESH_TRACE( - "RXMesh::calc_shared_memory() launching {} blocks with {} " - "threads on the device", - launch_box.blocks, blockThreads); + "RXMeshStatic::calc_shared_memory() launching {} blocks with " + "{} threads on the device", + launch_box.blocks, + blockThreads); } } template - uint32_t check_shared_memory(const Op op, - const uint32_t smem_bytes_dyn, - bool is_higher_query) const + void check_shared_memory(const Op op, + const uint32_t smem_bytes_dyn, + size_t& smem_bytes_static, + uint32_t& num_reg_per_thread, + const void* kernel) const { // check if total shared memory (static + dynamic) consumed by // k_base_query are less than the max shared per block - cudaFuncAttributes func_attr; - switch (op) { - case Op::VV: { - if (is_higher_query) { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, - detail::higher_query_prototype)); - } else { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, detail::query_prototype)); - } - - break; - } - case Op::VE: { - if (is_higher_query) { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, - detail::higher_query_prototype)); - } else { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, detail::query_prototype)); - } - break; - } - case Op::VF: { - if (is_higher_query) { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, - detail::higher_query_prototype)); - } else { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, detail::query_prototype)); - } - break; - } - case Op::EV: { - if (is_higher_query) { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, - detail::higher_query_prototype)); - } else { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, detail::query_prototype)); - } - break; - } - case Op::EE: { - break; - } - case Op::EF: { - if (is_higher_query) { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, - detail::higher_query_prototype)); - } else { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, detail::query_prototype)); - } - break; - } - case Op::FV: { - if (is_higher_query) { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, - detail::higher_query_prototype)); - } else { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, detail::query_prototype)); - } - break; - } - case Op::FE: { - if (is_higher_query) { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, - detail::higher_query_prototype)); - } else { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, detail::query_prototype)); - } - break; - } - case Op::FF: { - if (is_higher_query) { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, - detail::higher_query_prototype)); - } else { - CUDA_ERROR(cudaFuncGetAttributes( - &func_attr, detail::query_prototype)); - } - break; - } - } + cudaFuncAttributes func_attr = cudaFuncAttributes(); + CUDA_ERROR(cudaFuncGetAttributes(&func_attr, kernel)); - uint32_t smem_bytes_static = func_attr.sharedSizeBytes; - uint32_t num_regs = func_attr.numRegs; - int device_id; + smem_bytes_static = func_attr.sharedSizeBytes; + num_reg_per_thread = static_cast(func_attr.numRegs); + int device_id; CUDA_ERROR(cudaGetDevice(&device_id)); cudaDeviceProp devProp; CUDA_ERROR(cudaGetDeviceProperties(&devProp, device_id)); if (!this->m_quite) { RXMESH_TRACE( - "RXMeshStatic::check_shared_memory() query_prototype with " - "{} " - "required shared memory = {} (dynamic) + {} (static) = {} " - "(bytes) and {} registers", - op_to_string(op), smem_bytes_dyn, smem_bytes_static, - smem_bytes_dyn + smem_bytes_static, num_regs); + "RXMeshStatic::check_shared_memory() user function with {} " + "requires shared memory = {} (dynamic) + {} (static) = {} " + "(bytes) and {} registers per thread", + op_to_string(op), + smem_bytes_dyn, + smem_bytes_static, + smem_bytes_dyn + smem_bytes_static, + num_reg_per_thread); RXMESH_TRACE( "RXMeshStatic::check_shared_memory() available total shared " @@ -327,12 +786,15 @@ class RXMeshStatic : public RXMesh if (smem_bytes_static + smem_bytes_dyn > devProp.sharedMemPerBlock) { RXMESH_ERROR( " RXMeshStatic::check_shared_memory() shared memory needed for" - " query_prototype ({} bytes) exceeds the max shared memory " + " input function ({} bytes) exceeds the max shared memory " "per block on the current device ({} bytes)", - smem_bytes_static + smem_bytes_dyn, devProp.sharedMemPerBlock); + smem_bytes_static + smem_bytes_dyn, + devProp.sharedMemPerBlock); exit(EXIT_FAILURE); } - return static_cast(smem_bytes_static); } + + + std::shared_ptr m_attr_container; }; -} // namespace RXMESH \ No newline at end of file +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/rxmesh_util.h b/include/rxmesh/rxmesh_util.h deleted file mode 100644 index 6a211f79..00000000 --- a/include/rxmesh/rxmesh_util.h +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once - -#include "rxmesh/rxmesh.h" - -namespace RXMESH { - -/** - * io_elements() - */ -void __device__ __host__ __inline__ io_elements(const Op& op, - ELEMENT& source_ele, - ELEMENT& output_ele) -{ - if (op == Op::VV || op == Op::VE || op == Op::VF) { - source_ele = ELEMENT::VERTEX; - } else if (op == Op::EV || op == Op::EE || op == Op::EF) { - source_ele = ELEMENT::EDGE; - } else if (op == Op::FV || op == Op::FE || op == Op::FF) { - source_ele = ELEMENT::FACE; - } - if (op == Op::VV || op == Op::EV || op == Op::FV) { - output_ele = ELEMENT::VERTEX; - } else if (op == Op::VE || op == Op::EE || op == Op::FE) { - output_ele = ELEMENT::EDGE; - } else if (op == Op::VF || op == Op::EF || op == Op::FF) { - output_ele = ELEMENT::FACE; - } -} -} // namespace RXMESH \ No newline at end of file diff --git a/include/rxmesh/types.h b/include/rxmesh/types.h new file mode 100644 index 00000000..a4a84987 --- /dev/null +++ b/include/rxmesh/types.h @@ -0,0 +1,125 @@ +#pragma once +#include +#include +#include "rxmesh/util/macros.h" + +namespace rxmesh { + +/** + * @brief Flags for where data resides. Used with Attributes + */ +using locationT = uint32_t; +enum : locationT +{ + LOCATION_NONE = 0x00, + HOST = 0x01, + DEVICE = 0x02, + LOCATION_ALL = 0x0F, +}; + +/** + * @brief convert locationT to string + */ +static std::string location_to_string(const locationT location) +{ + switch (location) { + case LOCATION_NONE: + return "NONE"; + case HOST: + return "HOST"; + case DEVICE: + return "DEVICE"; + case LOCATION_ALL: + return "ALL"; + default: { + RXMESH_ERROR("to_string() unknown location"); + return ""; + } + } +} + +/** + * @brief Memory layout + */ +using layoutT = uint32_t; +enum : layoutT +{ + AoS = 0x00, + SoA = 0x01, +}; +/** + * @brief convert locationT to string + */ +static std::string layout_to_string(const layoutT layout) +{ + switch (layout) { + case AoS: + return "AoS"; + case SoA: + return "SoA"; + default: { + RXMESH_ERROR("to_string() unknown layout"); + return ""; + } + } +} + +/** + * @brief ELEMENT represents the three types of mesh elements + */ +enum class ELEMENT +{ + VERTEX = 0, + EDGE = 1, + FACE = 2 +}; + +/** + * @brief Various query operations supported in RXMesh + */ +enum class Op +{ + VV = 0, + VE = 1, + VF = 2, + FV = 3, + FE = 4, + FF = 5, + EV = 6, + EE = 7, + EF = 8, +}; + +/** + * @brief Convert an operation to string + * @param op a query operation + * @return name of the query operation as a string + */ +static std::string op_to_string(const Op& op) +{ + switch (op) { + case Op::VV: + return "VV"; + case Op::VE: + return "VE"; + case Op::VF: + return "VF"; + case Op::FV: + return "FV"; + case Op::FE: + return "FE"; + case Op::FF: + return "FF"; + case Op::EV: + return "EV"; + case Op::EF: + return "EF"; + case Op::EE: + return "EE"; + default: { + RXMESH_ERROR("to_string() unknown input operation"); + return ""; + } + } +} +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/util/cuda_query.h b/include/rxmesh/util/cuda_query.h index 6290f876..d6df770c 100644 --- a/include/rxmesh/util/cuda_query.h +++ b/include/rxmesh/util/cuda_query.h @@ -4,7 +4,7 @@ #include "rxmesh/util/log.h" #include "rxmesh/util/macros.h" -namespace RXMESH { +namespace rxmesh { inline int convert_SMV_to_cores(int major, int minor) { // Taken from Nvidia helper_cuda.h to get the number of SM and cuda cores @@ -29,7 +29,11 @@ inline int convert_SMV_to_cores(int major, int minor) {0x61, 128}, // Pascal Generation (SM 6.1) GP10x class {0x62, 128}, // Pascal Generation (SM 6.2) GP10x class {0x70, 64}, // Volta Generation (SM 7.0) GV100 class - {0x72, 64}, {0x75, 64}, {0x80, 64}, {0x86, 128}, {-1, -1}}; + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {-1, -1}}; int index = 0; @@ -44,7 +48,9 @@ inline int convert_SMV_to_cores(int major, int minor) // properly printf( "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", - major, minor, nGpuArchCoresPerSM[index - 1].Cores); + major, + minor, + nGpuArchCoresPerSM[index - 1].Cores); return nGpuArchCoresPerSM[index - 1].Cores; } @@ -61,37 +67,48 @@ cudaDeviceProp cuda_query(const int dev, bool quite = false) " a CUDA-supported GPU!!!"); } - cudaSetDevice(dev); - cudaDeviceProp devProp; + CUDA_ERROR(cudaSetDevice(dev)); + cudaDeviceProp dev_prop; - CUDA_ERROR(cudaGetDeviceProperties(&devProp, dev)); + CUDA_ERROR(cudaGetDeviceProperties(&dev_prop, dev)); if (!quite) { RXMESH_TRACE("Total number of device: {}", deviceCount); RXMESH_TRACE("Using device Number: {}", dev); - RXMESH_TRACE("Device name: {}", devProp.name); - RXMESH_TRACE("Compute Capability: {}.{}", (int)devProp.major, - (int)devProp.minor); + + RXMESH_TRACE("Device name: {}", dev_prop.name); + RXMESH_TRACE("Compute Capability: {}.{}", + (int)dev_prop.major, + (int)dev_prop.minor); RXMESH_TRACE("Total amount of global memory (MB): {0:.1f}", - (float)devProp.totalGlobalMem / 1048576.0f); + (float)dev_prop.totalGlobalMem / 1048576.0f); RXMESH_TRACE("{} Multiprocessors, {} CUDA Cores/MP: {} CUDA Cores", - devProp.multiProcessorCount, - convert_SMV_to_cores(devProp.major, devProp.minor), - convert_SMV_to_cores(devProp.major, devProp.minor) * - devProp.multiProcessorCount); + dev_prop.multiProcessorCount, + convert_SMV_to_cores(dev_prop.major, dev_prop.minor), + convert_SMV_to_cores(dev_prop.major, dev_prop.minor) * + dev_prop.multiProcessorCount); + RXMESH_TRACE("ECC support: {}", + (dev_prop.ECCEnabled ? "Enabled" : "Disabled")); RXMESH_TRACE("GPU Max Clock rate: {0:.1f} MHz ({1:.2f} GHz)", - devProp.clockRate * 1e-3f, devProp.clockRate * 1e-6f); + dev_prop.clockRate * 1e-3f, + dev_prop.clockRate * 1e-6f); RXMESH_TRACE("Memory Clock rate: {0:.1f} Mhz", - devProp.memoryClockRate * 1e-3f); - RXMESH_TRACE("Memory Bus Width: {}-bit", devProp.memoryBusWidth); - const double maxBW = 2.0 * devProp.memoryClockRate * - (devProp.memoryBusWidth / 8.0) / 1.0E6; + dev_prop.memoryClockRate * 1e-3f); + RXMESH_TRACE("Memory Bus Width: {}-bit", dev_prop.memoryBusWidth); + const double maxBW = 2.0 * dev_prop.memoryClockRate * + (dev_prop.memoryBusWidth / 8.0) / 1.0E6; RXMESH_TRACE("Peak Memory Bandwidth: {0:f}(GB/s)", maxBW); RXMESH_TRACE("Kernels compiled for compute capability: {}", cuda_arch()); } - return devProp; + if (!dev_prop.managedMemory) { + RXMESH_ERROR( + "The selected device does not support CUDA unified memory"); + exit(EXIT_FAILURE); + } + + return dev_prop; } -} // namespace RXMESH \ No newline at end of file +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/util/export_tools.h b/include/rxmesh/util/export_tools.h index 43d66334..7f829c27 100644 --- a/include/rxmesh/util/export_tools.h +++ b/include/rxmesh/util/export_tools.h @@ -88,7 +88,7 @@ void export_as_cubes_VTK(std::string filename, CubeY funY, CubeZ funZ, const uint32_t num_att, - bool randomize = 1, + bool randomize = 1, float* randomness = (float*)nullptr) { @@ -281,7 +281,7 @@ void export_attribute_VTK( if (rand_map_it != rand_map.end()) { return rand_map[att[id]]; } else { - double val = double(rand()) / double(RAND_MAX); + double val = double(rand()) / double(RAND_MAX); rand_map[att[id]] = val; return val; } diff --git a/include/rxmesh/util/import_obj.h b/include/rxmesh/util/import_obj.h index 47cff4a0..b4601a29 100644 --- a/include/rxmesh/util/import_obj.h +++ b/include/rxmesh/util/import_obj.h @@ -2,58 +2,58 @@ #include #include - #include "rxmesh/util/log.h" -#ifndef MAX_LINE_LENGTH -#define MAX_LINE_LENGTH 2048 -#endif - - -// Read and input mesh from obj file format -// Input: path to the obj file -// Output: Verts = 3d vertices (Num vertices X 3) -// Faces = faces index to the Vert array (Num facex X 3) -// Tex = Tex coordinates (Num texture coordinates X 2) -// Faces = faces index to the Tex array (Num facex X 3) -// Normals = faces index to the Tex array (Num normals X 3) -// Faces = faces index to the Normals array (Num facex X 3) - -template -bool import_obj(const std::string fileName, - std::vector>& Verts, - std::vector>& Faces, - std::vector>& Tex, - std::vector>& FacesTex, - std::vector>& Normal, - std::vector>& FacesNormal, - bool quite = false) +/** + * @brief Read an input mesh from obj file format + * @tparam DataT coordinates type (float/double) + * @tparam IndexT indices type + * @param file_name path to the obj file + * @param vertices 3d vertices (3*#vertices) + * @param faces face index to the Vert array (3*#faces) + * @param tex texture coordinates (2*#texture coordinates) + * @param face_tex faces index to the tex array (3*#faces) + * @param normals face normal coordinates (3*#normal) + * @param face_normal faces index to the Normals array (3*faces) + * @param quite run in quite mode + * @return true if reading the file is successful + */ +template +bool import_obj(const std::string file_name, + std::vector>& vertices, + std::vector>& faces, + std::vector>& tex, + std::vector>& face_tex, + std::vector>& normals, + std::vector>& face_normal, + bool quite = false) { - FILE* Objfile = fopen(fileName.c_str(), "r"); + FILE* Objfile = fopen(file_name.c_str(), "r"); if (NULL == Objfile) { - RXMESH_ERROR("importOBJ() can not open {}", fileName); + RXMESH_ERROR("importOBJ() can not open {}", file_name); return false; } else { if (!quite) { - RXMESH_TRACE("Reading {}", fileName); + RXMESH_TRACE("Reading {}", file_name); } } // make sure everything is clean - Verts.clear(); - Faces.clear(); - Tex.clear(); - FacesTex.clear(); - Normal.clear(); - FacesNormal.clear(); + vertices.clear(); + faces.clear(); + tex.clear(); + face_tex.clear(); + normals.clear(); + face_normal.clear(); - char line[MAX_LINE_LENGTH]; - uint32_t lineNum = 1; - while (fgets(line, MAX_LINE_LENGTH, Objfile) != NULL) { + constexpr uint32_t max_line_length = 2048; + char line[max_line_length]; + uint32_t lineNum = 1; + while (fgets(line, max_line_length, Objfile) != NULL) { - char type[MAX_LINE_LENGTH]; + char type[max_line_length]; if (sscanf(line, "%s", type) == 1) { // read only the first letter of the line @@ -61,9 +61,9 @@ bool import_obj(const std::string fileName, char* l = &line[strlen(type)]; // next thing after the type if (strcmp(type, "v") == 0) { // vertex - std::istringstream ls(&line[1]); - std::vector vert{std::istream_iterator(ls), - std::istream_iterator()}; + std::istringstream ls(&line[1]); + std::vector vert{std::istream_iterator(ls), + std::istream_iterator()}; if (vert.size() < 3) { // vertex has less than coordinates RXMESH_ERROR( @@ -73,28 +73,28 @@ bool import_obj(const std::string fileName, fclose(Objfile); return false; } - Verts.push_back(vert); + vertices.push_back(vert); } else if (strcmp(type, "vn") == 0) { // normal - DATA_T x[3]; + DataT x[3]; uint32_t count = sscanf(l, "%f %f %f\n", &x[0], &x[1], &x[2]); if (count != 3) { RXMESH_ERROR( - "importOBJ() normal has less than 3 " - "coordinates Line[{}]\n", + "importOBJ() normals does not have 3 coordinates " + "Line[{}]\n", lineNum); fclose(Objfile); return false; } - std::vector normal_v(3); + std::vector normal_v(3); normal_v[0] = x[0]; normal_v[1] = x[1]; normal_v[2] = x[2]; - Normal.push_back(normal_v); + normals.push_back(normal_v); } else if (strcmp(type, "vt") == 0) { // texture - DATA_T x[3]; + DataT x[3]; uint32_t count = sscanf(l, "%f %f %f\n", &x[0], &x[1], &x[2]); if (count != 2 && count != 3) { @@ -105,40 +105,40 @@ bool import_obj(const std::string fileName, fclose(Objfile); return false; } - std::vector tex(count); + std::vector tx(count); for (uint32_t i = 0; i < count; i++) { - tex[i] = x[i]; + tx[i] = x[i]; } - Tex.push_back(tex); + tex.push_back(tx); } else if (strcmp(type, "f") == 0) { // face (read vert id, norm id, tex id) - // const auto & shift = [&Verts](const int i)->int{return i<0 ? - // i+Verts.size():i-1;}; const auto & shift_t = [&Tex](const int - // i)->int{return i<0 ? i+Tex.size():i-1;}; const auto & shift_n - // = [&Normal](const int i)->int{return i<0 ? - // i+Normal.size():i-1;}; - - std::vector f; - std::vector ft; - std::vector fn; - char word[MAX_LINE_LENGTH]; - uint32_t offset; + // const auto & shift = [&vertices](const int i)->int{return i<0 + // ? i+vertices.size():i-1;}; const auto & shift_t = + // [&Tex](const int i)->int{return i<0 ? i+Tex.size():i-1;}; + // const auto & shift_n = [&normals ](const int i)->int{return + // i<0 ? i+normals .size():i-1;}; + + std::vector f; + std::vector ft; + std::vector fn; + char word[max_line_length]; + uint32_t offset; while (sscanf(l, "%s%n", word, &offset) == 1) { l += offset; long int i, it, in; if (sscanf(word, "%ld/%ld/%ld", &i, &it, &in) == 3) { // face, norm, tex - f.push_back(i < 0 ? i + Verts.size() : i - 1); - ft.push_back(i < 0 ? i + Tex.size() : i - 1); - fn.push_back(i < 0 ? i + Normal.size() : i - 1); + f.push_back(i < 0 ? i + vertices.size() : i - 1); + ft.push_back(i < 0 ? i + tex.size() : i - 1); + fn.push_back(i < 0 ? i + normals.size() : i - 1); } else if (sscanf(word, "%ld/%ld", &i, &it) == 2) { // face, tex - f.push_back(i < 0 ? i + Verts.size() : i - 1); - ft.push_back(i < 0 ? i + Tex.size() : i - 1); + f.push_back(i < 0 ? i + vertices.size() : i - 1); + ft.push_back(i < 0 ? i + tex.size() : i - 1); } else if (sscanf(word, "%ld", &i) == 1) { // face - f.push_back(i < 0 ? i + Verts.size() : i - 1); + f.push_back(i < 0 ? i + vertices.size() : i - 1); } else { RXMESH_ERROR( "importOBJ() face has wrong format Line[{}]", @@ -153,9 +153,9 @@ bool import_obj(const std::string fileName, (f.size() > 0 && fn.size() == f.size() && ft.size() == f.size())) { - Faces.push_back(f); - FacesTex.push_back(ft); - FacesNormal.push_back(fn); + faces.push_back(f); + face_tex.push_back(ft); + face_normal.push_back(fn); } else { RXMESH_ERROR("importOBJ() face has wrong format Line[{}]", lineNum); @@ -170,8 +170,8 @@ bool import_obj(const std::string fileName, } else { // others - RXMESH_ERROR("importOBJ() invalid Line[{}] File[{}]\n", lineNum, - line); + RXMESH_ERROR( + "importOBJ() invalid Line[{}] File[{}]\n", lineNum, line); fclose(Objfile); return false; } @@ -184,29 +184,38 @@ bool import_obj(const std::string fileName, fclose(Objfile); if (!quite) { - RXMESH_TRACE("import_obj() #Verts= {} ", Verts.size()); - RXMESH_TRACE("import_obj() #Faces= {} ", Faces.size()); - RXMESH_TRACE("import_obj() #Tex= {} ", Tex.size()); - RXMESH_TRACE("import_obj() #FacesTex= {} ", FacesTex.size()); - RXMESH_TRACE("import_obj() #Normal= {} ", Normal.size()); - RXMESH_TRACE("import_obj() #FacesNormal= {} ", FacesNormal.size()); + RXMESH_TRACE("import_obj() #vertices= {} ", vertices.size()); + RXMESH_TRACE("import_obj() #faces= {} ", faces.size()); + RXMESH_TRACE("import_obj() #tex= {} ", tex.size()); + RXMESH_TRACE("import_obj() #face_tex= {} ", face_tex.size()); + RXMESH_TRACE("import_obj() #normals = {} ", normals.size()); + RXMESH_TRACE("import_obj() #face_normal= {} ", face_normal.size()); } return true; } - -template -bool import_obj(const std::string fileName, - std::vector>& Verts, - std::vector>& Faces, - bool quite = false) +/** + * @brief Read an input mesh from obj file format + * @tparam DataT coordinates type (float/double) + * @tparam IndexT indices type + * @param file_name path to the obj file + * @param vertices 3d vertices (3*#vertices) + * @param faces face index to the Vert array (3*#faces) + * @param quite run in quite mode + * @return true if reading the file is successful + */ +template +bool import_obj(const std::string file_name, + std::vector>& vertices, + std::vector>& faces, + bool quite = false) { - std::vector> Tex; - std::vector> FacesTex; - std::vector> Normal; - std::vector> FacesNormal; + std::vector> tex; + std::vector> face_tex; + std::vector> normals; + std::vector> face_normal; - return import_obj(fileName, Verts, Faces, Tex, FacesTex, Normal, - FacesNormal, quite); + return import_obj( + file_name, vertices, faces, tex, face_tex, normals, face_normal, quite); } \ No newline at end of file diff --git a/include/rxmesh/util/log.h b/include/rxmesh/util/log.h index 9900558f..fd2b0b17 100644 --- a/include/rxmesh/util/log.h +++ b/include/rxmesh/util/log.h @@ -7,12 +7,12 @@ #include "spdlog/spdlog.h" -namespace RXMESH { +namespace rxmesh { class Log { public: - static void init() + static void init(spdlog::level::level_enum level = spdlog::level::trace) { std::vector sinks; sinks.emplace_back( @@ -23,11 +23,11 @@ class Log sinks[0]->set_pattern("%^[%T] %n: %v%$"); sinks[1]->set_pattern("[%T] [%l] %n: %v"); - m_logger = std::make_shared("RXMesh", begin(sinks), - end(sinks)); + m_logger = std::make_shared( + "RXMesh", begin(sinks), end(sinks)); spdlog::register_logger(m_logger); - m_logger->set_level(spdlog::level::trace); - m_logger->flush_on(spdlog::level::trace); + m_logger->set_level(level); + m_logger->flush_on(level); } inline static std::shared_ptr& get_logger() @@ -39,17 +39,17 @@ class Log private: inline static std::shared_ptr m_logger; }; -} // namespace RXMESH +} // namespace rxmesh -#define RXMESH_TRACE(...) ::RXMESH::Log::get_logger()->trace(__VA_ARGS__) -#define RXMESH_INFO(...) ::RXMESH::Log::get_logger()->info(__VA_ARGS__) +#define RXMESH_TRACE(...) ::rxmesh::Log::get_logger()->trace(__VA_ARGS__) +#define RXMESH_INFO(...) ::rxmesh::Log::get_logger()->info(__VA_ARGS__) #define RXMESH_WARN(...) \ - ::RXMESH::Log::get_logger()->warn("Line {} File {}", __LINE__, __FILE__); \ - ::RXMESH::Log::get_logger()->warn(__VA_ARGS__) + ::rxmesh::Log::get_logger()->warn("Line {} File {}", __LINE__, __FILE__); \ + ::rxmesh::Log::get_logger()->warn(__VA_ARGS__) #define RXMESH_ERROR(...) \ - ::RXMESH::Log::get_logger()->error("Line {} File {}", __LINE__, __FILE__); \ - ::RXMESH::Log::get_logger()->error(__VA_ARGS__) -#define RXMESH_CRITICAL(...) \ - ::RXMESH::Log::get_logger()->critical("Line {} File {}", __LINE__, \ - __FILE__); \ - ::RXMESH::Log::get_logger()->critical(__VA_ARGS__) + ::rxmesh::Log::get_logger()->error("Line {} File {}", __LINE__, __FILE__); \ + ::rxmesh::Log::get_logger()->error(__VA_ARGS__) +#define RXMESH_CRITICAL(...) \ + ::rxmesh::Log::get_logger()->critical( \ + "Line {} File {}", __LINE__, __FILE__); \ + ::rxmesh::Log::get_logger()->critical(__VA_ARGS__) diff --git a/include/rxmesh/util/macros.h b/include/rxmesh/util/macros.h index 3534b06e..c5a225fc 100644 --- a/include/rxmesh/util/macros.h +++ b/include/rxmesh/util/macros.h @@ -4,10 +4,9 @@ #include #include "rxmesh/util/log.h" -namespace RXMESH { +namespace rxmesh { -typedef uint8_t flag_t; -constexpr uint32_t PATCH_SIZE = 512; +typedef uint8_t flag_t; // TRANSPOSE_ITEM_PER_THREAD constexpr uint32_t TRANSPOSE_ITEM_PER_THREAD = 11; @@ -15,8 +14,8 @@ constexpr uint32_t TRANSPOSE_ITEM_PER_THREAD = 11; // used for integer rounding #define DIVIDE_UP(num, divisor) (num + divisor - 1) / (divisor) -// assuming a 32-bit index -#define FULL_MASK 0xffffffff +// unsigned 64-bit +#define INVALID64 0xFFFFFFFFFFFFFFFFu // unsigned 32-bit #define INVALID32 0xFFFFFFFFu @@ -27,12 +26,6 @@ constexpr uint32_t TRANSPOSE_ITEM_PER_THREAD = 11; // unsigned 8-bit #define INVALID8 0xFFu -// assuming a 32-bit index -#define SPECIAL 0xFFFFFFFE - -// 32 -#define WARPSIZE 32u - // http://www.decompile.com/cpp/faq/file_and_line_error_string.htm #define STRINGIFY(x) TOSTRING(x) @@ -61,5 +54,20 @@ inline void HandleError(cudaError_t err, const char* file, int line) ptr = nullptr; \ } +// Taken from https://stackoverflow.com/a/12779757/1608232 +#if defined(__CUDACC__) // NVCC +#define ALIGN(n) __align__(n) +#elif defined(__GNUC__) // GCC +#define ALIGN(n) __attribute__((aligned(n))) +#elif defined(_MSC_VER) // MSVC +#define ALIGN(n) __declspec(align(n)) +#else +#error "Please provide a definition for MY_ALIGN macro for your host compiler!" +#endif + + +//Taken from https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambda-traits +#define IS_D_LAMBDA(X) __nv_is_extended_device_lambda_closure_type(X) +#define IS_HD_LAMBDA(X) __nv_is_extended_host_device_lambda_closure_type(X) -} // namespace RXMESH \ No newline at end of file +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/util/math.h b/include/rxmesh/util/math.h deleted file mode 100644 index 16eb2d7e..00000000 --- a/include/rxmesh/util/math.h +++ /dev/null @@ -1,201 +0,0 @@ -#pragma once -#include -#include -#include -#include - -namespace RXMESH { -// 180.0/PI (multiply this by the radian angle to convert to degree) -constexpr float RadToDeg = 57.295779513078550; - -constexpr float PIf = 3.1415927f; - - -/** - * l2_norm() - * TODO remove - */ -template -__host__ __device__ __forceinline__ T l2_norm(const T ax0, - const T ax1, - const T ax2, - const T bx0, - const T bx1, - const T bx2) -{ - // compute sqrt((xa0-xb0)*(xa0-xb0) + (xa1-xb1)*(xa1-xb1) + - //(xa2-xb2)*(xa2-xb2)) - return sqrt(l2_norm_sq(ax0, ax1, ax2, bx0, bx1, bx2)); -} - - -/** - * l2_norm_sq() - * TODO remove - */ -template -__host__ __device__ __forceinline__ T l2_norm_sq(const T ax0, - const T ax1, - const T ax2, - const T bx0, - const T bx1, - const T bx2) -{ - // compute (xa0-xb0)*(xa0-xb0) + (xa1-xb1)*(xa1-xb1) + (xa2-xb2)*(xa2-xb2) - T x0 = ax0 - bx0; - T x1 = ax1 - bx1; - T x2 = ax2 - bx2; - return x0 * x0 + x1 * x1 + x2 * x2; -} - -/** - * vector_length() - * TODO remove - */ -__device__ __host__ __forceinline__ float vector_length(const float x, - const float y, - const float z) -{ - return sqrtf(x * x + y * y + z * z); -} - - -/** - * vector_length() - * TODO remove - */ -__device__ __host__ __forceinline__ double vector_length(const double x, - const double y, - const double z) -{ - return sqrt(x * x + y * y + z * z); -} - -/** - * cross_product() - * TODO remove - */ -template -__host__ __device__ __forceinline__ void -cross_product(T xv1, T yv1, T zv1, T xv2, T yv2, T zv2, T& xx, T& yy, T& zz) -{ - xx = yv1 * zv2 - zv1 * yv2; - yy = zv1 * xv2 - xv1 * zv2; - zz = xv1 * yv2 - yv1 * xv2; -} - -/** - * vector_normal() - * TODO remove - */ -template -__device__ __host__ __forceinline__ T vector_normal(const T& vector_x, - const T& vector_y, - const T& vector_z) -{ - return vector_length(vector_x, vector_y, vector_z); -} - -/** - * normalize_vector() - * TODO remove - */ -template -__device__ __host__ __forceinline__ void normalize_vector(T& vector_x, - T& vector_y, - T& vector_z) -{ - T nn = vector_normal(vector_x, vector_y, vector_z); - if (nn == 0) { - vector_x = vector_y = vector_z = 0; - } else { - nn = 1 / nn; - vector_x *= nn; - vector_y *= nn; - vector_z *= nn; - } -} - -/** - * round_up_multiple() - */ -template -__host__ __device__ __forceinline__ T round_up_multiple(const T numToRound, - const T multiple) -{ - - // https://stackoverflow.com/a/3407254/1608232 - // rounding numToRound to the closest number multiple of multiple - // this code meant only for +ve int. for -ve, check the reference above - if (multiple == 0) { - return numToRound; - } - - const T remainder = numToRound % multiple; - if (remainder == 0) { - return numToRound; - } - return numToRound + multiple - remainder; -} - -/** - * round_to_next_power_two() - */ -__host__ __device__ __forceinline__ uint32_t -round_to_next_power_two(const uint32_t numToRound) -{ - - // https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 - uint32_t res = numToRound; - if (res == 0) { - return 1; - } - res--; - res |= res >> 1; - res |= res >> 2; - res |= res >> 4; - res |= res >> 8; - res |= res >> 16; - res++; - return res; -} - -/** - * dot() - * TODO remove - */ -template -T dot(const std::vector& u, const std::vector& v) -{ - return std::inner_product(std::begin(u), std::end(u), std::begin(v), 0.0); -} - -/** - * scale() - * TODO remove - */ -template -void scale(std::vector& v, const T factor) -{ - std::transform( - v.begin(), v.end(), v.begin(), - std::bind(std::multiplies(), std::placeholders::_1, factor)); -} - -/** - * axpy() - */ -template -void axpy(const std::vector& x, - const T alpha, - const T beta, - std::vector& y) -{ - // y = alpha*x + beta*y - for (uint32_t i = 0; i < x.size(); ++i) { - y[i] *= beta; - y[i] += alpha * x[i]; - } -} - -} // namespace RXMESH \ No newline at end of file diff --git a/include/rxmesh/util/meta.h b/include/rxmesh/util/meta.h new file mode 100644 index 00000000..c42bf54a --- /dev/null +++ b/include/rxmesh/util/meta.h @@ -0,0 +1,51 @@ +#pragma once +#include +namespace rxmesh { +namespace detail { + +/** + * @brief extracting the input parameter type and return type of a lambda + * function. Taken from https://stackoverflow.com/a/7943765/1608232. + * For generic types, directly use the result of the signature of its operator() + */ +template +struct FunctionTraits : public FunctionTraits +{ +}; + +/** + * @brief specialization for pointers to member function + */ +template +struct FunctionTraits +{ + /** + * @brief arity is the number of arguments. + */ + enum + { + arity = sizeof...(Args) + }; + + typedef ReturnType result_type; + + /** + * @brief the i-th argument is equivalent to the i-th tuple element of a + * tuple composed of those arguments. + */ + template + struct arg + { + using type_rc = + typename std::tuple_element>::type; + using type_c = std::conditional_t, + std::remove_reference_t, + type_rc>; + using type = std::conditional_t, + std::remove_const_t, + type_c>; + }; +}; + +} // namespace detail +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/util/report.h b/include/rxmesh/util/report.h index 4363994f..a72abc8f 100644 --- a/include/rxmesh/util/report.h +++ b/include/rxmesh/util/report.h @@ -14,8 +14,8 @@ #include "rxmesh/util/util.h" #ifdef __NVCC__ #include "cuda.h" -#include "rxmesh/util/cuda_query.h" #include "rxmesh/kernels/get_arch.cuh" +#include "rxmesh/util/cuda_query.h" #endif #include "rxmesh/util/git_sha1.h" @@ -26,7 +26,7 @@ #include #endif -namespace RXMESH { +namespace rxmesh { // Most values are signed and initialized to -1 // if any value is not modified, it won't be written @@ -36,12 +36,13 @@ namespace RXMESH { struct TestData { std::vector time_ms; - int32_t num_blocks = -1; + int32_t num_blocks = -1; int32_t num_threads = -1; std::vector passed; - std::string test_name = ""; - float dyn_smem = -1; - float static_smem = -1; + std::string test_name = ""; + int32_t dyn_smem = -1; + int32_t static_smem = -1; + int32_t num_reg = -1; }; struct Report @@ -60,12 +61,12 @@ struct Report m_doc.GetAllocator()); std::string str = g_GIT_SHA1; m_doc.AddMember("git_sha", - rapidjson::Value().SetString(str.c_str(), str.length(), - m_doc.GetAllocator()), + rapidjson::Value().SetString( + str.c_str(), str.length(), m_doc.GetAllocator()), m_doc.GetAllocator()); // Time - auto t = std::time(nullptr); + auto t = std::time(nullptr); auto tm = *std::localtime(&t); { std::ostringstream oss; @@ -80,8 +81,8 @@ struct Report m_doc.AddMember( "date", - rapidjson::Value().SetString(str.c_str(), str.length(), - m_doc.GetAllocator()), + rapidjson::Value().SetString( + str.c_str(), str.length(), m_doc.GetAllocator()), m_doc.GetAllocator()); } } @@ -94,8 +95,8 @@ struct Report cmd = cmd + " " + std::string(argv[i]); } m_doc.AddMember("command_line", - rapidjson::Value().SetString(cmd.c_str(), cmd.length(), - m_doc.GetAllocator()), + rapidjson::Value().SetString( + cmd.c_str(), cmd.length(), m_doc.GetAllocator()), m_doc.GetAllocator()); } @@ -145,22 +146,25 @@ struct Report // Memory add_member("Total amount of global memory (MB)", - (float)devProp.totalGlobalMem / 1048576.0f, subdoc); + (float)devProp.totalGlobalMem / 1048576.0f, + subdoc); add_member("Total amount of shared memory per block (Kb)", - (float)devProp.sharedMemPerBlock / 1024.0f, subdoc); + (float)devProp.sharedMemPerBlock / 1024.0f, + subdoc); // SM add_member("Multiprocessors", devProp.multiProcessorCount, subdoc); #ifdef __NVCC__ add_member("CUDA Cores/MP", - convert_SMV_to_cores(devProp.major, devProp.minor), subdoc); + convert_SMV_to_cores(devProp.major, devProp.minor), + subdoc); #endif // Clocks - add_member("GPU Max Clock rate (GHz)", devProp.clockRate * 1e-6f, - subdoc); - add_member("Memory Clock rate (GHz)", devProp.memoryClockRate * 1e-6f, - subdoc); + add_member( + "GPU Max Clock rate (GHz)", devProp.clockRate * 1e-6f, subdoc); + add_member( + "Memory Clock rate (GHz)", devProp.memoryClockRate * 1e-6f, subdoc); add_member("Memory Bus Width (bit)", devProp.memoryBusWidth, subdoc); add_member("Peak Memory Bandwidth (GB/s)", 2.0 * devProp.memoryClockRate * @@ -194,8 +198,8 @@ struct Report #ifdef _MSC_VER - add_member("Microsoft Full Compiler Version", int32_t(_MSC_FULL_VER), - subdoc); + add_member( + "Microsoft Full Compiler Version", int32_t(_MSC_FULL_VER), subdoc); add_member("Microsoft Compiler Version", int32_t(_MSC_VER), subdoc); #else @@ -265,9 +269,7 @@ struct Report } // get model data from RXMesh - template - void model_data(const std::string& model_name, - const RXMESH::RXMesh

& rxmesh) + void model_data(const std::string& model_name, const rxmesh::RXMesh& rxmesh) { rapidjson::Document subdoc(&m_doc.GetAllocator()); subdoc.SetObject(); @@ -285,21 +287,19 @@ struct Report add_member("num_lloyd_run", rxmesh.get_num_lloyd_run(), subdoc); add_member("patching_time", rxmesh.get_patching_time(), subdoc); uint32_t min_patch_size(0), max_patch_size(0), avg_patch_size(0); - rxmesh.get_max_min_avg_patch_size(min_patch_size, max_patch_size, - avg_patch_size); + rxmesh.get_max_min_avg_patch_size( + min_patch_size, max_patch_size, avg_patch_size); add_member("min_patch_size", min_patch_size, subdoc); add_member("max_patch_size", max_patch_size, subdoc); add_member("avg_patch_size", avg_patch_size, subdoc); add_member("per_patch_max_vertices", - rxmesh.get_per_patch_max_vertices(), subdoc); - add_member("per_patch_max_edges", rxmesh.get_per_patch_max_edges(), - subdoc); - add_member("per_patch_max_faces", rxmesh.get_per_patch_max_faces(), - subdoc); - add_member("ribbon_overhead (%)", rxmesh.get_ribbon_overhead(), - subdoc); - add_member("total_gpu_storage (mb)", rxmesh.get_gpu_storage_mb(), + rxmesh.get_per_patch_max_vertices(), subdoc); + add_member( + "per_patch_max_edges", rxmesh.get_per_patch_max_edges(), subdoc); + add_member( + "per_patch_max_faces", rxmesh.get_per_patch_max_faces(), subdoc); + add_member("ribbon_overhead (%)", rxmesh.get_ribbon_overhead(), subdoc); m_doc.AddMember("Model", subdoc, m_doc.GetAllocator()); } @@ -322,8 +322,12 @@ struct Report } if (test_data.static_smem != -1) { - add_member("static_shared_memory (b)", test_data.static_smem, - subdoc); + add_member( + "static_shared_memory (b)", test_data.static_smem, subdoc); + } + + if (test_data.num_reg != -1) { + add_member("num_register_per_thread", test_data.num_reg, subdoc); } if (!test_data.passed.empty()) { @@ -359,8 +363,8 @@ struct Report void add_member(std::string member_key, const int32_t member_val, docT& doc) { rapidjson::Value key(member_key.c_str(), doc.GetAllocator()); - doc.AddMember(key, rapidjson::Value().SetInt(member_val), - doc.GetAllocator()); + doc.AddMember( + key, rapidjson::Value().SetInt(member_val), doc.GetAllocator()); } template void add_member(std::string member_key, @@ -368,24 +372,24 @@ struct Report docT& doc) { rapidjson::Value key(member_key.c_str(), doc.GetAllocator()); - doc.AddMember(key, rapidjson::Value().SetUint(member_val), - doc.GetAllocator()); + doc.AddMember( + key, rapidjson::Value().SetUint(member_val), doc.GetAllocator()); } template void add_member(std::string member_key, const double member_val, docT& doc) { rapidjson::Value key(member_key.c_str(), doc.GetAllocator()); - doc.AddMember(key, rapidjson::Value().SetDouble(member_val), - doc.GetAllocator()); + doc.AddMember( + key, rapidjson::Value().SetDouble(member_val), doc.GetAllocator()); } template void add_member(std::string member_key, const bool member_val, docT& doc) { rapidjson::Value key(member_key.c_str(), doc.GetAllocator()); - doc.AddMember(key, rapidjson::Value().SetBool(member_val), - doc.GetAllocator()); + doc.AddMember( + key, rapidjson::Value().SetBool(member_val), doc.GetAllocator()); } template @@ -442,4 +446,4 @@ class CustomReport : public Report this->m_doc.AddMember("Model", subdoc, m_doc.GetAllocator()); } }; -} // namespace RXMESH \ No newline at end of file +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/util/timer.h b/include/rxmesh/util/timer.h index 42126c34..c5c06adb 100644 --- a/include/rxmesh/util/timer.h +++ b/include/rxmesh/util/timer.h @@ -3,7 +3,7 @@ #include #include "rxmesh/util/macros.h" -namespace RXMESH { +namespace rxmesh { struct GPUTimer { @@ -65,4 +65,4 @@ struct CPUTimer std::chrono::high_resolution_clock::time_point m_start; std::chrono::high_resolution_clock::time_point m_stop; }; -} // namespace RXMESH \ No newline at end of file +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/util/util.h b/include/rxmesh/util/util.h index 014b62c6..bdbd73b7 100644 --- a/include/rxmesh/util/util.h +++ b/include/rxmesh/util/util.h @@ -5,10 +5,10 @@ #include #include "rxmesh/util/macros.h" -namespace RXMESH { +namespace rxmesh { /** - * get_cmd_option() + * @brief Parse for an option. Maninly used to parse user input from CMD */ inline char* get_cmd_option(char** begin, char** end, const std::string& option) { @@ -19,8 +19,10 @@ inline char* get_cmd_option(char** begin, char** end, const std::string& option) } return 0; } + /** - * cmd_option_exists() + * @brief Check if an input string exists. Mainly used to check if input option + * exists in CMD */ inline bool cmd_option_exists(char** begin, char** end, @@ -31,32 +33,38 @@ inline bool cmd_option_exists(char** begin, } /** - * print_device_memory_usage() + * @brief Print current GPU memory usage */ inline void print_device_memory_usage() { // print how much memory is available, used and free on the current device size_t free_t, total_t; CUDA_ERROR(cudaMemGetInfo(&free_t, &total_t)); - double free_m = (double)free_t / (double)1048576.0; + double free_m = (double)free_t / (double)1048576.0; double total_m = (double)total_t / (double)1048576.0; - double used_m = total_m - free_m; - RXMESH_TRACE(" device memory mem total = {} (B) [{} (MB)]", total_t, - total_m); + double used_m = total_m - free_m; + RXMESH_TRACE( + " device memory mem total = {} (B) [{} (MB)]", total_t, total_m); RXMESH_TRACE(" device memory free: {} (B) [{} (MB)]", free_t, free_m); RXMESH_TRACE(" device memory mem used: {} (MB)", used_m); } + /** - * find_index() + * @brief Find the index of an entry in a vector + * @tparam T type of the entry and vector elements + * @param entry to search for + * @param vect input vector to search in + * @return return the index of the entry or std::numeric_limits::max() + * if it is not found */ template -inline uint32_t find_index(const T entery, const std::vector& vect) +inline uint32_t find_index(const T entry, const std::vector& vect) { // get index of entry in vector typename std::vector::const_iterator it = - std::find(vect.begin(), vect.end(), entery); + std::find(vect.begin(), vect.end(), entry); if (it == vect.end()) { return std::numeric_limits::max(); } @@ -64,15 +72,21 @@ inline uint32_t find_index(const T entery, const std::vector& vect) } /** - * find_index() + * @brief Find the index of an entry an array given its size + * @tparam T type of the entry and array elements + * @param entry to search for + * @param arr input array to search in + * @param arr_size size of the input array (arr) + * @return return the index of the entry or std::numeric_limits::max() + * if it is not found */ template -inline T find_index(const T* arr, const T arr_size, const T val) +inline T find_index(const T* arr, const T arr_size, const T entry) { // get index of entry in array const T* begin = arr; - const T* end = arr + arr_size; - const T* it = std::find(begin, end, val); + const T* end = arr + arr_size; + const T* it = std::find(begin, end, entry); if (it == end) { return std::numeric_limits::max(); } @@ -80,7 +94,7 @@ inline T find_index(const T* arr, const T arr_size, const T val) } /** - * random_shuffle() + * @brief Shuffle the content of an input array randomly */ template inline void random_shuffle(T* d_in, @@ -93,7 +107,7 @@ inline void random_shuffle(T* d_in, } /** - * fill_with_sequential_numbers() + * @brief Fill in an array with sequential numbers */ template inline void fill_with_sequential_numbers(T* arr, @@ -103,23 +117,26 @@ inline void fill_with_sequential_numbers(T* arr, std::iota(arr, arr + size, start); } + /** - * compare() + * @brief Compare the content of two input arrays */ template bool compare(const dataT* gold, const dataT* arr, const T size, const bool verbose = false, - const dataT tol = 10E-5) + const dataT tol = 10E-5) { bool result = true; for (T i = 0; i < size; i++) { if (std::abs(double(gold[i]) - double(arr[i])) > tol) { if (verbose) { - RXMESH_WARN("compare() mismatch at {} gold = {} arr = {} ", i, - gold[i], arr[i]); + RXMESH_WARN("compare() mismatch at {} gold = {} arr = {} ", + i, + gold[i], + arr[i]); result = false; } else { // it is not verbose, don't bother running through all entires @@ -131,7 +148,7 @@ bool compare(const dataT* gold, } /** - * copy() + * @brief Copy the content of one vector to another */ template void copy(const std::vector& src, std::vector& tar, int tar_start = 0) @@ -139,8 +156,9 @@ void copy(const std::vector& src, std::vector& tar, int tar_start = 0) std::copy(src.begin(), src.end(), tar.data() + tar_start); } + /** - * compute_avg_stddev() + * @brief Compute the average and standard deviation of an input array */ template inline void compute_avg_stddev(const T* arr, @@ -149,7 +167,7 @@ inline void compute_avg_stddev(const T* arr, double& stddev) { if (size == 1) { - avg = arr[0]; + avg = arr[0]; stddev = 0; return; } @@ -170,9 +188,8 @@ inline void compute_avg_stddev(const T* arr, return; } /** - * compute_avg_stddev_max_min_rs() - * computes the average and stddev where the input is running sum (output of - * exclusive sum) the input size is actually size + 1 + * @brief computes the average and stddev where the input is running sum (output + * of exclusive sum) the input size is actually size + 1 */ template inline void compute_avg_stddev_max_min_rs(const T* arr_rs, @@ -183,15 +200,15 @@ inline void compute_avg_stddev_max_min_rs(const T* arr_rs, T& min) { uint32_t* arr = (uint32_t*)malloc(size * sizeof(uint32_t)); - max = std::numeric_limits::min(); - min = std::numeric_limits::max(); + max = std::numeric_limits::min(); + min = std::numeric_limits::max(); for (uint32_t i = 0; i < size; i++) { // arr[i] = arr_rs[i + 1] - arr_rs[i]; uint32_t start = (i == 0) ? 0 : arr_rs[i - 1]; - uint32_t end = arr_rs[i]; - arr[i] = end - start; - max = std::max(max, arr[i]); - min = std::min(min, arr[i]); + uint32_t end = arr_rs[i]; + arr[i] = end - start; + max = std::max(max, arr[i]); + min = std::min(min, arr[i]); } compute_avg_stddev(arr, size, avg, stddev); @@ -200,7 +217,7 @@ inline void compute_avg_stddev_max_min_rs(const T* arr_rs, } /** - * binary_search() + * @brief binary search in a vector (has to be sorted --- not checked) */ template inline size_t binary_search(const std::vector& list, @@ -235,8 +252,7 @@ inline size_t binary_search(const std::vector& list, /** - * inplace_remove_duplicates_sorted() - * in-place remove duplicates from sorted vector + * @brief in-place remove duplicates from sorted vector * requires one pass over all elements in sort_vec * it also resize sort_vec to contain only the unique values */ @@ -249,12 +265,12 @@ inline void inplace_remove_duplicates_sorted(std::vector& sort_vec) // leave the first value uint32_t next_unique_id = 1; - T prev_value = sort_vec.front(); + T prev_value = sort_vec.front(); for (uint32_t i = 1; i < sort_vec.size(); ++i) { T curr_val = sort_vec[i]; if (curr_val != prev_value) { sort_vec[next_unique_id++] = curr_val; - prev_value = curr_val; + prev_value = curr_val; } } @@ -262,7 +278,8 @@ inline void inplace_remove_duplicates_sorted(std::vector& sort_vec) } /** - * shuffle_obj() + * @brief Given the vertex coordinates and face indices, shuffle the input mesh + * randomly --- both vertices and face indices */ template inline void shuffle_obj(std::vector>& Faces, @@ -306,7 +323,7 @@ inline void shuffle_obj(std::vector>& Faces, /** - * remove_extension() + * @brief Remove the extension of an input file path */ inline std::string remove_extension(const std::string& filename) { // https://stackoverflow.com/a/6417908/1608232 @@ -317,44 +334,42 @@ inline std::string remove_extension(const std::string& filename) } /** - * extract_file_name() + * @brief Extract file path given its full path */ inline std::string extract_file_name(const std::string& full_path) { // given full path, we extract the file name without extension - std::string filename = remove_extension(full_path); + std::string filename = remove_extension(full_path); size_t lastslash = filename.find_last_of("/\\"); return filename.substr(lastslash + 1); } +namespace detail { + /** - * in_place_matrix_transpose() + * @brief hash function that takes a pair of vertices and returns a unique + * values. Used for storing vertex-edge relation in std map */ -template -void in_place_matrix_transpose(RandomIterator first, - RandomIterator last, - uint64_t m) +struct edge_key_hash { - // in-place matrix transpose represented as row-major format with m - // number for columns - // https://stackoverflow.com/a/9320349/1608232 - const uint64_t mn1 = (last - first - 1); - const uint64_t n = (last - first) / m; - - std::vector visited(last - first, false); - - RandomIterator cycle = first; - while (++cycle != last) { - if (visited[cycle - first]) { - continue; - } - uint64_t a = cycle - first; - do { - a = (a == mn1) ? mn1 : (n * a) % mn1; - std::swap(*(first + a), *cycle); - visited[a] = true; - } while ((first + a) != cycle); + // www.techiedelight.com/use-std-pair-key-std-unordered_map-cpp/ + template + inline std::size_t operator()(const std::pair& e_key) const + { + return std::hash()(e_key.first * 8191 + e_key.second * 11003); } +}; + +/** + * @brief return consistent edge key given two vertices + */ +inline std::pair edge_key(const uint32_t v0, + const uint32_t v1) +{ + uint32_t i = std::max(v0, v1); + uint32_t j = std::min(v0, v1); + return std::make_pair(i, j); } -} // namespace RXMESH \ No newline at end of file +} // namespace detail +} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/util/vector.h b/include/rxmesh/util/vector.h index 4b2363d3..ab1e76d4 100644 --- a/include/rxmesh/util/vector.h +++ b/include/rxmesh/util/vector.h @@ -2,7 +2,7 @@ #include #include -namespace RXMESH { +namespace rxmesh { template struct Vector @@ -290,11 +290,11 @@ struct Vector __host__ __device__ __forceinline__ void normalize() { T r = norm(); - if(r == T(0.0)){ + if (r == T(0.0)) { for (uint32_t i = 0; i < N; ++i) { m_v[i] = 0; } - }else{ + } else { r = 1. / r; (*this) *= r; } @@ -440,50 +440,50 @@ inline std::istream& operator>>(std::istream& input, const Vector& v) } // Alias -using Vector2d = Vector<2, double>; -using Vector2f = Vector<2, float>; -using Vector2i = Vector<2, int32_t>; +using Vector2d = Vector<2, double>; +using Vector2f = Vector<2, float>; +using Vector2i = Vector<2, int32_t>; using Vector2ui = Vector<2, uint32_t>; -using Vector2s = Vector<2, int16_t>; +using Vector2s = Vector<2, int16_t>; using Vector2us = Vector<2, uint16_t>; -using Vector2c = Vector<2, int8_t>; +using Vector2c = Vector<2, int8_t>; using Vector2uc = Vector<2, uint8_t>; -using Vector3d = Vector<3, double>; -using Vector3f = Vector<3, float>; -using Vector3i = Vector<3, int32_t>; +using Vector3d = Vector<3, double>; +using Vector3f = Vector<3, float>; +using Vector3i = Vector<3, int32_t>; using Vector3ui = Vector<3, uint32_t>; -using Vector3s = Vector<3, int16_t>; +using Vector3s = Vector<3, int16_t>; using Vector3us = Vector<3, uint16_t>; -using Vector3c = Vector<3, int8_t>; +using Vector3c = Vector<3, int8_t>; using Vector3uc = Vector<3, uint8_t>; -using Vector4d = Vector<4, double>; -using Vector4f = Vector<4, float>; -using Vector4i = Vector<4, int32_t>; +using Vector4d = Vector<4, double>; +using Vector4f = Vector<4, float>; +using Vector4i = Vector<4, int32_t>; using Vector4ui = Vector<4, uint32_t>; -using Vector4s = Vector<4, int16_t>; +using Vector4s = Vector<4, int16_t>; using Vector4us = Vector<4, uint16_t>; -using Vector4c = Vector<4, int8_t>; +using Vector4c = Vector<4, int8_t>; using Vector4uc = Vector<4, uint8_t>; -using Vector6d = Vector<6, double>; -using Vector6f = Vector<6, float>; -using Vector6i = Vector<6, int32_t>; +using Vector6d = Vector<6, double>; +using Vector6f = Vector<6, float>; +using Vector6i = Vector<6, int32_t>; using Vector6ui = Vector<6, uint32_t>; -using Vector6s = Vector<6, int16_t>; +using Vector6s = Vector<6, int16_t>; using Vector6us = Vector<6, uint16_t>; -using Vector6c = Vector<6, int8_t>; +using Vector6c = Vector<6, int8_t>; using Vector6uc = Vector<6, uint8_t>; -} // namespace RXMESH +} // namespace rxmesh // Hash namespace std { template -struct hash> +struct hash> { - std::size_t operator()(const RXMESH::Vector& v) const + std::size_t operator()(const rxmesh::Vector& v) const { std::size_t h = 0; for (int i = 0; i < N; i++) { diff --git a/tests/RXMesh_test/CMakeLists.txt b/tests/RXMesh_test/CMakeLists.txt index f76c7b35..e5716cff 100644 --- a/tests/RXMesh_test/CMakeLists.txt +++ b/tests/RXMesh_test/CMakeLists.txt @@ -3,7 +3,7 @@ add_executable( RXMesh_test ) set( SOURCE_LIST rxmesh_test_main.cu rxmesh_test.h - test_attribute.cu + test_attribute.cuh test_vector.cu test_util.cu test_iterator.cu @@ -11,6 +11,7 @@ set( SOURCE_LIST test_higher_queries.h query.cuh higher_query.cuh + test_for_each.h ) target_sources( RXMesh_test @@ -20,7 +21,7 @@ target_sources( RXMesh_test set_target_properties( RXMesh_test PROPERTIES FOLDER "tests") -#set_property(TARGET RXMesh_test PROPERTY CUDA_SEPARABLE_COMPILATION ON) +set_property(TARGET RXMesh_test PROPERTY CUDA_SEPARABLE_COMPILATION ON) source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "RXMesh_test" FILES ${SOURCE_LIST}) diff --git a/tests/RXMesh_test/benchmark.sh b/tests/RXMesh_test/benchmark.sh old mode 100644 new mode 100755 index ec11eba8..c83ae5f6 --- a/tests/RXMesh_test/benchmark.sh +++ b/tests/RXMesh_test/benchmark.sh @@ -1,5 +1,4 @@ #!/bin/bash -echo "This script re-generates RXMesh data in Figure 6 in the paper." echo "Please make sure to first compile the source code and then enter the input OBJ files directory." read -p "OBJ files directory (no trailing slash): " input_dir @@ -16,13 +15,7 @@ device_id=0 for file in $input_dir/*.obj; do if [ -f "$file" ]; then - echo $exe --gtest_filter=RXMesh.Queries -input "$file" -num_run $num_run -device_id $device_id - $exe --gtest_filter=RXMesh.Queries -input "$file" -num_run $num_run -device_id $device_id - - echo $exe -s --gtest_filter=RXMesh.Queries -input "$file" -num_run $num_run -device_id $device_id - $exe -s --gtest_filter=RXMesh.Queries -input "$file" -num_run $num_run -device_id $device_id - - echo $exe -p --gtest_filter=RXMesh.Queries -input "$file" -num_run $num_run -device_id $device_id - $exe -p --gtest_filter=RXMesh.Queries -input "$file" -num_run $num_run -device_id $device_id + echo $exe --gtest_filter=RXMeshStatic.Queries -input "$file" -num_run $num_run -device_id $device_id + $exe --gtest_filter=RXMeshStatic.Queries -input "$file" -num_run $num_run -device_id $device_id fi done \ No newline at end of file diff --git a/tests/RXMesh_test/higher_query.cuh b/tests/RXMesh_test/higher_query.cuh index fbf00d33..909543dc 100644 --- a/tests/RXMesh_test/higher_query.cuh +++ b/tests/RXMesh_test/higher_query.cuh @@ -3,107 +3,88 @@ #include #include -#include "rxmesh/kernels/rxmesh_iterator.cuh" -#include "rxmesh/kernels/rxmesh_query_dispatcher.cuh" -#include "rxmesh/rxmesh.h" -#include "rxmesh/rxmesh_attribute.h" -#include "rxmesh/rxmesh_context.h" +#include "rxmesh/attribute.h" +#include "rxmesh/context.h" +#include "rxmesh/iterator.cuh" +#include "rxmesh/kernels/query_dispatcher.cuh" + /** - * higher_query() + * @brief perform 2-ring VV query */ -template -__launch_bounds__(blockThreads) __global__ - static void higher_query(const RXMESH::RXMeshContext context, - RXMESH::RXMeshAttribute d_src, - RXMESH::RXMeshAttribute output_container, - const bool oriented = false) +template +__global__ static void higher_query( + const rxmesh::Context context, + rxmesh::VertexAttribute input, + rxmesh::VertexAttribute output) { - using namespace RXMESH; - uint32_t block_offset = 0; - if constexpr (op == Op::EV || op == Op::EF) { - block_offset = context.get_edge_distribution()[blockIdx.x]; - } else if constexpr (op == Op::FV || op == Op::FE || op == Op::FF) { - block_offset = context.get_face_distribution()[blockIdx.x]; - } else if constexpr (op == Op::VV || op == Op::VE || op == Op::VF) { - block_offset = context.get_vertex_distribution()[blockIdx.x]; - } + using namespace rxmesh; // the mesh element that this thread is assigned to - uint32_t thread_element = INVALID32; - - // the location where thread_element will store its output - uint32_t element_offset; + VertexHandle thread_vertex; // number of vertices in the first ring uint32_t num_vv_1st_ring(0), num_vv(0); // computation done on the first ring/level // this is similar to the lambda function for query_block_dispatcher() - auto first_level_lambda = [&](uint32_t id, RXMeshIterator& iter) { - assert(iter.size() < output_container.get_num_attribute_per_element()); + auto first_ring_lambda = [&](VertexHandle id, + Iterator& iter) { + assert(iter.size() < output.get_num_attributes()); num_vv_1st_ring = iter.size(); - num_vv = num_vv_1st_ring; + num_vv = num_vv_1st_ring; // record the mesh element that this thread is assigned to - thread_element = id; - element_offset = block_offset + iter.local_id(); - - d_src(element_offset) = id; + thread_vertex = id; + input(thread_vertex) = thread_vertex; - output_container(element_offset, 0) = iter.size(); for (uint32_t i = 0; i < iter.size(); ++i) { - output_container(element_offset, i + 1) = iter[i]; + output(thread_vertex, i) = iter[i]; } }; + query_block_dispatcher(context, first_ring_lambda); - query_block_dispatcher(context, first_level_lambda, - oriented); - - uint32_t next_id = 1; + uint32_t next_id = 0; while (true) { - uint32_t next_vertex = INVALID32; + VertexHandle next_vertex; - if (thread_element != INVALID32 && next_id <= num_vv_1st_ring) { - next_vertex = output_container(element_offset, next_id); + if (thread_vertex.is_valid() && next_id < num_vv_1st_ring) { + next_vertex = output(thread_vertex, next_id); } - auto second_level_lambda = [&](uint32_t id, RXMeshIterator& iter) { + auto higher_rings_lambda = [&](const VertexHandle& id, + const VertexIterator& iter) { assert(id == next_vertex); for (uint32_t i = 0; i < iter.size(); ++i) { - if (iter[i] != thread_element) { + if (iter[i] != thread_vertex) { // make sure that we don't store duplicate outputs bool duplicate = false; - for (uint32_t j = 1; j <= num_vv; ++j) { - if (iter[i] == output_container(element_offset, j)) { + for (uint32_t j = 0; j < num_vv; ++j) { + if (iter[i] == output(thread_vertex, j)) { duplicate = true; break; } } if (!duplicate) { + output(thread_vertex, num_vv) = iter[i]; num_vv++; - output_container(element_offset, num_vv) = iter[i]; } } } }; - query_block_dispatcher(context, next_vertex, - second_level_lambda); + higher_query_block_dispatcher( + context, next_vertex, higher_rings_lambda); bool is_done = - (next_id > num_vv_1st_ring) || (thread_element == INVALID32); + (next_id >= num_vv_1st_ring) || !thread_vertex.is_valid(); if (__syncthreads_and(is_done)) { break; } next_id++; } - - if (thread_element != INVALID32) { - output_container(element_offset, 0) = num_vv; - } } \ No newline at end of file diff --git a/tests/RXMesh_test/query.cuh b/tests/RXMesh_test/query.cuh index 3b70b345..e4896c38 100644 --- a/tests/RXMesh_test/query.cuh +++ b/tests/RXMesh_test/query.cuh @@ -3,47 +3,34 @@ #include #include -#include "rxmesh/kernels/rxmesh_iterator.cuh" -#include "rxmesh/kernels/rxmesh_query_dispatcher.cuh" -#include "rxmesh/rxmesh.h" -#include "rxmesh/rxmesh_attribute.h" -#include "rxmesh/rxmesh_context.h" +#include "rxmesh/attribute.h" +#include "rxmesh/context.h" +#include "rxmesh/iterator.cuh" +#include "rxmesh/kernels/query_dispatcher.cuh" + /** - * query() + * @brief perform query of type of and store the output as well as the + * corresponding input */ -template -__launch_bounds__(blockThreads) __global__ - static void query(const RXMESH::RXMeshContext context, - RXMESH::RXMeshAttribute d_src, - RXMESH::RXMeshAttribute output_container, - const bool oriented = false) +template +__global__ static void query_kernel(const rxmesh::Context context, + InputAttributeT input, + OutputAttributeT output, + const bool oriented = false) { - using namespace RXMESH; - - static_assert(op != Op::EE, "Op::EE is not supported!"); - - assert(output_container.is_device_allocated()); - - uint32_t block_offset = 0; - if constexpr (op == Op::EV || op == Op::EF) { - block_offset = context.get_edge_distribution()[blockIdx.x]; - } else if constexpr (op == Op::FV || op == Op::FE || op == Op::FF) { - block_offset = context.get_face_distribution()[blockIdx.x]; - } else if constexpr (op == Op::VV || op == Op::VE || op == Op::VF) { - block_offset = context.get_vertex_distribution()[blockIdx.x]; - } - - auto store_lambda = [&](uint32_t id, RXMeshIterator& iter) { - assert(iter.size() < output_container.get_num_attribute_per_element()); - - uint32_t id_offset = block_offset + iter.local_id(); - d_src(id_offset) = id; + using namespace rxmesh; - output_container(id_offset, 0) = iter.size(); + auto store_lambda = [&](InputHandleT& id, Iterator& iter) { + input(id) = id; for (uint32_t i = 0; i < iter.size(); ++i) { - output_container(id_offset, i + 1) = iter[i]; + output(id, i) = iter[i]; } }; diff --git a/tests/RXMesh_test/rxmesh_test.h b/tests/RXMesh_test/rxmesh_test.h index 751db854..1fb28835 100644 --- a/tests/RXMesh_test/rxmesh_test.h +++ b/tests/RXMesh_test/rxmesh_test.h @@ -2,97 +2,54 @@ #include #include -#include "rxmesh/rxmesh_attribute.h" -#include "rxmesh/rxmesh_context.h" +#include +#include "rxmesh/attribute.h" +#include "rxmesh/context.h" #include "rxmesh/rxmesh_static.h" #include "rxmesh/util/util.h" class RXMeshTest { + private: + bool m_quite; + std::vector> m_h_FE; + public: RXMeshTest(const RXMeshTest&) = delete; - RXMeshTest(bool quite = true) : m_quite(quite){}; - - /** - * run_query_verifier() - */ - template - bool run_query_verifier( - const RXMESH::RXMeshStatic& rxmesh, - const RXMESH::Op op, - const RXMESH::RXMeshAttribute& input_container, - const RXMESH::RXMeshAttribute& output_container) + RXMeshTest(const rxmesh::RXMeshStatic& rxmesh, + const std::vector>& fv, + bool quite = true) + : m_quite(quite) { + assert(rxmesh.m_edges_map.size() != 0); - // run test on specific query operation on an instance of rxmesh. this - // does not account for patching so works only on big matrices data- - // structure - - populate_FE(rxmesh); - switch (op) { - case RXMESH::Op::VV: - return test_VV(rxmesh, input_container, output_container); - break; - - case RXMESH::Op::VE: - return test_VE(rxmesh, input_container, output_container); - break; - - case RXMESH::Op::VF: - return test_VF(rxmesh, input_container, output_container); - break; - - case RXMESH::Op::FV: - return test_FV(rxmesh, input_container, output_container); - break; - - case RXMESH::Op::FE: - return test_FE(rxmesh, input_container, output_container); - break; - - case RXMESH::Op::FF: - return test_FF(rxmesh, input_container, output_container); - break; - - case RXMESH::Op::EV: - return test_EV(rxmesh, input_container, output_container); - break; - case RXMESH::Op::EF: - return test_EF(rxmesh, input_container, output_container); - break; - - default: - RXMESH_ERROR("RXMeshTest::run_test() Op is not supported!!"); - break; - } - return false; - } + for (uint32_t f = 0; f < rxmesh.m_num_faces; ++f) { + uint32_t i = f; - /** - * run_higher_query_verifier() - */ - template - bool run_higher_query_verifier( - const RXMESH::RXMeshStatic& rxmesh, - const RXMESH::RXMeshAttribute& input_container, - const RXMESH::RXMeshAttribute& output_container) - { - populate_FE(rxmesh); - return test_VVV(rxmesh, input_container, output_container); + std::vector fe(3); + + for (uint32_t j = 0; j < 3; ++j) { + + uint32_t v0 = fv[i][j]; + uint32_t v1 = fv[i][(j + 1) % 3]; + + std::pair my_edge = + rxmesh::detail::edge_key(v0, v1); + uint32_t edge_id = rxmesh.get_edge_id(my_edge); + fe[j] = edge_id; + } + m_h_FE.push_back(fe); + } } - /** - * run_ltog_mapping_test() - */ - template - bool run_ltog_mapping_test(const RXMESH::RXMesh& rxmesh) + bool run_ltog_mapping_test(const rxmesh::RXMesh& rxmesh, + const std::vector>& fv) { // check if the mapping created for each patch is consistent // i.e., what you have in the local index space represents the global // space - populate_FE(rxmesh); for (uint32_t p = 0; p < rxmesh.m_num_patches; ++p) { bool edges_ok(true), faces_ok(true); check_mapping(rxmesh, p, edges_ok, faces_ok); @@ -103,59 +60,21 @@ class RXMeshTest return true; } - private: - bool m_quite; - std::vector> m_h_FE; - - template - void populate_FE(const RXMESH::RXMesh& rxmesh) - { - - // populate m_h_FE (in global space) with global edge numbers - // m_h_FE should be uninitialized - // should call this only if verification is needed. - - if (m_h_FE.size() > 0) { - return; - } - m_h_FE.clear(); - - if (rxmesh.m_edges_map.size() == 0) { - RXMESH_ERROR( - "RXMeshTest::populate_FE() can not call me before" - " populating m_edges_map"); - } - - for (uint32_t f = 0; f < rxmesh.m_num_faces; ++f) { - uint32_t i = f; - - std::vector ff(3); - - for (uint32_t j = 0; j < 3; ++j) { - uint32_t v0 = rxmesh.m_fvn[i][j]; - uint32_t v1 = - (j != 2) ? rxmesh.m_fvn[i][j + 1] : rxmesh.m_fvn[i][0]; - std::pair my_edge = rxmesh.edge_key(v0, v1); - uint32_t edge_id = rxmesh.get_edge_id(my_edge); - ff[j] = edge_id; - } - m_h_FE.push_back(ff); - } - } - - private: - template - bool test_VVV(const RXMESH::RXMeshStatic& rxmesh, - const RXMESH::RXMeshAttribute& input_container, - const RXMESH::RXMeshAttribute& output_container) + /** + * @brief verify VV query. If is_higher_query is true, it verifies 2-ring + * queries + */ + bool run_test(const rxmesh::RXMeshStatic& rxmesh, + const std::vector>& fv, + const rxmesh::VertexAttribute& input, + const rxmesh::VertexAttribute& output, + const bool is_higher_query = false) { - - // construct VV - std::vector> v_v(rxmesh.m_num_vertices, + std::vector> v_v(rxmesh.get_num_vertices(), std::vector(0)); - auto e_it = rxmesh.m_edges_map.begin(); + auto e_it = rxmesh.m_edges_map.begin(); auto e_end = rxmesh.m_edges_map.end(); for (; e_it != e_end; e_it++) { @@ -165,184 +84,219 @@ class RXMeshTest v_v[vertices.second].push_back(vertices.first); } - // use VV to construct VVV - std::vector> v_v_v = v_v; - for (uint32_t v = 0; v < v_v_v.size(); ++v) { + if (is_higher_query) { + // use VV to construct VVV + std::vector> v_v_v = v_v; + for (uint32_t v = 0; v < v_v_v.size(); ++v) { - // loop over the v_v list of the vertex v - for (uint32_t i = 0; i < v_v[v].size(); ++i) { + // loop over the v_v list of the vertex v + for (uint32_t i = 0; i < v_v[v].size(); ++i) { - // this is a vertex in the 1-ring (v_v) of v - uint32_t n = v_v_v[v][i]; + // this is a vertex in the 1-ring (v_v) of v + uint32_t n = v_v_v[v][i]; - // loop over the v_v list (1-ring) of n - for (uint32_t j = 0; j < v_v[n].size(); ++j) { + // loop over the v_v list (1-ring) of n + for (uint32_t j = 0; j < v_v[n].size(); ++j) { - // a candidate to be added to the 2-ring of v - uint32_t candid = v_v[n][j]; + // a candidate to be added to the 2-ring of v + uint32_t candid = v_v[n][j]; - // but we need to check first if it is not duplicate and - // it is not v itself - if (candid != v && - RXMESH::find_index(candid, v_v_v[v]) == - std::numeric_limits::max()) { + // but we need to check first if it is not duplicate and + // it is not v itself + if (candid != v && + rxmesh::find_index(candid, v_v_v[v]) == + std::numeric_limits::max()) { - v_v_v[v].push_back(candid); + v_v_v[v].push_back(candid); + } } } } - } - - - // two-way verification - return verifier(rxmesh.get_patcher()->get_vertex_patch().data(), v_v_v, - input_container, output_container); - } - - template - bool test_VV(const RXMESH::RXMeshStatic& rxmesh, - const RXMESH::RXMeshAttribute& input_container, - const RXMESH::RXMeshAttribute& output_container) - { - - // construct VV - - std::vector> v_v(rxmesh.m_num_vertices, - std::vector(0)); - - auto e_it = rxmesh.m_edges_map.begin(); - auto e_end = rxmesh.m_edges_map.end(); - for (; e_it != e_end; e_it++) { - std::pair vertices = e_it->first; - - v_v[vertices.first].push_back(vertices.second); - v_v[vertices.second].push_back(vertices.first); + return verifier( + v_v_v, + rxmesh, + rxmesh.m_h_num_owned_v, + rxmesh.m_h_patches_ltog_v, + rxmesh.m_h_patches_ltog_v, + input, + output); } - // two-way verification - return verifier(rxmesh.get_patcher()->get_vertex_patch().data(), v_v, - input_container, output_container); + return verifier( + v_v, + rxmesh, + rxmesh.m_h_num_owned_v, + rxmesh.m_h_patches_ltog_v, + rxmesh.m_h_patches_ltog_v, + input, + output); } - template - bool test_VE(const RXMESH::RXMeshStatic& rxmesh, - const RXMESH::RXMeshAttribute& input_container, - const RXMESH::RXMeshAttribute& output_container) + /** + * @brief verify VE query + */ + bool run_test(const rxmesh::RXMeshStatic& rxmesh, + const std::vector>& fv, + const rxmesh::VertexAttribute& input, + const rxmesh::VertexAttribute& output) { - - // construct VE - std::vector> v_e(rxmesh.m_num_vertices, std::vector(0)); - auto e_it = rxmesh.m_edges_map.begin(); + auto e_it = rxmesh.m_edges_map.begin(); auto e_end = rxmesh.m_edges_map.end(); for (; e_it != e_end; e_it++) { std::pair vertices = e_it->first; - uint32_t edge = e_it->second; + uint32_t edge = e_it->second; v_e[vertices.first].push_back(edge); v_e[vertices.second].push_back(edge); } - // two-way verification - return verifier(rxmesh.get_patcher()->get_vertex_patch().data(), v_e, - input_container, output_container); + return verifier( + v_e, + rxmesh, + rxmesh.m_h_num_owned_v, + rxmesh.m_h_patches_ltog_v, + rxmesh.m_h_patches_ltog_e, + input, + output); } - template - bool test_VF(const RXMESH::RXMeshStatic& rxmesh, - const RXMESH::RXMeshAttribute& input_container, - const RXMESH::RXMeshAttribute& output_container) + /** + * @brief verify VF query + */ + bool run_test(const rxmesh::RXMeshStatic& rxmesh, + const std::vector>& fv, + const rxmesh::VertexAttribute& input, + const rxmesh::VertexAttribute& output) { - // construct FV - + if (rxmesh.m_num_faces != fv.size()) { + return false; + } std::vector> v_f(rxmesh.m_num_vertices, std::vector(0)); - // TODO this depends on m_fvn which does not record any changes - // but it is what the user has passed. Should compute v_f based on - // m_edge_map for consistency - uint32_t f_deg = rxmesh.m_face_degree; - for (uint32_t f = 0; f < rxmesh.m_num_faces; f++) { - for (uint32_t v = 0; v < f_deg; v++) { - uint32_t vert = rxmesh.m_fvn[f][v]; + for (uint32_t f = 0; f < fv.size(); f++) { + for (uint32_t v = 0; v < 3; v++) { + uint32_t vert = fv[f][v]; v_f[vert].push_back(f); } } - // two-way verification - return verifier(rxmesh.get_patcher()->get_vertex_patch().data(), v_f, - input_container, output_container); + return verifier( + v_f, + rxmesh, + rxmesh.m_h_num_owned_v, + rxmesh.m_h_patches_ltog_v, + rxmesh.m_h_patches_ltog_f, + input, + output); } - template - bool test_FV(const RXMESH::RXMeshStatic& rxmesh, - const RXMESH::RXMeshAttribute& input_container, - const RXMESH::RXMeshAttribute& output_container) + /** + * @brief verify EV query + */ + bool run_test(const rxmesh::RXMeshStatic& rxmesh, + const std::vector>& fv, + const rxmesh::EdgeAttribute& input, + const rxmesh::EdgeAttribute& output) { - // construct FV - - uint32_t f_deg = rxmesh.m_face_degree; - - std::vector> f_v(rxmesh.m_num_faces, - std::vector(f_deg)); - - for (uint32_t f = 0; f < rxmesh.m_num_faces; f++) { + std::vector> e_v(rxmesh.m_num_edges, + std::vector(2)); - std::memcpy(f_v[f].data(), rxmesh.m_fvn[f].data(), - f_deg * sizeof(uint32_t)); + auto e_it = rxmesh.m_edges_map.begin(); + while (e_it != rxmesh.m_edges_map.end()) { + e_v[e_it->second][0] = (e_it->first).first; + e_v[e_it->second][1] = (e_it->first).second; + e_it++; } - // two-way verification - return verifier(rxmesh.get_patcher()->get_face_patch().data(), f_v, - input_container, output_container); + return verifier( + e_v, + rxmesh, + rxmesh.m_h_num_owned_e, + rxmesh.m_h_patches_ltog_e, + rxmesh.m_h_patches_ltog_v, + input, + output); } - template - bool test_FE(const RXMESH::RXMeshStatic& rxmesh, - const RXMESH::RXMeshAttribute& input_container, - const RXMESH::RXMeshAttribute& output_container) + /** + * @brief verify EF query + */ + bool run_test(const rxmesh::RXMeshStatic& rxmesh, + const std::vector>& fv, + const rxmesh::EdgeAttribute& input, + const rxmesh::EdgeAttribute& output) { - - // construct FE - - uint32_t f_deg = rxmesh.m_face_degree; - - std::vector> f_e(rxmesh.m_num_faces, + std::vector> e_f(rxmesh.m_num_edges, std::vector(0)); for (uint32_t f = 0; f < rxmesh.m_num_faces; f++) { - f_e[f].reserve(f_deg); - } - - for (uint32_t f = 0; f < rxmesh.m_num_faces; f++) { - uint32_t e0 = m_h_FE[f][0]; - uint32_t e1 = m_h_FE[f][1]; - uint32_t e2 = m_h_FE[f][2]; - - f_e[f].push_back(e0); - f_e[f].push_back(e1); - f_e[f].push_back(e2); + for (uint32_t e = 0; e < 3; e++) { + uint32_t edge = m_h_FE[f][e]; + e_f[edge].push_back(f); + } } - - // two-way verification - return verifier(rxmesh.get_patcher()->get_face_patch().data(), f_e, - input_container, output_container); + return verifier( + e_f, + rxmesh, + rxmesh.m_h_num_owned_e, + rxmesh.m_h_patches_ltog_e, + rxmesh.m_h_patches_ltog_f, + input, + output); } - template - bool test_FF(const RXMESH::RXMeshStatic& rxmesh, - const RXMESH::RXMeshAttribute& input_container, - const RXMESH::RXMeshAttribute& output_container) + /** + * @brief verify FV query + */ + bool run_test(const rxmesh::RXMeshStatic& rxmesh, + const std::vector>& fv, + const rxmesh::FaceAttribute& input, + const rxmesh::FaceAttribute& output) { + return verifier( + fv, + rxmesh, + rxmesh.m_h_num_owned_f, + rxmesh.m_h_patches_ltog_f, + rxmesh.m_h_patches_ltog_v, + input, + output); + } - // construct FF + /** + * @brief verify FE query + */ + bool run_test(const rxmesh::RXMeshStatic& rxmesh, + const std::vector>& fv, + const rxmesh::FaceAttribute& input, + const rxmesh::FaceAttribute& output) + { + return verifier( + m_h_FE, + rxmesh, + rxmesh.m_h_num_owned_f, + rxmesh.m_h_patches_ltog_f, + rxmesh.m_h_patches_ltog_e, + input, + output); + } + /** + * @brief verify FF query + */ + bool run_test(const rxmesh::RXMeshStatic& rxmesh, + const std::vector>& fv, + const rxmesh::FaceAttribute& input, + const rxmesh::FaceAttribute& output) + { std::vector> f_f(rxmesh.m_num_faces, std::vector(0)); @@ -376,137 +330,108 @@ class RXMeshTest } } - // two-way verification - return verifier(rxmesh.get_patcher()->get_face_patch().data(), f_f, - input_container, output_container); - } - - template - bool test_EV(const RXMESH::RXMeshStatic& rxmesh, - const RXMESH::RXMeshAttribute& input_container, - const RXMESH::RXMeshAttribute& output_container) - { - - // construct EV - std::vector> e_v(rxmesh.m_num_edges, - std::vector(2)); - - auto e_it = rxmesh.m_edges_map.begin(); - while (e_it != rxmesh.m_edges_map.end()) { - e_v[e_it->second][0] = (e_it->first).first; - e_v[e_it->second][1] = (e_it->first).second; - e_it++; - } - - - // two-way verification - return verifier(rxmesh.get_patcher()->get_edge_patch().data(), e_v, - input_container, output_container); + return verifier( + f_f, + rxmesh, + rxmesh.m_h_num_owned_f, + rxmesh.m_h_patches_ltog_f, + rxmesh.m_h_patches_ltog_f, + input, + output); } - template - bool test_EF(const RXMESH::RXMeshStatic& rxmesh, - const RXMESH::RXMeshAttribute& input_container, - const RXMESH::RXMeshAttribute& output_container) - { - - // construct EF - - std::vector> e_f(rxmesh.m_num_edges, - std::vector(0)); - - uint32_t f_deg = rxmesh.m_face_degree; - for (uint32_t f = 0; f < rxmesh.m_num_faces; f++) { - for (uint32_t e = 0; e < f_deg; e++) { - uint32_t edge = m_h_FE[f][e]; - e_f[edge].push_back(f); - } - } - - // two-way verification - return verifier(rxmesh.get_patcher()->get_edge_patch().data(), e_f, - input_container, output_container); - } - bool verifier(const uint32_t* element_patch, - const std::vector>& mesh_ele, - const RXMESH::RXMeshAttribute& input_container, - const RXMESH::RXMeshAttribute& output_container) + private: + template + bool verifier(const std::vector>& gt, + const rxmesh::RXMeshStatic& rxmesh, + const std::vector& num_owned_input_elements, + const std::vector>& input_ltog, + const std::vector>& output_ltog, + const InputAttributeT& input, + const OutputAttributeT& output) { + auto global_id_from_handle = [&](OutputHandleT xxh) -> uint32_t { + auto pl = xxh.unpack(); + return output_ltog[pl.first][pl.second]; + }; + + for (uint32_t p = 0; p < rxmesh.get_num_patches(); ++p) { + for (uint32_t e = 0; e < num_owned_input_elements[p]; ++e) { + InputHandleT eh(p, e); + if (input(eh) != eh) { + return false; + } - bool results = true; - - const uint32_t input_size = input_container.get_num_mesh_elements(); - - assert(input_size == output_container.get_num_mesh_elements()); + uint32_t e_global = input_ltog[p][e]; - for (uint32_t v = 0; v < input_size; v++) { + // Check correctness + // check if all output XX are correct + uint32_t num_xx = 0; + for (uint32_t i = 0; i < output.get_num_attributes(); ++i) { + OutputHandleT xxh = output(eh, i); + if (xxh.is_valid()) { + num_xx++; - const uint32_t src_ele = input_container(v); + // extract local id from xxh's unique id + uint32_t xx_global = global_id_from_handle(xxh); - if (src_ele == INVALID32) { - // means it is isolated element so don't bother - continue; - } - // check for correctness (e.g, all edges in h_output are actually - // edges incident to the vertex v) - for (uint32_t i = 1; i <= output_container(v, 0); ++i) { + uint32_t id = + rxmesh::find_index(xx_global, gt[e_global]); - uint32_t id = RXMESH::find_index(output_container(v, i), - mesh_ele[src_ele]); - - if (id == std::numeric_limits::max()) { - if (!m_quite) { - RXMESH_ERROR( - "RXMeshTest::verifier() element {} is not incident " - "to {}", - output_container(v, i), src_ele); + if (id == std::numeric_limits::max()) { + return false; + } } - results = false; } - } - // check for completeness (e.g, that all edges incident to the - // vertex v are actually returned in output_container) - for (uint32_t i = 0; i < mesh_ele[src_ele].size(); i++) { - uint32_t e = mesh_ele[src_ele][i]; - bool found = false; - for (uint32_t j = 1; j <= output_container(v, 0); j++) { - if (output_container(v, j) == e) { - found = true; - break; - } + if (num_xx != gt[e_global].size()) { + return false; } - if (!found) { - if (!m_quite) { - RXMESH_ERROR( - "RXMeshTest::verifier() element {} is not incident " - "to {}", - e, src_ele); + // Check completeness + // check if all ground truth XX are in the output + for (uint32_t i = 0; i < gt[e_global].size(); ++i) { + uint32_t xx = gt[e_global][i]; + + bool found = false; + for (uint32_t j = 0; j < output.get_num_attributes(); ++j) { + OutputHandleT xxh = output(eh, j); + if (xxh.is_valid()) { + uint32_t xx_global = global_id_from_handle(xxh); + if (xx_global == xx) { + found = true; + break; + } + } + } + + if (!found) { + return false; } - results = false; } } } - return results; + return true; } - template - void check_mapping(const RXMESH::RXMesh& rxmesh, - const uint32_t patch_id, - bool& is_edges_ok, - bool& is_faces_ok) + + void check_mapping(const rxmesh::RXMesh& rxmesh, + const uint32_t patch_id, + bool& is_edges_ok, + bool& is_faces_ok) { // check if the mapping is consistent i.e., going from local to // global gives the same results as from global to local // Number of edges and faces in this patch - uint32_t num_p_edges = rxmesh.m_h_ad_size[patch_id].y >> 1; - uint32_t num_p_faces = static_cast( - static_cast(rxmesh.m_h_ad_size[patch_id].w) / 3.0f); + uint32_t num_p_edges = rxmesh.m_h_patches_info[patch_id].num_edges; + uint32_t num_p_faces = rxmesh.m_h_patches_info[patch_id].num_faces; assert(num_p_edges <= std::numeric_limits::max()); assert(num_p_faces <= std::numeric_limits::max()); @@ -516,15 +441,14 @@ class RXMeshTest is_faces_ok = check_mapping_faces(rxmesh, patch_id, num_p_faces); } - template - bool check_mapping_edges(const RXMESH::RXMesh& rxmesh, - const uint32_t patch_id, - const uint32_t num_p_edges) + bool check_mapping_edges(const rxmesh::RXMesh& rxmesh, + const uint32_t patch_id, + const uint32_t num_p_edges) { // 1) For each local edge in the patch, get its global id using the // mapping (using m_h_patches_ltog_e) - // 2) get the local edge's local vertices (using m_h_patches_edges) + // 2) get the local edge's local vertices (using m_h_patches_ev) // 3) map the local vertices to their global id (using // m_h_patches_ltog_v) @@ -539,26 +463,22 @@ class RXMeshTest // 1) // convert the local edge to global one - uint32_t e_ltog = - (rxmesh.m_h_patches_ltog_e.at(patch_id).at(e_l) >> 1); + uint32_t e_ltog = rxmesh.m_h_patches_ltog_e.at(patch_id).at(e_l); // 2) // get the local vertices - uint16_t v0_l = rxmesh.m_h_patches_edges.at(patch_id).at(e_l * 2); - uint16_t v1_l = - rxmesh.m_h_patches_edges.at(patch_id).at(e_l * 2 + 1); + uint16_t v0_l = rxmesh.m_h_patches_ev.at(patch_id).at(e_l * 2); + uint16_t v1_l = rxmesh.m_h_patches_ev.at(patch_id).at(e_l * 2 + 1); // 3) // convert the local vertices to global - uint32_t v0_ltog = - (rxmesh.m_h_patches_ltog_v.at(patch_id).at(v0_l) >> 1); - uint32_t v1_ltog = - (rxmesh.m_h_patches_ltog_v.at(patch_id).at(v1_l) >> 1); + uint32_t v0_ltog = rxmesh.m_h_patches_ltog_v.at(patch_id).at(v0_l); + uint32_t v1_ltog = rxmesh.m_h_patches_ltog_v.at(patch_id).at(v1_l); // 4) // use the convered vertices to look for the edge global id - auto my_edge = rxmesh.edge_key(v0_ltog, v1_ltog); + auto my_edge = rxmesh::detail::edge_key(v0_ltog, v1_ltog); uint32_t e_g; try { @@ -570,7 +490,11 @@ class RXMeshTest "find the corresponding edge between global vertices " "{} and {} with local id {} and in patch {} of " "converted to global vertices", - v0_ltog, v1_ltog, v0_l, v1_l, patch_id); + v0_ltog, + v1_ltog, + v0_l, + v1_l, + patch_id); } return false; } @@ -588,7 +512,13 @@ class RXMeshTest "{}, local edge id = {}, mapped to = {}, local " "vertices id = ({}, {}) mapped to= ({}, {}), global " "edge connecting the mapped global vertices = {}", - patch_id, e_l, e_ltog, v0_l, v1_l, v0_ltog, v1_ltog, + patch_id, + e_l, + e_ltog, + v0_l, + v1_l, + v0_ltog, + v1_ltog, e_g); } return false; @@ -597,64 +527,59 @@ class RXMeshTest return true; } - template - bool check_mapping_faces(const RXMESH::RXMesh& rxmesh, - const uint32_t patch_id, - const uint32_t num_p_faces) + bool check_mapping_faces(const rxmesh::RXMesh& rxmesh, + const uint32_t patch_id, + const uint32_t num_p_faces) { - using namespace RXMESH; + using namespace rxmesh; // 1) for each local face in the patch, get its global id using the // mapping (using m_h_patches_ltog_f) - // 2) get the local face's local edges (using m_h_patches_faces) + // 2) get the local face's local edges (using m_h_patches_fe) // 3) map the local edges to their global id //(using m_h_patches_ltog_v) // 4) use the converted edges to get their global face id (using - // m_h_patches_faces) + // m_h_patches_fe) // 5) check if the resulting global face id in 4) matches that // obtained in 1) - uint32_t deg = rxmesh.m_face_degree; - std::vector e_l(deg); - std::vector e_g(deg); - std::vector e_ltog(deg); + std::vector e_l(3); + std::vector e_g(3); + std::vector e_ltog(3); for (uint16_t f_l = 0; f_l < num_p_faces; ++f_l) { // 1) // convert the local face to global one - uint32_t f_ltog = - (rxmesh.m_h_patches_ltog_f.at(patch_id).at(f_l) >> 1); + uint32_t f_ltog = rxmesh.m_h_patches_ltog_f.at(patch_id).at(f_l); // 2) // get the local edges - for (uint32_t i = 0; i < deg; ++i) { - e_l[i] = - rxmesh.m_h_patches_faces.at(patch_id).at(f_l * deg + i); + for (uint32_t i = 0; i < 3; ++i) { + e_l[i] = rxmesh.m_h_patches_fe.at(patch_id).at(f_l * 3 + i); // shift right because the first bit is reserved for edge // direction flag_t dir(0); - RXMeshContext::unpack_edge_dir(e_l[i], e_l[i], dir); + Context::unpack_edge_dir(e_l[i], e_l[i], dir); } // 3) // convert the local edges to global - for (uint32_t i = 0; i < deg; ++i) { - e_ltog[i] = - (rxmesh.m_h_patches_ltog_e.at(patch_id).at(e_l[i]) >> 1); + for (uint32_t i = 0; i < 3; ++i) { + e_ltog[i] = rxmesh.m_h_patches_ltog_e.at(patch_id).at(e_l[i]); } // 4) // from the mapped face (f_ltog) get its global edges - for (uint32_t i = 0; i < deg; ++i) { + for (uint32_t i = 0; i < 3; ++i) { e_g[i] = m_h_FE[f_ltog][i]; } // 5) // check if the global edges matches the mapping edges - for (uint32_t i = 0; i < deg; ++i) { + for (uint32_t i = 0; i < 3; ++i) { if (e_g[i] != e_ltog[i]) { if (!m_quite) { RXMESH_ERROR( @@ -664,9 +589,18 @@ class RXMeshTest "edges id = ({}, {}, {}), mapped to = ({}, {}, " "{}), global edges obtained from the mapped global " "face= ({}, {}, {})", - patch_id, f_l, f_ltog, e_l[0], e_l[1], e_l[2], - e_ltog[0], e_ltog[1], e_ltog[2], e_ltog[0], - e_ltog[1], e_ltog[2]); + patch_id, + f_l, + f_ltog, + e_l[0], + e_l[1], + e_l[2], + e_ltog[0], + e_ltog[1], + e_ltog[2], + e_ltog[0], + e_ltog[1], + e_ltog[2]); } return false; } diff --git a/tests/RXMesh_test/rxmesh_test_main.cu b/tests/RXMesh_test/rxmesh_test_main.cu index d6ae3274..a8634c69 100644 --- a/tests/RXMesh_test/rxmesh_test_main.cu +++ b/tests/RXMesh_test/rxmesh_test_main.cu @@ -5,28 +5,26 @@ #include "rxmesh/util/vector.h" using dataT = float; -std::vector> Verts; struct RXMeshTestArg { - uint32_t num_run = 1; - uint32_t device_id = 0; + uint32_t num_run = 1; + uint32_t device_id = 0; std::string obj_file_name = STRINGIFY(INPUT_DIR) "sphere3.obj"; std::string output_folder = STRINGIFY(OUTPUT_DIR); - bool quite = false; - bool shuffle = false; - bool sort = false; - int argc = argc; - char** argv = argv; + bool quite = false; + int argc = argc; + char** argv = argv; } rxmesh_args; #include "test_higher_queries.h" #include "test_queries.h" - +#include "test_attribute.cuh" +#include "test_for_each.h" int main(int argc, char** argv) { - using namespace RXMESH; + using namespace rxmesh; Log::init(); ::testing::InitGoogleTest(&argc, argv); @@ -41,10 +39,8 @@ int main(int argc, char** argv) " Default is {} \n" " Hint: Only accepts OBJ files\n" " -o: JSON file output folder. Default is {} \n" - " -num_run: Number of iterations for performance testing. Default is {} \n" - " -q: Run in quite mode.\n" - " -s: Shuffle input. Default is false.\n" - " -p: Sort input using patching output. Default is false.\n" + " -num_run: Number of iterations for performance testing. Default is {} \n" + " -q: Run in quite mode. Default is false\n" " -device_id: GPU device ID. Default is {}", rxmesh_args.obj_file_name, rxmesh_args.output_folder ,rxmesh_args.num_run,rxmesh_args.device_id); // clang-format on @@ -72,18 +68,11 @@ int main(int argc, char** argv) if (cmd_option_exists(argv, argc + argv, "-q")) { rxmesh_args.quite = true; } - if (cmd_option_exists(argv, argc + argv, "-s")) { - rxmesh_args.shuffle = true; - } - if (cmd_option_exists(argv, argc + argv, "-p")) { - rxmesh_args.sort = true; - } } if (!rxmesh_args.quite) { RXMESH_TRACE("input= {}", rxmesh_args.obj_file_name); RXMESH_TRACE("output_folder= {}", rxmesh_args.output_folder); - RXMESH_TRACE("PATCH_SIZE= {}", PATCH_SIZE); RXMESH_TRACE("num_run= {}", rxmesh_args.num_run); RXMESH_TRACE("device_id= {}", rxmesh_args.device_id); } diff --git a/tests/RXMesh_test/test_attribute.cu b/tests/RXMesh_test/test_attribute.cu deleted file mode 100644 index 049c9139..00000000 --- a/tests/RXMesh_test/test_attribute.cu +++ /dev/null @@ -1,458 +0,0 @@ -#include "gtest/gtest.h" -#include "rxmesh/rxmesh_attribute.h" -#include "rxmesh/util/macros.h" -#include "rxmesh/util/vector.h" - -/** - * test_vector() - */ -__global__ static void test_vector( - RXMESH::RXMeshAttribute mesh_attr, - uint32_t* suceess) -{ - - if (threadIdx.x == 0 && blockIdx.x == 0) { - *suceess = 1; - - assert((mesh_attr.get_allocated() & RXMESH::DEVICE) == RXMESH::DEVICE); - uint32_t num_mesh_elements = mesh_attr.get_num_mesh_elements(); - for (uint32_t i = 0; i < num_mesh_elements; ++i) { - const auto& vec = mesh_attr(i); - if (vec[0] != i + 0 || vec[1] != i + 1 || vec[2] != i + 2) { - *suceess = 0; - return; - } - } - } -} - -/** - * test_values() - */ -template -__global__ static void test_values(RXMESH::RXMeshAttribute mesh_attr, - uint32_t* suceess) -{ - - if (threadIdx.x == 0 && blockIdx.x == 0) { - *suceess = 1; - - assert((mesh_attr.get_allocated() & RXMESH::DEVICE) == RXMESH::DEVICE); - uint32_t num_mesh_elements = mesh_attr.get_num_mesh_elements(); - for (uint32_t i = 0; i < num_mesh_elements; ++i) { - for (uint32_t j = 0; j < mesh_attr.get_num_attribute_per_element(); - ++j) { - if (mesh_attr(i, j) != i + j) { - - *suceess = 0; - return; - } - } - } - } -} - -/** - * generate_values() - */ -template -__global__ static void generate_values(RXMESH::RXMeshAttribute mesh_attr) -{ - - if (threadIdx.x == 0 && blockIdx.x == 0) { - assert((mesh_attr.get_allocated() & RXMESH::DEVICE) == RXMESH::DEVICE); - - uint32_t num_mesh_elements = mesh_attr.get_num_mesh_elements(); - for (uint32_t i = 0; i < num_mesh_elements; ++i) { - for (uint32_t j = 0; j < mesh_attr.get_num_attribute_per_element(); - ++j) { - mesh_attr(i, j) = i + j; - } - } - } -} - - -bool test_host(uint32_t attributes_per_element) -{ - using namespace RXMESH; - // mesh attr on host - uint32_t num_mesh_elements = 2048; - RXMESH::RXMeshAttribute rxmesh_attr; - - rxmesh_attr.set_name("float_attr"); - rxmesh_attr.init(num_mesh_elements, attributes_per_element, RXMESH::HOST, - RXMESH::AoS); - - // generate some numbers as AoS - for (uint32_t i = 0; i < num_mesh_elements; ++i) { - for (uint32_t j = 0; j < attributes_per_element; ++j) { - rxmesh_attr(i, j) = i + j; - } - } - - // change the layout to SoA (good for gpu) - rxmesh_attr.change_layout(RXMESH::HOST); - - // move memory to device - rxmesh_attr.move(RXMESH::HOST, RXMESH::DEVICE); - - - // device success variable - uint32_t* d_success = nullptr; - CUDA_ERROR(cudaMalloc((void**)&d_success, sizeof(uint32_t))); - - - // actual testing - test_values<<<1, 1>>>(rxmesh_attr, d_success); - - CUDA_ERROR(cudaPeekAtLastError()); - CUDA_ERROR(cudaGetLastError()); - CUDA_ERROR(cudaDeviceSynchronize()); - - // host success variable - uint32_t h_success(0); - CUDA_ERROR(cudaMemcpy(&h_success, d_success, sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - - // free device - GPU_FREE(d_success); - - // release rxmesh_attribute memory on host and device - rxmesh_attr.release(); - - // reporting - return h_success == 1; -} - - -bool test_device(uint32_t attributes_per_element) -{ - using namespace RXMESH; - // Test generating values on device and processing it on host - - // mesh attr on host (but allocated on device) - uint32_t num_mesh_elements = 2048; - RXMESH::RXMeshAttribute rxmesh_attr; - rxmesh_attr.set_name("int_attr"); - rxmesh_attr.init(num_mesh_elements, attributes_per_element, RXMESH::DEVICE); - - - // generate some numbers on device - generate_values<<<1, 1>>>(rxmesh_attr); - - CUDA_ERROR(cudaDeviceSynchronize()); - CUDA_ERROR(cudaGetLastError()); - - - // move the generate values to host - rxmesh_attr.move(RXMESH::DEVICE, RXMESH::HOST); - - // change the layout to SoA - rxmesh_attr.change_layout(RXMESH::HOST); - - // testing - bool suceess = true; - assert((rxmesh_attr.get_allocated() & RXMESH::HOST) == RXMESH::HOST); - num_mesh_elements = rxmesh_attr.get_num_mesh_elements(); - - for (uint32_t i = 0; i < num_mesh_elements; ++i) { - for (uint32_t j = 0; j < attributes_per_element; ++j) { - if (rxmesh_attr(i, j) != i + j) { - suceess = false; - break; - } - } - if (!suceess) { - break; - } - } - - // release rxmesh_attribute memory on host and device - rxmesh_attr.release(); - - return suceess; -} - -/*bool test_vector() -{ - using namespace RXMESH; - // mesh attr on host - uint32_t num_mesh_elements = 2048; - RXMESH::RXMeshAttribute rxmesh_attr; - - rxmesh_attr.set_name("vector3f_attr"); - rxmesh_attr.init(num_mesh_elements, 1, RXMESH::HOST, RXMESH::AoS); - - // generate some numbers as AoS - for (uint32_t i = 0; i < num_mesh_elements; ++i) { - auto& vec = rxmesh_attr(i); - vec[0] = i + 0; - vec[1] = i + 1; - vec[2] = i + 2; - } - - // move memory to device - rxmesh_attr.move(RXMESH::HOST, RXMESH::DEVICE); - - - // device success variable - uint32_t* d_success = nullptr; - CUDA_ERROR(cudaMalloc((void**)&d_success, sizeof(uint32_t))); - - - // actual testing - test_vector<<<1, 1>>>(rxmesh_attr, d_success); - - CUDA_ERROR(cudaPeekAtLastError()); - CUDA_ERROR(cudaGetLastError()); - CUDA_ERROR(cudaDeviceSynchronize()); - - // host success variable - uint32_t h_success(0); - CUDA_ERROR(cudaMemcpy(&h_success, d_success, sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - - // free device - GPU_FREE(d_success); - - // release rxmesh_attribute memory on host and device - rxmesh_attr.release(); - - // reporting - return h_success == 1; -}*/ - -bool test_axpy(uint32_t attributes_per_element) -{ - using namespace RXMESH; - - float x_val(1.0), y_val(3.0), alpha_val(5.0), beta_val(7.0); - - uint32_t num_mesh_elements = 2048; - RXMESH::RXMeshAttribute X; - RXMESH::RXMeshAttribute Y; - - X.set_name("X"); - Y.set_name("Y"); - X.init(num_mesh_elements, attributes_per_element, RXMESH::HOST, - RXMESH::AoS); - Y.init(num_mesh_elements, attributes_per_element, RXMESH::HOST, - RXMESH::AoS); - - // generate some numbers as AoS - for (uint32_t i = 0; i < num_mesh_elements; ++i) { - for (uint32_t j = 0; j < attributes_per_element; ++j) { - X(i, j) = x_val; - Y(i, j) = y_val; - } - } - - X.change_layout(RXMESH::HOST); - Y.change_layout(RXMESH::HOST); - X.move(RXMESH::HOST, RXMESH::DEVICE); - Y.move(RXMESH::HOST, RXMESH::DEVICE); - - // call axpy - Vector<3, float> alpha(alpha_val); - Vector<3, float> beta(beta_val); - Y.axpy(X, alpha, beta); - - // sync - CUDA_ERROR(cudaDeviceSynchronize()); - CUDA_ERROR(cudaPeekAtLastError()); - CUDA_ERROR(cudaGetLastError()); - - - // move to host (don't need to move X - Y.move(RXMESH::DEVICE, RXMESH::HOST); - - // check results - bool is_passed = true; - for (uint32_t i = 0; i < num_mesh_elements; ++i) { - for (uint32_t j = 0; j < attributes_per_element; ++j) { - if (std::abs(Y(i, j) - (alpha_val * x_val + beta_val * y_val)) > - 0.0001) { - is_passed = false; - break; - } - } - if (!is_passed) { - break; - } - } - - // release rxmesh_attribute memory on host and device - X.release(); - Y.release(); - - - return is_passed; -} - - -bool test_reduce() -{ - using namespace RXMESH; - constexpr uint32_t attributes_per_element = 3; - uint32_t num_mesh_elements = 2048; - RXMESH::RXMeshAttribute X; - - X.set_name("X"); - X.init(num_mesh_elements, attributes_per_element, RXMESH::HOST, - RXMESH::AoS); - - // generate some numbers as AoS - for (uint32_t i = 0; i < num_mesh_elements; ++i) { - for (uint32_t j = 0; j < attributes_per_element; ++j) { - X(i, j) = j + 1; - } - } - - X.change_layout(RXMESH::HOST); - X.move(RXMESH::HOST, RXMESH::DEVICE); - Vector output; - - // call reduce - X.reduce(output, RXMESH::SUM); - - - // sync - CUDA_ERROR(cudaDeviceSynchronize()); - CUDA_ERROR(cudaPeekAtLastError()); - CUDA_ERROR(cudaGetLastError()); - - bool is_passed = true; - - for (uint32_t j = 0; j < attributes_per_element; ++j) { - if (output[j] != num_mesh_elements * (j + 1)) { - is_passed = false; - break; - } - } - - // release rxmesh_attribute memory on host and device - X.release(); - - - return is_passed; -} - - -bool test_norm2() -{ - using namespace RXMESH; - constexpr uint32_t attributes_per_element = 3; - uint32_t num_mesh_elements = 2048; - RXMESH::RXMeshAttribute X; - - X.set_name("X"); - X.init(num_mesh_elements, attributes_per_element, RXMESH::HOST, - RXMESH::AoS); - - // generate some numbers as AoS - for (uint32_t i = 0; i < num_mesh_elements; ++i) { - for (uint32_t j = 0; j < attributes_per_element; ++j) { - X(i, j) = 2; - } - } - - X.change_layout(RXMESH::HOST); - X.move(RXMESH::HOST, RXMESH::DEVICE); - Vector output; - - // call reduce - X.reduce(output, RXMESH::NORM2); - - - // sync - CUDA_ERROR(cudaDeviceSynchronize()); - CUDA_ERROR(cudaPeekAtLastError()); - CUDA_ERROR(cudaGetLastError()); - - bool is_passed = true; - - for (uint32_t j = 0; j < attributes_per_element; ++j) { - if (output[j] != 4 * num_mesh_elements) { - is_passed = false; - break; - } - } - - // release rxmesh_attribute memory on host and device - X.release(); - - - return is_passed; -} - - -bool test_dot() -{ - using namespace RXMESH; - constexpr uint32_t attributes_per_element = 3; - uint32_t num_mesh_elements = 2048; - RXMESH::RXMeshAttribute X; - RXMESH::RXMeshAttribute Y; - - X.set_name("X"); - Y.set_name("Y"); - X.init(num_mesh_elements, attributes_per_element, RXMESH::HOST, - RXMESH::AoS); - Y.init(num_mesh_elements, attributes_per_element, RXMESH::HOST, - RXMESH::AoS); - - // generate some numbers as AoS - for (uint32_t i = 0; i < num_mesh_elements; ++i) { - for (uint32_t j = 0; j < attributes_per_element; ++j) { - X(i, j) = 2; - Y(i, j) = 3; - } - } - - X.change_layout(RXMESH::HOST); - X.move(RXMESH::HOST, RXMESH::DEVICE); - Y.change_layout(RXMESH::HOST); - Y.move(RXMESH::HOST, RXMESH::DEVICE); - Vector output; - - // call reduce - X.reduce(output, RXMESH::DOT, &Y); - - - // sync - CUDA_ERROR(cudaDeviceSynchronize()); - CUDA_ERROR(cudaPeekAtLastError()); - CUDA_ERROR(cudaGetLastError()); - - bool is_passed = true; - - for (uint32_t j = 0; j < attributes_per_element; ++j) { - if (output[j] != 6 * num_mesh_elements) { - is_passed = false; - break; - } - } - - // release rxmesh_attribute memory on host and device - X.release(); - Y.release(); - - - return is_passed; -} - - -TEST(RXMesh, Attributes) -{ - using namespace RXMESH; - EXPECT_TRUE(test_host(3u)) << " TestAttributes::tes_host failed"; - EXPECT_TRUE(test_device(3u)) << " TestAttributes::tes_device failed"; - // EXPECT_TRUE(test_vector()) << " TestAttributes::test_vector failed"; - EXPECT_TRUE(test_axpy(3u)) << " TestAttributes::test_axpy failed"; - EXPECT_TRUE(test_reduce()) << " TestAttributes::test_reduce failed"; - EXPECT_TRUE(test_norm2()) << " TestAttributes::test_norm2 failed"; - EXPECT_TRUE(test_dot()) << " TestAttributes::test_dot failed"; - - CUDA_ERROR(cudaDeviceSynchronize()); -} \ No newline at end of file diff --git a/tests/RXMesh_test/test_attribute.cuh b/tests/RXMesh_test/test_attribute.cuh new file mode 100644 index 00000000..4804e12e --- /dev/null +++ b/tests/RXMesh_test/test_attribute.cuh @@ -0,0 +1,170 @@ +#include "gtest/gtest.h" +#include "rxmesh/attribute.h" +#include "rxmesh/reduce_handle.h" +#include "rxmesh/util/macros.h" + +template +void populate(rxmesh::RXMeshStatic& rxmesh, + rxmesh::VertexAttribute& v, + T val) +{ + rxmesh.for_each_vertex( + rxmesh::DEVICE, + [v, val] __device__(const rxmesh::VertexHandle vh) { v(vh) = val; }); + + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); +} + + +template +void populate(rxmesh::RXMeshStatic& rxmesh, rxmesh::FaceAttribute& f, T val) +{ + rxmesh.for_each_face( + rxmesh::DEVICE, + [f, val] __device__(const rxmesh::FaceHandle fh) { f(fh) = val; }); + + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); +} + +template +void populate(rxmesh::RXMeshStatic& rxmesh, + rxmesh::VertexAttribute& v1, + rxmesh::VertexAttribute& v2, + T v1_val, + T v2_val) +{ + rxmesh.for_each_vertex( + rxmesh::DEVICE, + [v1, v2, v1_val, v2_val] __device__(const rxmesh::VertexHandle vh) { + v1(vh) = v1_val; + v2(vh) = v2_val; + }); + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); +} + +TEST(Attribute, Norm2) +{ + using namespace rxmesh; + + CUDA_ERROR(cudaDeviceReset()); + + cuda_query(rxmesh_args.device_id, rxmesh_args.quite); + + std::vector> Verts; + std::vector> Faces; + + ASSERT_TRUE( + import_obj(STRINGIFY(INPUT_DIR) "sphere3.obj", Verts, Faces, true)); + + RXMeshStatic rxmesh(Faces, rxmesh_args.quite); + + auto attr = rxmesh.add_vertex_attribute("v", 3, rxmesh::DEVICE); + + const float val(2.0); + + populate(rxmesh, *attr, val); + + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + ReduceHandle reduce(*attr); + + float output = reduce.norm2(*attr); + + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + EXPECT_FLOAT_EQ(output, std::sqrt(val * val * rxmesh.get_num_vertices())); +} + + +TEST(Attribute, Dot) +{ + using namespace rxmesh; + + cuda_query(rxmesh_args.device_id, rxmesh_args.quite); + + std::vector> Verts; + std::vector> Faces; + + ASSERT_TRUE( + import_obj(STRINGIFY(INPUT_DIR) "sphere3.obj", Verts, Faces, true)); + + RXMeshStatic rxmesh(Faces, rxmesh_args.quite); + + auto v1_attr = rxmesh.add_vertex_attribute("v1", 3, rxmesh::DEVICE); + auto v2_attr = rxmesh.add_vertex_attribute("v2", 3, rxmesh::DEVICE); + + const float v1_val(2.0); + const float v2_val(3.0); + + populate(rxmesh, *v1_attr, *v2_attr, v1_val, v2_val); + + ReduceHandle reduce(*v1_attr); + + float output = reduce.dot(*v1_attr, *v2_attr); + + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + EXPECT_FLOAT_EQ(output, v1_val * v2_val * rxmesh.get_num_vertices()); +} + + +TEST(Attribute, CopyFrom) +{ + using namespace rxmesh; + + cuda_query(rxmesh_args.device_id, rxmesh_args.quite); + + std::vector> Verts; + std::vector> Faces; + + ASSERT_TRUE( + import_obj(STRINGIFY(INPUT_DIR) "sphere3.obj", Verts, Faces, true)); + + + RXMeshStatic rxmesh(Faces, rxmesh_args.quite); + + auto f_device = rxmesh.add_face_attribute("d", 3, DEVICE); + + auto f_host = rxmesh.add_face_attribute("h", 3, HOST); + + uint32_t val = 99; + + populate(rxmesh, *f_device, val); + + f_host->copy_from(*f_device, DEVICE, HOST); + + rxmesh.for_each_face( + HOST, [&](const FaceHandle fh) { EXPECT_EQ((*f_host)(fh), val); }); +} + +TEST(Attribute, AddingAndRemoving) +{ + using namespace rxmesh; + + cuda_query(rxmesh_args.device_id, rxmesh_args.quite); + + std::vector> Verts; + std::vector> Faces; + + ASSERT_TRUE( + import_obj(STRINGIFY(INPUT_DIR) "sphere3.obj", Verts, Faces, true)); + + + RXMeshStatic rxmesh(Faces, rxmesh_args.quite); + + std::string attr_name = "v_attr"; + + auto vertex_attr = + rxmesh.add_vertex_attribute(attr_name, 3, rxmesh::LOCATION_ALL); + + EXPECT_TRUE(rxmesh.does_attribute_exist(attr_name)); + + + vertex_attr->move(rxmesh::HOST, rxmesh::DEVICE); + + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + // this is not neccessary in general but we are just testing the + // functionality here + rxmesh.remove_attribute(attr_name); +} \ No newline at end of file diff --git a/tests/RXMesh_test/test_for_each.h b/tests/RXMesh_test/test_for_each.h new file mode 100644 index 00000000..e097c625 --- /dev/null +++ b/tests/RXMesh_test/test_for_each.h @@ -0,0 +1,37 @@ +#include "gtest/gtest.h" + +#include "rxmesh/util/cuda_query.h" +#include "rxmesh/util/import_obj.h" + +TEST(RXMeshStatic, ForEach) +{ + using namespace rxmesh; + + cuda_query(rxmesh_args.device_id, rxmesh_args.quite); + + std::vector> Verts; + std::vector> Faces; + + ASSERT_TRUE( + import_obj(STRINGIFY(INPUT_DIR) "cube.obj", Verts, Faces, true)); + + + RXMeshStatic rxmesh_static(Faces, rxmesh_args.quite); + + std::atomic_uint32_t num_v = 0; + std::atomic_uint32_t num_e = 0; + std::atomic_uint32_t num_f = 0; + + rxmesh_static.for_each_vertex(HOST, + [&](const VertexHandle vh) { num_v++; }); + + rxmesh_static.for_each_edge(HOST, [&](const EdgeHandle eh) { num_e++; }); + + rxmesh_static.for_each_face(HOST, [&](const FaceHandle fh) { num_f++; }); + + EXPECT_EQ(num_v, rxmesh_static.get_num_vertices()); + + EXPECT_EQ(num_e, rxmesh_static.get_num_edges()); + + EXPECT_EQ(num_f, rxmesh_static.get_num_faces()); +} \ No newline at end of file diff --git a/tests/RXMesh_test/test_higher_queries.h b/tests/RXMesh_test/test_higher_queries.h index 7fb94b24..1cc5ffd5 100644 --- a/tests/RXMesh_test/test_higher_queries.h +++ b/tests/RXMesh_test/test_higher_queries.h @@ -1,63 +1,57 @@ #include "gtest/gtest.h" #include "higher_query.cuh" -#include "rxmesh/rxmesh_attribute.h" +#include "rxmesh/attribute.h" #include "rxmesh/rxmesh_static.h" #include "rxmesh/util/import_obj.h" #include "rxmesh_test.h" -using namespace RXMESH; - -TEST(RXMesh, HigherQueries) +TEST(RXMeshStatic, HigherQueries) { + using namespace rxmesh; + // Select device cuda_query(rxmesh_args.device_id, rxmesh_args.quite); + std::vector> Verts; std::vector> Faces; - if (!import_obj(rxmesh_args.obj_file_name, Verts, Faces, - rxmesh_args.quite)) { - exit(EXIT_FAILURE); - } + ASSERT_TRUE(import_obj( + STRINGIFY(INPUT_DIR) "sphere3.obj", Verts, Faces, rxmesh_args.quite)); // RXMesh - RXMeshStatic rxmesh_static(Faces, Verts, false, - rxmesh_args.quite); + RXMeshStatic rxmesh(Faces, rxmesh_args.quite); - uint32_t input_size = rxmesh_static.get_num_vertices(); // input/output container - RXMeshAttribute input_container; - input_container.init(input_size, 1u, RXMESH::DEVICE, RXMESH::AoS, false, - false); + auto input = rxmesh.add_vertex_attribute("input", 1); + input->reset(VertexHandle(), rxmesh::DEVICE); - RXMeshAttribute output_container; - output_container.init(input_size, - input_size, // that is a bit excessive - RXMESH::DEVICE, RXMESH::SoA, false, false); + // we assume that every vertex could store up to num_vertices as its + // neighbor vertices which is a bit excessive + auto output = rxmesh.add_vertex_attribute( + "output", rxmesh.get_num_vertices()); + output->reset(VertexHandle(), rxmesh::DEVICE); // launch box - constexpr uint32_t blockThreads = 512; + constexpr uint32_t blockThreads = 256; LaunchBox launch_box; - rxmesh_static.prepare_launch_box(Op::VV, launch_box, true, false); + rxmesh.prepare_launch_box( + Op::VV, launch_box, (void*)higher_query, false); - output_container.reset(INVALID32, RXMESH::DEVICE); - input_container.reset(INVALID32, RXMESH::DEVICE); - ::RXMeshTest tester(true); + RXMeshTest tester(rxmesh, Faces, true); // launch - higher_query + higher_query <<>>( - rxmesh_static.get_context(), input_container, output_container); + rxmesh.get_context(), *input, *output); + + CUDA_ERROR(cudaGetLastError()); + CUDA_ERROR(cudaDeviceSynchronize()); // move containers to the CPU for testing - output_container.move(RXMESH::DEVICE, RXMESH::HOST); - input_container.move(RXMESH::DEVICE, RXMESH::HOST); + output->move(rxmesh::DEVICE, rxmesh::HOST); + input->move(rxmesh::DEVICE, rxmesh::HOST); // verify - EXPECT_TRUE(tester.run_higher_query_verifier(rxmesh_static, input_container, - output_container)); - - - input_container.release(); - output_container.release(); + EXPECT_TRUE(tester.run_test(rxmesh, Faces, *input, *output, true)); } \ No newline at end of file diff --git a/tests/RXMesh_test/test_iterator.cu b/tests/RXMesh_test/test_iterator.cu index 33dae66d..13259624 100644 --- a/tests/RXMesh_test/test_iterator.cu +++ b/tests/RXMesh_test/test_iterator.cu @@ -1,99 +1,105 @@ #include "gtest/gtest.h" -#include "rxmesh/kernels/rxmesh_iterator.cuh" +#include "rxmesh/iterator.cuh" #include "rxmesh/util/util.h" -template -__global__ static void test_iterator(uint32_t* suceess, - uint32_t* ltog_map, - uint16_t* patch_output, - uint32_t num_elements) -{ - using namespace RXMESH; - uint32_t local_id = threadIdx.x; - RXMeshIterator iter(local_id, patch_output, patch_output, ltog_map, - fixedOffset, 0); - - if (iter.local_id() != local_id) { - atomicAdd(suceess, 1u); - return; - } - - if (iter.size() != fixedOffset) { - atomicAdd(suceess, 1u); - return; - } - uint32_t truth = num_elements - threadIdx.x - 1; - if (iter[0] != truth || iter[1] != truth || iter[2] != truth || - iter.back() != truth || iter.front() != truth) { - atomicAdd(suceess, 1u); - return; - } +template +__global__ static void test_iterator(uint32_t* suceess, + const uint16_t* patch_output, + const uint32_t num_elements, + const uint32_t offset_size, + const uint32_t patch_id) +{ + using namespace rxmesh; + uint16_t local_id = threadIdx.x; + const HandleT truth(patch_id, {local_id}); + + if (local_id >= num_elements) { + + Iterator iter( + local_id, + reinterpret_cast(patch_output), + nullptr, + offset_size, + patch_id, + num_elements, + nullptr, + nullptr); + + if (iter.size() != offset_size) { + atomicAdd(suceess, 1u); + return; + } + if (iter.front() != truth) { + atomicAdd(suceess, 1u); + return; + } - for (uint32_t i = 0; i < iter.size(); ++i) { - if (*iter != truth) { + if (iter.back() != truth) { atomicAdd(suceess, 1u); return; } - ++iter; + + for (uint32_t i = 0; i < iter.size(); ++i) { + if (iter[i] != truth) { + atomicAdd(suceess, 1u); + return; + } + } + + for (uint32_t i = 0; i < iter.size(); ++i) { + if (*iter != truth) { + atomicAdd(suceess, 1u); + return; + } + ++iter; + } } } TEST(RXMesh, Iterator) { - // patch_output: - // 0 0 0 | 1 1 1 | 2 2 2 | ...... - - // ltog_map: - // n-1 n-2 n-3 ..... 3 2 1 0 - - // and so the patch_output in global index space should be - // n-1 n-1 n-1 | n-2 n-2 n-2 | ...... | 1 1 1 | 0 0 0 + // The patch contains 32 elements and the patch_id is 1 + // and patch_output: + // 0 0 0 | 1 1 1 | 2 2 2 | ...... + // i.e., fixed_offset = 3 + using namespace rxmesh; + constexpr uint32_t offset_size = 3; + const uint32_t num_elements = 32; + const uint32_t patch_id = 1; - using namespace RXMESH; - constexpr uint32_t fixedOffset = 3; - const uint32_t N = 32; - - std::vector h_patch_output(fixedOffset * N); + std::vector h_patch_output(offset_size * num_elements); for (uint32_t i = 0; i < h_patch_output.size(); ++i) { - h_patch_output[i] = i / fixedOffset; - } - - std::vector h_ltog_map(N); - for (uint32_t i = 0; i < h_ltog_map.size(); ++i) { - h_ltog_map[i] = N - i - 1; + h_patch_output[i] = i / offset_size; } - uint32_t *d_ltog_map(nullptr), *d_suceess(nullptr); + uint32_t* d_suceess(nullptr); uint16_t* d_patch_output(nullptr); - CUDA_ERROR( - cudaMalloc((void**)&d_ltog_map, h_ltog_map.size() * sizeof(uint32_t))); CUDA_ERROR(cudaMalloc((void**)&d_patch_output, h_patch_output.size() * sizeof(uint32_t))); - CUDA_ERROR(cudaMemcpy(d_ltog_map, h_ltog_map.data(), - h_ltog_map.size() * sizeof(uint32_t), - cudaMemcpyHostToDevice)); - CUDA_ERROR(cudaMemcpy(d_patch_output, h_patch_output.data(), + + CUDA_ERROR(cudaMemcpy(d_patch_output, + h_patch_output.data(), h_patch_output.size() * sizeof(uint16_t), cudaMemcpyHostToDevice)); CUDA_ERROR(cudaMalloc((void**)&d_suceess, sizeof(uint32_t))); CUDA_ERROR(cudaMemset(d_suceess, 0, sizeof(uint32_t))); - test_iterator<3u><<<1, N>>>(d_suceess, d_ltog_map, d_patch_output, N); + test_iterator<<<1, num_elements>>>( + d_suceess, d_patch_output, num_elements, offset_size, patch_id); CUDA_ERROR(cudaDeviceSynchronize()); uint32_t h_success = 0; - CUDA_ERROR(cudaMemcpy(&h_success, d_suceess, sizeof(uint32_t), - cudaMemcpyDeviceToHost)); + CUDA_ERROR(cudaMemcpy( + &h_success, d_suceess, sizeof(uint32_t), cudaMemcpyDeviceToHost)); EXPECT_EQ(h_success, 0); CUDA_ERROR(cudaFree(d_patch_output)); CUDA_ERROR(cudaFree(d_suceess)); - CUDA_ERROR(cudaFree(d_ltog_map)); CUDA_ERROR(cudaDeviceSynchronize()); CUDA_ERROR(cudaDeviceReset()); } \ No newline at end of file diff --git a/tests/RXMesh_test/test_queries.h b/tests/RXMesh_test/test_queries.h index b37c344b..f2aca2cd 100644 --- a/tests/RXMesh_test/test_queries.h +++ b/tests/RXMesh_test/test_queries.h @@ -1,224 +1,216 @@ +#include +#include #include + #include "gtest/gtest.h" -#include "query.cuh" -#include "rxmesh/rxmesh_attribute.h" + #include "rxmesh/rxmesh_static.h" -#include "rxmesh/rxmesh_util.h" #include "rxmesh/util/import_obj.h" -#include "rxmesh/util/math.h" #include "rxmesh/util/report.h" #include "rxmesh_test.h" -using namespace RXMESH; - -/** - * launcher() - */ -template -float launcher(const RXMeshContext& context, - const Op op, - RXMeshAttribute& input_container, - RXMeshAttribute& output_container, - LaunchBox& launch_box, - const bool oriented = false) -{ - CUDA_ERROR(cudaProfilerStart()); - GPUTimer timer; - timer.start(); - - switch (op) { - case Op::VV: - query - <<>>(context, input_container, - output_container, oriented); - break; - case Op::VE: - query - <<>>(context, input_container, - output_container); - break; - case Op::VF: - query - <<>>(context, input_container, - output_container); - break; - case Op::EV: - query - <<>>(context, input_container, - output_container); - break; - case Op::EE: - RXMESH_ERROR( - "RXMeshStatic::launcher_no_src() Op::EE is not " - "supported!!"); - break; - case Op::EF: - query - <<>>(context, input_container, - output_container); - break; - case Op::FV: - query - <<>>(context, input_container, - output_container); - break; - case Op::FE: - query - <<>>(context, input_container, - output_container); - break; - case Op::FF: - query - <<>>(context, input_container, - output_container); - break; - } - timer.stop(); - CUDA_ERROR(cudaDeviceSynchronize()); - CUDA_ERROR(cudaGetLastError()); - CUDA_ERROR(cudaProfilerStop()); - return timer.elapsed_millis(); -} - -/** - * calc_fixed_offset() - */ -template -inline uint32_t max_output_per_element(const RXMeshStatic& rxmesh, - const Op& op) -{ - if (op == Op::EV) { - return 2; - } else if (op == Op::EF) { - return rxmesh.get_max_edge_incident_faces(); - } else if (op == Op::FV || op == Op::FE) { - return rxmesh.get_face_degree(); - } else if (op == Op::FF) { - return rxmesh.get_max_edge_adjacent_faces(); - } else if (op == Op::VV || op == Op::VE || op == Op::VF) { - return rxmesh.get_max_valence(); - } else { - RXMESH_ERROR("calc_fixed_offset() Invalid op " + op_to_string(op)); - return -1u; - } -} +#include "query.cuh" -TEST(RXMesh, Oriented_VV) +TEST(RXMeshStatic, Oriented_VV) { + using namespace rxmesh; // Select device cuda_query(rxmesh_args.device_id, rxmesh_args.quite); + std::vector> Verts; std::vector> Faces; - ASSERT_TRUE(import_obj(STRINGIFY(INPUT_DIR) "cube.obj", Verts, Faces, true)); + ASSERT_TRUE( + import_obj(STRINGIFY(INPUT_DIR) "cube.obj", Verts, Faces, true)); - // Instantiate RXMesh Static - RXMeshStatic rxmesh_static(Faces, Verts, false, rxmesh_args.quite); + // RXMesh + RXMeshStatic rxmesh(Faces, rxmesh_args.quite); - EXPECT_TRUE(rxmesh_static.is_closed()) + EXPECT_TRUE(rxmesh.is_closed()) << " Can't generate oriented VV for input with boundaries"; + auto coordinates = rxmesh.add_vertex_attribute(Verts, "coordinates"); + // input/output container - RXMeshAttribute input_container; - input_container.init(rxmesh_static.get_num_vertices(), 1u, RXMESH::DEVICE, - RXMESH::AoS, false, false); + auto input = rxmesh.add_vertex_attribute("input", 1); + auto output = rxmesh.add_vertex_attribute( + "output", rxmesh.get_max_valence()); - RXMeshAttribute output_container; - output_container.init(rxmesh_static.get_num_vertices(), - max_output_per_element(rxmesh_static, Op::VV) + 1, - RXMESH::DEVICE, RXMESH::SoA, false, false); + input->reset(VertexHandle(), rxmesh::DEVICE); + output->reset(VertexHandle(), rxmesh::DEVICE); // launch box - LaunchBox<256> launch_box; - rxmesh_static.prepare_launch_box(Op::VV, launch_box, false, true); - - // launch query - float tt = launcher(rxmesh_static.get_context(), Op::VV, input_container, - output_container, launch_box, true); + constexpr uint32_t blockThreads = 256; + LaunchBox launch_box; + rxmesh.prepare_launch_box( + Op::VV, + launch_box, + (void*)query_kernel, + VertexAttribute>, + true); + + // query + query_kernel + <<>>( + rxmesh.get_context(), *input, *output, true); + CUDA_ERROR(cudaDeviceSynchronize()); // move containers to the CPU for testing - output_container.move(RXMESH::DEVICE, RXMESH::HOST); - input_container.move(RXMESH::DEVICE, RXMESH::HOST); + output->move(rxmesh::DEVICE, rxmesh::HOST); + input->move(rxmesh::DEVICE, rxmesh::HOST); + + RXMeshTest tester(rxmesh, Faces, rxmesh_args.quite); + EXPECT_TRUE(tester.run_test(rxmesh, Faces, *input, *output)); - RXMeshTest tester(true); - EXPECT_TRUE(tester.run_query_verifier(rxmesh_static, Op::VV, - input_container, output_container)); // Make sure orientation is accurate // for the cube, all angle are either 45 or 90 - for (uint32_t v = 0; v < rxmesh_static.get_num_vertices(); ++v) { + auto vector_length = [](const dataT x, const dataT y, const dataT z) { + return std::sqrt(x * x + y * y + z * z); + }; - uint32_t vertex = input_container(v); + auto dot = [](const std::vector& u, const std::vector& v) { + return std::inner_product( + std::begin(u), std::end(u), std::begin(v), 0.0); + }; - uint32_t v_0 = output_container(v, output_container(v, 0)); - for (uint32_t i = 1; i < output_container(v, 0); ++i) { + rxmesh.for_each_vertex(HOST, [&](const VertexHandle& vertex) { + for (uint32_t i = 0; i < (*output).get_num_attributes(); ++i) { - uint32_t v_1 = output_container(v, i); + uint32_t j = (i + 1) % output->get_num_attributes(); - std::vector p1{Verts[vertex][0] - Verts[v_0][0], - Verts[vertex][1] - Verts[v_0][1], - Verts[vertex][2] - Verts[v_0][2]}; + auto v_0 = (*output)(vertex, i); + auto v_1 = (*output)(vertex, j); - std::vector p2{Verts[vertex][0] - Verts[v_1][0], - Verts[vertex][1] - Verts[v_1][1], - Verts[vertex][2] - Verts[v_1][2]}; - dataT dot_pro = dot(p1, p2); - dataT theta = - std::acos(dot_pro / (vector_length(p1[0], p1[1], p1[2]) * - vector_length(p2[0], p2[1], p2[2]))); - theta = (theta * 180) / 3.14159265; - EXPECT_TRUE(std::abs(theta - 90) < 0.0001 || - std::abs(theta - 45) < 0.0001); - v_0 = v_1; - } - } + if (v_1.is_valid() && v_0.is_valid()) { + std::vector p1{ + (*coordinates)(vertex, 0) - (*coordinates)(v_0, 0), + (*coordinates)(vertex, 1) - (*coordinates)(v_0, 1), + (*coordinates)(vertex, 2) - (*coordinates)(v_0, 2)}; - input_container.release(); - output_container.release(); -} + std::vector p2{ + (*coordinates)(vertex, 0) - (*coordinates)(v_1, 0), + (*coordinates)(vertex, 1) - (*coordinates)(v_1, 1), + (*coordinates)(vertex, 2) - (*coordinates)(v_1, 2)}; + dataT dot_pro = dot(p1, p2); + dataT theta = + std::acos(dot_pro / (vector_length(p1[0], p1[1], p1[2]) * + vector_length(p2[0], p2[1], p2[2]))); + theta = (theta * 180) / 3.14159265; + EXPECT_TRUE(std::abs(theta - 90) < 0.0001 || + std::abs(theta - 45) < 0.0001); + } + } + }); +} -TEST(RXMesh, Queries) +template +void launcher(const std::vector>& Faces, + rxmesh::RXMeshStatic& rxmesh, + InputAttributeT& input, + OutputAttributeT& output, + RXMeshTest& tester, + rxmesh::Report& report, + bool oriented) { - if (rxmesh_args.shuffle) { - ASSERT_FALSE(rxmesh_args.sort) - << " cannot shuffle and sort at the same time!"; + using namespace rxmesh; + + // launch box + constexpr uint32_t blockThreads = 256; + LaunchBox launch_box; + rxmesh.prepare_launch_box(op, + launch_box, + (void*)query_kernel, + oriented); + + // test data + TestData td; + td.test_name = op_to_string(op); + td.num_threads = launch_box.num_threads; + td.num_blocks = launch_box.blocks; + td.dyn_smem = launch_box.smem_bytes_dyn; + td.static_smem = launch_box.smem_bytes_static; + td.num_reg = launch_box.num_registers_per_thread; + + float total_time = 0; + + + for (uint32_t itr = 0; itr < rxmesh_args.num_run; itr++) { + // Reset input/output + input.reset(InputHandleT(), rxmesh::DEVICE); + output.reset(OutputHandleT(), rxmesh::DEVICE); + CUDA_ERROR(cudaDeviceSynchronize()); + + CUDA_ERROR(cudaProfilerStart()); + GPUTimer timer; + timer.start(); + query_kernel + <<>>( + rxmesh.get_context(), input, output, oriented); + + timer.stop(); + CUDA_ERROR(cudaDeviceSynchronize()); + CUDA_ERROR(cudaGetLastError()); + CUDA_ERROR(cudaProfilerStop()); + + total_time += timer.elapsed_millis(); + td.time_ms.push_back(timer.elapsed_millis()); } - if (rxmesh_args.sort) { - ASSERT_FALSE(rxmesh_args.shuffle) - << " cannot shuffle and sort at the same time!"; + + // move containers to the CPU for testing + output.move(rxmesh::DEVICE, rxmesh::HOST); + input.move(rxmesh::DEVICE, rxmesh::HOST); + + // verify + bool passed = tester.run_test(rxmesh, Faces, input, output); + + td.passed.push_back(passed); + EXPECT_TRUE(passed) << "Testing: " << td.test_name; + + report.add_test(td); + if (!rxmesh_args.quite) { + RXMESH_TRACE(" {} {} time = {} (ms)", + td.test_name.c_str(), + (passed ? " passed " : " failed "), + total_time / float(rxmesh_args.num_run)); } +} +TEST(RXMeshStatic, Queries) +{ + using namespace rxmesh; bool oriented = false; // Select device cuda_query(rxmesh_args.device_id, rxmesh_args.quite); + std::vector> Verts; std::vector> Faces; - ASSERT_TRUE(import_obj(rxmesh_args.obj_file_name, Verts, Faces, - rxmesh_args.quite)); - - if (rxmesh_args.shuffle) { - shuffle_obj(Faces, Verts); - } + ASSERT_TRUE( + import_obj(rxmesh_args.obj_file_name, Verts, Faces, rxmesh_args.quite)); // RXMesh - RXMeshStatic rxmesh_static(Faces, Verts, rxmesh_args.sort, - rxmesh_args.quite); + RXMeshStatic rxmesh(Faces, rxmesh_args.quite); // Report @@ -227,108 +219,104 @@ TEST(RXMesh, Queries) report.command_line(rxmesh_args.argc, rxmesh_args.argv); report.device(); report.system(); - report.model_data(rxmesh_args.obj_file_name, rxmesh_static); + report.model_data(rxmesh_args.obj_file_name, rxmesh); report.add_member("method", std::string("RXMesh")); - std::string order = "default"; - if (rxmesh_args.shuffle) { - order = "shuffle"; - } else if (rxmesh_args.sort) { - order = "sorted"; - } - report.add_member("input_order", order); // Tester to verify all queries - ::RXMeshTest tester(true); - EXPECT_TRUE(tester.run_ltog_mapping_test(rxmesh_static)) + ::RXMeshTest tester(rxmesh, Faces, rxmesh_args.quite); + EXPECT_TRUE(tester.run_ltog_mapping_test(rxmesh, Faces)) << "Local-to-global mapping test failed"; - // adding query that we want to test - std::vector ops = {Op::VV, Op::VE, Op::VF, // - Op::FV, Op::FE, Op::FF, // - Op::EV, Op::EF}; - - - for (auto& ops_it : ops) { - - // Input and output element type - ELEMENT source_ele(ELEMENT::VERTEX), output_ele(ELEMENT::VERTEX); - io_elements(ops_it, source_ele, output_ele); - - // Input size - uint32_t input_size = - (source_ele == ELEMENT::VERTEX) ? - rxmesh_static.get_num_vertices() : - ((source_ele == ELEMENT::EDGE) ? rxmesh_static.get_num_edges() : - rxmesh_static.get_num_faces()); - - // input/output container - RXMeshAttribute input_container; - input_container.init(input_size, 1u, RXMESH::DEVICE, RXMESH::AoS, false, - false); - - // allocate output container - // for each mesh element, we reserve the maximum possible output based - // on the operation (ops_it). The +1 is used to store the size of the - // output for operations that output variable outputs per elements - // (e.g., VV) - RXMeshAttribute output_container; - output_container.init(input_size, - max_output_per_element(rxmesh_static, ops_it) + 1, - RXMESH::DEVICE, RXMESH::SoA, false, false); - - // launch box - LaunchBox<256> launch_box; - rxmesh_static.prepare_launch_box(ops_it, launch_box, false, oriented); - - // test data - TestData td; - td.test_name = op_to_string(ops_it); - td.num_threads = launch_box.num_threads; - td.num_blocks = launch_box.blocks; - td.dyn_smem = launch_box.smem_bytes_dyn; - td.static_smem = launch_box.smem_bytes_static; - - - float total_time = 0; - for (uint32_t itr = 0; itr < rxmesh_args.num_run; itr++) { - - output_container.reset(INVALID32, RXMESH::DEVICE); - input_container.reset(INVALID32, RXMESH::DEVICE); - - // launch query - float tt = - launcher(rxmesh_static.get_context(), ops_it, input_container, - output_container, launch_box, oriented); - total_time += tt; - td.time_ms.push_back(tt); - } + { + // VV + auto input = rxmesh.add_vertex_attribute("input", 1); + auto output = rxmesh.add_vertex_attribute( + "output", rxmesh.get_max_valence()); + launcher( + Faces, rxmesh, *input, *output, tester, report, oriented); + rxmesh.remove_attribute("input"); + rxmesh.remove_attribute("output"); + } - // move containers to the CPU for testing - output_container.move(RXMESH::DEVICE, RXMESH::HOST); - input_container.move(RXMESH::DEVICE, RXMESH::HOST); + { + // VE + auto input = rxmesh.add_vertex_attribute("input", 1); + auto output = rxmesh.add_vertex_attribute( + "output", rxmesh.get_max_valence()); + launcher( + Faces, rxmesh, *input, *output, tester, report, oriented); + rxmesh.remove_attribute("input"); + rxmesh.remove_attribute("output"); + } - // verify - bool passed = tester.run_query_verifier( - rxmesh_static, ops_it, input_container, output_container); + { + // VF + auto input = rxmesh.add_vertex_attribute("input", 1); + auto output = rxmesh.add_vertex_attribute( + "output", rxmesh.get_max_valence()); + launcher( + Faces, rxmesh, *input, *output, tester, report, oriented); + rxmesh.remove_attribute("input"); + rxmesh.remove_attribute("output"); + } - td.passed.push_back(passed); - EXPECT_TRUE(passed) << "Testing: " << td.test_name; - report.add_test(td); - if (!rxmesh_args.quite) { - RXMESH_TRACE(" {} {} time = {} (ms)", td.test_name.c_str(), - (passed ? " passed " : " failed "), - total_time / float(rxmesh_args.num_run)); - } + { + // EV + auto input = rxmesh.add_edge_attribute("input", 1); + auto output = rxmesh.add_edge_attribute("output", 2); + launcher( + Faces, rxmesh, *input, *output, tester, report, oriented); + rxmesh.remove_attribute("input"); + rxmesh.remove_attribute("output"); + } - input_container.release(); - output_container.release(); + { + // EF + auto input = rxmesh.add_edge_attribute("input", 1); + auto output = rxmesh.add_edge_attribute( + "output", rxmesh.get_max_edge_incident_faces()); + launcher( + Faces, rxmesh, *input, *output, tester, report, oriented); + rxmesh.remove_attribute("input"); + rxmesh.remove_attribute("output"); } + { + // FV + auto input = rxmesh.add_face_attribute("input", 1); + auto output = rxmesh.add_face_attribute("output", 3); + launcher( + Faces, rxmesh, *input, *output, tester, report, oriented); + rxmesh.remove_attribute("input"); + rxmesh.remove_attribute("output"); + } + + { + // FE + auto input = rxmesh.add_face_attribute("input", 1); + auto output = rxmesh.add_face_attribute("output", 3); + launcher( + Faces, rxmesh, *input, *output, tester, report, oriented); + rxmesh.remove_attribute("input"); + rxmesh.remove_attribute("output"); + } + + { + // FF + auto input = rxmesh.add_face_attribute("input", 1); + auto output = rxmesh.add_face_attribute( + "output", rxmesh.get_max_face_adjacent_faces() + 2); + launcher( + Faces, rxmesh, *input, *output, tester, report, oriented); + rxmesh.remove_attribute("input"); + rxmesh.remove_attribute("output"); + } + // Write the report report.write( - rxmesh_args.output_folder + "/rxmesh/" + order, + rxmesh_args.output_folder + "/rxmesh", "QueryTest_RXMesh_" + extract_file_name(rxmesh_args.obj_file_name)); } diff --git a/tests/RXMesh_test/test_util.cu b/tests/RXMesh_test/test_util.cu index 036f0453..b3a143c1 100644 --- a/tests/RXMesh_test/test_util.cu +++ b/tests/RXMesh_test/test_util.cu @@ -5,7 +5,6 @@ #include "rxmesh/util/macros.h" #include "rxmesh/util/util.h" -//********************** Mat transpose kernel template __global__ static void k_test_block_mat_transpose(uint16_t* d_src, const uint32_t num_rows, @@ -13,25 +12,20 @@ __global__ static void k_test_block_mat_transpose(uint16_t* d_src, uint16_t* d_output) { - RXMESH::block_mat_transpose( + rxmesh::block_mat_transpose( num_rows, num_cols, d_src, d_output); } -//************************************************************************** -//********************** block scan inplace kernel template __global__ static void k_test_block_exclusive_sum(T* d_src, const uint32_t size) { - RXMESH::cub_block_exclusive_sum(d_src, size); + rxmesh::cub_block_exclusive_sum(d_src, size); } -//************************************************************************** - -//********************** atomicAdd kernel template __global__ static void k_test_atomicAdd(T* d_val) { - RXMESH::atomicAdd(d_val, 1); + rxmesh::atomicAdd(d_val, 1); /*__half* as_half = (__half*)(d_val); ::atomicAdd(as_half,1); __syncthreads(); @@ -40,238 +34,198 @@ __global__ static void k_test_atomicAdd(T* d_val) d_val[0] = val; }*/ } -//************************************************************************** -class TestUtil +TEST(Util, Scan) { + using namespace rxmesh; - public: - TestUtil(){}; + constexpr uint32_t blockThreads = 128; + uint32_t size = 8144; + std::vector h_src(size, 1); + uint32_t* d_src = nullptr; + CUDA_ERROR(cudaMalloc((void**)&d_src, size * sizeof(uint32_t))); + CUDA_ERROR(cudaMemcpy( + d_src, h_src.data(), size * sizeof(uint32_t), cudaMemcpyHostToDevice)); - void test_all() - { - test_scan(); + k_test_block_exclusive_sum + <<<1, blockThreads>>>(d_src, size); - test_block_mat_transpose<542, 847, 3>(); + CUDA_ERROR(cudaDeviceSynchronize()); + CUDA_ERROR(cudaGetLastError()); - test_atomicAdd(); + CUDA_ERROR(cudaMemcpy( + h_src.data(), d_src, size * sizeof(uint32_t), cudaMemcpyDeviceToHost)); - test_atomicAdd(); + for (uint32_t i = 0; i < h_src.size(); ++i) { + EXPECT_EQ(h_src[i], i); } + GPU_FREE(d_src); +} - bool test_scan() - { - using namespace RXMESH; +template +bool test_atomicAdd(const uint32_t threads = 1024) +{ + using namespace rxmesh; - constexpr uint32_t blockThreads = 128; - uint32_t size = 8144; - std::vector h_src(size, 1); - uint32_t* d_src = nullptr; - CUDA_ERROR(cudaMalloc((void**)&d_src, size * sizeof(uint32_t))); - CUDA_ERROR(cudaMemcpy(d_src, h_src.data(), size * sizeof(uint32_t), - cudaMemcpyHostToDevice)); + T h_val = 0; + T* d_val; - k_test_block_exclusive_sum - <<<1, blockThreads>>>(d_src, size); + CUDA_ERROR(cudaMalloc((void**)&d_val, sizeof(T))); + CUDA_ERROR(cudaMemcpy(d_val, &h_val, sizeof(T), cudaMemcpyHostToDevice)); - CUDA_ERROR(cudaDeviceSynchronize()); - CUDA_ERROR(cudaGetLastError()); - CUDA_ERROR(cudaMemcpy(h_src.data(), d_src, size * sizeof(uint32_t), - cudaMemcpyDeviceToHost)); - bool passed = true; - for (uint32_t i = 0; i < h_src.size(); ++i) { - if (h_src[i] != i) { - passed = false; - break; - } - } + k_test_atomicAdd<<<1, threads>>>(d_val); - GPU_FREE(d_src); + CUDA_ERROR(cudaDeviceSynchronize()); + CUDA_ERROR(cudaGetLastError()); - return passed; - } - //************************************************************************** - - - //********************** Test matrix transpose - template - bool test_block_mat_transpose() - { - using namespace RXMESH; - // The matrix is numRows X numCols where every rows has rowOffset - // non-zero elements. The matrix passed to the kernel contains the - // column ids only and we also pass the rowOffset. The transposed matrix - // is stored in the source as row ids and the offset is stored in the - // h_res_offset. - - const uint32_t arr_size = numRows * rowOffset; - std::vector h_src(arr_size); - std::vector row(numCols); - fill_with_sequential_numbers(row.data(), - static_cast(row.size())); - random_shuffle(row.data(), static_cast(row.size())); + CUDA_ERROR(cudaMemcpy(&h_val, d_val, sizeof(T), cudaMemcpyDeviceToHost)); - for (uint32_t s = 0; s < h_src.size(); s += rowOffset) { - // prevent duplication in the same row - for (uint32_t i = 0; i < rowOffset; ++i) { - h_src[s + i] = row[i]; - } - random_shuffle(row.data(), static_cast(row.size())); - } + // check + bool passed = true; + if (h_val != static_cast(threads)) { + passed = false; + } + GPU_FREE(d_val); - // const uint32_t threads = numRows*rowOffset; - // We try to divide the number of non-zero elements equally between - // threads. However, it may not aligned perfectly. So we need to pad - // h_src with INVALID32 since this will be part of the sorting in - // the transpose kernel. Also, d_offset should be large enough to - // align with the padding. - - const uint32_t threads = 256; - const uint32_t item_per_thread = - DIVIDE_UP(numRows * rowOffset, threads); - const uint32_t blocks = 1; + return passed; +} +TEST(Util, AtomicAdd) +{ + EXPECT_TRUE(test_atomicAdd()) << "uint16_t failed"; + EXPECT_TRUE(test_atomicAdd()) << "uint8_t failed"; +} - if (item_per_thread * threads > numRows * rowOffset) { - for (uint32_t i = numRows * rowOffset; - i < item_per_thread * threads; ++i) { - h_src.push_back(INVALID16); - } +TEST(Util, BlockMatrixTranspose) +{ + constexpr uint32_t numRows = 542; + constexpr uint32_t numCols = 847; + constexpr uint32_t rowOffset = 3; + + using namespace rxmesh; + // The matrix is numRows X numCols where every rows has rowOffset + // non-zero elements. The matrix passed to the kernel contains the + // column ids only and we also pass the rowOffset. The transposed matrix + // is stored in the source as row ids and the offset is stored in the + // h_res_offset. + + const uint32_t arr_size = numRows * rowOffset; + std::vector h_src(arr_size); + std::vector row(numCols); + fill_with_sequential_numbers(row.data(), static_cast(row.size())); + random_shuffle(row.data(), static_cast(row.size())); + + for (uint32_t s = 0; s < h_src.size(); s += rowOffset) { + // prevent duplication in the same row + for (uint32_t i = 0; i < rowOffset; ++i) { + h_src[s + i] = row[i]; } + random_shuffle(row.data(), static_cast(row.size())); + } - uint16_t *d_src, *d_offset; - CUDA_ERROR(cudaMalloc((void**)&d_src, h_src.size() * sizeof(uint16_t))); - CUDA_ERROR( - cudaMalloc((void**)&d_offset, h_src.size() * sizeof(uint16_t))); - CUDA_ERROR(cudaMemcpy(d_src, h_src.data(), - h_src.size() * sizeof(uint16_t), - cudaMemcpyHostToDevice)); - - - k_test_block_mat_transpose - <<>>( - d_src, numRows, numCols, d_offset); - - CUDA_ERROR(cudaDeviceSynchronize()); - CUDA_ERROR(cudaGetLastError()); - std::vector h_res(arr_size); - std::vector h_res_offset(numCols); + // const uint32_t threads = numRows*rowOffset; + // We try to divide the number of non-zero elements equally between + // threads. However, it may not aligned perfectly. So we need to pad + // h_src with INVALID32 since this will be part of the sorting in + // the transpose kernel. Also, d_offset should be large enough to + // align with the padding. - CUDA_ERROR(cudaMemcpy(h_res.data(), d_offset, - arr_size * sizeof(uint16_t), - cudaMemcpyDeviceToHost)); + const uint32_t threads = 256; + const uint32_t item_per_thread = DIVIDE_UP(numRows * rowOffset, threads); + const uint32_t blocks = 1; - CUDA_ERROR(cudaMemcpy(h_res_offset.data(), d_src, - numCols * sizeof(uint16_t), - cudaMemcpyDeviceToHost)); - std::vector gold_res(arr_size); - std::vector gold_res_offset(arr_size); - std::fill_n(gold_res_offset.data(), numCols, 0); - std::fill_n(gold_res.data(), numRows * rowOffset, INVALID16); - // count - for (uint32_t i = 0; i < arr_size; ++i) { - gold_res_offset[h_src[i]]++; - } - // offset - uint32_t prv = gold_res_offset[0]; - gold_res_offset[0] = 0; - for (uint32_t i = 1; i < numCols; ++i) { - uint16_t cur = gold_res_offset[i]; - gold_res_offset[i] = gold_res_offset[i - 1] + prv; - prv = cur; - } - // fill in - for (uint32_t i = 0; i < arr_size; ++i) { - uint16_t col = h_src[i]; - uint32_t row = i / rowOffset; - uint16_t start = gold_res_offset[col]; - uint16_t end = (col == numCols - 1) ? numRows * rowOffset : - gold_res_offset[col + 1]; - for (uint32_t j = start; j < end; ++j) { - if (gold_res[j] == INVALID16) { - gold_res[j] = row; - break; - } - } + if (item_per_thread * threads > numRows * rowOffset) { + for (uint32_t i = numRows * rowOffset; i < item_per_thread * threads; + ++i) { + h_src.push_back(INVALID16); } + } + uint16_t *d_src, *d_offset; + CUDA_ERROR(cudaMalloc((void**)&d_src, h_src.size() * sizeof(uint16_t))); + CUDA_ERROR(cudaMalloc((void**)&d_offset, h_src.size() * sizeof(uint16_t))); + CUDA_ERROR(cudaMemcpy(d_src, + h_src.data(), + h_src.size() * sizeof(uint16_t), + cudaMemcpyHostToDevice)); - for (uint32_t i = 0; i < numCols; ++i) { - uint32_t start = h_res_offset[i]; - uint32_t end = - (i == numCols - 1) ? numRows * rowOffset : h_res_offset[i + 1]; - std::sort(h_res.data() + start, h_res.data() + end); - } + k_test_block_mat_transpose + <<>>( + d_src, numRows, numCols, d_offset); - // compare - bool passed = true; - if (!compare(h_res.data(), gold_res.data(), - arr_size, false) || - !compare( - h_res_offset.data(), gold_res_offset.data(), numCols, false)) { - passed = false; + CUDA_ERROR(cudaDeviceSynchronize()); + CUDA_ERROR(cudaGetLastError()); + + std::vector h_res(arr_size); + std::vector h_res_offset(numCols); + + CUDA_ERROR(cudaMemcpy(h_res.data(), + d_offset, + arr_size * sizeof(uint16_t), + cudaMemcpyDeviceToHost)); + + CUDA_ERROR(cudaMemcpy(h_res_offset.data(), + d_src, + numCols * sizeof(uint16_t), + cudaMemcpyDeviceToHost)); + + std::vector gold_res(arr_size); + std::vector gold_res_offset(arr_size); + std::fill_n(gold_res_offset.data(), numCols, 0); + std::fill_n(gold_res.data(), numRows * rowOffset, INVALID16); + // count + for (uint32_t i = 0; i < arr_size; ++i) { + gold_res_offset[h_src[i]]++; + } + // offset + uint32_t prv = gold_res_offset[0]; + gold_res_offset[0] = 0; + for (uint32_t i = 1; i < numCols; ++i) { + uint16_t cur = gold_res_offset[i]; + gold_res_offset[i] = gold_res_offset[i - 1] + prv; + prv = cur; + } + // fill in + for (uint32_t i = 0; i < arr_size; ++i) { + uint16_t col = h_src[i]; + uint32_t row = i / rowOffset; + uint16_t start = gold_res_offset[col]; + uint16_t end = (col == numCols - 1) ? numRows * rowOffset : + gold_res_offset[col + 1]; + for (uint32_t j = start; j < end; ++j) { + if (gold_res[j] == INVALID16) { + gold_res[j] = row; + break; + } } - - GPU_FREE(d_src); - GPU_FREE(d_offset); - - return passed; } - //************************************************************************** - - template - bool test_atomicAdd(const uint32_t threads = 1024) - { - using namespace RXMESH; - T h_val = 0; - T* d_val; - - CUDA_ERROR(cudaMalloc((void**)&d_val, sizeof(T))); - CUDA_ERROR( - cudaMemcpy(d_val, &h_val, sizeof(T), cudaMemcpyHostToDevice)); - - - k_test_atomicAdd<<<1, threads>>>(d_val); - - CUDA_ERROR(cudaDeviceSynchronize()); - CUDA_ERROR(cudaGetLastError()); - - CUDA_ERROR( - cudaMemcpy(&h_val, d_val, sizeof(T), cudaMemcpyDeviceToHost)); - - - // check - bool passed = true; - if (h_val != static_cast(threads)) { - passed = false; - } - GPU_FREE(d_val); - - return passed; + for (uint32_t i = 0; i < numCols; ++i) { + uint32_t start = h_res_offset[i]; + uint32_t end = + (i == numCols - 1) ? numRows * rowOffset : h_res_offset[i + 1]; + std::sort(h_res.data() + start, h_res.data() + end); } - ~TestUtil(){}; -}; -TEST(RXMesh, Util) -{ - using namespace RXMESH; - TestUtil tc; + // compare + bool passed = true; + if (!compare( + h_res.data(), gold_res.data(), arr_size, false) || + !compare( + h_res_offset.data(), gold_res_offset.data(), numCols, false)) { + passed = false; + } - EXPECT_TRUE(tc.test_scan()); - bool mat_trans = tc.template test_block_mat_transpose<542, 847, 3>(); - EXPECT_TRUE(mat_trans); - EXPECT_TRUE(tc.test_atomicAdd()); - EXPECT_TRUE(tc.test_atomicAdd()); + GPU_FREE(d_src); + GPU_FREE(d_offset); - CUDA_ERROR(cudaDeviceSynchronize()); - CUDA_ERROR(cudaDeviceReset()); + EXPECT_TRUE(passed); } \ No newline at end of file diff --git a/tests/RXMesh_test/test_vector.cu b/tests/RXMesh_test/test_vector.cu index c39c6719..42322431 100644 --- a/tests/RXMesh_test/test_vector.cu +++ b/tests/RXMesh_test/test_vector.cu @@ -1,9 +1,9 @@ #include "gtest/gtest.h" #include "rxmesh/util/vector.h" -TEST(RXMESH, Vector) +TEST(RXMesh, Vector) { - using namespace RXMESH; + using namespace rxmesh; // constrctors Vector3f v0(0.5f);