diff --git a/.clang-format b/.clang-format
index 45af25e2..dd42a951 100644
--- a/.clang-format
+++ b/.clang-format
@@ -4,14 +4,16 @@ BasedOnStyle:  Chromium
 TabWidth: 4
 UseTab: Never
 
+AlignConsecutiveAssignments: true
 AllowShortFunctionsOnASingleLine: false
 AllowShortIfStatementsOnASingleLine: false
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakTemplateDeclarations: true
 AlignTrailingComments: true
+BinPackArguments: false
 BinPackParameters: false
 BreakBeforeTernaryOperators: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
 Cpp11BracedListStyle: true
 IndentCaseLabels: true
 IndentWidth:     4
diff --git a/.github/workflows/Ubuntu.yml b/.github/workflows/Ubuntu.yml
index dfa4823c..c618db96 100644
--- a/.github/workflows/Ubuntu.yml
+++ b/.github/workflows/Ubuntu.yml
@@ -1,5 +1,5 @@
 name: Ubuntu
-on: [push, pull_request]
+on: [push, pull_request, workflow_dispatch]
 jobs:
   UbuntuRun:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/Windows.yml b/.github/workflows/Windows.yml
index 4ea07595..575fbc68 100644
--- a/.github/workflows/Windows.yml
+++ b/.github/workflows/Windows.yml
@@ -1,5 +1,5 @@
 name: Windows
-on: [push, pull_request]
+on: [push, pull_request, workflow_dispatch]
 jobs:
   WindowsRun:
     runs-on: windows-latest
diff --git a/.gitignore b/.gitignore
index cda600a3..6b9aad91 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 output/
+input/
 build/
 include/rxmesh/util/git_sha1.cpp
 .vscode/
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f41a40e9..ea45e707 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,11 +1,11 @@
-cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 
 if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
   cmake_policy(SET CMP0104 OLD)
 endif()
 
 project(RXMesh 
-        VERSION 0.1.0 
+        VERSION 0.2.0 
         LANGUAGES C CXX CUDA)
 
 set(CMAKE_CXX_STANDARD 17)
@@ -86,15 +86,16 @@ target_sources(RXMesh_header_lib
 
 # CUDA and C++ compiler flags
 set(cxx_flags 
-	$<$<CXX_COMPILER_ID:MSVC>:-D_SCL_SECURE_NO_WARNINGS /openmp /std:c++17> #Add MSVC-specific compiler flags here
-	$<$<CXX_COMPILER_ID:GNU>:-Wall -m64 -fopenmp -O3 -std=c++17>            #Add GCC/Clang-specific compiler flags here
+	$<$<CXX_COMPILER_ID:MSVC>:-D_SCL_SECURE_NO_WARNINGS /openmp /std:c++17>           #Add MSVC-specific compiler flags here
+	$<$<CXX_COMPILER_ID:GNU>:-Wall -m64 -fopenmp -O3 -std=c++17 -Wno-unused-function> #Add GCC/Clang-specific compiler flags here
 	)
 set(cuda_flags
-    -Xcompiler=$<$<CXX_COMPILER_ID:GNU>:-Wall -fopenmp -O3>
+    -Xcompiler=$<$<CXX_COMPILER_ID:GNU>:-Wall -fopenmp -O3 -Wno-unused-function>
     #Disables warning
     #177-D "function XXX was declared but never referenced"
     -Xcudafe "--display_error_number --diag_suppress=177"
     ${CUDA_ARCHS}
+    -rdc=true
 	-lineinfo	
 	--expt-extended-lambda	
 	-use_fast_math	
@@ -111,7 +112,6 @@ target_compile_options(developer_flags INTERFACE
     $<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>
 )
 
-
 target_link_libraries(RXMesh_header_lib INTERFACE $<BUILD_INTERFACE:developer_flags>)
 
 #OpenMP
diff --git a/LICENSE b/LICENSE
index e1826355..ed791707 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 BSD 2-Clause License
 
-Copyright (c) 2021, owensgroup
+Copyright (c) 2022, owensgroup
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/apps/Filtering/CMakeLists.txt b/apps/Filtering/CMakeLists.txt
index 57b2309f..8090e5d3 100644
--- a/apps/Filtering/CMakeLists.txt
+++ b/apps/Filtering/CMakeLists.txt
@@ -27,6 +27,8 @@ endif()
 
 set_target_properties(Filtering PROPERTIES FOLDER "apps")
 
+set_property(TARGET Filtering PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+
 source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "Filtering" FILES ${SOURCE_LIST})
 
 target_link_libraries(Filtering 
diff --git a/apps/Filtering/benchmark.sh b/apps/Filtering/benchmark.sh
index 2cb5fcb9..cb437324 100644
--- a/apps/Filtering/benchmark.sh
+++ b/apps/Filtering/benchmark.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-echo "This script re-generates RXMesh data in Figure 8(c) in the paper."
 echo "Please make sure to first compile the source code and then enter the input OBJ files directory."
 read -p "OBJ files directory (no trailing slash): " input_dir
 
@@ -16,7 +15,7 @@ device_id=0
 
 for file in $input_dir/*.obj; do 	 
     if [ -f "$file" ]; then
-		echo $exe -p -input "$file" -num_filter_iter 5 -device_id $device_id
-             $exe -p -input "$file" -num_filter_iter 5 -device_id $device_id
+		echo $exe -input "$file" -num_filter_iter 5 -device_id $device_id
+         $exe -input "$file" -num_filter_iter 5 -device_id $device_id
     fi 
 done
\ No newline at end of file
diff --git a/apps/Filtering/filtering.cu b/apps/Filtering/filtering.cu
index 5e016041..3a85ee2a 100644
--- a/apps/Filtering/filtering.cu
+++ b/apps/Filtering/filtering.cu
@@ -13,15 +13,12 @@
 
 struct arg
 {
-    std::string obj_file_name = STRINGIFY(INPUT_DIR) "sphere3.obj";
-    std::string output_folder = STRINGIFY(OUTPUT_DIR);
-    uint32_t    device_id = 0;
+    std::string obj_file_name   = STRINGIFY(INPUT_DIR) "sphere3.obj";
+    std::string output_folder   = STRINGIFY(OUTPUT_DIR);
+    uint32_t    device_id       = 0;
     uint32_t    num_filter_iter = 5;
     char**      argv;
     int         argc;
-    bool        shuffle = false;
-    bool        sort = false;
-
 } Arg;
 
 #include "filtering_openmesh.h"
@@ -29,18 +26,9 @@ struct arg
 
 TEST(App, Filtering)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
     using dataT = float;
 
-
-    if (Arg.shuffle) {
-        ASSERT_FALSE(Arg.sort) << " cannot shuffle and sort at the same time!";
-    }
-    if (Arg.sort) {
-        ASSERT_FALSE(Arg.shuffle)
-            << " cannot shuffle and sort at the same time!";
-    }
-
     // Select device
     cuda_query(Arg.device_id);
 
@@ -50,42 +38,27 @@ TEST(App, Filtering)
     std::vector<std::vector<dataT>>    Verts;
     ASSERT_TRUE(import_obj(Arg.obj_file_name, Verts, Faces));
 
-    if (Arg.shuffle) {
-        shuffle_obj(Faces, Verts);
-    }
 
-    // Create RXMeshStatic instance. If Arg.sort is true, Faces and Verts will
-    // be sorted based on the patching happening inside RXMesh
-    RXMeshStatic<PATCH_SIZE> rxmesh_static(Faces, Verts, Arg.sort, false);
+    TriMesh input_mesh;
+    ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, Arg.obj_file_name));
 
+    ASSERT_EQ(input_mesh.n_vertices(), Verts.size());
 
-    // Since OpenMesh only accepts input as obj files, if the input mesh is
-    // shuffled or sorted, we have to write it to a temp file so that OpenMesh
-    // can pick it up
-    TriMesh input_mesh;
-    if (Arg.sort || Arg.shuffle) {
-        export_obj(Faces, Verts, "temp.obj", false);
-        ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, "temp.obj"));
-    } else {
-        ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, Arg.obj_file_name));
-    }
+    // OpenMesh Impl
+    std::vector<std::vector<dataT>> ground_truth(Verts);
+    size_t                          max_neighbour_size = 0;
+    filtering_openmesh(
+        omp_get_max_threads(), input_mesh, ground_truth, max_neighbour_size);
 
-    //*** OpenMesh Impl
-    RXMESH::RXMeshAttribute<dataT> ground_truth;
-    size_t max_neighbour_size = 0;
-    filtering_openmesh(omp_get_max_threads(), input_mesh, ground_truth, max_neighbour_size);
-    
 
-    //*** RXMesh Impl
-    filtering_rxmesh(rxmesh_static, Verts, ground_truth, max_neighbour_size);
+    // RXMesh Impl
+    filtering_rxmesh(Faces, Verts, ground_truth, max_neighbour_size);
 
-    // Release allocation
-    ground_truth.release();
 }
 
 int main(int argc, char** argv)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
     Log::init();
 
     ::testing::InitGoogleTest(&argc, argv);
@@ -101,9 +74,7 @@ int main(int argc, char** argv)
                         "                    Default is {} \n"
                         "                    Hint: Only accepts OBJ files\n"
                         " -o:                JSON file output folder. Default is {} \n"
-                        " -num_filter_iter:  Iteration count. Default is {} \n"                        
-                        " -s:                Shuffle input. Default is false.\n"
-                        " -p:                Sort input using patching output. Default is false.\n"
+                        " -num_filter_iter:  Iteration count. Default is {} \n"
                         " -device_id:        GPU device ID. Default is {}",
              Arg.obj_file_name, Arg.output_folder ,Arg.num_filter_iter ,Arg.device_id);
             // clang-format on
@@ -123,12 +94,6 @@ int main(int argc, char** argv)
             Arg.output_folder =
                 std::string(get_cmd_option(argv, argv + argc, "-o"));
         }
-        if (cmd_option_exists(argv, argc + argv, "-s")) {
-            Arg.shuffle = true;
-        }
-        if (cmd_option_exists(argv, argc + argv, "-p")) {
-            Arg.sort = true;
-        }
         if (cmd_option_exists(argv, argc + argv, "-device_id")) {
             Arg.device_id =
                 atoi(get_cmd_option(argv, argv + argc, "-device_id"));
diff --git a/apps/Filtering/filtering_openmesh.h b/apps/Filtering/filtering_openmesh.h
index dae4481e..5540ace5 100644
--- a/apps/Filtering/filtering_openmesh.h
+++ b/apps/Filtering/filtering_openmesh.h
@@ -4,7 +4,6 @@
 #include <queue>
 #include "../common/openmesh_report.h"
 #include "../common/openmesh_trimesh.h"
-#include "rxmesh/rxmesh_attribute.h"
 
 /**
  *computeSigma_s()
@@ -17,19 +16,19 @@ double computeSigma_s(
 {
 
 
-    float  offset = 0;
-    float  sum = 0;
+    float  offset  = 0;
+    float  sum     = 0;
     float  sum_sqs = 0;
-    size_t count = vertex_neighbour.size();
+    size_t count   = vertex_neighbour.size();
     for (size_t i = 0; i < count; ++i) {
         TriMesh::Point pj = mesh.point(vertex_neighbour[i]);
-        float          t = (pj - pi) | ni;
-        t = sqrt(t * t);
+        float          t  = (pj - pi) | ni;
+        t                 = sqrt(t * t);
         sum += t;
         sum_sqs += t * t;
     }
     float c = static_cast<float>(count);
-    offset = (sum_sqs / c) - ((sum * sum) / (c * c));
+    offset  = (sum_sqs / c) - ((sum * sum) / (c * c));
 
     float sigma_s =
         (sqrt(offset) < 1.0e-12) ? (sqrt(offset) + 1.0e-12) : sqrt(offset);
@@ -50,17 +49,18 @@ void getAdaptiveVertexNeighbor(
     mark[vh.idx()] = true;
     queue_vertex_handle.push(vh);
     float          radius = 2.0 * sigma_c;
-    TriMesh::Point ci = mesh.point(vh);
+    TriMesh::Point ci     = mesh.point(vh);
 
     while (!queue_vertex_handle.empty()) {
         TriMesh::VertexHandle vh = queue_vertex_handle.front();
         vertex_neighbor.push_back(vh);
         queue_vertex_handle.pop();
         for (TriMesh::VertexVertexIter vv_it = mesh.vv_iter(vh);
-             vv_it.is_valid(); ++vv_it) {
+             vv_it.is_valid();
+             ++vv_it) {
             TriMesh::VertexHandle vh_neighbor = *vv_it;
             if (mark[vh_neighbor.idx()] == false) {
-                TriMesh::Point cj = mesh.point(vh_neighbor);
+                TriMesh::Point cj     = mesh.point(vh_neighbor);
                 float          length = (cj - ci).length();
                 if (length <= radius)
                     queue_vertex_handle.push(vh_neighbor);
@@ -71,10 +71,10 @@ void getAdaptiveVertexNeighbor(
 }
 
 template <typename T>
-void filtering_openmesh(const int                   num_omp_threads,
-                        TriMesh&                    input_mesh,
-                        RXMESH::RXMeshAttribute<T>& filtered_coord,
-                        size_t&                     max_neighbour_size)
+void filtering_openmesh(const int                    num_omp_threads,
+                        TriMesh&                     input_mesh,
+                        std::vector<std::vector<T>>& filtered_coord,
+                        size_t&                      max_neighbour_size)
 {
     // Report
     OpenMeshReport report("Filtering_OpenMesh");
@@ -84,18 +84,8 @@ void filtering_openmesh(const int                   num_omp_threads,
     std::string method =
         "OpenMesh " + std::to_string(num_omp_threads) + " Core";
     report.add_member("method", method);
-    std::string order = "default";
-    if (Arg.shuffle) {
-        order = "shuffle";
-    } else if (Arg.sort) {
-        order = "sorted";
-    }
-    report.add_member("input_order", order);
     report.add_member("num_filter_iter", Arg.num_filter_iter);
 
-    // Allocate space for the filtered output coordinates
-    filtered_coord.init(input_mesh.n_vertices(), 3u, RXMESH::HOST);
-    filtered_coord.reset(0.0, RXMESH::HOST);
 
     // this where each thread will store its neighbour vertices
     // we allocate enough space such that each thread can store as much
@@ -109,7 +99,7 @@ void filtering_openmesh(const int                   num_omp_threads,
 
     max_neighbour_size = 0;
 
-    RXMESH::CPUTimer timer;
+    rxmesh::CPUTimer timer;
     timer.start();
 
     for (uint32_t itr = 0; itr < Arg.num_filter_iter; ++itr) {
@@ -127,12 +117,13 @@ void filtering_openmesh(const int                   num_omp_threads,
             int tid = omp_get_thread_num();
 
             // calculate sigma_c
-            TriMesh::Point  pi = input_mesh.point(*v_it);
-            TriMesh::Normal ni = input_mesh.normal(*v_it);
+            TriMesh::Point  pi      = input_mesh.point(*v_it);
+            TriMesh::Normal ni      = input_mesh.normal(*v_it);
             float           sigma_c = 1e10;
             for (TriMesh::VertexVertexIter vv_it = input_mesh.vv_iter(*v_it);
-                 vv_it.is_valid(); vv_it++) {
-                TriMesh::Point pj = input_mesh.point(*vv_it);
+                 vv_it.is_valid();
+                 vv_it++) {
+                TriMesh::Point pj     = input_mesh.point(*vv_it);
                 float          length = (pi - pj).length();
                 if (length < sigma_c) {
                     sigma_c = length;
@@ -141,8 +132,8 @@ void filtering_openmesh(const int                   num_omp_threads,
 
             // get the neighbor vertices
             vertex_neighbour[tid].clear();
-            getAdaptiveVertexNeighbor(input_mesh, *v_it, sigma_c,
-                                      vertex_neighbour[tid]);
+            getAdaptiveVertexNeighbor(
+                input_mesh, *v_it, sigma_c, vertex_neighbour[tid]);
 
             max_neighbour_size =
                 max(max_neighbour_size, vertex_neighbour[tid].size());
@@ -150,24 +141,24 @@ void filtering_openmesh(const int                   num_omp_threads,
             float sigma_s =
                 computeSigma_s(vertex_neighbour[tid], input_mesh, pi, ni);
 
-            float sum = 0;
+            float sum        = 0;
             float normalizer = 0;
 
             // calculate new vertex position
             for (int iv = 0; iv < (int)vertex_neighbour[tid].size(); iv++) {
                 TriMesh::Point pj = input_mesh.point(vertex_neighbour[tid][iv]);
 
-                float t = (pi - pj).length();
-                float h = (pj - pi) | ni;
+                float t  = (pi - pj).length();
+                float h  = (pj - pi) | ni;
                 float wc = std::exp(-0.5 * t * t / (sigma_c * sigma_c));
                 float ws = std::exp(-0.5 * h * h / (sigma_s * sigma_s));
                 sum += wc * ws * h;
                 normalizer += wc * ws;
             }
-            auto updated_point = pi + ni * (sum / normalizer);
-            filtered_coord(vert, 0) = updated_point[0];
-            filtered_coord(vert, 1) = updated_point[1];
-            filtered_coord(vert, 2) = updated_point[2];
+            auto updated_point      = pi + ni * (sum / normalizer);
+            filtered_coord[vert][0] = updated_point[0];
+            filtered_coord[vert][1] = updated_point[1];
+            filtered_coord[vert][2] = updated_point[2];
         }
 
         // update the mesh for the next iterations (needed to update the
@@ -176,9 +167,9 @@ void filtering_openmesh(const int                   num_omp_threads,
         for (int vert = 0; vert < num_vertrices; vert++) {
             TriMesh::VertexIter v_it = input_mesh.vertices_begin() + vert;
             TriMesh::Point      p;
-            p[0] = filtered_coord(vert, 0);
-            p[1] = filtered_coord(vert, 1);
-            p[2] = filtered_coord(vert, 2);
+            p[0] = filtered_coord[vert][0];
+            p[1] = filtered_coord[vert][1];
+            p[2] = filtered_coord[vert][2];
             input_mesh.set_point(*v_it, p);
         }
     }
@@ -202,13 +193,13 @@ void filtering_openmesh(const int                   num_omp_threads,
 
     // Finalize report
     report.add_member("total_time (ms)", timer.elapsed_millis());
-    RXMESH::TestData td;
-    td.test_name = "MCF";
+    rxmesh::TestData td;
+    td.test_name   = "MCF";
     td.num_threads = num_omp_threads;
     td.time_ms.push_back(timer.elapsed_millis());
     td.passed.push_back(true);
     report.add_test(td);
     report.write(
         Arg.output_folder + "/openmesh",
-        "MCF_OpenMesh_" + RXMESH::extract_file_name(Arg.obj_file_name));
+        "MCF_OpenMesh_" + rxmesh::extract_file_name(Arg.obj_file_name));
 }
\ No newline at end of file
diff --git a/apps/Filtering/filtering_rxmesh.cuh b/apps/Filtering/filtering_rxmesh.cuh
index b9301112..ff528279 100644
--- a/apps/Filtering/filtering_rxmesh.cuh
+++ b/apps/Filtering/filtering_rxmesh.cuh
@@ -3,20 +3,20 @@
 #include <cuda_profiler_api.h>
 
 #include "filtering_rxmesh_kernel.cuh"
-#include "rxmesh/rxmesh_attribute.h"
+#include "rxmesh/attribute.h"
 #include "rxmesh/util/report.h"
 #include "rxmesh/util/timer.h"
 
 /**
  * filtering_rxmesh()
  */
-template <typename T, uint32_t patchSize>
-void filtering_rxmesh(RXMESH::RXMeshStatic<patchSize>&  rxmesh_static,
-                      std::vector<std::vector<T>>&      Verts,
-                      const RXMESH::RXMeshAttribute<T>& ground_truth,
-                      const size_t                      max_neighbour_size)
+template <typename T>
+void filtering_rxmesh(std::vector<std::vector<uint32_t>>& Faces,
+                      const std::vector<std::vector<T>>&  Verts,
+                      const std::vector<std::vector<T>>&  ground_truth,
+                      const size_t                        max_neighbour_size)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
 
     constexpr uint32_t maxVVSize = 20 * 4;
 
@@ -25,132 +25,115 @@ void filtering_rxmesh(RXMESH::RXMeshStatic<patchSize>&  rxmesh_static,
            "greater than maxVVSize. Should increase maxVVSize to "
         << max_neighbour_size << " to avoid illegal memory access";
 
+    RXMeshStatic rxmesh(Faces, false);
+
     // Report
     Report report("Filtering_RXMesh");
     report.command_line(Arg.argc, Arg.argv);
     report.device();
     report.system();
-    report.model_data(Arg.obj_file_name, rxmesh_static);
+    report.model_data(Arg.obj_file_name, rxmesh);
     report.add_member("method", std::string("RXMesh"));
-    std::string order = "default";
-    if (Arg.shuffle) {
-        order = "shuffle";
-    } else if (Arg.sort) {
-        order = "sorted";
-    }
-    report.add_member("input_order", order);
     report.add_member("num_filter_iter", Arg.num_filter_iter);
 
 
     // input coords
-    RXMeshAttribute<T> coords;
-    coords.set_name("coords");
-    coords.init(rxmesh_static.get_num_vertices(), 3u, RXMESH::LOCATION_ALL);
-    for (uint32_t i = 0; i < Verts.size(); ++i) {
-        for (uint32_t j = 0; j < Verts[i].size(); ++j) {
-            coords(i, j) = Verts[i][j];
-        }
-    }
-    coords.move(RXMESH::HOST, RXMESH::DEVICE);
+    auto coords = rxmesh.add_vertex_attribute(Verts, "coords");
 
     // Vertex normals (only on device)
-    RXMeshAttribute<T> vertex_normal;
-    vertex_normal.set_name("vertex_normal");
-    vertex_normal.init(rxmesh_static.get_num_vertices(), 3u, RXMESH::DEVICE);
-    vertex_normal.reset(0.0, RXMESH::DEVICE);
+    auto vertex_normal = rxmesh.add_vertex_attribute<T>("vn", 3, DEVICE);
+    vertex_normal->reset(0, DEVICE);
 
 
     // Filtered coordinates
-    RXMeshAttribute<T> filtered_coord;
-    filtered_coord.set_name("filtered_coord");
-    filtered_coord.init(rxmesh_static.get_num_vertices(), 3u,
-                        RXMESH::LOCATION_ALL);
-    filtered_coord.reset(0.0, RXMESH::LOCATION_ALL);
-    filtered_coord.move(RXMESH::HOST, RXMESH::DEVICE);
+    auto filtered_coord =
+        rxmesh.add_vertex_attribute<T>("filtered", 3, LOCATION_ALL);
+    filtered_coord->reset(0, LOCATION_ALL);
 
     // vertex normal launch box
     constexpr uint32_t          vn_block_threads = 256;
     LaunchBox<vn_block_threads> vn_launch_box;
-    rxmesh_static.prepare_launch_box(RXMESH::Op::FV, vn_launch_box);
+    rxmesh.prepare_launch_box(rxmesh::Op::FV,
+                              vn_launch_box,
+                              (void*)compute_vertex_normal<T, vn_block_threads>);
 
     // filter launch box
     constexpr uint32_t              filter_block_threads = 512;
     LaunchBox<filter_block_threads> filter_launch_box;
-    rxmesh_static.prepare_launch_box(RXMESH::Op::VV, filter_launch_box, true);
+    rxmesh.prepare_launch_box(
+        rxmesh::Op::VV,
+        filter_launch_box,
+        (void*)bilateral_filtering<T, filter_block_threads, maxVVSize>);
 
     // double buffer
-    RXMeshAttribute<T>* double_buffer[2] = {&coords, &filtered_coord};
+    VertexAttribute<T>* double_buffer[2] = {coords.get(), filtered_coord.get()};
 
-    cudaStream_t stream;
-    CUDA_ERROR(cudaStreamCreate(&stream));
     CUDA_ERROR(cudaProfilerStart());
     GPUTimer timer;
     timer.start();
     uint32_t d = 0;
 
     for (uint32_t itr = 0; itr < Arg.num_filter_iter; ++itr) {
-        vertex_normal.reset(0, RXMESH::DEVICE, stream);
+        vertex_normal->reset(0, rxmesh::DEVICE);
 
         // update vertex normal before filtering
         compute_vertex_normal<T, vn_block_threads>
-            <<<vn_launch_box.blocks, vn_block_threads,
-               vn_launch_box.smem_bytes_dyn, stream>>>(
-                rxmesh_static.get_context(), *double_buffer[d], vertex_normal);
+            <<<vn_launch_box.blocks,
+               vn_block_threads,
+               vn_launch_box.smem_bytes_dyn>>>(
+                rxmesh.get_context(), *double_buffer[d], *vertex_normal);
 
         bilateral_filtering<T, filter_block_threads, maxVVSize>
-            <<<filter_launch_box.blocks, filter_block_threads,
-               filter_launch_box.smem_bytes_dyn, stream>>>(
-                rxmesh_static.get_context(), *double_buffer[d],
-                *double_buffer[!d], vertex_normal);
+            <<<filter_launch_box.blocks,
+               filter_block_threads,
+               filter_launch_box.smem_bytes_dyn>>>(rxmesh.get_context(),
+                                                   *double_buffer[d],
+                                                   *double_buffer[!d],
+                                                   *vertex_normal);
 
         d = !d;
-        CUDA_ERROR(cudaStreamSynchronize(stream));
+        CUDA_ERROR(cudaDeviceSynchronize());
     }
 
     timer.stop();
-    CUDA_ERROR(cudaDeviceSynchronize());
     CUDA_ERROR(cudaGetLastError());
     CUDA_ERROR(cudaProfilerStop());
-    CUDA_ERROR(cudaStreamDestroy(stream));
     RXMESH_TRACE("filtering_rxmesh() took {} (ms) (i.e., {} ms/iter) ",
                  timer.elapsed_millis(),
                  timer.elapsed_millis() / float(Arg.num_filter_iter));
 
     // move output to host
-    coords.copy(*double_buffer[d], RXMESH::DEVICE, RXMESH::HOST);
+    coords->copy_from(*double_buffer[d], rxmesh::DEVICE, rxmesh::HOST);
 
     // output to obj
-    // rxmesh_static.exportOBJ(
-    //   "output_rxmesh" + std::to_string(Arg.num_filter_iter) + ".obj",
-    //   [&](uint32_t i, uint32_t j) { return coords(i, j); });
+    // rxmesh.export_obj(STRINGIFY(OUTPUT_DIR) "output_rxmesh" +
+    //                      std::to_string(Arg.num_filter_iter) + ".obj",
+    //                  *coords);
 
 
     // Verify
-    bool    passed = true;
     const T tol = 0.01;
-    for (uint32_t v = 0; v < coords.get_num_mesh_elements(); ++v) {
-        const Vector<3, T> gt(ground_truth(v, 0), ground_truth(v, 1),
-                              ground_truth(v, 2));
-        const Vector<3, T> co(coords(v, 0), coords(v, 1), coords(v, 2));
-
-        if (std::fabs(co[0] - gt[0]) > tol || std::fabs(co[1] - gt[1]) > tol ||
-            std::fabs(co[2] - gt[2]) > tol) {
-            passed = false;
-            break;
-        }
-    }
-
-    EXPECT_TRUE(passed);
-
-    // Release allocation
-    filtered_coord.release();
-    coords.release();
-    vertex_normal.release();
+    rxmesh.for_each_vertex(HOST, [&](const VertexHandle& vh) {
+        uint32_t           v_id = rxmesh.map_to_global(vh);
+        const Vector<3, T> gt(ground_truth[v_id][0],
+                              ground_truth[v_id][1],
+                              ground_truth[v_id][2]);
+        const Vector<3, T> co(
+            (*coords)(vh, 0), (*coords)(vh, 1), (*coords)(vh, 2));
+
+        EXPECT_LT(std::fabs((*coords)(vh, 0) - ground_truth[v_id][0]), tol);
+        EXPECT_LT(std::fabs((*coords)(vh, 1) - ground_truth[v_id][1]), tol);
+        EXPECT_LT(std::fabs((*coords)(vh, 2) - ground_truth[v_id][2]), tol);
+    });
 
     // Finalize report
     TestData td;
-    td.test_name = "Filtering";
-    td.passed.push_back(passed);
+    td.test_name   = "Filtering";
+    td.num_threads = filter_launch_box.num_threads;
+    td.num_blocks  = filter_launch_box.blocks;
+    td.dyn_smem    = filter_launch_box.smem_bytes_dyn;
+    td.static_smem = filter_launch_box.smem_bytes_static;
+    td.num_reg     = filter_launch_box.num_registers_per_thread;
     td.time_ms.push_back(timer.elapsed_millis());
     report.add_test(td);
     report.write(Arg.output_folder + "/rxmesh",
diff --git a/apps/Filtering/filtering_rxmesh_kernel.cuh b/apps/Filtering/filtering_rxmesh_kernel.cuh
index 39ddd968..b8a53300 100644
--- a/apps/Filtering/filtering_rxmesh_kernel.cuh
+++ b/apps/Filtering/filtering_rxmesh_kernel.cuh
@@ -4,10 +4,9 @@
 #include <cub/block/block_radix_sort.cuh>
 
 #include "filtering_util.h"
-#include "rxmesh/kernels/rxmesh_query_dispatcher.cuh"
-#include "rxmesh/rxmesh_attribute.h"
-#include "rxmesh/rxmesh_context.h"
-#include "rxmesh/util/math.h"
+#include "rxmesh/attribute.h"
+#include "rxmesh/context.h"
+#include "rxmesh/kernels/query_dispatcher.cuh"
 #include "rxmesh/util/vector.h"
 
 constexpr float EPS = 10e-6;
@@ -17,35 +16,30 @@ constexpr float EPS = 10e-6;
  * compute_vertex_normal()
  */
 template <typename T, uint32_t blockThreads>
-__launch_bounds__(blockThreads, 6) __global__
-    static void compute_vertex_normal(const RXMESH::RXMeshContext context,
-                                      RXMESH::RXMeshAttribute<T>  coords,
-                                      RXMESH::RXMeshAttribute<T>  normals)
+__global__ static void compute_vertex_normal(const rxmesh::Context      context,
+                                             rxmesh::VertexAttribute<T> coords,
+                                             rxmesh::VertexAttribute<T> normals)
 {
-    using namespace RXMESH;
-    auto vn_lambda = [&](uint32_t face_id, RXMeshIterator& iter) {
+    using namespace rxmesh;
+    auto vn_lambda = [&](FaceHandle face_id, VertexIterator& fv) {
         // this face's three vertices
-        uint32_t v0(iter[0]), v1(iter[1]), v2(iter[2]);
+        VertexHandle v0(fv[0]), v1(fv[1]), v2(fv[2]);
+
+        // get the face's three vertices coordinates
+        Vector<3, T> c0(coords(fv[0], 0), coords(fv[0], 1), coords(fv[0], 2));
+        Vector<3, T> c1(coords(fv[1], 0), coords(fv[1], 1), coords(fv[1], 2));
+        Vector<3, T> c2(coords(fv[2], 0), coords(fv[2], 1), coords(fv[2], 2));
 
         // compute the face normal
-        const Vector<3, T> v0c(coords(v0, 0), coords(v0, 1), coords(v0, 2));
-        const Vector<3, T> v1c(coords(v1, 0), coords(v1, 1), coords(v1, 2));
-        const Vector<3, T> v2c(coords(v2, 0), coords(v2, 1), coords(v2, 2));
-        Vector<3, T>       n = cross(v1c - v0c, v2c - v0c);
+        Vector<3, T> n = cross(c1 - c0, c2 - c0);
         n.normalize();
 
         // add the face's normal to its vertices
-        atomicAdd(&normals(v0, 0), n[0]);
-        atomicAdd(&normals(v0, 1), n[1]);
-        atomicAdd(&normals(v0, 2), n[2]);
-
-        atomicAdd(&normals(v1, 0), n[0]);
-        atomicAdd(&normals(v1, 1), n[1]);
-        atomicAdd(&normals(v1, 2), n[2]);
-
-        atomicAdd(&normals(v2, 0), n[0]);
-        atomicAdd(&normals(v2, 1), n[1]);
-        atomicAdd(&normals(v2, 2), n[2]);
+        for (uint32_t v = 0; v < 3; ++v) {      // for every vertex in this face
+            for (uint32_t i = 0; i < 3; ++i) {  // for the vertex 3 coordinates
+                atomicAdd(&normals(fv[v], i), n[i]);
+            }
+        }
     };
 
     query_block_dispatcher<Op::FV, blockThreads>(context, vn_lambda);
@@ -57,26 +51,26 @@ __launch_bounds__(blockThreads, 6) __global__
  */
 template <typename T>
 __device__ __inline__ void compute_new_coordinates(
-    const uint32_t                    v_id,
-    const uint32_t                    vv[],
+    const rxmesh::VertexHandle&       v_id,
+    const rxmesh::VertexHandle        vv[],
     const uint8_t                     num_vv,
-    RXMESH::Vector<3, T>&             v,
-    const RXMESH::Vector<3, T>&       n,
+    rxmesh::Vector<3, T>&             v,
+    const rxmesh::Vector<3, T>&       n,
     const T                           sigma_c_sq,
-    const RXMESH::RXMeshAttribute<T>& input_coords,
-    RXMESH::RXMeshAttribute<T>&       filtered_coords)
+    const rxmesh::VertexAttribute<T>& input_coords,
+    rxmesh::VertexAttribute<T>&       filtered_coords)
 {
-    T sigma_s_sq =
-        compute_sigma_s_sq(v_id, vv, num_vv, v_id, v, n, input_coords);
+    T sigma_s_sq = compute_sigma_s_sq(v_id, vv, num_vv, v, n, input_coords);
 
-    T sum = 0;
+    T sum        = 0;
     T normalizer = 0;
     for (uint8_t i = 0; i < num_vv; ++i) {
-        RXMESH::Vector<3, T> q(input_coords(vv[i], 0), input_coords(vv[i], 1),
+        rxmesh::Vector<3, T> q(input_coords(vv[i], 0),
+                               input_coords(vv[i], 1),
                                input_coords(vv[i], 2));
         q -= v;
-        T t = q.norm();
-        T h = dot(q, n);
+        T t  = q.norm();
+        T h  = dot(q, n);
         T wc = exp(-0.5 * t * t / sigma_c_sq);
         T ws = exp(-0.5 * h * h / sigma_s_sq);
 
@@ -91,24 +85,27 @@ __device__ __inline__ void compute_new_coordinates(
 }
 
 /**
- * bilateral_filtering()
+ * bilateral_filtering_low_level_API()
+ * TODO refactor this to use handles
  */
-template <typename T, uint32_t blockThreads, uint32_t maxVVSize>
+/*template <typename T, uint32_t blockThreads, uint32_t maxVVSize>
 __launch_bounds__(blockThreads) __global__
     static void bilateral_filtering_low_level_API(
-        const RXMESH::RXMeshContext context,
-        RXMESH::RXMeshAttribute<T>  input_coords,
-        RXMESH::RXMeshAttribute<T>  filtered_coords,
-        RXMESH::RXMeshAttribute<T>  vertex_normals)
+        const rxmesh::Context context,
+        rxmesh::Attribute<T>  input_coords,
+        rxmesh::Attribute<T>  filtered_coords,
+        rxmesh::Attribute<T>  vertex_normals)
 {
-    using namespace RXMESH;
+    constexpr uint32_t special = 0xFFFFFFFE;
+
+    using namespace rxmesh;
     uint32_t vv[maxVVSize];
     uint32_t vv_patch[maxVVSize];
     uint16_t vv_local[maxVVSize];
 
-    uint8_t      num_vv = 0;
+    uint8_t      num_vv     = 0;
     T            sigma_c_sq = 0;
-    T            radius = 0;
+    T            radius     = 0;
     Vector<3, T> vertex, normal;
     uint32_t     v_id = INVALID32;
 
@@ -119,7 +116,7 @@ __launch_bounds__(blockThreads) __global__
 
     if (threadIdx.x == 0) {
         s_current_num_patches = 0;
-        s_num_patches = 0;
+        s_num_patches         = 0;
     }
 
     uint32_t patch_id = blockIdx.x;
@@ -128,8 +125,8 @@ __launch_bounds__(blockThreads) __global__
     // processed are within the same patch (patch_id). If a vertex within the
     // k-ring is not in the patch, it will be added to s_block_patches so the
     // whole block would process this patch later.
-    auto compute_vv_1st_level = [&](uint32_t p_id, RXMeshIterator& iter) {
-        v_id = p_id;
+    auto compute_vv_1st_level = [&](uint32_t p_id, Iterator& iter) {
+        v_id      = p_id;
         vertex[0] = input_coords(v_id, 0);
         vertex[1] = input_coords(v_id, 1);
         vertex[2] = input_coords(v_id, 2);
@@ -140,7 +137,7 @@ __launch_bounds__(blockThreads) __global__
 
         normal.normalize();
 
-        vv[0] = v_id;
+        vv[0]       = v_id;
         vv_patch[0] = INVALID32;
         ++num_vv;
 
@@ -148,7 +145,8 @@ __launch_bounds__(blockThreads) __global__
 
         for (uint32_t v = 0; v < iter.size(); ++v) {
             const uint32_t     vv_id = iter[v];
-            const Vector<3, T> q(input_coords(vv_id, 0), input_coords(vv_id, 1),
+            const Vector<3, T> q(input_coords(vv_id, 0),
+                                 input_coords(vv_id, 1),
                                  input_coords(vv_id, 2));
 
             T len = dist2(vertex, q);
@@ -173,9 +171,9 @@ __launch_bounds__(blockThreads) __global__
             if (dist <= radius) {
                 uint8_t id = num_vv++;
                 assert(id < maxVVSize);
-                vv[id] = vv_id;
+                vv[id]       = vv_id;
                 vv_local[id] = iter.neighbour_local_id(v);
-                vv_patch[id] = SPECIAL;
+                vv_patch[id] = special;
             }
         }
 
@@ -183,7 +181,7 @@ __launch_bounds__(blockThreads) __global__
         // process the 1-ring vertices that this in this patch and within
         // the radius
         uint8_t num_vv_start = 1;
-        uint8_t num_vv_end = num_vv;
+        uint8_t num_vv_end   = num_vv;
 
         while (true) {
 
@@ -194,17 +192,17 @@ __launch_bounds__(blockThreads) __global__
                 // results
                 if (vv_local[v] < iter.m_num_src_in_patch) {
 
-                    assert(vv_patch[v] == SPECIAL);
+                    assert(vv_patch[v] == special);
                     assert(context.get_vertex_patch()[vv[v]] == patch_id);
 
                     // to indicate that it's processed
                     vv_patch[v] = INVALID32;
 
-                    RXMeshIterator vv_iter(iter);
+                    Iterator vv_iter(iter);
                     vv_iter.set(vv_local[v], 0);
 
                     for (uint32_t i = 0; i < vv_iter.size(); ++i) {
-                        uint32_t vvv_id = vv_iter[i];
+                        uint32_t vvv_id       = vv_iter[i];
                         uint16_t vvv_local_id = vv_iter.neighbour_local_id(i);
 
                         // make sure that it is not a duplicate
@@ -219,9 +217,9 @@ __launch_bounds__(blockThreads) __global__
                                 uint8_t id = num_vv++;
 
                                 assert(id < maxVVSize);
-                                vv[id] = vvv_id;
+                                vv[id]       = vvv_id;
                                 vv_local[id] = vvv_local_id;
-                                vv_patch[id] = SPECIAL;
+                                vv_patch[id] = special;
                             }
                         }
                     }
@@ -253,7 +251,7 @@ __launch_bounds__(blockThreads) __global__
             // otherwise, it means we have added new vertices that might
             // fall in this patch, so we better process them now.
             num_vv_start = num_vv_end;
-            num_vv_end = num_vv;
+            num_vv_end   = num_vv;
         }
     };
 
@@ -282,13 +280,14 @@ __launch_bounds__(blockThreads) __global__
         // uniquify
         uint32_t  num_current_patches = s_num_patches - s_current_num_patches;
         uint32_t* new_end =
-            thrust::unique(thrust::device, s_block_patches,
+            thrust::unique(thrust::device,
+                           s_block_patches,
                            s_block_patches + num_current_patches);
         __syncthreads();
 
         if (threadIdx.x == 0) {
             s_current_num_patches = new_end - s_block_patches;
-            s_num_patches = s_current_num_patches;
+            s_num_patches         = s_current_num_patches;
         }
         __syncthreads();
 
@@ -301,9 +300,16 @@ __launch_bounds__(blockThreads) __global__
             uint16_t *offset_all_patches, *output_all_patches;
 
             detail::template query_block_dispatcher<Op::VV, blockThreads>(
-                context, patch_id, [](uint32_t) { return true; }, false, true,
-                num_src_in_patch, input_mapping, output_mapping,
-                offset_all_patches, output_all_patches);
+                context,
+                patch_id,
+                [](uint32_t) { return true; },
+                false,
+                true,
+                num_src_in_patch,
+                input_mapping,
+                output_mapping,
+                offset_all_patches,
+                output_all_patches);
 
 
             // mean that this thread has be assigned a vertex in
@@ -334,9 +340,12 @@ __launch_bounds__(blockThreads) __global__
                         // so that we don't process it again
                         vv_patch[v] = INVALID32;
 
-                        RXMeshIterator vv_iter(
-                            vv_local_id, output_all_patches, offset_all_patches,
-                            output_mapping, 0, num_src_in_patch);
+                        Iterator vv_iter(vv_local_id,
+                                         output_all_patches,
+                                         offset_all_patches,
+                                         output_mapping,
+                                         0,
+                                         num_src_in_patch);
 
                         for (uint32_t i = 0; i < vv_iter.size(); ++i) {
                             uint32_t vvv_id = vv_iter[i];
@@ -370,8 +379,8 @@ __launch_bounds__(blockThreads) __global__
                                     // patch before so we reduce the
                                     // duplicates
                                     if (pp != patch_id) {
-                                        if (!linear_search(vv_patch, pp,
-                                                           num_vv)) {
+                                        if (!linear_search(
+                                                vv_patch, pp, num_vv)) {
                                             uint32_t d =
                                                 atomicAdd(&s_num_patches, 1u);
                                             assert(d < blockThreads);
@@ -403,33 +412,35 @@ __launch_bounds__(blockThreads) __global__
 
     if (v_id != INVALID32) {
 
-        compute_new_coordinates(v_id, vv, num_vv, vertex, normal, sigma_c_sq,
-                                input_coords, filtered_coords);
+        compute_new_coordinates(v_id,
+                                vv,
+                                num_vv,
+                                vertex,
+                                normal,
+                                sigma_c_sq,
+                                input_coords,
+                                filtered_coords);
     }
-}
+}*/
 
-
-/**
- * bilateral_filtering2()
- */
 template <typename T, uint32_t blockThreads, uint32_t maxVVSize>
-__launch_bounds__(blockThreads) __global__
-    static void bilateral_filtering(const RXMESH::RXMeshContext context,
-                                    RXMESH::RXMeshAttribute<T>  input_coords,
-                                    RXMESH::RXMeshAttribute<T>  filtered_coords,
-                                    RXMESH::RXMeshAttribute<T>  vertex_normals)
+__global__ static void bilateral_filtering(
+    const rxmesh::Context      context,
+    rxmesh::VertexAttribute<T> input_coords,
+    rxmesh::VertexAttribute<T> filtered_coords,
+    rxmesh::VertexAttribute<T> vertex_normals)
 {
-    using namespace RXMESH;
-    uint32_t vv[maxVVSize];
+    using namespace rxmesh;
+    VertexHandle vv[maxVVSize];
 
-    uint8_t      num_vv = 0;
+    uint32_t     num_vv     = 0;
     T            sigma_c_sq = 0;
-    T            radius = 0;
+    T            radius     = 0;
     Vector<3, T> vertex, normal;
-    uint32_t     v_id = INVALID32;
+    VertexHandle v_id;
 
-    auto first_ring = [&](uint32_t p_id, RXMeshIterator& iter) {
-        v_id = p_id;
+    auto first_ring = [&](VertexHandle& p_id, VertexIterator& iter) {
+        v_id      = p_id;
         vertex[0] = input_coords(v_id, 0);
         vertex[1] = input_coords(v_id, 1);
         vertex[2] = input_coords(v_id, 2);
@@ -446,8 +457,9 @@ __launch_bounds__(blockThreads) __global__
         sigma_c_sq = 1e10;
 
         for (uint32_t v = 0; v < iter.size(); ++v) {
-            const uint32_t     vv_id = iter[v];
-            const Vector<3, T> q(input_coords(vv_id, 0), input_coords(vv_id, 1),
+            const VertexHandle vv_id = iter[v];
+            const Vector<3, T> q(input_coords(vv_id, 0),
+                                 input_coords(vv_id, 1),
                                  input_coords(vv_id, 2));
 
             T len = dist2(vertex, q);
@@ -460,7 +472,7 @@ __launch_bounds__(blockThreads) __global__
 
         // add 1-ring if it is within the radius
         for (uint32_t v = 0; v < iter.size(); ++v) {
-            uint32_t vv_id = iter[v];
+            const VertexHandle vv_id = iter[v];
 
             const Vector<3, T> vvc(input_coords(vv_id, 0),
                                    input_coords(vv_id, 1),
@@ -483,15 +495,15 @@ __launch_bounds__(blockThreads) __global__
     uint32_t next_id = 1;
     while (true) {
 
-        uint32_t next_vertex = INVALID32;
-        if (v_id != INVALID32 && next_id < num_vv) {
+        VertexHandle next_vertex;
+        if (v_id.is_valid() && next_id < num_vv) {
             next_vertex = vv[next_id];
         }
-        auto n_rings = [&](uint32_t id, RXMeshIterator& iter) {
+        auto n_rings = [&](const VertexHandle& id, const VertexIterator& iter) {
             assert(id == next_vertex);
 
             for (uint32_t i = 0; i < iter.size(); ++i) {
-                uint32_t vvv_id = iter[i];
+                VertexHandle vvv_id = iter[i];
 
                 if (vvv_id != v_id) {
                     // make sure that we don't store duplicate outputs
@@ -503,7 +515,7 @@ __launch_bounds__(blockThreads) __global__
 
                         T dist = dist2(vvv, vertex);
                         if (dist <= radius) {
-                            uint8_t id = num_vv++;
+                            uint32_t id = num_vv++;
                             assert(id < maxVVSize);
                             vv[id] = vvv_id;
                         }
@@ -513,18 +525,24 @@ __launch_bounds__(blockThreads) __global__
         };
 
 
-        query_block_dispatcher<Op::VV, blockThreads>(context, next_vertex,
-                                                     n_rings);
+        higher_query_block_dispatcher<Op::VV, blockThreads>(
+            context, next_vertex, n_rings);
 
-        bool is_done = (next_id > num_vv - 1) || (v_id == INVALID32);
+        bool is_done = (next_id >= num_vv) || !v_id.is_valid();
         if (__syncthreads_and(is_done)) {
             break;
         }
         next_id++;
     }
 
-    if (v_id != INVALID32) {
-        compute_new_coordinates(v_id, vv, num_vv, vertex, normal, sigma_c_sq,
-                                input_coords, filtered_coords);
+    if (v_id.is_valid()) {
+        compute_new_coordinates(v_id,
+                                vv,
+                                num_vv,
+                                vertex,
+                                normal,
+                                sigma_c_sq,
+                                input_coords,
+                                filtered_coords);
     }
 }
\ No newline at end of file
diff --git a/apps/Filtering/filtering_util.h b/apps/Filtering/filtering_util.h
index 08092f9f..58915b5a 100644
--- a/apps/Filtering/filtering_util.h
+++ b/apps/Filtering/filtering_util.h
@@ -1,19 +1,19 @@
-#include "rxmesh/rxmesh_attribute.h"
+#include "rxmesh/attribute.h"
 
 /**
  * compute_sigma_c()
  */
 template <typename T>
 __device__ __inline__ T compute_sigma_c_sq(
-    const uint32_t                    vv[],
+    const rxmesh::VertexHandle        vv[],
     const uint8_t                     num_vv,
-    const RXMESH::Vector<3, T>&       v,
-    const RXMESH::RXMeshAttribute<T>& input_coords)
+    const rxmesh::Vector<3, T>&       v,
+    const rxmesh::VertexAttribute<T>& input_coords)
 {
 
     T sigma_c = 1e10;
     for (uint8_t i = 1; i < num_vv; ++i) {
-        const RXMESH::Vector<3, T> q(input_coords(vv[i], 0),
+        const rxmesh::Vector<3, T> q(input_coords(vv[i], 0),
                                      input_coords(vv[i], 1),
                                      input_coords(vv[i], 2));
 
@@ -30,31 +30,31 @@ __device__ __inline__ T compute_sigma_c_sq(
  */
 template <typename T>
 __device__ __inline__ T compute_sigma_s_sq(
-    const uint32_t                    v_id,
-    const uint32_t                    vv[],
+    const rxmesh::VertexHandle&       v_id,
+    const rxmesh::VertexHandle        vv[],
     const uint8_t                     num_vv,
-    uint32_t                          thread_vertex,
-    const RXMESH::Vector<3, T>&       v,
-    const RXMESH::Vector<3, T>&       n,
-    const RXMESH::RXMeshAttribute<T>& input_coords)
+    const rxmesh::Vector<3, T>&       v,
+    const rxmesh::Vector<3, T>&       n,
+    const rxmesh::VertexAttribute<T>& input_coords)
 {
 
-    T sum = 0;
+    T sum     = 0;
     T sum_sqs = 0;
 
     for (uint32_t i = 0; i < num_vv; ++i) {
-        RXMESH::Vector<3, T> q(input_coords(vv[i], 0), input_coords(vv[i], 1),
+        rxmesh::Vector<3, T> q(input_coords(vv[i], 0),
+                               input_coords(vv[i], 1),
                                input_coords(vv[i], 2));
 
         q -= v;
         T t = dot(q, n);
-        t = sqrt(t * t);
+        t   = sqrt(t * t);
         sum += t;
         sum_sqs += t * t;
     }
-    T c = static_cast<T>(num_vv);
+    T c       = static_cast<T>(num_vv);
     T sigma_s = (sum_sqs / c) - ((sum * sum) / (c * c));
-    sigma_s = (sigma_s < 1.0e-20) ? (sigma_s + 1.0e-20) : sigma_s;
+    sigma_s   = (sigma_s < 1.0e-20) ? (sigma_s + 1.0e-20) : sigma_s;
     return sigma_s;
 }
 
diff --git a/apps/Geodesic/CMakeLists.txt b/apps/Geodesic/CMakeLists.txt
index 98b3a353..d63fb18c 100644
--- a/apps/Geodesic/CMakeLists.txt
+++ b/apps/Geodesic/CMakeLists.txt
@@ -26,6 +26,8 @@ endif()
 
 set_target_properties( Geodesic PROPERTIES FOLDER "apps")
 
+set_property(TARGET Geodesic PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+
 source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "Geodesic" FILES ${SOURCE_LIST})
 
 target_link_libraries( Geodesic
diff --git a/apps/Geodesic/benchmark.sh b/apps/Geodesic/benchmark.sh
index b64cb46b..6f24ff96 100644
--- a/apps/Geodesic/benchmark.sh
+++ b/apps/Geodesic/benchmark.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-echo "This script re-generates RXMesh data in Figure 8(b) in the paper."
 echo "Please make sure to first compile the source code and then enter the input OBJ files directory."
 read -p "OBJ files directory (no trailing slash): " input_dir
 
@@ -16,7 +15,7 @@ device_id=0
 
 for file in $input_dir/*.obj; do 	 
     if [ -f "$file" ]; then
-		echo $exe -p -input "$file" -device_id $device_id
-             $exe -p -input "$file" -device_id $device_id
+		echo $exe -input "$file" -device_id $device_id
+         $exe -input "$file" -device_id $device_id
     fi 
 done
\ No newline at end of file
diff --git a/apps/Geodesic/geodesic.cu b/apps/Geodesic/geodesic.cu
index 5fd078b2..c11aa7af 100644
--- a/apps/Geodesic/geodesic.cu
+++ b/apps/Geodesic/geodesic.cu
@@ -6,23 +6,21 @@
 #include <cuda_profiler_api.h>
 #include <random>
 
-#include "../common/openmesh_trimesh.h"
 #include "gtest/gtest.h"
-#include "rxmesh/rxmesh_attribute.h"
+
+#include "../common/openmesh_trimesh.h"
+
 #include "rxmesh/rxmesh_static.h"
 #include "rxmesh/util/cuda_query.h"
-#include "rxmesh/util/export_tools.h"
 #include "rxmesh/util/import_obj.h"
 
 struct arg
 {
     std::string obj_file_name = STRINGIFY(INPUT_DIR) "sphere3.obj";
     std::string output_folder = STRINGIFY(OUTPUT_DIR);
-    uint32_t    device_id = 0;
+    uint32_t    device_id     = 0;
     char**      argv;
     int         argc;
-    bool        shuffle = false;
-    bool        sort = false;
     uint32_t    num_seeds = 1;
 
 } Arg;
@@ -30,19 +28,11 @@ struct arg
 #include "geodesic_ptp_openmesh.h"
 #include "geodesic_ptp_rxmesh.h"
 
-TEST(App, GEODESIC)
+TEST(App, Geodesic)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
     using dataT = float;
 
-    if (Arg.shuffle) {
-        ASSERT_FALSE(Arg.sort) << " cannot shuffle and sort at the same time!";
-    }
-    if (Arg.sort) {
-        ASSERT_FALSE(Arg.shuffle)
-            << " cannot shuffle and sort at the same time!";
-    }
-
     // Select device
     cuda_query(Arg.device_id);
 
@@ -50,83 +40,48 @@ TEST(App, GEODESIC)
     // Load mesh
     std::vector<std::vector<dataT>>    Verts;
     std::vector<std::vector<uint32_t>> Faces;
-
     ASSERT_TRUE(import_obj(Arg.obj_file_name, Verts, Faces));
 
-    if (Arg.shuffle) {
-        shuffle_obj(Faces, Verts);
-    }
+    RXMeshStatic rxmesh(Faces, false);
+    ASSERT_TRUE(rxmesh.is_closed())
+        << "Geodesic only works on watertight/closed manifold mesh without "
+           "boundaries";
+    ASSERT_TRUE(rxmesh.is_edge_manifold())
+        << "Geodesic only works on watertight/closed manifold mesh without "
+           "boundaries";
 
-    // Create RXMeshStatic instance. If Arg.sort is true, Faces and Verts will
-    // be sorted based on the patching happening inside RXMesh
-    RXMeshStatic<PATCH_SIZE> rxmesh_static(Faces, Verts, Arg.sort, false);
-    ASSERT_TRUE(rxmesh_static.is_closed()) << "Geodesic only works on watertight/closed manifold mesh without boundaries";
-    ASSERT_TRUE(rxmesh_static.is_edge_manifold())<< "Geodesic only works on watertight/closed manifold mesh without boundaries";
-    
-    // Since OpenMesh only accepts input as obj files, if the input mesh is
-    // shuffled or sorted, we have to write it to a temp file so that OpenMesh
-    // can pick it up
-    TriMesh input_mesh;
-    if (Arg.sort || Arg.shuffle) {
-        export_obj(Faces, Verts, "temp.obj", false);
-        ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, "temp.obj"));
-    } else {
-        ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, Arg.obj_file_name));
-    }
 
     // Generate Seeds
     std::vector<uint32_t> h_seeds(Arg.num_seeds);
     std::random_device    dev;
     std::mt19937          rng(dev());
-    std::uniform_int_distribution<std::mt19937::result_type> dist(
-        0, rxmesh_static.get_num_vertices());
+    std::uniform_int_distribution<std::mt19937::result_type> dist(0,
+                                                                  Verts.size());
     for (auto& s : h_seeds) {
         s = dist(rng);
         // s = 0;
     }
 
 
-    //*** OpenMesh Impl
-    RXMeshAttribute<dataT> ground_truth;
-
     // Save a map from vertex id to topleset (number of hops from
     // (closest?) source). It's used by OpenMesh to help construct
     // sorted_index and limit. We keep it for RXMesh because it is
     // used to quickly determine whether or not a vertex is within
     // the "update band".
-    RXMeshAttribute<uint32_t> toplesets("toplesets");
-    toplesets.init(Verts.size(), 1u,
-                   RXMESH::HOST);  // will move() to DEVICE later
-
-
+    std::vector<uint32_t> toplesets(Verts.size(), 1u);
     std::vector<uint32_t> sorted_index;
     std::vector<uint32_t> limits;
-    geodesic_ptp_openmesh(input_mesh, h_seeds, ground_truth, sorted_index,
-                          limits, toplesets);
-
-    // export_attribute_VTK("geo_openmesh.vtk", Faces, Verts, false,
-    //                     ground_truth.operator->(),
-    //                     ground_truth.operator->());
-
-    // Now that OpenMesh has calculated the toplesets,
-    // move to DEVICE -- it's needed by RXMesh version
-    toplesets.move(RXMESH::HOST, RXMESH::DEVICE);
+    geodesic_ptp_openmesh(
+        Faces, Verts, h_seeds, sorted_index, limits, toplesets);
 
-
-    //*** RXMesh Impl
-    EXPECT_TRUE(geodesic_rxmesh(rxmesh_static, Faces, Verts, h_seeds,
-                                ground_truth, sorted_index, limits, toplesets))
-        << "RXMesh failed!!";
-
-
-    // Release allocation
-    ground_truth.release();
-    toplesets.release();
+    // RXMesh Impl
+    geodesic_rxmesh(
+        rxmesh, Faces, Verts, h_seeds, sorted_index, limits, toplesets);
 }
 
 int main(int argc, char** argv)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
     Log::init();
 
     ::testing::InitGoogleTest(&argc, argv);
@@ -143,9 +98,7 @@ int main(int argc, char** argv)
                         "              Default is {} \n"
                         "              Hint: Only accepts OBJ files\n"
                         " -o:          JSON file output folder. Default is {} \n"
-                       // "-num_seeds:   Number of input seeds. Default is {}\n"                        
-                        " -s:          Shuffle input. Default is false.\n"
-                        " -p:          Sort input using patching output. Default is false.\n"
+                       // "-num_seeds:   Number of input seeds. Default is {}\n"
                         " -device_id:  GPU device ID. Default is {}",
             Arg.obj_file_name, Arg.output_folder ,Arg.num_seeds, Arg.device_id);
             // clang-format on
@@ -160,12 +113,6 @@ int main(int argc, char** argv)
             Arg.output_folder =
                 std::string(get_cmd_option(argv, argv + argc, "-o"));
         }
-        if (cmd_option_exists(argv, argc + argv, "-s")) {
-            Arg.shuffle = true;
-        }
-        if (cmd_option_exists(argv, argc + argv, "-p")) {
-            Arg.sort = true;
-        }
         if (cmd_option_exists(argv, argc + argv, "-device_id")) {
             Arg.device_id =
                 atoi(get_cmd_option(argv, argv + argc, "-device_id"));
diff --git a/apps/Geodesic/geodesic_kernel.cuh b/apps/Geodesic/geodesic_kernel.cuh
index 5c8246ca..046d17ae 100644
--- a/apps/Geodesic/geodesic_kernel.cuh
+++ b/apps/Geodesic/geodesic_kernel.cuh
@@ -1,8 +1,8 @@
 #pragma once
 
-#include "rxmesh/kernels/rxmesh_query_dispatcher.cuh"
-#include "rxmesh/rxmesh_attribute.h"
-#include "rxmesh/rxmesh_context.h"
+#include "rxmesh/attribute.h"
+#include "rxmesh/context.h"
+#include "rxmesh/kernels/query_dispatcher.cuh"
 #include "rxmesh/util/vector.h"
 
 /**
@@ -10,14 +10,14 @@
  */
 template <typename T>
 __device__ __inline__ T update_step(
-    const uint32_t                    v0_id,
-    const uint32_t                    v1_id,
-    const uint32_t                    v2_id,
-    const RXMESH::RXMeshAttribute<T>& geo_distance,
-    const RXMESH::RXMeshAttribute<T>& coords,
+    const rxmesh::VertexHandle&       v0_id,
+    const rxmesh::VertexHandle&       v1_id,
+    const rxmesh::VertexHandle&       v2_id,
+    const rxmesh::VertexAttribute<T>& geo_distance,
+    const rxmesh::VertexAttribute<T>& coords,
     const T                           infinity_val)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
     const Vector<3, T> v0(coords(v0_id, 0), coords(v0_id, 1), coords(v0_id, 2));
     const Vector<3, T> v1(coords(v1_id, 0), coords(v1_id, 1), coords(v1_id, 2));
     const Vector<3, T> v2(coords(v2_id, 0), coords(v2_id, 1), coords(v2_id, 2));
@@ -44,14 +44,14 @@ __device__ __inline__ T update_step(
     Q[1][1] = q[0][0] / det;
 
     T delta = t[0] * (Q[0][0] + Q[1][0]) + t[1] * (Q[0][1] + Q[1][1]);
-    T dis = delta * delta -
+    T dis   = delta * delta -
             (Q[0][0] + Q[0][1] + Q[1][0] + Q[1][1]) *
                 (t[0] * t[0] * Q[0][0] + t[0] * t[1] * (Q[1][0] + Q[0][1]) +
                  t[1] * t[1] * Q[1][1] - 1);
     T p = (delta + std::sqrt(dis)) / (Q[0][0] + Q[0][1] + Q[1][0] + Q[1][1]);
     T tp[2];
-    tp[0] = t[0] - p;
-    tp[1] = t[1] - p;
+    tp[0]                = t[0] - p;
+    tp[1]                = t[1] - p;
     const Vector<3, T> n = (x0 * Q[0][0] + x1 * Q[1][0]) * tp[0] +
                            (x0 * Q[0][1] + x1 * Q[1][1]) * tp[1];
     T cond[2];
@@ -67,48 +67,48 @@ __device__ __inline__ T update_step(
         T dp[2];
         dp[0] = geo_distance(v1_id) + x0.norm();
         dp[1] = geo_distance(v2_id) + x1.norm();
-        p = dp[dp[1] < dp[0]];
+        p     = dp[dp[1] < dp[0]];
     }
     return p;
 }
 
 
 template <typename T, uint32_t blockThreads>
-__launch_bounds__(blockThreads) __global__ static void relax_ptp_rxmesh(
-    const RXMESH::RXMeshContext             context,
-    const RXMESH::RXMeshAttribute<T>        coords,
-    RXMESH::RXMeshAttribute<T>              new_geo_dist,
-    const RXMESH::RXMeshAttribute<T>        old_geo_dist,
-    const RXMESH::RXMeshAttribute<uint32_t> toplesets,
+__global__ static void relax_ptp_rxmesh(
+    const rxmesh::Context                   context,
+    const rxmesh::VertexAttribute<T>        coords,
+    rxmesh::VertexAttribute<T>              new_geo_dist,
+    const rxmesh::VertexAttribute<T>        old_geo_dist,
+    const rxmesh::VertexAttribute<uint32_t> toplesets,
     const uint32_t                          band_start,
     const uint32_t                          band_end,
     uint32_t*                               d_error,
     const T                                 infinity_val,
     const T                                 error_tol)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
 
-    auto in_active_set = [&](uint32_t p_id) {
+    auto in_active_set = [&](VertexHandle p_id) {
         uint32_t my_band = toplesets(p_id);
         return my_band >= band_start && my_band < band_end;
     };
 
-    auto geo_lambda = [&](uint32_t p_id, RXMeshIterator& iter) {
+    auto geo_lambda = [&](VertexHandle& p_id, const VertexIterator& iter) {
         // this vertex (p_id) update_band
         uint32_t my_band = toplesets(p_id);
 
         // this is the last vertex in the one-ring (before r_id)
-        uint32_t q_id = iter.back();
+        auto q_id = iter.back();
 
         // one-ring enumeration
         T current_dist = old_geo_dist(p_id);
-        T new_dist = current_dist;
+        T new_dist     = current_dist;
         for (uint32_t v = 0; v < iter.size(); ++v) {
             // the current one ring vertex
-            uint32_t r_id = iter[v];
+            auto r_id = iter[v];
 
-            T dist = update_step(p_id, q_id, r_id, old_geo_dist, coords,
-                                 infinity_val);
+            T dist = update_step(
+                p_id, q_id, r_id, old_geo_dist, coords, infinity_val);
             if (dist < new_dist) {
                 new_dist = dist;
             }
@@ -126,6 +126,6 @@ __launch_bounds__(blockThreads) __global__ static void relax_ptp_rxmesh(
     };
 
 
-    query_block_dispatcher<Op::VV, blockThreads>(context, geo_lambda,
-                                                 in_active_set, true);
+    query_block_dispatcher<Op::VV, blockThreads>(
+        context, geo_lambda, in_active_set, true);
 }
diff --git a/apps/Geodesic/geodesic_ptp_openmesh.h b/apps/Geodesic/geodesic_ptp_openmesh.h
index db7f0e36..e8ad2569 100644
--- a/apps/Geodesic/geodesic_ptp_openmesh.h
+++ b/apps/Geodesic/geodesic_ptp_openmesh.h
@@ -9,15 +9,15 @@
 #include "../common/openmesh_report.h"
 #include "../common/openmesh_trimesh.h"
 #include "gtest/gtest.h"
-#include "rxmesh/rxmesh_attribute.h"
+#include "rxmesh/util/export_tools.h"
 #include "rxmesh/util/report.h"
 #include "rxmesh/util/timer.h"
 
-inline float compute_toplesets(TriMesh&                           mesh,
-                               std::vector<uint32_t>&             sorted_index,
-                               std::vector<uint32_t>&             limits,
-                               RXMESH::RXMeshAttribute<uint32_t>& toplesets,
-                               const std::vector<uint32_t>&       h_seeds)
+inline float compute_toplesets(TriMesh&                     mesh,
+                               std::vector<uint32_t>&       sorted_index,
+                               std::vector<uint32_t>&       limits,
+                               std::vector<uint32_t>&       toplesets,
+                               const std::vector<uint32_t>& h_seeds)
 {
     limits.clear();
     limits.reserve(mesh.n_vertices() / 2);
@@ -26,34 +26,36 @@ inline float compute_toplesets(TriMesh&                           mesh,
         return 0;
     }
 
-    RXMESH::CPUTimer timer;
+    rxmesh::CPUTimer timer;
     timer.start();
 
-    toplesets.reset(INVALID32, RXMESH::HOST);
+    toplesets.clear();
+    toplesets.resize(mesh.n_vertices(), INVALID32);
     uint32_t level = 0;
-    uint32_t p = 0;
+    uint32_t p     = 0;
     for (const uint32_t& s : h_seeds) {
         sorted_index[p] = s;
         p++;
-        if (toplesets(s) == INVALID32) {
-            toplesets(s) = level;
+        if (toplesets[s] == INVALID32) {
+            toplesets[s] = level;
         }
     }
 
     limits.push_back(0);
     for (uint32_t i = 0; i < p; i++) {
         const uint32_t v = sorted_index[i];
-        if (toplesets(v) > level) {
+        if (toplesets[v] > level) {
             level++;
             limits.push_back(i);
         }
 
         TriMesh::VertexIter v_iter = mesh.vertices_begin() + v;
         for (TriMesh::VertexVertexIter vv_iter = mesh.vv_iter(*v_iter);
-             vv_iter.is_valid(); ++vv_iter) {
+             vv_iter.is_valid();
+             ++vv_iter) {
             int vv = (*vv_iter).idx();
-            if (toplesets(vv) == INVALID32) {
-                toplesets(vv) = toplesets(v) + 1;
+            if (toplesets[vv] == INVALID32) {
+                toplesets[vv]   = toplesets[v] + 1;
                 sorted_index[p] = vv;
                 p++;
             }
@@ -68,18 +70,18 @@ inline float compute_toplesets(TriMesh&                           mesh,
             "compute_toplesets() could not compute toplesets for all "
             "vertices maybe because the input is not manifold or contain "
             "duplicate vertices!");
-        exit(0);
+        exit(EXIT_FAILURE);
     }
     timer.stop();
     return timer.elapsed_millis();
 }
 
 template <typename T>
-inline T update_step(TriMesh&                    mesh,
-                     const uint32_t              v0,
-                     const uint32_t              v1,
-                     const uint32_t              v2,
-                     RXMESH::RXMeshAttribute<T>& geo_distance)
+inline T update_step(TriMesh&        mesh,
+                     const uint32_t  v0,
+                     const uint32_t  v1,
+                     const uint32_t  v2,
+                     std::vector<T>& geo_distance)
 {
     TriMesh::VertexIter v0_it = mesh.vertices_begin() + v0;
     TriMesh::VertexIter v1_it = mesh.vertices_begin() + v1;
@@ -90,8 +92,8 @@ inline T update_step(TriMesh&                    mesh,
 
 
     T t[2];
-    t[0] = geo_distance(v1);
-    t[1] = geo_distance(v2);
+    t[0] = geo_distance[v1];
+    t[1] = geo_distance[v2];
 
     T q[2][2];
     q[0][0] = (X0 | X0);  // X0 dot_product X0
@@ -108,7 +110,7 @@ inline T update_step(TriMesh&                    mesh,
     Q[1][1] = q[0][0] / det;
 
     T delta = t[0] * (Q[0][0] + Q[1][0]) + t[1] * (Q[0][1] + Q[1][1]);
-    T dis = delta * delta -
+    T dis   = delta * delta -
             (Q[0][0] + Q[0][1] + Q[1][0] + Q[1][1]) *
                 (t[0] * t[0] * Q[0][0] + t[0] * t[1] * (Q[1][0] + Q[0][1]) +
                  t[1] * t[1] * Q[1][1] - 1);
@@ -138,8 +140,8 @@ inline T update_step(TriMesh&                    mesh,
         t[1] == std::numeric_limits<T>::infinity() || dis < 0 || c[0] >= 0 ||
         c[1] >= 0) {
         T dp[2];
-        dp[0] = geo_distance(v1) + X0.norm();
-        dp[1] = geo_distance(v2) + X1.norm();
+        dp[0] = geo_distance[v1] + X0.norm();
+        dp[1] = geo_distance[v2] + X1.norm();
 
         p = dp[dp[1] < dp[0]];
     }
@@ -151,31 +153,27 @@ inline float toplesets_propagation(TriMesh&                     mesh,
                                    const std::vector<uint32_t>& h_seeds,
                                    const std::vector<uint32_t>& limits,
                                    const std::vector<uint32_t>& sorted_index,
-                                   RXMESH::RXMeshAttribute<T>&  geo_distance,
+                                   std::vector<T>&              geo_distance,
                                    uint32_t&                    iter)
 {
     // second buffer for geodesic distance
-    RXMESH::RXMeshAttribute<T> geo_distance_2;
-    geo_distance_2.init(mesh.n_vertices(), 1u, RXMESH::HOST);
-    geo_distance_2.reset(std::numeric_limits<T>::infinity(), RXMESH::HOST);
-    geo_distance.reset(std::numeric_limits<T>::infinity(), RXMESH::HOST);
-    RXMESH::RXMeshAttribute<T>* double_buffer[2] = {&geo_distance,
-                                                    &geo_distance_2};
+    std::vector<T>  geo_distance_2(geo_distance);
+    std::vector<T>* double_buffer[2] = {&geo_distance, &geo_distance_2};
     // error buffer
     std::vector<T> error(mesh.n_vertices(), 0);
 
-    RXMESH::CPUTimer timer;
+    rxmesh::CPUTimer timer;
     timer.start();
 
     // source distance
     for (auto v : h_seeds) {
-        geo_distance(v) = 0;
-        geo_distance_2(v) = 0;
+        geo_distance[v]   = 0;
+        geo_distance_2[v] = 0;
     }
 
     uint32_t d = 0;
     uint32_t i(1), j(2);
-    iter = 0;
+    iter              = 0;
     uint32_t max_iter = 2 * limits.size();
 
     while (i < j && iter < max_iter) {
@@ -184,15 +182,15 @@ inline float toplesets_propagation(TriMesh&                     mesh,
             i = j / 2;
         }
 
-        const uint32_t start = limits[i];
-        const uint32_t end = limits[j];
+        const uint32_t start  = limits[i];
+        const uint32_t end    = limits[j];
         const uint32_t n_cond = limits[i + 1] - start;
 
         for (uint32_t vi = start; vi < end; vi++) {
-            const uint32_t      v = sorted_index[vi];
+            const uint32_t      v      = sorted_index[vi];
             TriMesh::VertexIter v_iter = mesh.vertices_begin() + v;
 
-            double_buffer[!d]->operator()(v) = double_buffer[d]->operator()(v);
+            (*double_buffer[!d])[v] = (*double_buffer[d])[v];
 
 
             // The last vertex in v one ring
@@ -202,7 +200,8 @@ inline float toplesets_propagation(TriMesh&                     mesh,
 
             // iterate over one-ring
             for (TriMesh::VertexVertexIter vv_iter = mesh.vv_iter(*v_iter);
-                 vv_iter.is_valid(); ++vv_iter) {
+                 vv_iter.is_valid();
+                 ++vv_iter) {
 
                 // current vv
                 uint32_t vv_id = (*vv_iter).idx();
@@ -212,8 +211,8 @@ inline float toplesets_propagation(TriMesh&                     mesh,
 
                 // working on triangle v,vv_id, p_id
                 T dist = update_step(mesh, v, p_id, vv_id, *double_buffer[d]);
-                if (dist < double_buffer[!d]->operator()(v)) {
-                    double_buffer[!d]->operator()(v) = dist;
+                if (dist < (*double_buffer[!d])[v]) {
+                    (*double_buffer[!d])[v] = dist;
                 }
 
 
@@ -225,9 +224,9 @@ inline float toplesets_propagation(TriMesh&                     mesh,
         // calc error
         for (uint32_t vi = start; vi < start + n_cond; vi++) {
             const uint32_t v = sorted_index[vi];
-            error[vi] = std::abs(double_buffer[!d]->operator()(v) -
-                                 double_buffer[d]-> operator()(v)) /
-                        double_buffer[d]->operator()(v);
+            error[vi] =
+                std::abs((*double_buffer[!d])[v] - (*double_buffer[d])[v]) /
+                (*double_buffer[d])[v];
         }
 
         uint32_t count = 0;
@@ -247,23 +246,26 @@ inline float toplesets_propagation(TriMesh&                     mesh,
     timer.stop();
 
     // copy most updated results (if needed)
-    if (geo_distance.operator->() != double_buffer[!d]->operator->()) {
-        geo_distance.copy(*(double_buffer[!d]), RXMESH::HOST, RXMESH::HOST);
+    if (&geo_distance != double_buffer[!d]) {
+        for (size_t i = 0; i < geo_distance.size(); ++i) {
+            geo_distance[i] = geo_distance_2[i];
+        }
     }
 
-    geo_distance_2.release();
-
     return timer.elapsed_millis();
 }
 
 template <typename T>
-void geodesic_ptp_openmesh(TriMesh&                           input_mesh,
-                           const std::vector<uint32_t>&       h_seeds,
-                           RXMESH::RXMeshAttribute<T>&        geo_distance,
-                           std::vector<uint32_t>&             sorted_index,
-                           std::vector<uint32_t>&             limits,
-                           RXMESH::RXMeshAttribute<uint32_t>& toplesets)
+void geodesic_ptp_openmesh(const std::vector<std::vector<uint32_t>>& Faces,
+                           const std::vector<std::vector<T>>&        Verts,
+                           const std::vector<uint32_t>&              h_seeds,
+                           std::vector<uint32_t>& sorted_index,
+                           std::vector<uint32_t>& limits,
+                           std::vector<uint32_t>& toplesets)
 {
+    TriMesh input_mesh;
+    ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, Arg.obj_file_name));
+
     // Report
     OpenMeshReport report("Geodesic_OpenMesh");
     report.command_line(Arg.argc, Arg.argv);
@@ -272,19 +274,13 @@ void geodesic_ptp_openmesh(TriMesh&                           input_mesh,
     report.add_member("seeds", h_seeds);
     std::string method = "OpenMeshSingleCore";
     report.add_member("method", method);
-    std::string order = "default";
-    if (Arg.shuffle) {
-        order = "shuffle";
-    } else if (Arg.sort) {
-        order = "sorted";
-    }
-    report.add_member("input_order", order);
 
+    ASSERT_TRUE(Faces.size() == input_mesh.n_faces());
+    ASSERT_TRUE(Verts.size() == input_mesh.n_vertices());
 
-    // Geodesic distance attribute for all vertices
-    geo_distance.set_name("GeodesicDistance");
-    geo_distance.init(input_mesh.n_vertices(), 1u, RXMESH::HOST);
-    geo_distance.reset(std::numeric_limits<T>::infinity(), RXMESH::HOST);
+
+    std::vector<T> geo_distance(input_mesh.n_vertices(),
+                                std::numeric_limits<T>::infinity());
 
     // sorted indices for toplesets
     sorted_index.clear();
@@ -295,26 +291,34 @@ void geodesic_ptp_openmesh(TriMesh&                           input_mesh,
     // compute toplesets
     float compute_toplesets_time =
         compute_toplesets(input_mesh, sorted_index, limits, toplesets, h_seeds);
+
     RXMESH_TRACE("OpenMesh: Computing toplesets took {} (ms)",
                  compute_toplesets_time);
+
     report.add_member("compute_toplesets_time", compute_toplesets_time);
 
     // compute geodesic distance
-    uint32_t iter = 0;
+    uint32_t iter            = 0;
     float    processing_time = toplesets_propagation(
         input_mesh, h_seeds, limits, sorted_index, geo_distance, iter);
     RXMESH_TRACE("geodesic_ptp_openmesh() took {} (ms)", processing_time);
 
+    // export_attribute_VTK("geo_openmesh.vtk",
+    //                     Faces,
+    //                     Verts,
+    //                     false,
+    //                     geo_distance.data(),
+    //                     geo_distance.data());
 
     // Finalize report
     report.add_member("num_iter_taken", iter);
-    RXMESH::TestData td;
-    td.test_name = "Geodesic";
+    rxmesh::TestData td;
+    td.test_name   = "Geodesic";
     td.num_threads = 1;
     td.time_ms.push_back(processing_time);
     td.passed.push_back(true);
     report.add_test(td);
     report.write(
         Arg.output_folder + "/openmesh",
-        "Geodesic_OpenMesh" + RXMESH::extract_file_name(Arg.obj_file_name));
+        "Geodesic_OpenMesh" + rxmesh::extract_file_name(Arg.obj_file_name));
 }
\ No newline at end of file
diff --git a/apps/Geodesic/geodesic_ptp_rxmesh.h b/apps/Geodesic/geodesic_ptp_rxmesh.h
index 75205f92..11a2c819 100644
--- a/apps/Geodesic/geodesic_ptp_rxmesh.h
+++ b/apps/Geodesic/geodesic_ptp_rxmesh.h
@@ -6,17 +6,16 @@
 
 constexpr float EPS = 10e-6;
 
-template <typename T, uint32_t patchSize>
-inline bool geodesic_rxmesh(RXMESH::RXMeshStatic<patchSize>&    rxmesh_static,
-                            std::vector<std::vector<uint32_t>>& Faces,
-                            std::vector<std::vector<T>>&        Verts,
-                            const std::vector<uint32_t>&        h_seeds,
-                            const RXMESH::RXMeshAttribute<T>&   ground_truth,
-                            const std::vector<uint32_t>&        h_sorted_index,
-                            const std::vector<uint32_t>&        h_limits,
-                            const RXMESH::RXMeshAttribute<uint32_t>& toplesets)
+template <typename T>
+inline void geodesic_rxmesh(rxmesh::RXMeshStatic&                     rxmesh,
+                            const std::vector<std::vector<uint32_t>>& Faces,
+                            const std::vector<std::vector<T>>&        Verts,
+                            const std::vector<uint32_t>&              h_seeds,
+                            const std::vector<uint32_t>& h_sorted_index,
+                            const std::vector<uint32_t>& h_limits,
+                            const std::vector<uint32_t>& toplesets)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
     constexpr uint32_t blockThreads = 256;
 
     // Report
@@ -24,49 +23,45 @@ inline bool geodesic_rxmesh(RXMESH::RXMeshStatic<patchSize>&    rxmesh_static,
     report.command_line(Arg.argc, Arg.argv);
     report.device();
     report.system();
-    report.model_data(Arg.obj_file_name, rxmesh_static);
+    report.model_data(Arg.obj_file_name, rxmesh);
     report.add_member("seeds", h_seeds);
     report.add_member("method", std::string("RXMesh"));
-    std::string order = "default";
-    if (Arg.shuffle) {
-        order = "shuffle";
-    } else if (Arg.sort) {
-        order = "sorted";
-    }
-    report.add_member("input_order", order);
-
 
     // input coords
-    RXMESH::RXMeshAttribute<T> input_coord;
-    input_coord.set_name("coord");
-    input_coord.init(Verts.size(), 3u, RXMESH::LOCATION_ALL);
-    for (uint32_t i = 0; i < Verts.size(); ++i) {
-        for (uint32_t j = 0; j < Verts[i].size(); ++j) {
-            input_coord(i, j) = Verts[i][j];
-        }
-    }
-    input_coord.change_layout(RXMESH::HOST);
-    input_coord.move(RXMESH::HOST, RXMESH::DEVICE);
+    auto input_coord = rxmesh.add_vertex_attribute(Verts, "coord");
+
+    // toplesets
+    auto d_toplesets = rxmesh.add_vertex_attribute(toplesets, "topleset");
+
 
     // RXMesh launch box
     LaunchBox<blockThreads> launch_box;
-    rxmesh_static.prepare_launch_box(RXMESH::Op::VV, launch_box, false, true);
+    rxmesh.prepare_launch_box(rxmesh::Op::VV,
+                              launch_box,
+                              (void*)relax_ptp_rxmesh<T, blockThreads>,
+                              true);
 
 
     // Geodesic distance attribute for all vertices (seeds set to zero
     // and infinity otherwise)
-    RXMeshAttribute<T> rxmesh_geo;
-    rxmesh_geo.init(rxmesh_static.get_num_vertices(), 1u, RXMESH::LOCATION_ALL);
-    rxmesh_geo.reset(std::numeric_limits<T>::infinity(), RXMESH::HOST);
-    for (uint32_t v : h_seeds) {
-        rxmesh_geo(v) = 0;
-    }
-    rxmesh_geo.move(RXMESH::HOST, RXMESH::DEVICE);
+    auto rxmesh_geo = rxmesh.add_vertex_attribute<T>("geo", 1u);
+    rxmesh_geo->reset(std::numeric_limits<T>::infinity(), rxmesh::HOST);
+    rxmesh.for_each_vertex(rxmesh::HOST, [&](const VertexHandle vh) {
+        uint32_t v_id = rxmesh.map_to_global(vh);
+        for (uint32_t s : h_seeds) {
+            if (s == v_id) {
+                (*rxmesh_geo)(vh) = 0;
+                break;
+            }
+        }
+    });
+    rxmesh_geo->move(rxmesh::HOST, rxmesh::DEVICE);
 
     // second buffer for geodesic distance for double buffering
-    RXMeshAttribute<T> rxmesh_geo_2;
-    rxmesh_geo_2.init(rxmesh_static.get_num_vertices(), 1u, RXMESH::DEVICE);
-    rxmesh_geo_2.copy(rxmesh_geo, RXMESH::DEVICE, RXMESH::DEVICE);
+    auto rxmesh_geo_2 =
+        rxmesh.add_vertex_attribute<T>("geo2", 1u, rxmesh::DEVICE);
+
+    rxmesh_geo_2->copy_from(*rxmesh_geo, rxmesh::DEVICE, rxmesh::DEVICE);
 
 
     // Error
@@ -74,7 +69,8 @@ inline bool geodesic_rxmesh(RXMESH::RXMeshStatic<patchSize>&    rxmesh_static,
     CUDA_ERROR(cudaMalloc((void**)&d_error, sizeof(uint32_t)));
 
     // double buffer
-    RXMeshAttribute<T>* double_buffer[2] = {&rxmesh_geo, &rxmesh_geo_2};
+    VertexAttribute<T>* double_buffer[2] = {rxmesh_geo.get(),
+                                            rxmesh_geo_2.get()};
 
     // start time
     GPUTimer timer;
@@ -83,7 +79,7 @@ inline bool geodesic_rxmesh(RXMESH::RXMeshStatic<patchSize>&    rxmesh_static,
     // actual computation
     uint32_t d = 0;
     uint32_t i(1), j(2);
-    uint32_t iter = 0;
+    uint32_t iter     = 0;
     uint32_t max_iter = 2 * h_limits.size();
     while (i < j && iter < max_iter) {
         iter++;
@@ -94,12 +90,19 @@ inline bool geodesic_rxmesh(RXMESH::RXMeshStatic<patchSize>&    rxmesh_static,
         // compute new geodesic
         relax_ptp_rxmesh<T, blockThreads>
             <<<launch_box.blocks, blockThreads, launch_box.smem_bytes_dyn>>>(
-                rxmesh_static.get_context(), input_coord, *double_buffer[!d],
-                *double_buffer[d], toplesets, i, j, d_error,
-                std::numeric_limits<T>::infinity(), T(1e-3));
-
-        CUDA_ERROR(cudaMemcpy(&h_error, d_error, sizeof(uint32_t),
-                              cudaMemcpyDeviceToHost));
+                rxmesh.get_context(),
+                *input_coord,
+                *double_buffer[!d],
+                *double_buffer[d],
+                *d_toplesets,
+                i,
+                j,
+                d_error,
+                std::numeric_limits<T>::infinity(),
+                T(1e-3));
+
+        CUDA_ERROR(cudaMemcpy(
+            &h_error, d_error, sizeof(uint32_t), cudaMemcpyDeviceToHost));
         CUDA_ERROR(cudaMemset(d_error, 0, sizeof(uint32_t)));
 
 
@@ -120,38 +123,33 @@ inline bool geodesic_rxmesh(RXMESH::RXMeshStatic<patchSize>&    rxmesh_static,
     CUDA_ERROR(cudaGetLastError());
     CUDA_ERROR(cudaProfilerStop());
 
-    // verify
-    rxmesh_geo.copy(*double_buffer[d], RXMESH::DEVICE, RXMESH::HOST);
-    T err = 0;
-    for (uint32_t i = 0; i < ground_truth.get_num_mesh_elements(); ++i) {
-        if (ground_truth(i) > EPS) {
-            err += std::abs(rxmesh_geo(i) - ground_truth(i)) / ground_truth(i);
-        }
-    }
-    err /= T(ground_truth.get_num_mesh_elements());
-    bool is_passed = (err < 10E-2);
+    rxmesh_geo->copy_from(*double_buffer[d], rxmesh::DEVICE, rxmesh::HOST);
 
-    RXMESH_TRACE("Geodesic_RXMesh took {} (ms) -- err= {} -- #iter= {}",
-                 timer.elapsed_millis(), err, iter);
+    RXMESH_TRACE("Geodesic_RXMesh took {} (ms) -- #iter= {}",
+                 timer.elapsed_millis(),
+                 iter);
 
-    // export_attribute_VTK("geo_rxmesh.vtk", Faces, Verts, false,
-    //                     rxmesh_geo.operator->(), rxmesh_geo.operator->());
+    // std::vector<T> geo(rxmesh.get_num_vertices());
+    // rxmesh.for_each_vertex(rxmesh::HOST, [&](const VertexHandle vh) {
+    //    uint32_t v_id = rxmesh.map_to_global(vh);
+    //    geo[v_id]     = (*rxmesh_geo)(vh);
+    //});
+    // export_attribute_VTK(
+    //    "geo_rxmesh.vtk", Faces, Verts, false, geo.data(), geo.data());
 
-    // Release allocation
-    rxmesh_geo.release();
-    rxmesh_geo_2.release();
-    input_coord.release();
     GPU_FREE(d_error);
 
     // Finalize report
     report.add_member("num_iter_taken", iter);
     TestData td;
-    td.test_name = "Geodesic";
+    td.test_name   = "Geodesic";
+    td.num_threads = launch_box.num_threads;
+    td.num_blocks  = launch_box.blocks;
+    td.dyn_smem    = launch_box.smem_bytes_dyn;
+    td.static_smem = launch_box.smem_bytes_static;
+    td.num_reg     = launch_box.num_registers_per_thread;
     td.time_ms.push_back(timer.elapsed_millis());
-    td.passed.push_back(is_passed);
     report.add_test(td);
     report.write(Arg.output_folder + "/rxmesh",
                  "Geodesic_RXMesh_" + extract_file_name(Arg.obj_file_name));
-
-    return is_passed;
 }
\ No newline at end of file
diff --git a/apps/MCF/CMakeLists.txt b/apps/MCF/CMakeLists.txt
index 1eff0cad..ec1ac6c9 100644
--- a/apps/MCF/CMakeLists.txt
+++ b/apps/MCF/CMakeLists.txt
@@ -27,6 +27,8 @@ endif()
 
 set_target_properties( MCF PROPERTIES FOLDER "apps")
 
+set_property(TARGET MCF PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+
 source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "MCF" FILES ${SOURCE_LIST})
 
 
diff --git a/apps/MCF/benchmark.sh b/apps/MCF/benchmark.sh
index e531d908..f1dfb38b 100644
--- a/apps/MCF/benchmark.sh
+++ b/apps/MCF/benchmark.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-echo "This script re-generates RXMesh data in Figure 8(a) in the paper."
 echo "Please make sure to first compile the source code and then enter the input OBJ files directory."
 read -p "OBJ files directory (no trailing slash): " input_dir
 
@@ -16,7 +15,7 @@ device_id=0
 
 for file in $input_dir/*.obj; do 	 
     if [ -f "$file" ]; then
-		echo $exe -p -input "$file" -device_id $device_id
-             $exe -p -input "$file" -device_id $device_id
+		echo $exe -input "$file" -device_id $device_id
+         $exe -input "$file" -device_id $device_id
     fi 
 done
\ No newline at end of file
diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu
index 38c40128..abfec068 100644
--- a/apps/MCF/mcf.cu
+++ b/apps/MCF/mcf.cu
@@ -6,7 +6,7 @@
 
 #include "../common/openmesh_trimesh.h"
 #include "gtest/gtest.h"
-#include "rxmesh/rxmesh_attribute.h"
+#include "rxmesh/attribute.h"
 #include "rxmesh/rxmesh_static.h"
 #include "rxmesh/util/cuda_query.h"
 #include "rxmesh/util/export_tools.h"
@@ -15,18 +15,15 @@
 
 struct arg
 {
-    std::string obj_file_name = STRINGIFY(INPUT_DIR) "sphere3.obj";
-    std::string output_folder = STRINGIFY(OUTPUT_DIR);
-    uint32_t    device_id = 0;
-    float       time_step = 0.001;
-    float       cg_tolerance = 1e-6;
-    uint32_t    max_num_cg_iter = 1000;
+    std::string obj_file_name       = STRINGIFY(INPUT_DIR) "sphere3.obj";
+    std::string output_folder       = STRINGIFY(OUTPUT_DIR);
+    uint32_t    device_id           = 0;
+    float       time_step           = 0.001;
+    float       cg_tolerance        = 1e-6;
+    uint32_t    max_num_cg_iter     = 1000;
     bool        use_uniform_laplace = false;
     char**      argv;
     int         argc;
-    bool        shuffle = false;
-    bool        sort = false;
-
 } Arg;
 
 #include "mcf_openmesh.h"
@@ -35,17 +32,9 @@ struct arg
 
 TEST(App, MCF)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
     using dataT = float;
 
-    if (Arg.shuffle) {
-        ASSERT_FALSE(Arg.sort) << " cannot shuffle and sort at the same time!";
-    }
-    if (Arg.sort) {
-        ASSERT_FALSE(Arg.shuffle)
-            << " cannot shuffle and sort at the same time!";
-    }
-
     // Select device
     cuda_query(Arg.device_id);
 
@@ -56,41 +45,24 @@ TEST(App, MCF)
 
     ASSERT_TRUE(import_obj(Arg.obj_file_name, Verts, Faces));
 
-    if (Arg.shuffle) {
-        shuffle_obj(Faces, Verts);
-    }
-
-    // Create RXMeshStatic instance. If Arg.sort is true, Faces and Verts will
-    // be sorted based on the patching happening inside RXMesh
-    RXMeshStatic<PATCH_SIZE> rxmesh_static(Faces, Verts, Arg.sort, false);
 
+    RXMeshStatic rxmesh(Faces, false);
 
-    // Since OpenMesh only accepts input as obj files, if the input mesh is
-    // shuffled or sorted, we have to write it to a temp file so that OpenMesh
-    // can pick it up
     TriMesh input_mesh;
-    if (Arg.sort || Arg.shuffle) {
-        export_obj(Faces, Verts, "temp.obj", false);
-        ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, "temp.obj"));
-    } else {
-        ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, Arg.obj_file_name));
-    }
-
-    //*** OpenMesh Impl
-    RXMESH::RXMeshAttribute<dataT> ground_truth;
-    mcf_openmesh(omp_get_max_threads(), input_mesh, ground_truth);
+    ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, Arg.obj_file_name));
 
-    //*** RXMesh Impl
-    mcf_rxmesh(rxmesh_static, Verts, ground_truth);
 
+    // OpenMesh Impl
+    std::vector<std::vector<dataT>> ground_truth(Verts);
+    mcf_openmesh(omp_get_max_threads(), input_mesh, ground_truth);
 
-    // Release allocation
-    ground_truth.release();
+    // RXMesh Impl
+    mcf_rxmesh(rxmesh, Verts, ground_truth);
 }
 
 int main(int argc, char** argv)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
     Log::init();
 
     ::testing::InitGoogleTest(&argc, argv);
@@ -109,9 +81,7 @@ int main(int argc, char** argv)
                         " -dt:                Time step (delta t). Default is {} \n"
                         "                     Hint: should be between (0.001, 1) for cotan Laplace or between (1, 100) for uniform Laplace\n"
                         " -eps:               Conjugate gradient tolerance. Default is {}\n"
-                        " -max_cg_iter:       Conjugate gradient maximum number of iterations. Default is {}\n"
-                        " -s:                 Shuffle input. Default is false.\n"
-                        " -p:                 Sort input using patching output. Default is false\n"
+                        " -max_cg_iter:       Conjugate gradient maximum number of iterations. Default is {}\n"                        
                         " -device_id:         GPU device ID. Default is {}",
             Arg.obj_file_name, Arg.output_folder,  (Arg.use_uniform_laplace? "true" : "false"), Arg.time_step, Arg.cg_tolerance, Arg.max_num_cg_iter, Arg.device_id);
             // clang-format on
@@ -141,12 +111,6 @@ int main(int argc, char** argv)
         if (cmd_option_exists(argv, argc + argv, "-uniform_laplace")) {
             Arg.use_uniform_laplace = true;
         }
-        if (cmd_option_exists(argv, argc + argv, "-s")) {
-            Arg.shuffle = true;
-        }
-        if (cmd_option_exists(argv, argc + argv, "-p")) {
-            Arg.sort = true;
-        }
         if (cmd_option_exists(argv, argc + argv, "-device_id")) {
             Arg.device_id =
                 atoi(get_cmd_option(argv, argv + argc, "-device_id"));
diff --git a/apps/MCF/mcf_openmesh.h b/apps/MCF/mcf_openmesh.h
index e7280b6b..ed9ef468 100644
--- a/apps/MCF/mcf_openmesh.h
+++ b/apps/MCF/mcf_openmesh.h
@@ -2,7 +2,6 @@
 #include "../common/openmesh_report.h"
 #include "../common/openmesh_trimesh.h"
 #include "mcf_util.h"
-#include "rxmesh/rxmesh_attribute.h"
 #include "rxmesh/util/timer.h"
 #include "rxmesh/util/vector.h"
 
@@ -10,24 +9,24 @@
  * axpy3()
  */
 template <typename T>
-void axpy3(const RXMESH::RXMeshAttribute<T>& X,
-           RXMESH::Vector<3, T>              alpha,
-           RXMESH::Vector<3, T>              beta,
-           RXMESH::RXMeshAttribute<T>&       Y,
-           const int                         num_omp_threads)
+void axpy3(const std::vector<std::vector<T>>& X,
+           const T                            alpha,
+           const T                            beta,
+           std::vector<std::vector<T>>&       Y,
+           const int                          num_omp_threads)
 {
     // Y = beta*Y + alpha*X
 
-    int size = static_cast<int>(X.get_num_mesh_elements());
+    int size = static_cast<int>(X.size());
 #pragma omp parallel for schedule(static) num_threads(num_omp_threads)
     for (int i = 0; i < size; ++i) {
-        Y(i, 0) *= beta[0];
-        Y(i, 1) *= beta[1];
-        Y(i, 2) *= beta[2];
+        Y[i][0] *= beta;
+        Y[i][1] *= beta;
+        Y[i][2] *= beta;
 
-        Y(i, 0) += alpha[0] * X(i, 0);
-        Y(i, 1) += alpha[1] * X(i, 1);
-        Y(i, 2) += alpha[2] * X(i, 2);
+        Y[i][0] += alpha * X[i][0];
+        Y[i][1] += alpha * X[i][1];
+        Y[i][2] += alpha * X[i][2];
     }
 }
 
@@ -35,26 +34,23 @@ void axpy3(const RXMESH::RXMeshAttribute<T>& X,
  * dot3()
  */
 template <typename T>
-void dot3(const RXMESH::RXMeshAttribute<T>& A,
-          const RXMESH::RXMeshAttribute<T>& B,
-          RXMESH::Vector<3, T>&             res,
-          const int                         num_omp_threads)
+T dot3(const std::vector<std::vector<T>>& A,
+       const std::vector<std::vector<T>>& B,
+       const int                          num_omp_threads)
 {
-    // creating temp variables because variable in 'reduction' clause/directive
-    // cannot have reference type
 
-    T   x_sum(0), y_sum(0), z_sum(0);
-    int size = static_cast<int>(A.get_num_mesh_elements());
-#pragma omp parallel for schedule(static) num_threads(num_omp_threads) reduction(+ : x_sum,y_sum,z_sum)
+    T   ret  = 0;
+    int size = static_cast<int>(A.size());
+#pragma omp parallel for schedule(static) num_threads(num_omp_threads) reduction(+ : ret)
     for (int i = 0; i < size; ++i) {
-        x_sum += A(i, 0) * B(i, 0);
-        y_sum += A(i, 1) * B(i, 1);
-        z_sum += A(i, 2) * B(i, 2);
+        T partial = 0;
+        for (size_t j = 0; j < A[i].size(); ++j) {
+            partial += A[i][j] * B[i][j];
+        }
+        ret += partial;
     }
 
-    res[0] = x_sum;
-    res[1] = y_sum;
-    res[2] = z_sum;
+    return ret;
 }
 
 /**
@@ -77,12 +73,12 @@ T partial_voronoi_area(const int      p_id,  // center
     assert((*q_it).idx() == q_id);
     assert((*r_it).idx() == r_id);
 
-    const RXMESH::Vector<3, T> p(mesh.point(*p_it)[0], mesh.point(*p_it)[1],
-                                 mesh.point(*p_it)[2]);
-    const RXMESH::Vector<3, T> q(mesh.point(*q_it)[0], mesh.point(*q_it)[1],
-                                 mesh.point(*q_it)[2]);
-    const RXMESH::Vector<3, T> r(mesh.point(*r_it)[0], mesh.point(*r_it)[1],
-                                 mesh.point(*r_it)[2]);
+    const rxmesh::Vector<3, T> p(
+        mesh.point(*p_it)[0], mesh.point(*p_it)[1], mesh.point(*p_it)[2]);
+    const rxmesh::Vector<3, T> q(
+        mesh.point(*q_it)[0], mesh.point(*q_it)[1], mesh.point(*q_it)[2]);
+    const rxmesh::Vector<3, T> r(
+        mesh.point(*r_it)[0], mesh.point(*r_it)[1], mesh.point(*r_it)[2]);
 
     return partial_voronoi_area(p, q, r);
 }
@@ -106,24 +102,24 @@ T edge_cotan_weight(const int      p_id,
     TriMesh::VertexIter q_it = mesh.vertices_begin() + q_id;
     TriMesh::VertexIter s_it = mesh.vertices_begin() + s_id;
 
-    const RXMESH::Vector<3, T> p(mesh.point(*p_it)[0], mesh.point(*p_it)[1],
-                                 mesh.point(*p_it)[2]);
-    const RXMESH::Vector<3, T> r(mesh.point(*r_it)[0], mesh.point(*r_it)[1],
-                                 mesh.point(*r_it)[2]);
-    const RXMESH::Vector<3, T> q(mesh.point(*q_it)[0], mesh.point(*q_it)[1],
-                                 mesh.point(*q_it)[2]);
-    const RXMESH::Vector<3, T> s(mesh.point(*s_it)[0], mesh.point(*s_it)[1],
-                                 mesh.point(*s_it)[2]);
+    const rxmesh::Vector<3, T> p(
+        mesh.point(*p_it)[0], mesh.point(*p_it)[1], mesh.point(*p_it)[2]);
+    const rxmesh::Vector<3, T> r(
+        mesh.point(*r_it)[0], mesh.point(*r_it)[1], mesh.point(*r_it)[2]);
+    const rxmesh::Vector<3, T> q(
+        mesh.point(*q_it)[0], mesh.point(*q_it)[1], mesh.point(*q_it)[2]);
+    const rxmesh::Vector<3, T> s(
+        mesh.point(*s_it)[0], mesh.point(*s_it)[1], mesh.point(*s_it)[2]);
 
     return edge_cotan_weight(p, r, q, s);
 }
 
 
 template <typename T>
-void mcf_matvec(TriMesh&                          mesh,
-                const RXMESH::RXMeshAttribute<T>& in,
-                RXMESH::RXMeshAttribute<T>&       out,
-                const int                         num_omp_threads)
+void mcf_matvec(TriMesh&                           mesh,
+                const std::vector<std::vector<T>>& in,
+                std::vector<std::vector<T>>&       out,
+                const int                          num_omp_threads)
 {
     // Matrix vector multiplication operation based on uniform Laplacian weight
     // defined in Equation 7 in Implicit Fairing of Irregular Meshes using
@@ -153,7 +149,7 @@ void mcf_matvec(TriMesh&                          mesh,
         TriMesh::VertexIter p_iter = mesh.vertices_begin() + p_id;
 
         // Off-diagonal entries
-        RXMESH::Vector<3, T> x(T(0));
+        rxmesh::Vector<3, T> x(T(0));
         T                    sum_e_weight(0);
 
         // vertex weight
@@ -170,7 +166,8 @@ void mcf_matvec(TriMesh&                          mesh,
         assert(s_iter.is_valid());
 
         for (TriMesh::VertexVertexIter r_iter = mesh.vv_iter(*p_iter);
-             r_iter.is_valid(); ++r_iter) {
+             r_iter.is_valid();
+             ++r_iter) {
 
             int r_id = (*r_iter).idx();
 
@@ -180,17 +177,18 @@ void mcf_matvec(TriMesh&                          mesh,
                 e_weight = 1;
             } else {
                 e_weight = std::max(
-                    T(0.0), edge_cotan_weight<T>(p_id, r_id, (*q_iter).idx(),
-                                                 (*s_iter).idx(), mesh));
+                    T(0.0),
+                    edge_cotan_weight<T>(
+                        p_id, r_id, (*q_iter).idx(), (*s_iter).idx(), mesh));
                 ++s_iter;
             }
 
             e_weight *= static_cast<T>(Arg.time_step);
             sum_e_weight += e_weight;
 
-            x[0] -= e_weight * in(r_id, 0);
-            x[1] -= e_weight * in(r_id, 1);
-            x[2] -= e_weight * in(r_id, 2);
+            x[0] -= e_weight * in[r_id][0];
+            x[1] -= e_weight * in[r_id][1];
+            x[2] -= e_weight * in[r_id][2];
 
             if (Arg.use_uniform_laplace) {
                 ++v_weight;
@@ -215,10 +213,10 @@ void mcf_matvec(TriMesh&                          mesh,
         assert(!std::isnan(v_weight));
         assert(!std::isinf(v_weight));
 
-        T diag = ((1.0 / v_weight) + sum_e_weight);
-        out(p_id, 0) = x[0] + diag * in(p_id, 0);
-        out(p_id, 1) = x[1] + diag * in(p_id, 1);
-        out(p_id, 2) = x[2] + diag * in(p_id, 2);
+        T diag       = ((1.0 / v_weight) + sum_e_weight);
+        out[p_id][0] = x[0] + diag * in[p_id][0];
+        out[p_id][1] = x[1] + diag * in[p_id][1];
+        out[p_id][2] = x[2] + diag * in[p_id][2];
     }
 }
 
@@ -227,16 +225,16 @@ void mcf_matvec(TriMesh&                          mesh,
  * cg()
  */
 template <typename T>
-void cg(TriMesh&                    mesh,
-        RXMESH::RXMeshAttribute<T>& X,
-        RXMESH::RXMeshAttribute<T>& B,
-        RXMESH::RXMeshAttribute<T>& R,
-        RXMESH::RXMeshAttribute<T>& P,
-        RXMESH::RXMeshAttribute<T>& S,
-        uint32_t&                   num_cg_iter_taken,
-        RXMESH::Vector<3, T>&       start_residual,
-        RXMESH::Vector<3, T>&       stop_residual,
-        const int                   num_omp_threads)
+void cg(TriMesh&                     mesh,
+        std::vector<std::vector<T>>& X,
+        std::vector<std::vector<T>>& B,
+        std::vector<std::vector<T>>& R,
+        std::vector<std::vector<T>>& P,
+        std::vector<std::vector<T>>& S,
+        uint32_t&                    num_cg_iter_taken,
+        T&                           start_residual,
+        T&                           stop_residual,
+        const int                    num_omp_threads)
 {
     // CG solver. Solve for the three coordinates simultaneously
 
@@ -248,112 +246,93 @@ void cg(TriMesh&                    mesh,
     // p = r
 #pragma omp parallel for schedule(static) num_threads(num_omp_threads)
     for (int i = 0; i < int(mesh.n_vertices()); ++i) {
-        R(i, 0) = B(i, 0) - S(i, 0);
-        R(i, 1) = B(i, 1) - S(i, 1);
-        R(i, 2) = B(i, 2) - S(i, 2);
+        R[i][0] = B[i][0] - S[i][0];
+        R[i][1] = B[i][1] - S[i][1];
+        R[i][2] = B[i][2] - S[i][2];
 
-        P(i, 0) = R(i, 0);
-        P(i, 1) = R(i, 1);
-        P(i, 2) = R(i, 2);
+        P[i][0] = R[i][0];
+        P[i][1] = R[i][1];
+        P[i][2] = R[i][2];
     }
 
     // delta_new = <r,r>
-    RXMESH::Vector<3, T> delta_new;
-    dot3(R, R, delta_new, num_omp_threads);
+    T delta_new = dot3(R, R, num_omp_threads);
 
     // delta_0 = delta_new
-    const RXMESH::Vector<3, T> delta_0(delta_new);
+    const T delta_0(delta_new);
 
     start_residual = delta_0;
-    const RXMESH::Vector<3, T> ones(1);
-    uint32_t                   iter = 0;
+    uint32_t iter  = 0;
     while (iter < Arg.max_num_cg_iter) {
         // s = Ap
         mcf_matvec(mesh, P, S, num_omp_threads);
 
         // alpha = delta_new / <s,p>
-        RXMESH::Vector<3, T> alpha;
-        dot3(S, P, alpha, num_omp_threads);
-        alpha = delta_new / alpha;
+        T alpha = dot3(S, P, num_omp_threads);
+        alpha   = delta_new / alpha;
 
 
         // x =  x + alpha*p
-        axpy3(P, alpha, ones, X, num_omp_threads);
+        axpy3(P, alpha, T(1), X, num_omp_threads);
 
         // r = r - alpha*s
-        axpy3(S, -alpha, ones, R, num_omp_threads);
+        axpy3(S, -alpha, T(1), R, num_omp_threads);
 
         // delta_old = delta_new
-        RXMESH::Vector<3, T> delta_old(delta_new);
+        T delta_old(delta_new);
 
         // delta_new = <r,r>
-        dot3(R, R, delta_new, num_omp_threads);
+        delta_new = dot3(R, R, num_omp_threads);
 
         // beta = delta_new/delta_old
-        RXMESH::Vector<3, T> beta(delta_new / delta_old);
+        T beta(delta_new / delta_old);
 
         // exit if error is getting too low across three coordinates
-        if (delta_new[0] < Arg.cg_tolerance * Arg.cg_tolerance * delta_0[0] &&
-            delta_new[1] < Arg.cg_tolerance * Arg.cg_tolerance * delta_0[1] &&
-            delta_new[2] < Arg.cg_tolerance * Arg.cg_tolerance * delta_0[2]) {
+        if (delta_new < Arg.cg_tolerance * Arg.cg_tolerance * delta_0) {
             break;
         }
 
         // p = beta*p + r
-        axpy3(R, ones, beta, P, num_omp_threads);
+        axpy3(R, T(1), beta, P, num_omp_threads);
 
         ++iter;
     }
     num_cg_iter_taken = iter;
-    stop_residual = delta_new;
+    stop_residual     = delta_new;
 }
 
 /**
  * implicit_smoothing()
  */
 template <typename T>
-void implicit_smoothing(TriMesh&                    mesh,
-                        RXMESH::RXMeshAttribute<T>& X,
-                        uint32_t&                   num_cg_iter_taken,
-                        float&                      time,
-                        RXMESH::Vector<3, T>&       start_residual,
-                        RXMESH::Vector<3, T>&       stop_residual,
-                        const int                   num_omp_threads)
+void implicit_smoothing(TriMesh&                     mesh,
+                        std::vector<std::vector<T>>& X,
+                        uint32_t&                    num_cg_iter_taken,
+                        float&                       time,
+                        T&                           start_residual,
+                        T&                           stop_residual,
+                        const int                    num_omp_threads)
 {
 
     for (TriMesh::VertexIter v_it = mesh.vertices_begin();
-         v_it != mesh.vertices_end(); ++v_it) {
+         v_it != mesh.vertices_end();
+         ++v_it) {
         ASSERT_FALSE(mesh.is_boundary(*v_it))
             << "OpenMesh MCF only takes watertight/closed mesh without "
                "boundaries";
     }
 
     // CG containers
-    RXMESH::RXMeshAttribute<T> B, R, P, S;
-
-    X.init(mesh.n_vertices(), 3u, RXMESH::HOST);
-    X.reset(0.0, RXMESH::HOST);
-
-    S.init(mesh.n_vertices(), 3u, RXMESH::HOST);
-    S.reset(0.0, RXMESH::HOST);
-
-    P.init(mesh.n_vertices(), 3u, RXMESH::HOST);
-    P.reset(0.0, RXMESH::HOST);
-
-    R.init(mesh.n_vertices(), 3u, RXMESH::HOST);
-    R.reset(0.0, RXMESH::HOST);
-
-    B.init(mesh.n_vertices(), 3u, RXMESH::HOST);
-    B.reset(0.0, RXMESH::HOST);
+    std::vector<std::vector<T>> B(X), R(X), P(X), S(X);
 
 #pragma omp parallel for
     for (uint32_t v_id = 0; v_id < mesh.n_vertices(); ++v_id) {
         TriMesh::VertexIter v_iter = mesh.vertices_begin() + v_id;
 
         // LHS
-        X(v_id, 0) = mesh.point(*v_iter)[0];
-        X(v_id, 1) = mesh.point(*v_iter)[1];
-        X(v_id, 2) = mesh.point(*v_iter)[2];
+        X[v_id][0] = mesh.point(*v_iter)[0];
+        X[v_id][1] = mesh.point(*v_iter)[1];
+        X[v_id][2] = mesh.point(*v_iter)[2];
 
         // RHS
         T v_weight = 1;
@@ -363,9 +342,9 @@ void implicit_smoothing(TriMesh&                    mesh,
         }
         // will fix it later for cotan weight
 
-        B(v_id, 0) = X(v_id, 0) * v_weight;
-        B(v_id, 1) = X(v_id, 1) * v_weight;
-        B(v_id, 2) = X(v_id, 2) * v_weight;
+        B[v_id][0] = X[v_id][0] * v_weight;
+        B[v_id][1] = X[v_id][1] * v_weight;
+        B[v_id][2] = X[v_id][2] * v_weight;
     }
 
     if (!Arg.use_uniform_laplace) {
@@ -381,30 +360,39 @@ void implicit_smoothing(TriMesh&                    mesh,
             assert(q_iter.is_valid());
 
             for (TriMesh::VertexVertexIter vv_iter = mesh.vv_iter(*v_iter);
-                 vv_iter.is_valid(); ++vv_iter) {
+                 vv_iter.is_valid();
+                 ++vv_iter) {
 
-                T tri_area = partial_voronoi_area<T>(v_id, (*q_iter).idx(),
-                                                     (*vv_iter).idx(), mesh);
+                T tri_area = partial_voronoi_area<T>(
+                    v_id, (*q_iter).idx(), (*vv_iter).idx(), mesh);
 
                 v_weight += (tri_area > 0) ? tri_area : 0;
 
                 q_iter++;
                 assert(q_iter == vv_iter);
             }
-            v_weight = 0.5 / v_weight;
-            B(v_id, 0) = X(v_id, 0) / v_weight;
-            B(v_id, 1) = X(v_id, 1) / v_weight;
-            B(v_id, 2) = X(v_id, 2) / v_weight;
+            v_weight   = 0.5 / v_weight;
+            B[v_id][0] = X[v_id][0] / v_weight;
+            B[v_id][1] = X[v_id][1] / v_weight;
+            B[v_id][2] = X[v_id][2] / v_weight;
         }
     }
 
     num_cg_iter_taken = 0;
 
     // solve
-    RXMESH::CPUTimer timer;
+    rxmesh::CPUTimer timer;
     timer.start();
 
-    cg(mesh, X, B, R, P, S, num_cg_iter_taken, start_residual, stop_residual,
+    cg(mesh,
+       X,
+       B,
+       R,
+       P,
+       S,
+       num_cg_iter_taken,
+       start_residual,
+       stop_residual,
        num_omp_threads);
 
     timer.stop();
@@ -412,9 +400,9 @@ void implicit_smoothing(TriMesh&                    mesh,
 }
 
 template <typename T>
-void mcf_openmesh(const int                   num_omp_threads,
-                  TriMesh&                    input_mesh,
-                  RXMESH::RXMeshAttribute<T>& smoothed_coord)
+void mcf_openmesh(const int                    num_omp_threads,
+                  TriMesh&                     input_mesh,
+                  std::vector<std::vector<T>>& smoothed_coord)
 {
     // Report
     OpenMeshReport report("MCF_OpenMesh");
@@ -424,13 +412,6 @@ void mcf_openmesh(const int                   num_omp_threads,
     std::string method =
         "OpenMesh " + std::to_string(num_omp_threads) + " Core";
     report.add_member("method", method);
-    std::string order = "default";
-    if (Arg.shuffle) {
-        order = "shuffle";
-    } else if (Arg.sort) {
-        order = "sorted";
-    }
-    report.add_member("input_order", order);
     report.add_member("time_step", Arg.time_step);
     report.add_member("cg_tolerance", Arg.cg_tolerance);
     report.add_member("use_uniform_laplace", Arg.use_uniform_laplace);
@@ -438,26 +419,33 @@ void mcf_openmesh(const int                   num_omp_threads,
 
 
     // implicit smoothing
-    uint32_t             num_cg_iter_taken = 0;
-    float                time = 0;
-    RXMESH::Vector<3, T> start_residual;
-    RXMESH::Vector<3, T> stop_residual;
-
-    implicit_smoothing(input_mesh, smoothed_coord, num_cg_iter_taken, time,
-                       start_residual, stop_residual, num_omp_threads);
+    uint32_t num_cg_iter_taken = 0;
+    float    time              = 0;
+    T        start_residual;
+    T        stop_residual;
+
+    implicit_smoothing(input_mesh,
+                       smoothed_coord,
+                       num_cg_iter_taken,
+                       time,
+                       start_residual,
+                       stop_residual,
+                       num_omp_threads);
 
     RXMESH_TRACE(
         "mcf_openmesh() took {} (ms) and {} iterations (i.e., {} ms/iter) ",
-        time, num_cg_iter_taken, time / float(num_cg_iter_taken));
+        time,
+        num_cg_iter_taken,
+        time / float(num_cg_iter_taken));
 
 
     // write output
     //#pragma omp parallel for
     //    for (int v_id = 0; v_id < int(input_mesh.n_vertices()); ++v_id) {
     //        TriMesh::VertexIter v_iter = input_mesh.vertices_begin() + v_id;
-    //        input_mesh.point(*v_iter)[0] = smoothed_coord(v_id, 0);
-    //        input_mesh.point(*v_iter)[1] = smoothed_coord(v_id, 1);
-    //        input_mesh.point(*v_iter)[2] = smoothed_coord(v_id, 2);
+    //        input_mesh.point(*v_iter)[0] = smoothed_coord[v_id][0];
+    //        input_mesh.point(*v_iter)[1] = smoothed_coord[v_id][1];
+    //        input_mesh.point(*v_iter)[2] = smoothed_coord[v_id][2];
     //    }
     //    std::string fn = STRINGIFY(OUTPUT_DIR) "mcf_openmesh.obj";
     //    if (!OpenMesh::IO::write_mesh(input_mesh, fn)) {
@@ -465,17 +453,17 @@ void mcf_openmesh(const int                   num_omp_threads,
     //    }
 
     // Finalize report
-    report.add_member("start_residual", to_string(start_residual));
-    report.add_member("end_residual", to_string(stop_residual));
+    report.add_member("start_residual", start_residual);
+    report.add_member("end_residual", stop_residual);
     report.add_member("num_cg_iter_taken", num_cg_iter_taken);
     report.add_member("total_time (ms)", time);
-    RXMESH::TestData td;
-    td.test_name = "MCF";
+    rxmesh::TestData td;
+    td.test_name   = "MCF";
     td.num_threads = num_omp_threads;
     td.time_ms.push_back(time / float(num_cg_iter_taken));
     td.passed.push_back(true);
     report.add_test(td);
     report.write(
         Arg.output_folder + "/openmesh",
-        "MCF_OpenMesh_" + RXMESH::extract_file_name(Arg.obj_file_name));
+        "MCF_OpenMesh_" + rxmesh::extract_file_name(Arg.obj_file_name));
 }
\ No newline at end of file
diff --git a/apps/MCF/mcf_rxmesh.h b/apps/MCF/mcf_rxmesh.h
index 37c23547..e3681c8b 100644
--- a/apps/MCF/mcf_rxmesh.h
+++ b/apps/MCF/mcf_rxmesh.h
@@ -2,19 +2,56 @@
 
 #include <cuda_profiler_api.h>
 #include "mcf_rxmesh_kernel.cuh"
-#include "rxmesh/rxmesh_attribute.h"
+#include "rxmesh/attribute.h"
+#include "rxmesh/reduce_handle.h"
 #include "rxmesh/rxmesh_static.h"
 #include "rxmesh/util/report.h"
 #include "rxmesh/util/timer.h"
 #include "rxmesh/util/vector.h"
 
-
-template <typename T, uint32_t patchSize>
-void mcf_rxmesh(RXMESH::RXMeshStatic<patchSize>&   rxmesh_static,
+template <typename T>
+void axpy(rxmesh::RXMeshStatic&             rxmesh,
+          rxmesh::VertexAttribute<T>&       y,
+          const rxmesh::VertexAttribute<T>& x,
+          const T                           alpha,
+          const T                           beta,
+          cudaStream_t                      stream = NULL)
+{
+    // Y = alpha*X + beta*Y
+    rxmesh.for_each_vertex(
+        rxmesh::DEVICE,
+        [y, x, alpha, beta] __device__(const rxmesh::VertexHandle vh) {
+            for (uint32_t i = 0; i < 3; ++i) {
+                y(vh, i) = alpha * x(vh, i) + beta * y(vh, i);
+            }
+        });
+}
+
+template <typename T>
+void init_PR(rxmesh::RXMeshStatic&             rxmesh,
+             const rxmesh::VertexAttribute<T>& B,
+             const rxmesh::VertexAttribute<T>& S,
+             rxmesh::VertexAttribute<T>&       R,
+             rxmesh::VertexAttribute<T>&       P)
+{
+    rxmesh.for_each_vertex(
+        rxmesh::DEVICE, [B, S, R, P] __device__(const rxmesh::VertexHandle vh) {
+            R(vh, 0) = B(vh, 0) - S(vh, 0);
+            R(vh, 1) = B(vh, 1) - S(vh, 1);
+            R(vh, 2) = B(vh, 2) - S(vh, 2);
+
+            P(vh, 0) = R(vh, 0);
+            P(vh, 1) = R(vh, 1);
+            P(vh, 2) = R(vh, 2);
+        });
+}
+
+template <typename T>
+void mcf_rxmesh(rxmesh::RXMeshStatic&              rxmesh,
                 const std::vector<std::vector<T>>& Verts,
-                const RXMESH::RXMeshAttribute<T>&  ground_truth)
+                const std::vector<std::vector<T>>& ground_truth)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
     constexpr uint32_t blockThreads = 256;
 
     // Report
@@ -22,122 +59,123 @@ void mcf_rxmesh(RXMESH::RXMeshStatic<patchSize>&   rxmesh_static,
     report.command_line(Arg.argc, Arg.argv);
     report.device();
     report.system();
-    report.model_data(Arg.obj_file_name, rxmesh_static);
+    report.model_data(Arg.obj_file_name, rxmesh);
     report.add_member("method", std::string("RXMesh"));
-    std::string order = "default";
-    if (Arg.shuffle) {
-        order = "shuffle";
-    } else if (Arg.sort) {
-        order = "sorted";
-    }
-    report.add_member("input_order", order);
     report.add_member("time_step", Arg.time_step);
     report.add_member("cg_tolerance", Arg.cg_tolerance);
     report.add_member("use_uniform_laplace", Arg.use_uniform_laplace);
     report.add_member("max_num_cg_iter", Arg.max_num_cg_iter);
     report.add_member("blockThreads", blockThreads);
 
-    ASSERT_TRUE(rxmesh_static.is_closed())
+    ASSERT_TRUE(rxmesh.is_closed())
         << "mcf_rxmesh only takes watertight/closed mesh without boundaries";
 
     // Different attributes used throughout the application
-    RXMeshAttribute<T> input_coord;
-    input_coord.set_name("coord");
-    input_coord.init(Verts.size(), 3u, RXMESH::LOCATION_ALL);
-    for (uint32_t i = 0; i < Verts.size(); ++i) {
-        for (uint32_t j = 0; j < Verts[i].size(); ++j) {
-            input_coord(i, j) = Verts[i][j];
-        }
-    }
-    input_coord.change_layout(RXMESH::HOST);
-    input_coord.move(RXMESH::HOST, RXMESH::DEVICE);
+    auto input_coord =
+        rxmesh.add_vertex_attribute<T>(Verts, "coord", rxmesh::LOCATION_ALL);
 
     // S in CG
-    RXMeshAttribute<T> S;
-    S.set_name("S");
-    S.init(rxmesh_static.get_num_vertices(), 3u, RXMESH::DEVICE, RXMESH::SoA);
-    S.reset(0.0, RXMESH::DEVICE);
+    auto S =
+        rxmesh.add_vertex_attribute<T>("S", 3, rxmesh::DEVICE, rxmesh::SoA);
+    S->reset(0.0, rxmesh::DEVICE);
 
     // P in CG
-    RXMeshAttribute<T> P;
-    P.set_name("P");
-    P.init(rxmesh_static.get_num_vertices(), 3u, RXMESH::DEVICE, RXMESH::SoA);
-    P.reset(0.0, RXMESH::DEVICE);
+    auto P =
+        rxmesh.add_vertex_attribute<T>("P", 3, rxmesh::DEVICE, rxmesh::SoA);
+    P->reset(0.0, rxmesh::DEVICE);
 
     // R in CG
-    RXMeshAttribute<T> R;
-    R.set_name("P");
-    R.init(rxmesh_static.get_num_vertices(), 3u, RXMESH::DEVICE, RXMESH::SoA);
-    R.reset(0.0, RXMESH::DEVICE);
+    auto R =
+        rxmesh.add_vertex_attribute<T>("R", 3, rxmesh::DEVICE, rxmesh::SoA);
+    R->reset(0.0, rxmesh::DEVICE);
 
     // B in CG
-    RXMeshAttribute<T> B;
-    B.set_name("B");
-    B.init(rxmesh_static.get_num_vertices(), 3u, RXMESH::DEVICE, RXMESH::SoA);
-    B.reset(0.0, RXMESH::DEVICE);
-
-    // X in CG
-    RXMeshAttribute<T> X;
-    X.set_name("X");
-    X.init(rxmesh_static.get_num_vertices(), 3u, RXMESH::LOCATION_ALL,
-           RXMESH::SoA);
-    X.copy(input_coord, RXMESH::HOST, RXMESH::DEVICE);
+    auto B =
+        rxmesh.add_vertex_attribute<T>("B", 3, rxmesh::DEVICE, rxmesh::SoA);
+    B->reset(0.0, rxmesh::DEVICE);
+
+    // X in CG (the output)
+    auto X = rxmesh.add_vertex_attribute<T>(Verts, "X", rxmesh::LOCATION_ALL);
+
+    ReduceHandle<T> reduce_handle(*X);
 
     // RXMesh launch box
-    LaunchBox<blockThreads> launch_box;
-    rxmesh_static.prepare_launch_box(RXMESH::Op::VV, launch_box, false, true);
+    LaunchBox<blockThreads> launch_box_init_B;
+    LaunchBox<blockThreads> launch_box_matvec;
+    rxmesh.prepare_launch_box(rxmesh::Op::VV,
+                              launch_box_init_B,
+                              (void*)init_B<T, blockThreads>,
+                              true);
+    rxmesh.prepare_launch_box(rxmesh::Op::VV,
+                              launch_box_matvec,
+                              (void*)rxmesh_matvec<T, blockThreads>,
+                              true);
 
 
     // init kernel to initialize RHS (B)
-    init_B<T, blockThreads>
-        <<<launch_box.blocks, blockThreads, launch_box.smem_bytes_dyn>>>(
-            rxmesh_static.get_context(), X, B, Arg.use_uniform_laplace);
+    init_B<T, blockThreads><<<launch_box_init_B.blocks,
+                              launch_box_init_B.num_threads,
+                              launch_box_init_B.smem_bytes_dyn>>>(
+        rxmesh.get_context(), *X, *B, Arg.use_uniform_laplace);
 
     // CG scalars
-    Vector<3, T> alpha(T(0)), beta(T(0)), delta_new(T(0)), delta_old(T(0)),
-        ones(T(1));
+    T alpha(0), beta(0), delta_new(0), delta_old(0);
 
     GPUTimer timer;
     timer.start();
 
     // s = Ax
-    mcf_matvec<T, blockThreads>
-        <<<launch_box.blocks, blockThreads, launch_box.smem_bytes_dyn>>>(
-            rxmesh_static.get_context(), input_coord, X, S,
-            Arg.use_uniform_laplace, Arg.time_step);
+    rxmesh_matvec<T, blockThreads>
+        <<<launch_box_matvec.blocks,
+           launch_box_matvec.num_threads,
+           launch_box_matvec.smem_bytes_dyn>>>(rxmesh.get_context(),
+                                               *input_coord,
+                                               *X,
+                                               *S,
+                                               Arg.use_uniform_laplace,
+                                               Arg.time_step);
 
     // r = b - s = b - Ax
-    // p=r
-    const uint32_t num_blocks =
-        DIVIDE_UP(rxmesh_static.get_num_vertices(), blockThreads);
-    init_PR<T><<<num_blocks, blockThreads>>>(rxmesh_static.get_num_vertices(),
-                                             B, S, R, P);
+    // p=rk
+    init_PR(rxmesh, *B, *S, *R, *P);
+
 
     // delta_new = <r,r>
-    R.reduce(delta_new, RXMESH::NORM2);
+    delta_new = reduce_handle.norm2(*R);
+    delta_new *= delta_new;
 
-    const Vector<3, T> delta_0(delta_new);
+    const T delta_0(delta_new);
 
     uint32_t num_cg_iter_taken = 0;
 
+    GPUTimer matvec_timer;
+    float    matvec_time = 0;
+
+
     while (num_cg_iter_taken < Arg.max_num_cg_iter) {
         // s = Ap
-
-        mcf_matvec<T, blockThreads>
-            <<<launch_box.blocks, blockThreads, launch_box.smem_bytes_dyn>>>(
-                rxmesh_static.get_context(), input_coord, P, S,
-                Arg.use_uniform_laplace, Arg.time_step);
+        matvec_timer.start();
+        rxmesh_matvec<T, blockThreads>
+            <<<launch_box_matvec.blocks,
+               launch_box_matvec.num_threads,
+               launch_box_matvec.smem_bytes_dyn>>>(rxmesh.get_context(),
+                                                   *input_coord,
+                                                   *P,
+                                                   *S,
+                                                   Arg.use_uniform_laplace,
+                                                   Arg.time_step);
+        matvec_timer.stop();
+        matvec_time += matvec_timer.elapsed_millis();
 
         // alpha = delta_new / <s,p>
-        S.reduce(alpha, RXMESH::DOT, &P);
-
+        alpha = reduce_handle.dot(*S, *P);
         alpha = delta_new / alpha;
 
-        // x =  x + alpha*p
-        X.axpy(P, alpha, ones);
+        // x =  alpha*p + x
+        axpy(rxmesh, *X, *P, alpha, 1.f);
 
-        // r = r - alpha*s
-        R.axpy(S, -alpha, ones);
+        // r = - alpha*s + r
+        axpy(rxmesh, *R, *S, -alpha, 1.f);
 
 
         // delta_old = delta_new
@@ -146,15 +184,14 @@ void mcf_rxmesh(RXMESH::RXMeshStatic<patchSize>&   rxmesh_static,
 
 
         // delta_new = <r,r>
-        R.reduce(delta_new, RXMESH::NORM2);
+        delta_new = reduce_handle.norm2(*R);
+        delta_new *= delta_new;
 
         CUDA_ERROR(cudaStreamSynchronize(0));
 
 
         // exit if error is getting too low across three coordinates
-        if (delta_new[0] < Arg.cg_tolerance * Arg.cg_tolerance * delta_0[0] &&
-            delta_new[1] < Arg.cg_tolerance * Arg.cg_tolerance * delta_0[1] &&
-            delta_new[2] < Arg.cg_tolerance * Arg.cg_tolerance * delta_0[2]) {
+        if (delta_new < Arg.cg_tolerance * Arg.cg_tolerance * delta_0) {
             break;
         }
 
@@ -162,7 +199,7 @@ void mcf_rxmesh(RXMESH::RXMeshStatic<patchSize>&   rxmesh_static,
         beta = delta_new / delta_old;
 
         // p = beta*p + r
-        P.axpy(R, ones, beta);
+        axpy(rxmesh, *P, *R, 1.f, beta);
 
         ++num_cg_iter_taken;
 
@@ -176,50 +213,52 @@ void mcf_rxmesh(RXMESH::RXMeshStatic<patchSize>&   rxmesh_static,
 
 
     RXMESH_TRACE(
-        "mcf_rxmesh() took {} (ms) and {} iterations (i.e., {} ms/iter) ",
-        timer.elapsed_millis(), num_cg_iter_taken,
-        timer.elapsed_millis() / float(num_cg_iter_taken));
+        "mcf_rxmesh() took {} (ms) and {} iterations (i.e., {} ms/iter), "
+        "mat_vec time {} (ms) (i.e., {} ms/iter)",
+        timer.elapsed_millis(),
+        num_cg_iter_taken,
+        timer.elapsed_millis() / float(num_cg_iter_taken),
+        matvec_time,
+        matvec_time / float(num_cg_iter_taken));
 
     // move output to host
-    X.move(RXMESH::DEVICE, RXMESH::HOST);
+    X->move(rxmesh::DEVICE, rxmesh::HOST);
 
     // output to obj
-    // rxmesh_static.exportOBJ("mcf_rxmesh.obj",
-    //                        [&X](uint32_t i, uint32_t j) { return X(i, j); });
+    // rxmesh.export_obj("mcf_rxmesh.obj", *X);
 
     // Verify
+    const T tol    = 0.001;
     bool    passed = true;
-    const T tol = 0.001;
-    for (uint32_t v = 0; v < X.get_num_mesh_elements(); ++v) {
-        if (std::fabs(X(v, 0) - ground_truth(v, 0)) >
-                tol * std::fabs(ground_truth(v, 0)) ||
-            std::fabs(X(v, 1) - ground_truth(v, 1)) >
-                tol * std::fabs(ground_truth(v, 1)) ||
-            std::fabs(X(v, 2) - ground_truth(v, 2)) >
-                tol * std::fabs(ground_truth(v, 2))) {
-            passed = false;
-            break;
+    rxmesh.for_each_vertex(HOST, [&](const VertexHandle& vh) {
+        uint32_t v_id = rxmesh.map_to_global(vh);
+
+        for (uint32_t i = 0; i < 3; ++i) {
+            if (std::abs(((*X)(vh, i) - ground_truth[v_id][i]) /
+                         ground_truth[v_id][i]) > tol) {
+                passed = false;
+                break;
+            }
         }
-    }
+    });
 
     EXPECT_TRUE(passed);
-    // Release allocation
-    X.release();
-    B.release();
-    S.release();
-    R.release();
-    P.release();
-    input_coord.release();
 
     // Finalize report
-    report.add_member("start_residual", to_string(delta_0));
-    report.add_member("end_residual", to_string(delta_new));
+    report.add_member("start_residual", delta_0);
+    report.add_member("end_residual", delta_new);
     report.add_member("num_cg_iter_taken", num_cg_iter_taken);
     report.add_member("total_time (ms)", timer.elapsed_millis());
+    report.add_member("matvec_time (ms)", matvec_time);
     TestData td;
-    td.test_name = "MCF";
-    td.time_ms.push_back(timer.elapsed_millis() / float(num_cg_iter_taken));
+    td.test_name   = "MCF";
+    td.num_threads = launch_box_matvec.num_threads;
+    td.num_blocks  = launch_box_matvec.blocks;
+    td.dyn_smem    = launch_box_matvec.smem_bytes_dyn;
+    td.static_smem = launch_box_matvec.smem_bytes_static;
+    td.num_reg     = launch_box_matvec.num_registers_per_thread;
     td.passed.push_back(passed);
+    td.time_ms.push_back(timer.elapsed_millis() / float(num_cg_iter_taken));
     report.add_test(td);
     report.write(Arg.output_folder + "/rxmesh",
                  "MCF_RXMesh_" + extract_file_name(Arg.obj_file_name));
diff --git a/apps/MCF/mcf_rxmesh_kernel.cuh b/apps/MCF/mcf_rxmesh_kernel.cuh
index 6e8cab7f..7439be1e 100644
--- a/apps/MCF/mcf_rxmesh_kernel.cuh
+++ b/apps/MCF/mcf_rxmesh_kernel.cuh
@@ -1,50 +1,25 @@
 #pragma once
 
 #include "mcf_util.h"
-#include "rxmesh/kernels/rxmesh_query_dispatcher.cuh"
-#include "rxmesh/rxmesh_attribute.h"
-#include "rxmesh/rxmesh_context.h"
-#include "rxmesh/util/math.h"
+#include "rxmesh/attribute.h"
+#include "rxmesh/context.h"
+#include "rxmesh/kernels/query_dispatcher.cuh"
 #include "rxmesh/util/vector.h"
 
-/**
- * init_PR()
- */
-template <typename T>
-__global__ static void init_PR(const uint32_t                   num_vertices,
-                               const RXMESH::RXMeshAttribute<T> B,
-                               const RXMESH::RXMeshAttribute<T> S,
-                               RXMESH::RXMeshAttribute<T>       R,
-                               RXMESH::RXMeshAttribute<T>       P)
-{
-    // r = b-s = b - Ax
-    // p= r
-    uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (idx < num_vertices) {
-        R(idx, 0) = B(idx, 0) - S(idx, 0);
-        R(idx, 1) = B(idx, 1) - S(idx, 1);
-        R(idx, 2) = B(idx, 2) - S(idx, 2);
-
-        P(idx, 0) = R(idx, 0);
-        P(idx, 1) = R(idx, 1);
-        P(idx, 2) = R(idx, 2);
-    }
-}
-
 /**
  * edge_cotan_weight()
  */
 template <typename T>
 __device__ __forceinline__ T
-edge_cotan_weight(const uint32_t                    p_id,
-                  const uint32_t                    r_id,
-                  const uint32_t                    q_id,
-                  const uint32_t                    s_id,
-                  const RXMESH::RXMeshAttribute<T>& X)
+edge_cotan_weight(const rxmesh::VertexHandle&       p_id,
+                  const rxmesh::VertexHandle&       r_id,
+                  const rxmesh::VertexHandle&       q_id,
+                  const rxmesh::VertexHandle&       s_id,
+                  const rxmesh::VertexAttribute<T>& X)
 {
     // Get the edge weight between the two vertices p-r where
     // q and s composes the diamond around p-r
-    using namespace RXMESH;
+    using namespace rxmesh;
 
     const Vector<3, T> p(X(p_id, 0), X(p_id, 1), X(p_id, 2));
     const Vector<3, T> r(X(r_id, 0), X(r_id, 1), X(r_id, 2));
@@ -59,14 +34,14 @@ edge_cotan_weight(const uint32_t                    p_id,
  */
 template <typename T>
 __device__ __forceinline__ T
-partial_voronoi_area(const uint32_t                    p_id,  // center
-                     const uint32_t                    q_id,  // before center
-                     const uint32_t                    r_id,  // after center
-                     const RXMESH::RXMeshAttribute<T>& X)
+partial_voronoi_area(const rxmesh::VertexHandle&       p_id,  // center
+                     const rxmesh::VertexHandle&       q_id,  // before center
+                     const rxmesh::VertexHandle&       r_id,  // after center
+                     const rxmesh::VertexAttribute<T>& X)
 {
     // compute partial Voronoi area of the center vertex that is associated with
     // the triangle p->q->r (oriented ccw)
-    using namespace RXMESH;
+    using namespace rxmesh;
 
     const Vector<3, T> p(X(p_id, 0), X(p_id, 1), X(p_id, 2));
     const Vector<3, T> q(X(q_id, 0), X(q_id, 1), X(q_id, 2));
@@ -79,31 +54,30 @@ partial_voronoi_area(const uint32_t                    p_id,  // center
  * init_B()
  */
 template <typename T, uint32_t blockThreads>
-__launch_bounds__(blockThreads) __global__
-    static void init_B(const RXMESH::RXMeshContext      context,
-                       const RXMESH::RXMeshAttribute<T> X,
-                       RXMESH::RXMeshAttribute<T>       B,
-                       const bool                       use_uniform_laplace)
+__global__ static void init_B(const rxmesh::Context            context,
+                              const rxmesh::VertexAttribute<T> X,
+                              rxmesh::VertexAttribute<T>       B,
+                              const bool use_uniform_laplace)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
 
-    auto init_lambda = [&](uint32_t p_id, RXMeshIterator& iter) {
+    auto init_lambda = [&](VertexHandle& p_id, const VertexIterator& iter) {
         if (use_uniform_laplace) {
             const T valence = static_cast<T>(iter.size());
-            B(p_id, 0) = X(p_id, 0) * valence;
-            B(p_id, 1) = X(p_id, 1) * valence;
-            B(p_id, 2) = X(p_id, 2) * valence;
+            B(p_id, 0)      = X(p_id, 0) * valence;
+            B(p_id, 1)      = X(p_id, 1) * valence;
+            B(p_id, 2)      = X(p_id, 2) * valence;
         } else {
 
             // using Laplace weights
             T v_weight = 0;
 
             // this is the last vertex in the one-ring (before r_id)
-            uint32_t q_id = iter.back();
+            VertexHandle q_id = iter.back();
 
             for (uint32_t v = 0; v < iter.size(); ++v) {
                 // the current one ring vertex
-                uint32_t r_id = iter[v];
+                VertexHandle r_id = iter[v];
 
                 T tri_area = partial_voronoi_area(p_id, q_id, r_id, X);
 
@@ -121,21 +95,20 @@ __launch_bounds__(blockThreads) __global__
 
     // With uniform Laplacian, we just need the valence, thus we
     // call query_block_dispatcher and set oriented to false
-    query_block_dispatcher<Op::VV, blockThreads>(context, init_lambda,
-                                                 !use_uniform_laplace);
+    query_block_dispatcher<Op::VV, blockThreads>(
+        context, init_lambda, !use_uniform_laplace);
 }
 
 /**
  * mcf_matvec()
  */
 template <typename T, uint32_t blockThreads>
-__launch_bounds__(blockThreads) __global__
-    static void mcf_matvec(const RXMESH::RXMeshContext      context,
-                           const RXMESH::RXMeshAttribute<T> coords,
-                           const RXMESH::RXMeshAttribute<T> in,
-                           RXMESH::RXMeshAttribute<T>       out,
-                           const bool                       use_uniform_laplace,
-                           const T                          time_step)
+__global__ static void rxmesh_matvec(const rxmesh::Context            context,
+                                     const rxmesh::VertexAttribute<T> coords,
+                                     const rxmesh::VertexAttribute<T> in,
+                                     rxmesh::VertexAttribute<T>       out,
+                                     const bool use_uniform_laplace,
+                                     const T    time_step)
 {
 
     // To compute the vertex cotan weight, we use the following configuration
@@ -150,9 +123,9 @@ __launch_bounds__(blockThreads) __global__
            \ |  /
              p
     */
-    using namespace RXMESH;
+    using namespace rxmesh;
 
-    auto matvec_lambda = [&](uint32_t p_id, RXMeshIterator& iter) {
+    auto matvec_lambda = [&](VertexHandle& p_id, const VertexIterator& iter) {
         T sum_e_weight(0);
 
         Vector<3, T> x(T(0));
@@ -161,18 +134,19 @@ __launch_bounds__(blockThreads) __global__
         T v_weight(0);
 
         // this is the last vertex in the one-ring (before r_id)
-        uint32_t q_id = iter.back();
+        VertexHandle q_id = iter.back();
 
         for (uint32_t v = 0; v < iter.size(); ++v) {
             // the current one ring vertex
-            uint32_t r_id = iter[v];
+            VertexHandle r_id = iter[v];
 
             T e_weight = 0;
             if (use_uniform_laplace) {
                 e_weight = 1;
             } else {
                 // the second vertex in the one ring (after r_id)
-                uint32_t s_id = (v == iter.size() - 1) ? iter[0] : iter[v + 1];
+                VertexHandle s_id =
+                    (v == iter.size() - 1) ? iter[0] : iter[v + 1];
 
                 e_weight = edge_cotan_weight(p_id, r_id, q_id, s_id, coords);
 
@@ -208,7 +182,7 @@ __launch_bounds__(blockThreads) __global__
         assert(!isnan(v_weight));
         assert(!isinf(v_weight));
 
-        T diag = ((1.0 / v_weight) + sum_e_weight);
+        T diag       = ((1.0 / v_weight) + sum_e_weight);
         out(p_id, 0) = x[0] + diag * in(p_id, 0);
         out(p_id, 1) = x[1] + diag * in(p_id, 1);
         out(p_id, 2) = x[2] + diag * in(p_id, 2);
@@ -216,6 +190,6 @@ __launch_bounds__(blockThreads) __global__
 
     // With uniform Laplacian, we just need the valence, thus we
     // call query_block_dispatcher and set oriented to false
-    query_block_dispatcher<Op::VV, blockThreads>(context, matvec_lambda,
-                                                 !use_uniform_laplace);
+    query_block_dispatcher<Op::VV, blockThreads>(
+        context, matvec_lambda, !use_uniform_laplace);
 }
\ No newline at end of file
diff --git a/apps/MCF/mcf_util.h b/apps/MCF/mcf_util.h
index 6481beff..90d23a26 100644
--- a/apps/MCF/mcf_util.h
+++ b/apps/MCF/mcf_util.h
@@ -10,7 +10,7 @@ __host__ __device__ __forceinline__ void clamp_cot(T& v)
     // clamp cotangent values as if angles are in[1, 179]
 
     const T bound = 19.1;  // 3 degrees
-    v = (v < -bound) ? -bound : ((v > bound) ? bound : v);
+    v             = (v < -bound) ? -bound : ((v > bound) ? bound : v);
 }
 
 /**
@@ -18,14 +18,14 @@ __host__ __device__ __forceinline__ void clamp_cot(T& v)
  */
 template <typename T>
 __host__ __device__ __forceinline__ T
-partial_voronoi_area(const RXMESH::Vector<3, T>& p,  // center
-                     const RXMESH::Vector<3, T>& q,  // before center
-                     const RXMESH::Vector<3, T>& r)  // after center
+partial_voronoi_area(const rxmesh::Vector<3, T>& p,  // center
+                     const rxmesh::Vector<3, T>& q,  // before center
+                     const rxmesh::Vector<3, T>& r)  // after center
 
 {
     // compute partial Voronoi area of the center vertex that is associated with
     // the triangle p->q->r (oriented ccw)
-    using namespace RXMESH;
+    using namespace rxmesh;
 
     // Edge vector p->q
     const Vector<3, T> pq = q - p;
@@ -78,14 +78,14 @@ partial_voronoi_area(const RXMESH::Vector<3, T>& p,  // center
  */
 template <typename T>
 __host__ __device__ __forceinline__ T
-edge_cotan_weight(const RXMESH::Vector<3, T>& p,
-                  const RXMESH::Vector<3, T>& r,
-                  const RXMESH::Vector<3, T>& q,
-                  const RXMESH::Vector<3, T>& s)
+edge_cotan_weight(const rxmesh::Vector<3, T>& p,
+                  const rxmesh::Vector<3, T>& r,
+                  const rxmesh::Vector<3, T>& q,
+                  const rxmesh::Vector<3, T>& s)
 {
     // Get the edge weight between the two vertices p-r where
     // q and s composes the diamond around p-r
-    using namespace RXMESH;
+    using namespace rxmesh;
 
     auto partial_weight = [&](const Vector<3, T>& v) -> T {
         const Vector<3, T> d0 = p - v;
diff --git a/apps/VertexNormal/CMakeLists.txt b/apps/VertexNormal/CMakeLists.txt
index 597386fd..34f7eeb2 100644
--- a/apps/VertexNormal/CMakeLists.txt
+++ b/apps/VertexNormal/CMakeLists.txt
@@ -14,6 +14,8 @@ target_sources(VertexNormal
 
 set_target_properties(VertexNormal PROPERTIES FOLDER "apps")
 
+set_property(TARGET VertexNormal PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+
 source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "VertexNormal" FILES ${SOURCE_LIST})
 
 target_link_libraries( VertexNormal 
diff --git a/apps/VertexNormal/benchmark.sh b/apps/VertexNormal/benchmark.sh
index afedd573..bb9d9264 100644
--- a/apps/VertexNormal/benchmark.sh
+++ b/apps/VertexNormal/benchmark.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-echo "This script re-generates RXMesh data in Figure 8(d) in the paper."
 echo "Please make sure to first compile the source code and then enter the input OBJ files directory."
 read -p "OBJ files directory (no trailing slash): " input_dir
 
@@ -16,7 +15,7 @@ device_id=0
 
 for file in $input_dir/*.obj; do 	 
     if [ -f "$file" ]; then
-		echo $exe -p -input "$file" -num_run $num_run -device_id $device_id
-             $exe -p -input "$file" -num_run $num_run -device_id $device_id
+		echo $exe -input "$file" -num_run $num_run -device_id $device_id
+         $exe -input "$file" -num_run $num_run -device_id $device_id
     fi 
 done
\ No newline at end of file
diff --git a/apps/VertexNormal/vertex_normal.cu b/apps/VertexNormal/vertex_normal.cu
index bc27e538..3fe2da5b 100644
--- a/apps/VertexNormal/vertex_normal.cu
+++ b/apps/VertexNormal/vertex_normal.cu
@@ -4,7 +4,7 @@
 
 #include <cuda_profiler_api.h>
 #include "gtest/gtest.h"
-#include "rxmesh/rxmesh_attribute.h"
+#include "rxmesh/attribute.h"
 #include "rxmesh/rxmesh_static.h"
 #include "rxmesh/util/import_obj.h"
 #include "rxmesh/util/report.h"
@@ -16,22 +16,20 @@ struct arg
 {
     std::string obj_file_name = STRINGIFY(INPUT_DIR) "sphere3.obj";
     std::string output_folder = STRINGIFY(OUTPUT_DIR);
-    uint32_t    num_run = 1;
-    uint32_t    device_id = 0;
+    uint32_t    num_run       = 1;
+    uint32_t    device_id     = 0;
     char**      argv;
     int         argc;
-    bool        shuffle = false;
-    bool        sort = false;
 } Arg;
 
 #include "vertex_normal_hardwired.cuh"
 
-template <typename T, uint32_t patchSize>
-void vertex_normal_rxmesh(RXMESH::RXMeshStatic<patchSize>&   rxmesh_static,
+template <typename T>
+void vertex_normal_rxmesh(rxmesh::RXMeshStatic&              rxmesh,
                           const std::vector<std::vector<T>>& Verts,
                           const std::vector<T>&              vertex_normal_gold)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
     constexpr uint32_t blockThreads = 256;
 
     // Report
@@ -39,53 +37,41 @@ void vertex_normal_rxmesh(RXMESH::RXMeshStatic<patchSize>&   rxmesh_static,
     report.command_line(Arg.argc, Arg.argv);
     report.device();
     report.system();
-    report.model_data(Arg.obj_file_name, rxmesh_static);
+    report.model_data(Arg.obj_file_name, rxmesh);
     report.add_member("method", std::string("RXMesh"));
-    std::string order = "default";
-    if (Arg.shuffle) {
-        order = "shuffle";
-    } else if (Arg.sort) {
-        order = "sorted";
-    }
-    report.add_member("input_order", order);
     report.add_member("blockThreads", blockThreads);
 
-    RXMeshAttribute<T> coords;
-    coords.set_name("coord");
-    coords.init(Verts.size(), 3u, RXMESH::LOCATION_ALL);
-    // fill in the coordinates
-    for (uint32_t i = 0; i < Verts.size(); ++i) {
-        for (uint32_t j = 0; j < Verts[i].size(); ++j) {
-            coords(i, j) = Verts[i][j];
-        }
-    }
-    // move the coordinates to device
-    coords.move(RXMESH::HOST, RXMESH::DEVICE);
+    auto coords = rxmesh.add_vertex_attribute<T>(Verts, "coordinates");
 
 
     // normals
-    RXMeshAttribute<T> rxmesh_normal;
-    rxmesh_normal.set_name("normal");
-    rxmesh_normal.init(coords.get_num_mesh_elements(), 3u,
-                       RXMESH::LOCATION_ALL);
+    auto v_normals =
+        rxmesh.add_vertex_attribute<T>("v_normals", 3, rxmesh::LOCATION_ALL);
 
     // launch box
     LaunchBox<blockThreads> launch_box;
-    rxmesh_static.prepare_launch_box(RXMESH::Op::FV, launch_box);
+    rxmesh.prepare_launch_box(
+        rxmesh::Op::FV, launch_box, (void*)compute_vertex_normal<T, blockThreads>);
 
 
     TestData td;
-    td.test_name = "VertexNormal";
+    td.test_name   = "VertexNormal";
+    td.num_threads = launch_box.num_threads;
+    td.num_blocks  = launch_box.blocks;
+    td.dyn_smem    = launch_box.smem_bytes_dyn;
+    td.static_smem = launch_box.smem_bytes_static;
+    td.num_reg     = launch_box.num_registers_per_thread;
 
     float vn_time = 0;
     for (uint32_t itr = 0; itr < Arg.num_run; ++itr) {
-        rxmesh_normal.reset(0, RXMESH::DEVICE);
+        v_normals->reset(0, rxmesh::DEVICE);
         GPUTimer timer;
         timer.start();
 
-        compute_vertex_normal<T, blockThreads>
-            <<<launch_box.blocks, blockThreads, launch_box.smem_bytes_dyn>>>(
-                rxmesh_static.get_context(), coords, rxmesh_normal);
+        compute_vertex_normal<T, blockThreads><<<launch_box.blocks,
+                                                 launch_box.num_threads,
+                                                 launch_box.smem_bytes_dyn>>>(
+            rxmesh.get_context(), *coords, *v_normals);
 
         timer.stop();
         CUDA_ERROR(cudaDeviceSynchronize());
@@ -99,17 +85,17 @@ void vertex_normal_rxmesh(RXMESH::RXMeshStatic<patchSize>&   rxmesh_static,
                  vn_time / Arg.num_run);
 
     // Verify
-    rxmesh_normal.move(RXMESH::DEVICE, RXMESH::HOST);
+    v_normals->move(rxmesh::DEVICE, rxmesh::HOST);
 
-    bool passed = compare(vertex_normal_gold.data(),
-                          rxmesh_normal.get_pointer(RXMESH::HOST),
-                          coords.get_num_mesh_elements() * 3, false);
-    td.passed.push_back(passed);
-    EXPECT_TRUE(passed) << " RXMesh Validation failed \n";
+    rxmesh.for_each_vertex(HOST, [&](const VertexHandle& vh) {
+        uint32_t v_id = rxmesh.map_to_global(vh);
 
-    // Release allocation
-    rxmesh_normal.release();
-    coords.release();
+        for (uint32_t i = 0; i < 3; ++i) {
+            EXPECT_NEAR(std::abs(vertex_normal_gold[v_id * 3 + i]),
+                        std::abs((*v_normals)(vh, i)),
+                        0.0001);
+        }
+    });
 
     // Finalize report
     report.add_test(td);
@@ -119,17 +105,9 @@ void vertex_normal_rxmesh(RXMESH::RXMeshStatic<patchSize>&   rxmesh_static,
 
 TEST(Apps, VertexNormal)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
     using dataT = float;
 
-    if (Arg.shuffle) {
-        ASSERT_FALSE(Arg.sort) << " cannot shuffle and sort at the same time!";
-    }
-    if (Arg.sort) {
-        ASSERT_FALSE(Arg.shuffle)
-            << " cannot shuffle and sort at the same time!";
-    }
-
     // Select device
     cuda_query(Arg.device_id);
 
@@ -139,28 +117,23 @@ TEST(Apps, VertexNormal)
 
     ASSERT_TRUE(import_obj(Arg.obj_file_name, Verts, Faces));
 
-    if (Arg.shuffle) {
-        shuffle_obj(Faces, Verts);
-    }
 
-    // Create RXMeshStatic instance. If Arg.sort is true, Faces and Verts will
-    // be sorted based on the patching happening inside RXMesh
-    RXMeshStatic<PATCH_SIZE> rxmesh_static(Faces, Verts, Arg.sort, false);
+    RXMeshStatic rxmesh(Faces, false);
 
-    //*** Serial reference
+    // Serial reference
     std::vector<dataT> vertex_normal_gold(3 * Verts.size());
     vertex_normal_ref(Faces, Verts, vertex_normal_gold);
 
-    //*** RXMesh Impl
-    vertex_normal_rxmesh(rxmesh_static, Verts, vertex_normal_gold);
+    // RXMesh Impl
+    vertex_normal_rxmesh(rxmesh, Verts, vertex_normal_gold);
 
-    //*** Hardwired Impl
+    // Hardwired Impl
     vertex_normal_hardwired(Faces, Verts, vertex_normal_gold);
 }
 
 int main(int argc, char** argv)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
     Log::init();
 
     ::testing::InitGoogleTest(&argc, argv);
@@ -177,8 +150,6 @@ int main(int argc, char** argv)
                         "              Hint: Only accepts OBJ files\n"
                         " -o:          JSON file output folder. Default is {} \n"
                         " -num_run:    Number of iterations for performance testing. Default is {} \n"                        
-                        " -s:          Shuffle input. Default is false.\n"
-                        " -p:          Sort input using patching output. Default is false.\n"
                         " -device_id:  GPU device ID. Default is {}",
             Arg.obj_file_name, Arg.output_folder, Arg.num_run, Arg.device_id);
             // clang-format on
@@ -201,12 +172,6 @@ int main(int argc, char** argv)
             Arg.device_id =
                 atoi(get_cmd_option(argv, argv + argc, "-device_id"));
         }
-        if (cmd_option_exists(argv, argc + argv, "-s")) {
-            Arg.shuffle = true;
-        }
-        if (cmd_option_exists(argv, argc + argv, "-p")) {
-            Arg.sort = true;
-        }
     }
 
     RXMESH_TRACE("input= {}", Arg.obj_file_name);
diff --git a/apps/VertexNormal/vertex_normal_hardwired.cuh b/apps/VertexNormal/vertex_normal_hardwired.cuh
index c11bf837..7ab1a700 100644
--- a/apps/VertexNormal/vertex_normal_hardwired.cuh
+++ b/apps/VertexNormal/vertex_normal_hardwired.cuh
@@ -1,7 +1,6 @@
 #pragma once
 #include <vector>
 #include "rxmesh/util/log.h"
-#include "rxmesh/util/math.h"
 #include "rxmesh/util/report.h"
 
 template <typename T>
@@ -13,6 +12,28 @@ vertex_normal_hardwired_kernel(const uint32_t  num_faces,
                                T*              d_vertex_normal)
 {
     uint32_t f_id = threadIdx.x + blockIdx.x * blockDim.x;
+
+    auto l2_norm_sq = [](const T ax0,
+                         const T ax1,
+                         const T ax2,
+                         const T bx0,
+                         const T bx1,
+                         const T bx2) {
+        // compute (xa0-xb0)*(xa0-xb0) + (xa1-xb1)*(xa1-xb1) +
+        // (xa2-xb2)*(xa2-xb2)
+        T x0 = ax0 - bx0;
+        T x1 = ax1 - bx1;
+        T x2 = ax2 - bx2;
+        return x0 * x0 + x1 * x1 + x2 * x2;
+    };
+
+    auto cross_product =
+        [](T xv1, T yv1, T zv1, T xv2, T yv2, T zv2, T& xx, T& yy, T& zz) {
+            xx = yv1 * zv2 - zv1 * yv2;
+            yy = zv1 * xv2 - xv1 * zv2;
+            zz = xv1 * yv2 - yv1 * xv2;
+        };
+
     if (f_id < num_faces) {
         uint32_t v0 = d_faces[f_id * 3];
         uint32_t v1 = d_faces[f_id * 3 + 1];
@@ -29,11 +50,18 @@ vertex_normal_hardwired_kernel(const uint32_t  num_faces,
 
         T nx, ny, nz;
 
-        RXMESH::cross_product(v1x - v0x, v1y - v0y, v1z - v0z, v2x - v0x,
-                              v2y - v0y, v2z - v0z, nx, ny, nz);
-        T l0 = RXMESH::l2_norm_sq(v0x, v0y, v0z, v1x, v1y, v1z);  // v0-v1
-        T l1 = RXMESH::l2_norm_sq(v1x, v1y, v1z, v2x, v2y, v2z);  // v1-v2
-        T l2 = RXMESH::l2_norm_sq(v2x, v2y, v2z, v0x, v0y, v0z);  // v2-v0
+        cross_product(v1x - v0x,
+                      v1y - v0y,
+                      v1z - v0z,
+                      v2x - v0x,
+                      v2y - v0y,
+                      v2z - v0z,
+                      nx,
+                      ny,
+                      nz);
+        T l0 = l2_norm_sq(v0x, v0y, v0z, v1x, v1y, v1z);  // v0-v1
+        T l1 = l2_norm_sq(v1x, v1y, v1z, v2x, v2y, v2z);  // v1-v2
+        T l2 = l2_norm_sq(v2x, v2y, v2z, v0x, v0y, v0z);  // v2-v0
 
         atomicAdd(&d_vertex_normal[v0 * 3 + 0], nx / (l0 + l2));
         atomicAdd(&d_vertex_normal[v0 * 3 + 1], ny / (l0 + l2));
@@ -55,9 +83,9 @@ inline void vertex_normal_hardwired(
     const std::vector<std::vector<T>>&        Verts,
     const std::vector<T>&                     vertex_normal_gold)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
     uint32_t num_vertices = Verts.size();
-    uint32_t num_faces = Faces.size();
+    uint32_t num_faces    = Faces.size();
 
     CustomReport report("VertexNormal_Hardwired");
     report.command_line(Arg.argc, Arg.argv);
@@ -65,13 +93,6 @@ inline void vertex_normal_hardwired(
     report.system();
     report.model_data(Arg.obj_file_name, num_vertices, num_faces);
     report.add_member("method", std::string("Hardwired"));
-    std::string order = "default";
-    if (Arg.shuffle) {
-        order = "shuffle";
-    } else if (Arg.sort) {
-        order = "sorted";
-    }
-    report.add_member("input_order", order);
 
     std::vector<uint32_t> h_face(num_faces * 3);
     std::vector<T>        h_verts(num_vertices * 3);
@@ -94,17 +115,20 @@ inline void vertex_normal_hardwired(
     CUDA_ERROR(cudaMalloc((void**)&d_face, 3 * num_faces * sizeof(uint32_t)));
     CUDA_ERROR(cudaMalloc((void**)&d_verts, 3 * num_vertices * sizeof(T)));
     CUDA_ERROR(cudaMalloc((void**)&d_normals, 3 * num_vertices * sizeof(T)));
-    CUDA_ERROR(cudaMemcpy(d_face, h_face.data(),
+    CUDA_ERROR(cudaMemcpy(d_face,
+                          h_face.data(),
                           h_face.size() * sizeof(uint32_t),
                           cudaMemcpyHostToDevice));
-    CUDA_ERROR(cudaMemcpy(d_verts, h_verts.data(), h_verts.size() * sizeof(T),
+    CUDA_ERROR(cudaMemcpy(d_verts,
+                          h_verts.data(),
+                          h_verts.size() * sizeof(T),
                           cudaMemcpyHostToDevice));
 
     const uint32_t threads = 256;
-    const uint32_t blocks = DIVIDE_UP(num_faces, threads);
+    const uint32_t blocks  = DIVIDE_UP(num_faces, threads);
 
     TestData td;
-    td.test_name = "VertexNormal";
+    td.test_name  = "VertexNormal";
     float vn_time = 0;
     for (uint32_t itr = 0; itr < Arg.num_run; ++itr) {
         CUDA_ERROR(cudaMemset(d_normals, 0, 3 * num_vertices * sizeof(T)));
@@ -126,7 +150,8 @@ inline void vertex_normal_hardwired(
 
     T* verts_normal_hardwired;
     verts_normal_hardwired = (T*)malloc(num_vertices * 3 * sizeof(T));
-    CUDA_ERROR(cudaMemcpy(verts_normal_hardwired, d_normals,
+    CUDA_ERROR(cudaMemcpy(verts_normal_hardwired,
+                          d_normals,
                           3 * num_vertices * sizeof(T),
                           cudaMemcpyDeviceToHost));
 
@@ -138,8 +163,10 @@ inline void vertex_normal_hardwired(
     RXMESH_TRACE("vertex_normal_hardwired() vertex normal kernel took {} (ms)",
                  vn_time);
 
-    bool passed = compare(vertex_normal_gold.data(), verts_normal_hardwired,
-                          Verts.size() * 3, false);
+    bool passed = compare(vertex_normal_gold.data(),
+                          verts_normal_hardwired,
+                          Verts.size() * 3,
+                          false);
     td.passed.push_back(passed);
     EXPECT_TRUE(passed) << " Hardwired Validation failed \n";
 
@@ -148,6 +175,6 @@ inline void vertex_normal_hardwired(
     report.add_test(td);
 
     report.write(
-        Arg.output_folder + "/hardwired/" + order,
+        Arg.output_folder + "/hardwired",
         "VertexNormal_Hardwired_" + extract_file_name(Arg.obj_file_name));
 }
\ No newline at end of file
diff --git a/apps/VertexNormal/vertex_normal_kernel.cuh b/apps/VertexNormal/vertex_normal_kernel.cuh
index fe775d26..67ccb727 100644
--- a/apps/VertexNormal/vertex_normal_kernel.cuh
+++ b/apps/VertexNormal/vertex_normal_kernel.cuh
@@ -1,21 +1,19 @@
 #pragma once
 
-#include "rxmesh/kernels/rxmesh_query_dispatcher.cuh"
-#include "rxmesh/rxmesh_attribute.h"
-#include "rxmesh/rxmesh_context.h"
-#include "rxmesh/util/math.h"
+#include "rxmesh/attribute.h"
+#include "rxmesh/context.h"
+#include "rxmesh/kernels/query_dispatcher.cuh"
 #include "rxmesh/util/vector.h"
 /**
  * vertex_normal()
  */
 template <typename T, uint32_t blockThreads>
-__launch_bounds__(blockThreads, 6) __global__
-    static void compute_vertex_normal(const RXMESH::RXMeshContext context,
-                                      RXMESH::RXMeshAttribute<T>  coords,
-                                      RXMESH::RXMeshAttribute<T>  normals)
+__global__ static void compute_vertex_normal(const rxmesh::Context      context,
+                                             rxmesh::VertexAttribute<T> coords,
+                                             rxmesh::VertexAttribute<T> normals)
 {
-    using namespace RXMESH;
-    auto vn_lambda = [&](uint32_t face_id, RXMeshIterator& fv) {
+    using namespace rxmesh;
+    auto vn_lambda = [&](FaceHandle face_id, VertexIterator& fv) {
         // get the face's three vertices coordinates
         Vector<3, T> c0(coords(fv[0], 0), coords(fv[0], 1), coords(fv[0], 2));
         Vector<3, T> c1(coords(fv[1], 0), coords(fv[1], 1), coords(fv[1], 2));
diff --git a/apps/VertexNormal/vertex_normal_ref.h b/apps/VertexNormal/vertex_normal_ref.h
index 486b7b2e..4ba55f8d 100644
--- a/apps/VertexNormal/vertex_normal_ref.h
+++ b/apps/VertexNormal/vertex_normal_ref.h
@@ -1,6 +1,5 @@
 #pragma once
 #include <vector>
-#include "rxmesh/util/math.h"
 #include "rxmesh/util/report.h"
 
 template <typename T>
@@ -9,7 +8,7 @@ inline void vertex_normal_ref(const std::vector<std::vector<uint32_t>>& Faces,
                               std::vector<T>& vertex_normal)
 {
     uint32_t num_vertices = Verts.size();
-    uint32_t num_faces = Faces.size();
+    uint32_t num_faces    = Faces.size();
 
     memset((void*)vertex_normal.data(), 0, vertex_normal.size() * sizeof(T));
 
@@ -17,46 +16,71 @@ inline void vertex_normal_ref(const std::vector<std::vector<uint32_t>>& Faces,
     uint32_t v[3];
     T        fn[3];
 
+    auto l2_norm_sq = [](const T ax0,
+                         const T ax1,
+                         const T ax2,
+                         const T bx0,
+                         const T bx1,
+                         const T bx2) {
+        // compute (xa0-xb0)*(xa0-xb0) + (xa1-xb1)*(xa1-xb1) +
+        // (xa2-xb2)*(xa2-xb2)
+        T x0 = ax0 - bx0;
+        T x1 = ax1 - bx1;
+        T x2 = ax2 - bx2;
+        return x0 * x0 + x1 * x1 + x2 * x2;
+    };
+
+    auto cross_product =
+        [](T xv1, T yv1, T zv1, T xv2, T yv2, T zv2, T& xx, T& yy, T& zz) {
+            xx = yv1 * zv2 - zv1 * yv2;
+            yy = zv1 * xv2 - xv1 * zv2;
+            zz = xv1 * yv2 - yv1 * xv2;
+        };
+
     for (uint32_t f = 0; f < num_faces; ++f) {
         v[0] = Faces[f][0];
         v[1] = Faces[f][1];
         v[2] = Faces[f][2];
 
-        RXMESH::cross_product(
-            Verts[v[1]][0] - Verts[v[0]][0], Verts[v[1]][1] - Verts[v[0]][1],
-            Verts[v[1]][2] - Verts[v[0]][2], Verts[v[2]][0] - Verts[v[0]][0],
-            Verts[v[2]][1] - Verts[v[0]][1], Verts[v[2]][2] - Verts[v[0]][2],
-            fn[0], fn[1], fn[2]);
+        cross_product(Verts[v[1]][0] - Verts[v[0]][0],
+                      Verts[v[1]][1] - Verts[v[0]][1],
+                      Verts[v[1]][2] - Verts[v[0]][2],
+                      Verts[v[2]][0] - Verts[v[0]][0],
+                      Verts[v[2]][1] - Verts[v[0]][1],
+                      Verts[v[2]][2] - Verts[v[0]][2],
+                      fn[0],
+                      fn[1],
+                      fn[2]);
 
-        edge_len[0] =
-            RXMESH::l2_norm_sq(Verts[v[0]][0], Verts[v[0]][1], Verts[v[0]][2],
-                               Verts[v[1]][0], Verts[v[1]][1],
-                               Verts[v[1]][2]);  // v0-v1
+        edge_len[0] = l2_norm_sq(Verts[v[0]][0],
+                                 Verts[v[0]][1],
+                                 Verts[v[0]][2],
+                                 Verts[v[1]][0],
+                                 Verts[v[1]][1],
+                                 Verts[v[1]][2]);  // v0-v1
 
-        edge_len[1] =
-            RXMESH::l2_norm_sq(Verts[v[1]][0], Verts[v[1]][1], Verts[v[1]][2],
-                               Verts[v[2]][0], Verts[v[2]][1],
-                               Verts[v[2]][2]);  // v1-v2
+        edge_len[1] = l2_norm_sq(Verts[v[1]][0],
+                                 Verts[v[1]][1],
+                                 Verts[v[1]][2],
+                                 Verts[v[2]][0],
+                                 Verts[v[2]][1],
+                                 Verts[v[2]][2]);  // v1-v2
 
-        edge_len[2] =
-            RXMESH::l2_norm_sq(Verts[v[2]][0], Verts[v[2]][1], Verts[v[2]][2],
-                               Verts[v[0]][0], Verts[v[0]][1],
-                               Verts[v[0]][2]);  // v2-v0
+        edge_len[2] = l2_norm_sq(Verts[v[2]][0],
+                                 Verts[v[2]][1],
+                                 Verts[v[2]][2],
+                                 Verts[v[0]][0],
+                                 Verts[v[0]][1],
+                                 Verts[v[0]][2]);  // v2-v0
 
 
         for (uint32_t i = 0; i < 3; ++i) {
-            uint32_t k = (i + 2) % 3;
+            uint32_t k    = (i + 2) % 3;
             uint32_t base = 3 * v[i];
 
             for (uint32_t l = 0; l < 3; ++l) {
                 vertex_normal[base + l] += fn[l] / (edge_len[i] + edge_len[k]);
             }
         }
-    }
-
-    /*for (T v = 0; v < num_vertices; ++v) {
-        T base = 3 * v;
-        normalize_vector(vertex_normal[base], vertex_normal[base + 1],
-            vertex_normal[base + 2]);
-    }*/
+    } 
 }
\ No newline at end of file
diff --git a/apps/common/openmesh_report.h b/apps/common/openmesh_report.h
index 446d41f5..60de6b92 100644
--- a/apps/common/openmesh_report.h
+++ b/apps/common/openmesh_report.h
@@ -1,12 +1,12 @@
 #include "rxmesh/util/report.h"
 
-class OpenMeshReport : public RXMESH::Report
+class OpenMeshReport : public rxmesh::Report
 {
    public:
-    OpenMeshReport() : RXMESH::Report()
+    OpenMeshReport() : rxmesh::Report()
     {
     }
-    OpenMeshReport(const std::string& record_name) : RXMESH::Report(record_name)
+    OpenMeshReport(const std::string& record_name) : rxmesh::Report(record_name)
     {
     }
 
@@ -16,8 +16,8 @@ class OpenMeshReport : public RXMESH::Report
         subdoc.SetObject();
 
         add_member("model_name", model_name, subdoc);
-        add_member("num_vertices", static_cast<uint32_t>(mesh.n_vertices()),
-                   subdoc);
+        add_member(
+            "num_vertices", static_cast<uint32_t>(mesh.n_vertices()), subdoc);
         add_member("num_edges", static_cast<uint32_t>(mesh.n_edges()), subdoc);
         add_member("num_faces", static_cast<uint32_t>(mesh.n_faces()), subdoc);
 
diff --git a/apps/common/openmesh_trimesh.h b/apps/common/openmesh_trimesh.h
index dc9fb700..049b7ddd 100644
--- a/apps/common/openmesh_trimesh.h
+++ b/apps/common/openmesh_trimesh.h
@@ -5,7 +5,7 @@
 #include <OpenMesh/Core/Mesh/TriMesh_ArrayKernelT.hh>
 
 struct MyTraits : public OpenMesh::DefaultTraits
-    //DefaultTraitsDouble
+// DefaultTraitsDouble
 {
     VertexAttributes(OpenMesh::Attributes::Normal);
 
diff --git a/cmake/AutoDetectCudaArch.cmake b/cmake/AutoDetectCudaArch.cmake
index 4165d93a..a9dbf914 100644
--- a/cmake/AutoDetectCudaArch.cmake
+++ b/cmake/AutoDetectCudaArch.cmake
@@ -48,20 +48,24 @@ int main() {
 		if(CUDA_RETURN_CODE EQUAL 0)			
 			set(CUDA_ARCHS ${fprintf_output} CACHE STRING "CUDA Arch")			
 		else()
-			message(STATUS "GPU architectures auto-detect failed. Will build for all possible architectures.")      
-			set(CUDA_ARCHS "--generate-code=arch=compute_35,code=sm_35;"
-						   "--generate-code=arch=compute_37,code=sm_37;"
-			               "--generate-code=arch=compute_50,code=sm_50;"
-			               "--generate-code=arch=compute_52,code=sm_52;"
-			               "--generate-code=arch=compute_60,code=sm_60;"
-			               "--generate-code=arch=compute_61,code=sm_61;"
-			               "--generate-code=arch=compute_70,code=sm_70;"
-			               "--generate-code=arch=compute_72,code=sm_72;"
-			               "--generate-code=arch=compute_75,code=sm_75;"
+			message(STATUS "GPU architectures auto-detect failed. Will build for sm_70.")      
+			set(CUDA_ARCHS #"--generate-code=arch=compute_35,code=sm_35;"
+						   #"--generate-code=arch=compute_37,code=sm_37;"
+			               #"--generate-code=arch=compute_50,code=sm_50;"
+			               #"--generate-code=arch=compute_52,code=sm_52;"
+			               #"--generate-code=arch=compute_60,code=sm_60;"
+			               #"--generate-code=arch=compute_61,code=sm_61;"
+			               --generate-code=arch=compute_70,code=sm_70;
+			               #"--generate-code=arch=compute_72,code=sm_72;"
+			               #"--generate-code=arch=compute_75,code=sm_75;"
 						   CACHE STRING "CUDA Arch")			
 		endif()  
 	endif()	
 	message(STATUS "CUDA_ARCHS= " ${CUDA_ARCHS})	
+	if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+		#https://gitlab.kitware.com/cmake/cmake/-/issues/18265	
+		list(APPEND CMAKE_CUDA_FLAGS "${CUDA_ARCHS}")	
+	endif ()	
 endif()
 ###################################################################################
 
diff --git a/include/rxmesh/attribute.h b/include/rxmesh/attribute.h
new file mode 100644
index 00000000..22d413a2
--- /dev/null
+++ b/include/rxmesh/attribute.h
@@ -0,0 +1,884 @@
+#pragma once
+
+#include <assert.h>
+#include <utility>
+
+#include "rxmesh/handle.h"
+#include "rxmesh/kernels/attribute.cuh"
+#include "rxmesh/kernels/collective.cuh"
+#include "rxmesh/kernels/util.cuh"
+#include "rxmesh/patch_info.h"
+#include "rxmesh/types.h"
+#include "rxmesh/util/cuda_query.h"
+#include "rxmesh/util/log.h"
+#include "rxmesh/util/util.h"
+#include "rxmesh/util/vector.h"
+
+class RXMeshTest;
+
+
+namespace rxmesh {
+
+
+/**
+ * @brief Base untyped attributes used as an interface for attribute container
+ */
+class AttributeBase
+{
+    // our friend tester class
+    friend class ::RXMeshTest;
+
+   public:
+    AttributeBase() = default;
+
+    virtual const char* get_name() const = 0;
+
+    virtual void release(locationT location = LOCATION_ALL) = 0;
+
+    virtual ~AttributeBase() = default;
+};
+
+/**
+ * @brief  Here we manage the attributes on top of the mesh. An attributes is
+ * attached to mesh element (e.g., vertices, edges, or faces).
+ * largely inspired by
+ * https://github.com/gunrock/gunrock/blob/master/gunrock/util/array_utils.cuh
+ * It is discouraged to use Attribute directly in favor of using
+ * add_X_attributes() from RXMeshStatic where X is vertex, edge, or face. This
+ * way, the user does not have to specify the number of mesh elements or
+ * deallocate/release the Attribute (attribute garbage collection is managed by
+ * RXMeshStatic)
+ * @tparam T type of the attribute
+ */
+template <class T>
+class Attribute : public AttributeBase
+{
+    template <typename S>
+    friend class ReduceHandle;
+
+   public:
+    /**
+     * @brief Default constructor which initializes all pointers to nullptr
+     */
+    Attribute()
+        : AttributeBase(),
+          m_name(nullptr),
+          m_num_attributes(0),
+          m_allocated(LOCATION_NONE),
+          m_h_attr(nullptr),
+          m_h_ptr_on_device(nullptr),
+          m_d_attr(nullptr),
+          m_num_patches(0),
+          m_d_element_per_patch(nullptr),
+          m_h_element_per_patch(nullptr),
+          m_layout(AoS)
+    {
+
+        this->m_name    = (char*)malloc(sizeof(char) * 1);
+        this->m_name[0] = '\0';
+    }
+
+    /**
+     * @brief Main constructor
+     * @param name attribute name
+     */
+    Attribute(const char* name)
+        : AttributeBase(),
+          m_name(nullptr),
+          m_num_attributes(0),
+          m_allocated(LOCATION_NONE),
+          m_h_attr(nullptr),
+          m_h_ptr_on_device(nullptr),
+          m_d_attr(nullptr),
+          m_num_patches(0),
+          m_d_element_per_patch(nullptr),
+          m_h_element_per_patch(nullptr),
+          m_layout(AoS)
+    {
+        if (name != nullptr) {
+            this->m_name = (char*)malloc(sizeof(char) * (strlen(name) + 1));
+            strcpy(this->m_name, name);
+        }
+    }
+
+    Attribute(const Attribute& rhs) = default;
+
+    virtual ~Attribute() = default;
+
+    /**
+     * @brief Get the name of the attribute
+     */
+    const char* get_name() const
+    {
+        return m_name;
+    }
+
+    /**
+     * @brief get the number of attributes per mesh element
+     */
+    __host__ __device__ __forceinline__ uint32_t get_num_attributes() const
+    {
+        return this->m_num_attributes;
+    }
+
+    /**
+     * @brief Flag that indicates where the memory is allocated
+     */
+    __host__ __device__ __forceinline__ locationT get_allocated() const
+    {
+        return this->m_allocated;
+    }
+
+    /**
+     * @brief Check if attribute is allocated on device
+     */
+    __host__ __device__ __forceinline__ bool is_device_allocated() const
+    {
+        return ((m_allocated & DEVICE) == DEVICE);
+    }
+
+    /**
+     * @brief Check if attribute is allocated on host
+     */
+    __host__ __device__ __forceinline__ bool is_host_allocated() const
+    {
+        return ((m_allocated & HOST) == HOST);
+    }
+
+    /**
+     * @brief Reset attribute to certain value
+     * @param value to be set
+     * @param location which location (device, host, or both) where attribute
+     * will be set
+     * @param stream in case of DEVICE, this is the stream that will be used to
+     * launch the reset kernel
+     */
+    void reset(const T value, locationT location, cudaStream_t stream = NULL)
+    {
+        if ((location & DEVICE) == DEVICE) {
+
+            assert((m_allocated & DEVICE) == DEVICE);
+
+            const int threads = 256;
+            detail::template memset_attribute<T>
+                <<<m_num_patches, threads, 0, stream>>>(*this,
+                                                        value,
+                                                        m_d_element_per_patch,
+                                                        m_num_patches,
+                                                        m_num_attributes);
+        }
+
+
+        if ((location & HOST) == HOST) {
+            assert((m_allocated & HOST) == HOST);
+#pragma omp parallel for
+            for (int p = 0; p < static_cast<int>(m_num_patches); ++p) {
+                for (int e = 0; e < m_h_element_per_patch[p]; ++e) {
+                    m_h_attr[p][e] = value;
+                }
+            }
+        }
+    }
+
+    /**
+     * @brief Allocate memory for attribute. This is meant to be used by
+     * RXMeshStatic
+     * @param element_per_patch indicate the number of mesh element owned by
+     * each patch
+     * @param num_attributes number of attribute per mesh element
+     * @param location where the memory should reside (host, device, or both)
+     * @param layout memory layout in case num_attributes>1
+     */
+    void init(const std::vector<uint16_t>& element_per_patch,
+              const uint32_t               num_attributes,
+              locationT                    location = LOCATION_ALL,
+              const layoutT                layout   = AoS)
+    {
+        release();
+        m_num_patches    = element_per_patch.size();
+        m_num_attributes = num_attributes;
+        m_layout         = layout;
+
+        if (m_num_patches == 0) {
+            return;
+        }
+
+        allocate(element_per_patch.data(), location);
+    }
+
+    /**
+     * @brief Copy memory from one location to another. If target is not
+     * allocated, it will be allocated first before copying the memory.
+     * @param source the source location
+     * @param target the destination location
+     * @param stream to be used to launch the kernel
+     * TODO it is better to launch a kernel that do the memcpy than relying on
+     * the host API from CUDA since all these small memcpy will be enqueued in
+     * the same stream and so serialized
+     */
+    void move(locationT source, locationT target, cudaStream_t stream = NULL)
+    {
+        if (source == target) {
+            RXMESH_WARN(
+                "Attribute::move() source ({}) and target ({}) "
+                "are the same.",
+                location_to_string(source),
+                location_to_string(target));
+            return;
+        }
+
+        if ((source == HOST || source == DEVICE) &&
+            ((source & m_allocated) != source)) {
+            RXMESH_ERROR(
+                "Attribute::move() moving source is not valid"
+                " because it was not allocated on source i.e., {}",
+                location_to_string(source));
+        }
+
+        if (((target & HOST) == HOST || (target & DEVICE) == DEVICE) &&
+            ((target & m_allocated) != target)) {
+            RXMESH_WARN(
+                "Attribute::move() allocating target before moving to {}",
+                location_to_string(target));
+            allocate(m_h_element_per_patch, target);
+        }
+
+        if (this->m_num_patches == 0) {
+            return;
+        }
+
+        if (source == HOST && target == DEVICE) {
+            for (uint32_t p = 0; p < m_num_patches; ++p) {
+                CUDA_ERROR(cudaMemcpyAsync(
+                    m_h_ptr_on_device[p],
+                    m_h_attr[p],
+                    sizeof(T) * m_h_element_per_patch[p] * m_num_attributes,
+                    cudaMemcpyHostToDevice,
+                    stream));
+            }
+        } else if (source == DEVICE && target == HOST) {
+            for (uint32_t p = 0; p < m_num_patches; ++p) {
+                CUDA_ERROR(cudaMemcpyAsync(
+                    m_h_attr[p],
+                    m_h_ptr_on_device[p],
+                    sizeof(T) * m_h_element_per_patch[p] * m_num_attributes,
+                    cudaMemcpyDeviceToHost,
+                    stream));
+            }
+        }
+    }
+
+    /**
+     * @brief Release allocated memory in certain location
+     * @param location where memory will be released
+     */
+    void release(locationT location = LOCATION_ALL)
+    {
+        if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) {
+            for (uint32_t p = 0; p < m_num_patches; ++p) {
+                free(m_h_attr[p]);
+            }
+            free(m_h_attr);
+            m_h_attr = nullptr;
+            free(m_h_element_per_patch);
+            m_h_element_per_patch = nullptr;
+            m_allocated           = m_allocated & (~HOST);
+        }
+
+        if (((location & DEVICE) == DEVICE) &&
+            ((m_allocated & DEVICE) == DEVICE)) {
+            for (uint32_t p = 0; p < m_num_patches; ++p) {
+                GPU_FREE(m_h_ptr_on_device[p]);
+            }
+            GPU_FREE(m_d_attr);
+            GPU_FREE(m_d_element_per_patch);
+            m_allocated = m_allocated & (~DEVICE);
+        }
+    }
+
+    /**
+     * @brief Deep copy from a source attribute. If source_flag and dst_flag are
+     * both set to LOCATION_ALL, then we copy what is on host to host, and what
+     * on device to device. If sourc_flag is set to HOST (or DEVICE) and
+     * dst_flag is set to LOCATION_ALL, then we copy source's HOST (or
+     * DEVICE) to both HOST and DEVICE. Setting source_flag to
+     * LOCATION_ALL while dst_flag is NOT set to LOCATION_ALL is invalid
+     * because we don't know which source to copy from
+     * @param source attribute to copy from
+     * @param source_flag defines where we will copy from
+     * @param dst_flag defines where we will copy to
+     * @param stream used to launch kernel/memcpy
+     */
+    void copy_from(Attribute<T>& source,
+                   locationT     source_flag,
+                   locationT     dst_flag,
+                   cudaStream_t  stream = NULL)
+    {
+
+
+        if (source.m_layout != m_layout) {
+            RXMESH_ERROR(
+                "Attribute::copy_from() does not support copy from "
+                "source of different layout!");
+        }
+
+        if ((source_flag & LOCATION_ALL) == LOCATION_ALL &&
+            (dst_flag & LOCATION_ALL) != LOCATION_ALL) {
+            RXMESH_ERROR("Attribute::copy_from() Invalid configuration!");
+        }
+
+        if (m_num_attributes != source.get_num_attributes()) {
+            RXMESH_ERROR(
+                "Attribute::copy_from() number of attributes is "
+                "different!");
+        }
+
+        if (this->is_empty() || this->m_num_patches == 0) {
+            return;
+        }
+
+        // 1) copy from HOST to HOST
+        if ((source_flag & HOST) == HOST && (dst_flag & HOST) == HOST) {
+            if ((source_flag & source.m_allocated) != source_flag) {
+                RXMESH_ERROR(
+                    "Attribute::copy() copying source is not valid"
+                    " because it was not allocated on host");
+            }
+            if ((dst_flag & m_allocated) != dst_flag) {
+                RXMESH_ERROR(
+                    "Attribute::copy() copying source is not valid"
+                    " because location (this) was not allocated on host");
+            }
+
+            for (uint32_t p = 0; p < m_num_patches; ++p) {
+                assert(m_h_element_per_patch[p] ==
+                       source.m_h_element_per_patch[p]);
+                std::memcpy(
+                    m_h_ptr_on_device[p],
+                    source.m_h_ptr_on_device[p],
+                    sizeof(T) * m_h_element_per_patch[p] * m_num_attributes);
+            }
+        }
+
+
+        // 2) copy from DEVICE to DEVICE
+        if ((source_flag & DEVICE) == DEVICE && (dst_flag & DEVICE) == DEVICE) {
+            if ((source_flag & source.m_allocated) != source_flag) {
+                RXMESH_ERROR(
+                    "Attribute::copy() copying source is not valid"
+                    " because it was not allocated on device");
+            }
+            if ((dst_flag & m_allocated) != dst_flag) {
+                RXMESH_ERROR(
+                    "Attribute::copy() copying source is not valid"
+                    " because location (this) was not allocated on device");
+            }
+
+            for (uint32_t p = 0; p < m_num_patches; ++p) {
+                assert(m_h_element_per_patch[p] ==
+                       source.m_h_element_per_patch[p]);
+                CUDA_ERROR(cudaMemcpyAsync(
+                    m_h_ptr_on_device[p],
+                    source.m_h_ptr_on_device[p],
+                    sizeof(T) * m_h_element_per_patch[p] * m_num_attributes,
+                    cudaMemcpyDeviceToDevice,
+                    stream));
+            }
+        }
+
+
+        // 3) copy from DEVICE to HOST
+        if ((source_flag & DEVICE) == DEVICE && (dst_flag & HOST) == HOST) {
+            if ((source_flag & source.m_allocated) != source_flag) {
+                RXMESH_ERROR(
+                    "Attribute::copy() copying source is not valid"
+                    " because it was not allocated on host");
+            }
+            if ((dst_flag & m_allocated) != dst_flag) {
+                RXMESH_ERROR(
+                    "Attribute::copy() copying source is not valid"
+                    " because location (this) was not allocated on device");
+            }
+
+
+            for (uint32_t p = 0; p < m_num_patches; ++p) {
+                assert(m_h_element_per_patch[p] ==
+                       source.m_h_element_per_patch[p]);
+                CUDA_ERROR(cudaMemcpyAsync(
+                    m_h_attr[p],
+                    source.m_h_ptr_on_device[p],
+                    sizeof(T) * m_h_element_per_patch[p] * m_num_attributes,
+                    cudaMemcpyDeviceToHost,
+                    stream));
+            }
+        }
+
+
+        // 4) copy from HOST to DEVICE
+        if ((source_flag & HOST) == HOST && (dst_flag & DEVICE) == DEVICE) {
+            if ((source_flag & source.m_allocated) != source_flag) {
+                RXMESH_ERROR(
+                    "Attribute::copy() copying source is not valid"
+                    " because it was not allocated on device");
+            }
+            if ((dst_flag & m_allocated) != dst_flag) {
+                RXMESH_ERROR(
+                    "Attribute::copy() copying source is not valid"
+                    " because location (this) was not allocated on host");
+            }
+
+
+            for (uint32_t p = 0; p < m_num_patches; ++p) {
+                assert(m_h_element_per_patch[p] ==
+                       source.m_h_element_per_patch[p]);
+                CUDA_ERROR(cudaMemcpyAsync(
+                    m_h_ptr_on_device[p],
+                    source.m_h_attr[p],
+                    sizeof(T) * m_h_element_per_patch[p] * m_num_attributes,
+                    cudaMemcpyHostToDevice,
+                    stream));
+            }
+        }
+    }
+
+    /**
+     * @brief Access the attribute value using patch and local index in the
+     * patch. This is meant to be used by XXAttribute not directly by the user
+     * @param patch_id patch to be accessed
+     * @param local_id the local id in the patch
+     * @param attr the attribute id
+     * @return const reference to the attribute
+     */
+    __host__ __device__ __forceinline__ T& operator()(const uint32_t patch_id,
+                                                      const uint16_t local_id,
+                                                      const uint32_t attr) const
+    {
+        assert(patch_id < m_num_patches);
+        assert(attr < m_num_attributes);
+
+        const uint32_t pitch_x = (m_layout == AoS) ? m_num_attributes : 1;
+#ifdef __CUDA_ARCH__
+        const uint32_t pitch_y =
+            (m_layout == AoS) ? 1 : m_d_element_per_patch[patch_id];
+        return m_d_attr[patch_id][local_id * pitch_x + attr * pitch_y];
+#else
+        const uint32_t pitch_y =
+            (m_layout == AoS) ? 1 : m_h_element_per_patch[patch_id];
+        return m_h_attr[patch_id][local_id * pitch_x + attr * pitch_y];
+#endif
+    }
+
+    /**
+     * @brief Access the attribute value using patch and local index in the
+     * patch. This is meant to be used by XXAttribute not directly by the user
+     * @param patch_id patch to be accessed
+     * @param local_id the local id in the patch
+     * @param attr the attribute id
+     * @return non-const reference to the attribute
+     */
+    __host__ __device__ __forceinline__ T& operator()(const uint32_t patch_id,
+                                                      const uint16_t local_id,
+                                                      const uint32_t attr)
+    {
+        assert(patch_id < m_num_patches);
+        assert(attr < m_num_attributes);
+
+        const uint32_t pitch_x = (m_layout == AoS) ? m_num_attributes : 1;
+#ifdef __CUDA_ARCH__
+        const uint32_t pitch_y =
+            (m_layout == AoS) ? 1 : m_d_element_per_patch[patch_id];
+        return m_d_attr[patch_id][local_id * pitch_x + attr * pitch_y];
+#else
+        const uint32_t pitch_y =
+            (m_layout == AoS) ? 1 : m_h_element_per_patch[patch_id];
+        return m_h_attr[patch_id][local_id * pitch_x + attr * pitch_y];
+#endif
+    }
+
+    /**
+     * @brief Check if the attribute is empty
+     */
+    __host__ __device__ __forceinline__ bool is_empty() const
+    {
+        return m_num_patches == 0;
+    }
+
+
+   private:
+    /**
+     * @brief allocate internal memory
+     */
+    void allocate(const uint16_t* element_per_patch, locationT location)
+    {
+
+        if (m_num_patches != 0) {
+
+            if ((location & HOST) == HOST) {
+                release(HOST);
+                m_h_element_per_patch = static_cast<uint16_t*>(
+                    malloc(sizeof(uint16_t) * m_num_patches));
+
+                m_h_attr = static_cast<T**>(malloc(sizeof(T*) * m_num_patches));
+
+                std::memcpy(m_h_element_per_patch,
+                            element_per_patch,
+                            sizeof(uint16_t) * m_num_patches);
+
+                for (uint32_t p = 0; p < m_num_patches; ++p) {
+                    m_h_attr[p] = static_cast<T*>(malloc(
+                        sizeof(T) * element_per_patch[p] * m_num_attributes));
+                }
+
+                m_allocated = m_allocated | HOST;
+            }
+
+            if ((location & DEVICE) == DEVICE) {
+                release(DEVICE);
+
+                m_h_element_per_patch = static_cast<uint16_t*>(
+                    malloc(sizeof(uint16_t) * m_num_patches));
+
+                std::memcpy(m_h_element_per_patch,
+                            element_per_patch,
+                            sizeof(uint16_t) * m_num_patches);
+
+                CUDA_ERROR(cudaMalloc((void**)&(m_d_element_per_patch),
+                                      sizeof(uint16_t) * m_num_patches));
+
+
+                CUDA_ERROR(cudaMalloc((void**)&(m_d_attr),
+                                      sizeof(T*) * m_num_patches));
+                m_h_ptr_on_device =
+                    static_cast<T**>(malloc(sizeof(T*) * m_num_patches));
+
+                CUDA_ERROR(cudaMemcpy(m_d_element_per_patch,
+                                      element_per_patch,
+                                      sizeof(uint16_t) * m_num_patches,
+                                      cudaMemcpyHostToDevice));
+
+                for (uint32_t p = 0; p < m_num_patches; ++p) {
+                    CUDA_ERROR(cudaMalloc((void**)&(m_h_ptr_on_device[p]),
+                                          sizeof(T) * m_h_element_per_patch[p] *
+                                              m_num_attributes));
+                }
+                CUDA_ERROR(cudaMemcpy(m_d_attr,
+                                      m_h_ptr_on_device,
+                                      sizeof(T*) * m_num_patches,
+                                      cudaMemcpyHostToDevice));
+                m_allocated = m_allocated | DEVICE;
+            }
+        }
+    }
+
+
+    char*     m_name;
+    uint32_t  m_num_attributes;
+    locationT m_allocated;
+    T**       m_h_attr;
+    T**       m_h_ptr_on_device;
+    T**       m_d_attr;
+    uint32_t  m_num_patches;
+    uint16_t* m_d_element_per_patch;
+    uint16_t* m_h_element_per_patch;
+    layoutT   m_layout;
+
+    constexpr static uint32_t m_block_size = 256;
+};
+
+/**
+ * @brief Attributes for faces
+ * @tparam T the attribute type
+ */
+template <class T>
+class FaceAttribute : public Attribute<T>
+{
+   public:
+    /**
+     * @brief Default constructor
+     */
+    FaceAttribute() = default;
+
+    /**
+     * @brief Main constructor to be used by RXMeshStatic not directly by the
+     * user
+     * @param name of the attribute
+     * @param face_per_patch number of faces owned per patch
+     * @param num_attributes number of attribute per face
+     * @param location where the attribute to be allocated
+     * @param layout memory layout in case of num_attributes>1
+     */
+    FaceAttribute(const char*                  name,
+                  const std::vector<uint16_t>& face_per_patch,
+                  const uint32_t               num_attributes,
+                  locationT                    location,
+                  const layoutT                layout)
+        : Attribute<T>(name)
+    {
+        this->init(face_per_patch, num_attributes, location, layout);
+    }
+
+    /**
+     * @brief Accessing face attribute using FaceHandle
+     * @param f_handle input face handle
+     * @param attr the attribute id
+     * @return const reference to the attribute
+     */
+    __host__ __device__ __forceinline__ T& operator()(
+        const FaceHandle f_handle,
+        const uint32_t   attr = 0) const
+    {
+        auto                 pl = f_handle.unpack();
+        return Attribute<T>::operator()(pl.first, pl.second, attr);
+    }
+
+
+    /**
+     * @brief Accessing face attribute using FaceHandle
+     * @param f_handle input face handle
+     * @param attr the attribute id
+     * @return non-const reference to the attribute
+     */
+    __host__ __device__ __forceinline__ T& operator()(const FaceHandle f_handle,
+                                                      const uint32_t   attr = 0)
+    {
+        auto                 pl = f_handle.unpack();
+        return Attribute<T>::operator()(pl.first, pl.second, attr);
+    }
+};
+
+
+/**
+ * @brief Attributes for edges
+ * @tparam T the attribute type
+ */
+template <class T>
+class EdgeAttribute : public Attribute<T>
+{
+   public:
+    /**
+     * @brief Default constructor
+     */
+    EdgeAttribute() = default;
+
+    /**
+     * @brief Main constructor to be used by RXMeshStatic not directly by the
+     * user
+     * @param name of the attribute
+     * @param edge_per_patch number of edges owned per patch
+     * @param num_attributes number of attribute per edge
+     * @param location where the attribute to be allocated
+     * @param layout memory layout in case of num_attributes>1
+     */
+    EdgeAttribute(const char*                  name,
+                  const std::vector<uint16_t>& edge_per_patch,
+                  const uint32_t               num_attributes,
+                  locationT                    location,
+                  const layoutT                layout)
+        : Attribute<T>(name)
+    {
+        this->init(edge_per_patch, num_attributes, location, layout);
+    }
+
+    /**
+     * @brief Accessing edge attribute using EdgeHandle
+     * @param e_handle input edge handle
+     * @param attr the attribute id
+     * @return const reference to the attribute
+     */
+    __host__ __device__ __forceinline__ T& operator()(
+        const EdgeHandle e_handle,
+        const uint32_t   attr = 0) const
+    {
+        auto                 pl = e_handle.unpack();
+        return Attribute<T>::operator()(pl.first, pl.second, attr);
+    }
+
+    /**
+     * @brief Accessing edge attribute using EdgeHandle
+     * @param e_handle input edge handle
+     * @param attr the attribute id
+     * @return non-const reference to the attribute
+     */
+    __host__ __device__ __forceinline__ T& operator()(const EdgeHandle e_handle,
+                                                      const uint32_t   attr = 0)
+    {
+        auto                 pl = e_handle.unpack();
+        return Attribute<T>::operator()(pl.first, pl.second, attr);
+    }
+};
+
+
+/**
+ * @brief Attributes for vertices
+ * @tparam T the attribute type
+ */
+template <class T>
+class VertexAttribute : public Attribute<T>
+{
+   public:
+    /**
+     * @brief Default constructor
+     */
+    VertexAttribute() = default;
+
+    /**
+     * @brief Main constructor to be used by RXMeshStatic not directly by the
+     * user
+     * @param name of the attribute
+     * @param vertex_per_patch number of vertices owned per patch
+     * @param num_attributes number of attribute per vertex
+     * @param location where the attribute to be allocated
+     * @param layout memory layout in case of num_attributes > 1
+     */
+    VertexAttribute(const char*                  name,
+                    const std::vector<uint16_t>& vertex_per_patch,
+                    const uint32_t               num_attributes,
+                    locationT                    location,
+                    const layoutT                layout)
+        : Attribute<T>(name)
+    {
+        this->init(vertex_per_patch, num_attributes, location, layout);
+    }
+
+
+    /**
+     * @brief Accessing vertex attribute using VertexHandle
+     * @param v_handle input face handle
+     * @param attr the attribute id
+     * @return const reference to the attribute
+     */
+    __host__ __device__ __forceinline__ T& operator()(
+        const VertexHandle v_handle,
+        const uint32_t     attr = 0) const
+    {
+        auto                 pl = v_handle.unpack();
+        return Attribute<T>::operator()(pl.first, pl.second, attr);
+    }
+
+    /**
+     * @brief Accessing vertex attribute using VertexHandle
+     * @param v_handle input face handle
+     * @param attr the attribute id
+     * @return non-const reference to the attribute
+     */
+    __host__ __device__ __forceinline__ T& operator()(
+        const VertexHandle v_handle,
+        const uint32_t     attr = 0)
+    {
+        auto                 pl = v_handle.unpack();
+        return Attribute<T>::operator()(pl.first, pl.second, attr);
+    }
+};
+
+/**
+ * @brief Attribute container used to manage a collection of attributes by
+ * RXMeshStatic
+ */
+class AttributeContainer
+{
+   public:
+    /**
+     * @brief Default constructor
+     */
+    AttributeContainer() = default;
+
+    /**
+     * @brief Destructor which releases all attribute managed by this container
+     */
+    virtual ~AttributeContainer()
+    {
+        while (!m_attr_container.empty()) {
+            m_attr_container.back()->release();
+            m_attr_container.pop_back();
+        }
+    }
+
+    /**
+     * @brief Number of attribute managed by this container
+     */
+    size_t size()
+    {
+        return m_attr_container.size();
+    }
+
+    /**
+     * @brief get a list of name of the attributes managed by this container
+     * @return
+     */
+    std::vector<std::string> get_attribute_names() const
+    {
+        std::vector<std::string> names;
+        for (size_t i = 0; i < m_attr_container.size(); ++i) {
+            names.push_back(m_attr_container[i]->get_name());
+        }
+        return names;
+    }
+
+    /**
+     * @brief add a new attribute to be managed by this container
+     * @tparam AttrT attribute type
+     * @param name unique name given to the attribute
+     * @param element_per_patch number of mesh element owned by each patch
+     * @param num_attributes number of attributes per mesh element
+     * @param location where the attributes will be allocated
+     * @param layout memory layout in case of num_attributes > 1
+     * @return a shared pointer to the attribute
+     */
+    template <typename AttrT>
+    std::shared_ptr<AttrT> add(const char*            name,
+                               std::vector<uint16_t>& element_per_patch,
+                               uint32_t               num_attributes,
+                               locationT              location,
+                               layoutT                layout)
+    {
+        if (does_exist(name)) {
+            RXMESH_WARN(
+                "AttributeContainer::add() adding an attribute with "
+                "name {} already exists!",
+                std::string(name));
+        }
+
+        auto new_attr = std::make_shared<AttrT>(
+            name, element_per_patch, num_attributes, location, layout);
+        m_attr_container.push_back(
+            std::dynamic_pointer_cast<AttributeBase>(new_attr));
+
+        return new_attr;
+    }
+
+    /**
+     * @brief Check if an attribute exists
+     * @param name of the attribute
+     */
+    bool does_exist(const char* name)
+    {
+        for (size_t i = 0; i < m_attr_container.size(); ++i) {
+            if (!strcmp(m_attr_container[i]->get_name(), name)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * @brief remove an attribute and release its memory
+     * @param name of the attribute
+     */
+    void remove(const char* name)
+    {
+        for (auto it = m_attr_container.begin(); it != m_attr_container.end();
+             ++it) {
+
+            if (!strcmp((*it)->get_name(), name)) {
+                (*it)->release(LOCATION_ALL);
+                m_attr_container.erase(it);
+                break;
+            }
+        }
+    }
+
+   private:
+    std::vector<std::shared_ptr<AttributeBase>> m_attr_container;
+};
+
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/context.h b/include/rxmesh/context.h
new file mode 100644
index 00000000..d93a2d72
--- /dev/null
+++ b/include/rxmesh/context.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <stdint.h>
+#include "rxmesh/patch_info.h"
+#include "rxmesh/util/macros.h"
+
+namespace rxmesh {
+
+/**
+ * @brief context for the mesh parameters and pointers. Everything is allocated
+ * on and managed by RXMesh. This class is meant to be a vehicle to copy various
+ * parameters to the device kernels.
+ */
+class Context
+{
+   public:
+    /**
+     * @brief Default constructor
+     */
+    Context()
+        : m_num_edges(0),
+          m_num_faces(0),
+          m_num_vertices(0),
+          m_num_patches(0),
+          m_patches_info(nullptr)
+    {
+    }
+
+    /**
+     * @brief initialize various members
+     * @param num_edges total number of edges in the mesh
+     * @param num_faces total number of faces in the mesh
+     * @param num_vertices total number of vertices in the mesh
+     * @param num_patches number of patches
+     * @param patches pointer to PatchInfo that contains different info about
+     * the patches
+     */
+    void init(const uint32_t num_edges,
+              const uint32_t num_faces,
+              const uint32_t num_vertices,
+              const uint32_t num_patches,
+              PatchInfo*     patches)
+    {
+
+        m_num_edges    = num_edges;
+        m_num_faces    = num_faces;
+        m_num_vertices = num_vertices;
+        m_num_patches  = num_patches;
+        m_patches_info = patches;
+    }
+
+    /**
+     * @brief Total number of edges in mesh
+     */
+    __device__ __forceinline__ uint32_t get_num_edges() const
+    {
+        return m_num_edges;
+    }
+
+    /**
+     * @brief Total number of faces in mesh
+     */
+    __device__ __forceinline__ uint32_t get_num_faces() const
+    {
+        return m_num_faces;
+    }
+
+    /**
+     * @brief Total number of vertices in mesh
+     */
+    __device__ __forceinline__ uint32_t get_num_vertices() const
+    {
+        return m_num_vertices;
+    }
+
+    /**
+     * @brief Total number of patches in mesh
+     */
+    __device__ __forceinline__ uint32_t get_num_patches() const
+    {
+        return m_num_patches;
+    }
+
+    /**
+     * @brief A pointer to device PatchInfo used to store various information
+     * about the patches
+     */
+    __device__ __forceinline__ PatchInfo* get_patches_info() const
+    {
+        return m_patches_info;
+    }
+
+    /**
+     * @brief Unpack an edge to its edge ID and direction
+     * @param edge_dir The input packed edge as stored in PatchInfo and
+     * internally in RXMesh
+     * @param edge The unpacked edge ID
+     * @param dir The unpacked edge direction
+     */
+    static __device__ __host__ __forceinline__ void
+    unpack_edge_dir(const uint16_t edge_dir, uint16_t& edge, flag_t& dir)
+    {
+        dir  = (edge_dir & 1) != 0;
+        edge = edge_dir >> 1;
+    }
+
+   private:
+    uint32_t   m_num_edges, m_num_faces, m_num_vertices, m_num_patches;
+    PatchInfo* m_patches_info;
+};
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/handle.h b/include/rxmesh/handle.h
new file mode 100644
index 00000000..19f0b7de
--- /dev/null
+++ b/include/rxmesh/handle.h
@@ -0,0 +1,284 @@
+#pragma once
+#include <stdint.h>
+#include <string>
+#include "rxmesh/local.h"
+#include "rxmesh/patch_info.h"
+#include "rxmesh/util/macros.h"
+
+namespace rxmesh {
+
+namespace detail {
+/**
+ * @brief Return unique index of the local mesh element composed by the
+ * patch id and the local index
+ *
+ * @param local_id the local within-patch mesh element id
+ * @param patch_id the patch owning the mesh element
+ * @return
+ */
+uint64_t __device__ __host__ __forceinline__ unique_id(const uint16_t local_id,
+                                                       const uint32_t patch_id)
+{
+    uint64_t ret = patch_id;
+    ret          = (ret << 32);
+    ret |= local_id;
+    return ret;
+}
+
+/**
+ * @brief unpack a 64 uint to its high and low 32 bits. The low 32 bit are
+ * casted to 16 bit. This is used to convert the unique id to its local id (16
+ * low bit) and patch id (high 32 bit)
+ * @param uid unique id
+ * @return a std::pair storing the patch id and local id
+ */
+std::pair<uint32_t, uint16_t> __device__ __host__ __forceinline__
+unpack(uint64_t uid)
+{
+    uint16_t local_id = uid & ((1 << 16) - 1);
+    uint32_t patch_id = uid >> 32;
+    return std::make_pair(patch_id, local_id);
+}
+
+}  // namespace detail
+
+/**
+ * @brief vertex identifier. It is a unique handle for each vertex equipped with
+ * operator==. It can be used to access mesh (vertex) attributes
+ */
+struct VertexHandle
+{
+    using LocalT = LocalVertexT;
+
+    /**
+     * @brief Default constructor
+     */
+    __device__ __host__ VertexHandle() : m_handle(INVALID64)
+    {
+    }
+
+    /**
+     * @brief Constructor meant to be used internally by RXMesh and
+     * query_dispatcher
+     * @param patch_id the patch where the vertex belongs
+     * @param vertex_local_id the vertex local index within the patch
+     */
+    __device__ __host__ VertexHandle(uint32_t     patch_id,
+                                     LocalVertexT vertex_local_id)
+        : m_handle(detail::unique_id(vertex_local_id.id, patch_id))
+    {
+    }
+
+    /**
+     * @brief Operator ==
+     */
+    bool __device__ __host__ __inline__ operator==(
+        const VertexHandle& rhs) const
+    {
+        return m_handle == rhs.m_handle;
+    }
+
+    /**
+     * @brief Operator !=
+     */
+    bool __device__ __host__ __inline__ operator!=(
+        const VertexHandle& rhs) const
+    {
+        return !(*this == rhs);
+    }
+
+    /**
+     * @brief Check if the vertex is valid i.e., has been initialized by RXMesh
+     */
+    bool __device__ __host__ __inline__ is_valid() const
+    {
+        return m_handle != INVALID64;
+    }
+
+    /**
+     * @brief The unique identifier that represents the vertex
+     */
+    uint64_t __device__ __host__ __inline__ unique_id() const
+    {
+        return m_handle;
+    }
+
+    /**
+     * @brief Unpack the handle to its patch id and vertex local index within
+     * the patch
+     */
+    std::pair<uint32_t, uint16_t> __device__ __host__ __inline__ unpack() const
+    {
+        return detail::unpack(m_handle);
+    }
+
+   private:
+    uint64_t m_handle;
+};
+
+/**
+ * @brief print vertex unique_id to ostream
+ */
+inline std::ostream& operator<<(std::ostream& os, VertexHandle v_handle)
+{
+    return (os << 'v' << v_handle.unique_id());
+}
+
+/**
+ * @brief edge identifier. It is a unique handle for each edge equipped with
+ * operator==. It can be used to access mesh (edge) attributes
+ */
+struct EdgeHandle
+{
+    using LocalT = LocalEdgeT;
+
+    /**
+     * @brief Default constructor
+     */
+    __device__ __host__ EdgeHandle() : m_handle(INVALID64)
+    {
+    }
+
+    /**
+     * @brief Constructor meant to be used internally by RXMesh and
+     * query_dispatcher
+     * @param patch_id the patch where the edge belongs
+     * @param edge_local_id the edge local index within the patch
+     */
+    __device__ __host__ EdgeHandle(uint32_t patch_id, LocalEdgeT edge_local_id)
+        : m_handle(detail::unique_id(edge_local_id.id, patch_id))
+    {
+    }
+
+    /**
+     * @brief Operator ==
+     */
+    bool __device__ __host__ __inline__ operator==(const EdgeHandle& rhs) const
+    {
+        return m_handle == rhs.m_handle;
+    }
+
+
+    /**
+     * @brief Operator !=
+     */
+    bool __device__ __host__ __inline__ operator!=(const EdgeHandle& rhs) const
+    {
+        return !(*this == rhs);
+    }
+
+    /**
+     * @brief Check if the edge is valid i.e., has been initialized by RXMesh
+     */
+    bool __device__ __host__ __inline__ is_valid() const
+    {
+        return m_handle != INVALID64;
+    }
+
+    /**
+     * @brief The unique identifier that represents the edge
+     */
+    uint64_t __device__ __host__ __inline__ unique_id() const
+    {
+        return m_handle;
+    }
+
+    /**
+     * @brief Unpack the handle to its patch id and edge local index within
+     * the patch
+     */
+    std::pair<uint32_t, uint16_t> __device__ __host__ __inline__ unpack() const
+    {
+        return detail::unpack(m_handle);
+    }
+
+   private:
+    uint64_t m_handle;
+};
+/**
+ * @brief print edge unique_id to ostream
+ */
+inline std::ostream& operator<<(std::ostream& os, EdgeHandle e_handle)
+{
+    return (os << 'e' << e_handle.unique_id());
+}
+
+/**
+ * @brief face identifier. It is a unique handle for each face equipped with
+ * operator==. It can be used to access mesh (face) attributes
+ */
+struct FaceHandle
+{
+    using LocalT = LocalFaceT;
+
+    /**
+     * @brief Default constructor
+     */
+    __device__ __host__ FaceHandle() : m_handle(INVALID64)
+    {
+    }
+
+    /**
+     * @brief Constructor meant to be used internally by RXMesh and
+     * query_dispatcher
+     * @param patch_id the patch where the face belongs
+     * @param vertex_local_id the face local index within the patch
+     */
+    __device__ __host__ FaceHandle(uint32_t patch_id, LocalFaceT face_local_id)
+        : m_handle(detail::unique_id(face_local_id.id, patch_id))
+    {
+    }
+
+    /**
+     * @brief Operator ==
+     */
+    bool __device__ __host__ __inline__ operator==(const FaceHandle& rhs) const
+    {
+        return m_handle == rhs.m_handle;
+    }
+
+    /**
+     * @brief Operator !=
+     */
+    bool __device__ __host__ __inline__ operator!=(const FaceHandle& rhs) const
+    {
+        return !(*this == rhs);
+    }
+
+    /**
+     * @brief Check if the face is valid i.e., has been initialized by RXMesh
+     */
+    bool __device__ __host__ __inline__ is_valid() const
+    {
+        return m_handle != INVALID64;
+    }
+
+    /**
+     * @brief The unique identifier that represents the face
+     */
+    uint64_t __device__ __host__ __inline__ unique_id() const
+    {
+        return m_handle;
+    }
+
+    /**
+     * @brief Unpack the handle to its patch id and face local index within
+     * the patch
+     */
+    std::pair<uint32_t, uint16_t> __device__ __host__ __inline__ unpack() const
+    {
+        return detail::unpack(m_handle);
+    }
+
+   private:
+    uint64_t m_handle;
+};
+
+/**
+ * @brief print face unique_id to ostream
+ */
+inline std::ostream& operator<<(std::ostream& os, FaceHandle f_handle)
+{
+    return (os << 'f' << f_handle.unique_id());
+}
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/iterator.cuh b/include/rxmesh/iterator.cuh
new file mode 100644
index 00000000..5eda0450
--- /dev/null
+++ b/include/rxmesh/iterator.cuh
@@ -0,0 +1,142 @@
+#pragma once
+#include <stdint.h>
+#include "rxmesh/handle.h"
+
+namespace rxmesh {
+
+template <typename HandleT>
+struct Iterator
+{
+    using LocalT = typename HandleT::LocalT;
+
+    __device__ Iterator(const uint16_t  local_id,
+                        const LocalT*   patch_output,
+                        const uint16_t* patch_offset,
+                        const uint32_t  offset_size,
+                        const uint32_t  patch_id,
+                        const uint32_t  num_owned,
+                        const uint32_t* not_owned_patch,
+                        const uint16_t* not_owned_local_id,
+                        int             shift = 0)
+        : m_patch_output(patch_output),
+          m_patch_offset(patch_offset),
+          m_patch_id(patch_id),
+          m_num_owned(num_owned),
+          m_not_owned_patch(not_owned_patch),
+          m_not_owned_local_id(not_owned_local_id),
+          m_shift(shift)
+    {
+        set(local_id, offset_size);
+    }
+
+    Iterator(const Iterator& orig) = default;
+
+
+    __device__ uint16_t size() const
+    {
+        return m_end - m_begin;
+    }
+
+    __device__ HandleT operator[](const uint16_t i) const
+    {
+        assert(m_patch_output);
+        assert(i + m_begin < m_end);
+        uint16_t lid = (m_patch_output[m_begin + i].id) >> m_shift;
+        if (lid < m_num_owned) {
+            return {m_patch_id, lid};
+        } else {
+            lid -= m_num_owned;            
+            return {m_not_owned_patch[lid], m_not_owned_local_id[lid]};
+        }
+    }
+
+    __device__ HandleT operator*() const
+    {
+        assert(m_patch_output);
+        return ((*this)[m_current]);
+    }
+
+    __device__ HandleT back() const
+    {
+        return ((*this)[size() - 1]);
+    }
+
+    __device__ HandleT front() const
+    {
+        return ((*this)[0]);
+    }
+
+    __device__ Iterator& operator++()
+    {
+        // pre
+        m_current = (m_current + 1) % size();
+        return *this;
+    }
+    __device__ Iterator operator++(int)
+    {
+        // post
+        Iterator pre(*this);
+        m_current = (m_current + 1) % size();
+        return pre;
+    }
+
+    __device__ Iterator& operator--()
+    {
+        // pre
+        m_current = (m_current == 0) ? size() - 1 : m_current - 1;
+        return *this;
+    }
+
+    __device__ Iterator operator--(int)
+    {
+        // post
+        Iterator pre(*this);
+        m_current = (m_current == 0) ? size() - 1 : m_current - 1;
+        return pre;
+    }
+
+    __device__ bool operator==(const Iterator& rhs) const
+    {
+        return rhs.m_local_id == m_local_id && rhs.m_patch_id == m_patch_id &&
+               rhs.m_current == m_current;
+    }
+
+    __device__ bool operator!=(const Iterator& rhs) const
+    {
+        return !(*this == rhs);
+    }
+
+
+   private:
+    const LocalT*   m_patch_output;
+    const uint16_t* m_patch_offset;
+    const uint32_t  m_patch_id;
+    const uint32_t* m_not_owned_patch;
+    const uint16_t* m_not_owned_local_id;
+    uint16_t        m_num_owned;
+    uint16_t        m_local_id;
+    uint16_t        m_begin;
+    uint16_t        m_end;
+    uint16_t        m_current;
+    int             m_shift;
+
+    __device__ void set(const uint16_t local_id, const uint32_t offset_size)
+    {
+        m_current  = 0;
+        m_local_id = local_id;
+        if (offset_size == 0) {
+            m_begin = m_patch_offset[m_local_id];
+            m_end   = m_patch_offset[m_local_id + 1];
+        } else {
+            m_begin = m_local_id * offset_size;
+            m_end   = (m_local_id + 1) * offset_size;
+        }
+        assert(m_end > m_begin);
+    }
+};
+
+using VertexIterator = Iterator<VertexHandle>;
+using EdgeIterator   = Iterator<EdgeHandle>;
+using FaceIterator   = Iterator<FaceHandle>;
+
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/kernels/attribute.cuh b/include/rxmesh/kernels/attribute.cuh
new file mode 100644
index 00000000..6a4b3e90
--- /dev/null
+++ b/include/rxmesh/kernels/attribute.cuh
@@ -0,0 +1,93 @@
+#pragma once
+#include <cub/block/block_reduce.cuh>
+#include "rxmesh/util/macros.h"
+
+
+namespace rxmesh {
+
+template <typename T>
+class Attribute;
+
+namespace detail {
+
+template <class T, uint32_t blockSize>
+__device__ __forceinline__ void cub_block_sum(const T thread_val,
+                                              T*      d_block_output)
+{
+    typedef cub::BlockReduce<T, blockSize>       BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    T block_sum = BlockReduce(temp_storage).Sum(thread_val);
+    if (threadIdx.x == 0) {
+        d_block_output[blockIdx.x] = block_sum;
+    }
+}
+
+template <class T, uint32_t blockSize>
+__launch_bounds__(blockSize) __global__
+    void norm2_kernel(const Attribute<T> X,
+                      const uint16_t*    d_element_per_patch,
+                      const uint32_t     num_patches,
+                      const uint32_t     num_attributes,
+                      T*                 d_block_output)
+{
+    uint32_t p_id = blockIdx.x;
+    if (p_id < num_patches) {
+        const uint16_t element_per_patch = d_element_per_patch[p_id];
+        T              thread_val        = 0;
+        for (uint16_t i = threadIdx.x; i < element_per_patch; i += blockSize) {
+            for (uint32_t j = 0; j < num_attributes; ++j) {
+                const T val = X(p_id, i, j);
+                thread_val += val * val;
+            }
+        }
+
+        cub_block_sum<T, blockSize>(thread_val, d_block_output);
+    }
+}
+
+
+template <typename T, uint32_t blockSize>
+__launch_bounds__(blockSize) __global__
+    void dot_kernel(const Attribute<T> X,
+                    const Attribute<T> Y,
+                    const uint16_t*    d_element_per_patch,
+                    const uint32_t     num_patches,
+                    const uint32_t     num_attributes,
+                    T*                 d_block_output)
+{
+    assert(X.get_num_attributes() == Y.get_num_attributes());
+
+    uint32_t p_id = blockIdx.x;
+    if (p_id < num_patches) {
+        const uint16_t element_per_patch = d_element_per_patch[p_id];
+        T              thread_val        = 0;
+        for (uint16_t i = threadIdx.x; i < element_per_patch; i += blockSize) {
+            for (uint32_t j = 0; j < num_attributes; ++j) {
+                thread_val += X(p_id, i, j) * Y(p_id, i, j);
+            }
+        }
+
+        cub_block_sum<T, blockSize>(thread_val, d_block_output);
+    }
+}
+
+template <typename T>
+__global__ void memset_attribute(const Attribute<T> attr,
+                                 const T            value,
+                                 const uint16_t*    d_element_per_patch,
+                                 const uint32_t     num_patches,
+                                 const uint32_t     num_attributes)
+{
+    uint32_t p_id = blockIdx.x;
+    if (p_id < num_patches) {
+        const uint16_t element_per_patch = d_element_per_patch[p_id];
+        for (uint16_t i = threadIdx.x; i < element_per_patch; i += blockDim.x) {
+            for (uint32_t j = 0; j < num_attributes; ++j) {
+                attr(p_id, i, j) = value;
+            }
+        }
+    }
+}
+
+}  // namespace detail
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/kernels/collective.cuh b/include/rxmesh/kernels/collective.cuh
index 1b152e93..43baec8c 100644
--- a/include/rxmesh/kernels/collective.cuh
+++ b/include/rxmesh/kernels/collective.cuh
@@ -4,9 +4,9 @@
 #include <cub/cub.cuh>
 #include "rxmesh/util/macros.h"
 
-namespace RXMESH {
+namespace rxmesh {
 /**
- * cub_block_exclusive_sum()
+ * @brief Compute block-wide exclusive sum using CUB
  */
 template <typename T, uint32_t blockThreads>
 __device__ __forceinline__ void cub_block_exclusive_sum(T*             data,
@@ -94,4 +94,4 @@ __device__ __forceinline__ void cub_block_exclusive_sum(T*             data,
     }*/
 }
 
-}  // namespace RXMESH
\ No newline at end of file
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/kernels/debug.cuh b/include/rxmesh/kernels/debug.cuh
index 6224702d..2cc4754a 100644
--- a/include/rxmesh/kernels/debug.cuh
+++ b/include/rxmesh/kernels/debug.cuh
@@ -2,7 +2,7 @@
 #include <stdio.h>
 #include "cuda_runtime.h"
 
-namespace RXMESH {
+namespace rxmesh {
 
 /**
  * print_arr_uint()
@@ -11,7 +11,7 @@ template <typename T>
 __device__ void print_arr_uint(char     msg[],
                                uint32_t size,
                                T*       arr,
-                               uint32_t block_id = 0,
+                               uint32_t block_id  = 0,
                                uint32_t thread_id = 0)
 {
     if (blockIdx.x == block_id && threadIdx.x == thread_id) {
@@ -59,4 +59,4 @@ __device__ __forceinline__ unsigned total_smem_size()
     asm volatile("mov.u32 %0, %total_smem_size;" : "=r"(ret));
     return ret;
 }
-}  // namespace RXMESH
\ No newline at end of file
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/kernels/for_each.cuh b/include/rxmesh/kernels/for_each.cuh
new file mode 100644
index 00000000..3606eac7
--- /dev/null
+++ b/include/rxmesh/kernels/for_each.cuh
@@ -0,0 +1,54 @@
+#pragma once
+#include "rxmesh/patch_info.h"
+
+namespace rxmesh {
+namespace detail {
+template <typename LambdaT>
+__global__ void for_each_vertex(const uint32_t   num_patches,
+                                const PatchInfo* patch_info,
+                                LambdaT          apply)
+{
+    const uint32_t p_id = blockIdx.x;
+    if (p_id < num_patches) {
+        const uint16_t num_v = patch_info[p_id].num_owned_vertices;
+        for (uint16_t v = threadIdx.x; v < num_v; v += blockDim.x) {
+            const VertexHandle v_handle(p_id, v);
+            apply(v_handle);
+        }
+    }
+}
+
+
+template <typename LambdaT>
+__global__ void for_each_edge(const uint32_t   num_patches,
+                              const PatchInfo* patch_info,
+                              LambdaT          apply)
+{
+    const uint32_t p_id = blockIdx.x;
+    if (p_id < num_patches) {
+        const uint16_t num_e = patch_info[p_id].num_owned_edges;
+        for (uint16_t e = threadIdx.x; e < num_e; e += blockDim.x) {
+            const EdgeHandle e_handle(p_id, e);
+            apply(e_handle);
+        }
+    }
+}
+
+
+template <typename LambdaT>
+__global__ void for_each_face(const uint32_t   num_patches,
+                              const PatchInfo* patch_info,
+                              LambdaT          apply)
+{
+    const uint32_t p_id = blockIdx.x;
+    if (p_id < num_patches) {
+        const uint16_t num_f = patch_info[p_id].num_owned_faces;
+        for (uint16_t f = threadIdx.x; f < num_f; f += blockDim.x) {
+            const FaceHandle f_handle(p_id, f);
+            apply(f_handle);
+        }
+    }
+}
+
+}  // namespace detail
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/kernels/get_arch.cuh b/include/rxmesh/kernels/get_arch.cuh
index 769ab88f..9dd323f8 100644
--- a/include/rxmesh/kernels/get_arch.cuh
+++ b/include/rxmesh/kernels/get_arch.cuh
@@ -2,7 +2,7 @@
 #include <cuda_runtime.h>
 #include "rxmesh/util/macros.h"
 
-namespace RXMESH {
+namespace rxmesh {
 __global__ static void get_cude_arch_k(int* d_arch)
 {
 
@@ -24,4 +24,4 @@ inline int cuda_arch()
     cudaFree(d_arch);
     return h_arch;
 }
-}  // namespace RXMESH
\ No newline at end of file
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/kernels/loader.cuh b/include/rxmesh/kernels/loader.cuh
new file mode 100644
index 00000000..9f4fb62c
--- /dev/null
+++ b/include/rxmesh/kernels/loader.cuh
@@ -0,0 +1,285 @@
+#pragma once
+
+#include <assert.h>
+#include <stdint.h>
+#include "rxmesh/context.h"
+#include "rxmesh/local.h"
+#include "rxmesh/types.h"
+
+namespace rxmesh {
+
+template <uint32_t blockThreads>
+__device__ __forceinline__ void load_uint16(const uint16_t* in,
+                                            const uint16_t  size,
+                                            uint16_t*       out)
+{
+    const uint32_t  size32   = size / 2;
+    const uint32_t  reminder = size % 2;
+    const uint32_t* in32     = reinterpret_cast<const uint32_t*>(in);
+    uint32_t*       out32    = reinterpret_cast<uint32_t*>(out);
+
+    for (uint32_t i = threadIdx.x; i < size32; i += blockThreads) {
+        uint32_t a = in32[i];
+        out32[i]   = a;
+    }
+
+    if (reminder != 0) {
+        if (threadIdx.x == 0) {
+            out[size - 1] = in[size - 1];
+        }
+    }
+}
+
+
+/**
+ * @brief load the patch FE
+ * @param patch_info input patch info
+ * @param patch_faces output FE
+ * @return
+ */
+template <uint32_t blockThreads>
+__device__ __forceinline__ void load_patch_FE(const PatchInfo& patch_info,
+                                              LocalEdgeT*      fe)
+{
+    load_uint16<blockThreads>(reinterpret_cast<const uint16_t*>(patch_info.fe),
+                              patch_info.num_faces * 3,
+                              reinterpret_cast<uint16_t*>(fe));
+}
+
+/**
+ * @brief load the patch EV
+ * @param patch_info input patch info
+ * @param ev output EV
+ * @return
+ */
+template <uint32_t blockThreads>
+__device__ __forceinline__ void load_patch_EV(const PatchInfo& patch_info,
+                                              LocalVertexT*    ev)
+{
+    const uint32_t  num_edges = patch_info.num_edges;
+    const uint32_t* input_ev32 =
+        reinterpret_cast<const uint32_t*>(patch_info.ev);
+    uint32_t* output_ev32 = reinterpret_cast<uint32_t*>(ev);
+#pragma unroll 2
+    for (uint32_t i = threadIdx.x; i < num_edges; i += blockThreads) {
+        uint32_t a     = input_ev32[i];
+        output_ev32[i] = a;
+    }
+}
+
+/**
+ * @brief load the patch topology i.e., EV and FE
+ * @param patch_info input patch info
+ * @param load_ev input indicates if we should load EV
+ * @param load_fe input indicates if we should load FE
+ * @param s_ev where EV will be loaded
+ * @param s_fe where FE will be loaded
+ * @return
+ */
+template <uint32_t blockThreads>
+__device__ __forceinline__ void load_mesh(const PatchInfo& patch_info,
+                                          const bool       load_ev,
+                                          const bool       load_fe,
+                                          LocalVertexT*&   s_ev,
+                                          LocalEdgeT*&     s_fe)
+{
+
+    if (load_ev) {
+        load_patch_EV<blockThreads>(patch_info, s_ev);
+    }
+    // load patch faces
+    if (load_fe) {
+        if (load_ev) {
+            // if we loaded the edges, then we need to move where
+            // s_fe is pointing at to avoid overwrite
+            s_fe =
+                reinterpret_cast<LocalEdgeT*>(&s_ev[patch_info.num_edges * 2]);
+        }
+        load_patch_FE<blockThreads>(patch_info, s_fe);
+    }
+}
+
+template <uint32_t blockThreads>
+__device__ __forceinline__ void load_not_owned_local_id(
+    const uint16_t  num_not_owned,
+    uint16_t*       output_not_owned_local_id,
+    const uint16_t* input_not_owned_local_id)
+{
+    load_uint16<blockThreads>(
+        input_not_owned_local_id, num_not_owned, output_not_owned_local_id);
+}
+
+template <uint32_t blockThreads>
+__device__ __forceinline__ void load_not_owned_patch(
+    const uint16_t  num_not_owned,
+    uint32_t*       output_not_owned_patch,
+    const uint32_t* input_not_owned_patch)
+{
+    for (uint32_t i = threadIdx.x; i < num_not_owned; i += blockThreads) {
+        output_not_owned_patch[i] = input_not_owned_patch[i];
+    }
+}
+
+/**
+ * @brief Load local id and patch of the not-owned verteices, edges, or faces
+ * based on query op.
+ * @param patch_info input patch info
+ * @param not_owned_local_id output local id
+ * @param not_owned_patch output patch id
+ * @param num_not_owned number of not-owned mesh elements
+ */
+template <Op op, uint32_t blockThreads>
+__device__ __forceinline__ void load_not_owned(const PatchInfo& patch_info,
+                                               uint16_t*& not_owned_local_id,
+                                               uint32_t*& not_owned_patch,
+                                               uint16_t&  num_owned)
+{
+    uint32_t num_not_owned = 0;
+    switch (op) {
+        case Op::VV: {
+            num_owned     = patch_info.num_owned_vertices;
+            num_not_owned = patch_info.num_vertices - num_owned;
+
+            // should be 4*patch_info.num_edges but VV (offset and values) are
+            // stored as uint16_t and not_owned_patch is uint32_t* so we need to
+            // shift the pointer only by half this amount
+            not_owned_patch = not_owned_patch + 2 * patch_info.num_edges;
+            not_owned_local_id =
+                reinterpret_cast<uint16_t*>(not_owned_patch + num_not_owned);
+            load_not_owned_patch<blockThreads>(
+                num_not_owned, not_owned_patch, patch_info.not_owned_patch_v);
+            load_not_owned_local_id<blockThreads>(
+                num_not_owned,
+                not_owned_local_id,
+                reinterpret_cast<uint16_t*>(patch_info.not_owned_id_v));
+            break;
+        }
+        case Op::VE: {
+            num_owned     = patch_info.num_owned_edges;
+            num_not_owned = patch_info.num_edges - num_owned;
+
+            // should be 4*patch_info.num_edges but VE (offset and values) are
+            // stored as uint16_t and not_owned_patch is uint32_t* so we need to
+            // shift the pointer only by half this amount
+            not_owned_patch = not_owned_patch + 2 * patch_info.num_edges;
+            not_owned_local_id =
+                reinterpret_cast<uint16_t*>(not_owned_patch + num_not_owned);
+            load_not_owned_patch<blockThreads>(
+                num_not_owned, not_owned_patch, patch_info.not_owned_patch_e);
+            load_not_owned_local_id<blockThreads>(
+                num_not_owned,
+                not_owned_local_id,
+                reinterpret_cast<uint16_t*>(patch_info.not_owned_id_e));
+            break;
+        }
+        case Op::VF: {
+            num_owned     = patch_info.num_owned_faces;
+            num_not_owned = patch_info.num_faces - num_owned;
+
+            uint32_t shift = DIVIDE_UP(
+                3 * patch_info.num_faces + std::max(3 * patch_info.num_faces,
+                                                    2 * patch_info.num_edges),
+                2);
+            not_owned_patch = not_owned_patch + shift;
+            not_owned_local_id =
+                reinterpret_cast<uint16_t*>(not_owned_patch + num_not_owned);
+            load_not_owned_patch<blockThreads>(
+                num_not_owned, not_owned_patch, patch_info.not_owned_patch_f);
+            load_not_owned_local_id<blockThreads>(
+                num_not_owned,
+                not_owned_local_id,
+                reinterpret_cast<uint16_t*>(patch_info.not_owned_id_f));
+            break;
+        }
+        case Op::FV: {
+            num_owned     = patch_info.num_owned_vertices;
+            num_not_owned = patch_info.num_vertices - num_owned;
+
+            assert(2 * patch_info.num_edges >= (1 + 2) * num_not_owned);
+            not_owned_local_id =
+                reinterpret_cast<uint16_t*>(not_owned_patch + num_not_owned);
+            load_not_owned_patch<blockThreads>(
+                num_not_owned, not_owned_patch, patch_info.not_owned_patch_v);
+            load_not_owned_local_id<blockThreads>(
+                num_not_owned,
+                not_owned_local_id,
+                reinterpret_cast<uint16_t*>(patch_info.not_owned_id_v));
+            break;
+        }
+        case Op::FE: {
+            num_owned     = patch_info.num_owned_edges;
+            num_not_owned = patch_info.num_edges - num_owned;
+
+            // should be 3*patch_info.num_faces but FE is stored as uint16_t and
+            // not_owned_patch is uint32_t* so we need to shift the pointer only
+            // by half this amount
+            not_owned_patch =
+                not_owned_patch + DIVIDE_UP(3 * patch_info.num_faces, 2);
+            not_owned_local_id =
+                reinterpret_cast<uint16_t*>(not_owned_patch + num_not_owned);
+            load_not_owned_patch<blockThreads>(
+                num_not_owned, not_owned_patch, patch_info.not_owned_patch_e);
+            load_not_owned_local_id<blockThreads>(
+                num_not_owned,
+                not_owned_local_id,
+                reinterpret_cast<uint16_t*>(patch_info.not_owned_id_e));
+            break;
+        }
+        case Op::FF: {
+            num_owned     = patch_info.num_owned_faces;
+            num_not_owned = patch_info.num_faces - num_owned;
+
+            not_owned_local_id =
+                reinterpret_cast<uint16_t*>(not_owned_patch + num_not_owned);
+            load_not_owned_patch<blockThreads>(
+                num_not_owned, not_owned_patch, patch_info.not_owned_patch_f);
+            load_not_owned_local_id<blockThreads>(
+                num_not_owned,
+                not_owned_local_id,
+                reinterpret_cast<uint16_t*>(patch_info.not_owned_id_f));
+            break;
+        }
+        case Op::EV: {
+            num_owned     = patch_info.num_owned_vertices;
+            num_not_owned = patch_info.num_vertices - num_owned;
+
+            // should be 2*patch_info.num_edges but EV is stored as uint16_t and
+            // not_owned_patch is uint32_t* so we need to shift the pointer only
+            // by num_edges
+            not_owned_patch = not_owned_patch + patch_info.num_edges;
+            not_owned_local_id =
+                reinterpret_cast<uint16_t*>(not_owned_patch + num_not_owned);
+            load_not_owned_patch<blockThreads>(
+                num_not_owned, not_owned_patch, patch_info.not_owned_patch_v);
+            load_not_owned_local_id<blockThreads>(
+                num_not_owned,
+                not_owned_local_id,
+                reinterpret_cast<uint16_t*>(patch_info.not_owned_id_v));
+            break;
+        }
+        case Op::EF: {
+            num_owned     = patch_info.num_owned_faces;
+            num_not_owned = patch_info.num_faces - num_owned;
+
+            // should be 6*patch_info.num_faces but EF (offset and values) are
+            // stored as uint16_t and not_owned_patch is uint32_t* so we need to
+            // shift the pointer only by half this amount
+            not_owned_patch = not_owned_patch + 3 * patch_info.num_faces;
+            not_owned_local_id =
+                reinterpret_cast<uint16_t*>(not_owned_patch + num_not_owned);
+            load_not_owned_patch<blockThreads>(
+                num_not_owned, not_owned_patch, patch_info.not_owned_patch_f);
+            load_not_owned_local_id<blockThreads>(
+                num_not_owned,
+                not_owned_local_id,
+                reinterpret_cast<uint16_t*>(patch_info.not_owned_id_f));
+            break;
+        }
+        default: {
+            assert(1 != 1);
+            break;
+        }
+    }
+}
+
+}  // namespace rxmesh
diff --git a/include/rxmesh/kernels/prototype.cuh b/include/rxmesh/kernels/prototype.cuh
deleted file mode 100644
index 73ab7c24..00000000
--- a/include/rxmesh/kernels/prototype.cuh
+++ /dev/null
@@ -1,63 +0,0 @@
-#pragma once
-#include "rxmesh/kernels/rxmesh_query_dispatcher.cuh"
-namespace RXMESH {
-namespace detail {
-
-/**
- * query_prototype() represents the minimal user function for op query.
- * This function is only used in order to calculate the static shared memory and
- * registers used
- */
-template <Op op, uint32_t blockThreads>
-__launch_bounds__(blockThreads) __global__
-    static void query_prototype(const RXMeshContext context,
-                                const bool          oriented = false)
-{
-    static_assert(op != Op::EE, "Op::EE is not supported!");
-
-    auto user_lambda = [&](uint32_t id, RXMeshIterator& iter) {
-        printf("\n iter.size() = %u", iter.size());
-        for (uint32_t i = 0; i < iter.size(); ++i) {
-            printf("\n iter[%u] = %u", i, iter[i]);
-        }
-    };
-
-    query_block_dispatcher<op, blockThreads>(context, user_lambda, oriented);
-}
-
-/**
- * higher_query_prototype() represents the minimal user function for higeher
- * queries. Higher we assume that all query of similar type. This function is
- * only used in order to calculate the static shared memory and registers used/
- */
-template <Op op, uint32_t blockThreads>
-__launch_bounds__(blockThreads) __global__
-    static void higher_query_prototype(const RXMeshContext context,
-                                       const bool          oriented = false)
-{
-    static_assert(op != Op::EE, "Op::EE is not supported!");
-
-    uint32_t thread_element;
-    auto     first_ring = [&](uint32_t id, RXMeshIterator& iter) {
-        thread_element = id;
-        printf("\n iter.size() = %u", iter.size());
-        for (uint32_t i = 0; i < iter.size(); ++i) {
-            printf("\n iter[%u] = %u", i, iter[i]);
-        }
-    };
-
-    query_block_dispatcher<op, blockThreads>(context, first_ring, oriented);
-
-    auto n_ring = [&](uint32_t id, RXMeshIterator& iter) {
-        printf("\n iter.size() = %u", iter.size());
-        for (uint32_t i = 0; i < iter.size(); ++i) {
-            printf("\n iter[%u] = %u", i, iter[i]);
-        }
-    };
-
-    query_block_dispatcher<op, blockThreads>(context, thread_element, n_ring,
-                                             oriented);
-}
-
-}  // namespace detail
-}  // namespace RXMESH
\ No newline at end of file
diff --git a/include/rxmesh/kernels/query_dispatcher.cuh b/include/rxmesh/kernels/query_dispatcher.cuh
new file mode 100644
index 00000000..ae3c1387
--- /dev/null
+++ b/include/rxmesh/kernels/query_dispatcher.cuh
@@ -0,0 +1,415 @@
+#pragma once
+#include <assert.h>
+#include <stdint.h>
+#include <cub/block/block_discontinuity.cuh>
+
+#include "rxmesh/context.h"
+#include "rxmesh/handle.h"
+#include "rxmesh/iterator.cuh"
+#include "rxmesh/kernels/collective.cuh"
+#include "rxmesh/kernels/debug.cuh"
+#include "rxmesh/kernels/loader.cuh"
+#include "rxmesh/kernels/rxmesh_queries.cuh"
+#include "rxmesh/types.h"
+#include "rxmesh/util/meta.h"
+
+namespace rxmesh {
+
+namespace detail {
+
+/**
+ * query_block_dispatcher()
+ */
+template <Op op, uint32_t blockThreads, typename activeSetT>
+__device__ __inline__ void query_block_dispatcher(const PatchInfo& patch_info,
+                                                  activeSetT compute_active_set,
+                                                  const bool oriented,
+                                                  uint32_t&  num_src_in_patch,
+                                                  uint16_t*& s_output_offset,
+                                                  uint16_t*& s_output_value,
+                                                  uint16_t&  num_owned,
+                                                  uint32_t*& not_owned_patch,
+                                                  uint16_t*& not_owned_local_id)
+{
+    static_assert(op != Op::EE, "Op::EE is not supported!");
+
+    constexpr bool load_fe  = (op == Op::VF || op == Op::EE || op == Op::EF ||
+                              op == Op::FV || op == Op::FE || op == Op::FF);
+    constexpr bool loead_ev = (op == Op::VV || op == Op::VE || op == Op::VF ||
+                               op == Op::EV || op == Op::FV);
+    static_assert(loead_ev || load_fe,
+                  "At least faces or edges needs to be loaded");
+
+    // Check if any of the mesh elements are in the active set
+    // input mapping does not need to be stored in shared memory since it will
+    // be read coalesced, we can rely on L1 cache here
+    num_src_in_patch = 0;
+    if constexpr (op == Op::VV || op == Op::VE || op == Op::VF) {
+        num_src_in_patch = patch_info.num_owned_vertices;
+    }
+    if constexpr (op == Op::EV || op == Op::EF) {
+        num_src_in_patch = patch_info.num_owned_edges;
+    }
+    if constexpr (op == Op::FV || op == Op::FE || op == Op::FF) {
+        num_src_in_patch = patch_info.num_owned_faces;
+    }
+
+    bool     is_active = false;
+    uint16_t local_id  = threadIdx.x;
+    while (local_id < num_src_in_patch) {
+        is_active =
+            is_active || compute_active_set({patch_info.patch_id, local_id});
+        local_id += blockThreads;
+    }
+
+    if (__syncthreads_or(is_active) == 0) {
+        // reset num_src_in_patch to zero to indicate that this block/patch has
+        // no work to do
+        num_src_in_patch = 0;
+        return;
+    }
+
+    // 2) Load the patch info
+    extern __shared__ uint16_t shrd_mem[];
+    LocalVertexT*              s_ev = reinterpret_cast<LocalVertexT*>(shrd_mem);
+    LocalEdgeT*                s_fe = reinterpret_cast<LocalEdgeT*>(shrd_mem);
+    load_mesh<blockThreads>(patch_info, loead_ev, load_fe, s_ev, s_fe);
+
+    not_owned_patch    = reinterpret_cast<uint32_t*>(shrd_mem);
+    not_owned_local_id = shrd_mem;
+    num_owned          = 0;
+    // 3)Perform the query operation
+    if (oriented) {
+        assert(op == Op::VV);
+        if constexpr (op == Op::VV) {
+            __syncthreads();
+            v_v_oreinted<blockThreads>(patch_info,
+                                       s_output_offset,
+                                       s_output_value,
+                                       reinterpret_cast<uint16_t*>(s_ev));
+        }
+    } else {
+        if constexpr (!(op == Op::VV || op == Op::FV || op == Op::FF)) {
+            load_not_owned<op, blockThreads>(
+                patch_info, not_owned_local_id, not_owned_patch, num_owned);
+        }
+        __syncthreads();
+        query<blockThreads, op>(s_output_offset,
+                                s_output_value,
+                                reinterpret_cast<uint16_t*>(s_ev),
+                                reinterpret_cast<uint16_t*>(s_fe),
+                                patch_info.num_vertices,
+                                patch_info.num_edges,
+                                patch_info.num_faces);
+    }
+
+    // load not-owned local and patch id
+    if constexpr (op == Op::VV || op == Op::FV || op == Op::FF) {
+        // need to sync since we will overwrite things that are used in
+        // query
+        __syncthreads();
+        load_not_owned<op, blockThreads>(
+            patch_info, not_owned_local_id, not_owned_patch, num_owned);
+    }
+
+
+    __syncthreads();
+}
+
+
+/**
+ * query_block_dispatcher()
+ */
+template <Op op, uint32_t blockThreads, typename computeT, typename activeSetT>
+__device__ __inline__ void query_block_dispatcher(const Context& context,
+                                                  const uint32_t patch_id,
+                                                  computeT       compute_op,
+                                                  activeSetT compute_active_set,
+                                                  const bool oriented = false)
+{
+    // Extract the type of the input parameters of the compute lambda function.
+    // The first parameter should be Vertex/Edge/FaceHandle and second parameter
+    // should be RXMeshVertex/Edge/FaceIterator
+
+    using ComputeTraits    = detail::FunctionTraits<computeT>;
+    using ComputeHandleT   = typename ComputeTraits::template arg<0>::type;
+    using ComputeIteratorT = typename ComputeTraits::template arg<1>::type;
+    using LocalT           = typename ComputeIteratorT::LocalT;
+
+    // Extract the type of the single input parameter of the active_set lambda
+    // function. It should be Vertex/Edge/FaceHandle and it should match the
+    // first parameter of the compute lambda function
+    using ActiveSetTraits  = detail::FunctionTraits<activeSetT>;
+    using ActiveSetHandleT = typename ActiveSetTraits::template arg<0>::type;
+    static_assert(
+        std::is_same_v<ActiveSetHandleT, ComputeHandleT>,
+        "First argument of compute_op lambda function should match the first "
+        "argument of active_set lambda function ");
+
+    static_assert(op != Op::EE, "Op::EE is not supported!");
+
+
+    assert(patch_id < context.get_num_patches());
+
+    uint32_t  num_src_in_patch = 0;
+    uint16_t* s_output_offset(nullptr);
+    uint16_t* s_output_value(nullptr);
+    uint16_t  num_owned;
+    uint32_t* not_owned_patch(nullptr);
+    uint16_t* not_owned_local_id(nullptr);
+
+    detail::template query_block_dispatcher<op, blockThreads>(
+        context.get_patches_info()[patch_id],
+        compute_active_set,
+        oriented,
+        num_src_in_patch,
+        s_output_offset,
+        s_output_value,
+        num_owned,
+        not_owned_patch,
+        not_owned_local_id);
+
+    // Call compute on the output in shared memory by looping over all
+    // source elements in this patch.
+
+    uint16_t local_id = threadIdx.x;
+    while (local_id < num_src_in_patch) {
+
+        assert(s_output_value);
+
+        if (compute_active_set({patch_id, local_id})) {
+            constexpr uint32_t fixed_offset =
+                ((op == Op::EV)                 ? 2 :
+                 (op == Op::FV || op == Op::FE) ? 3 :
+                                                  0);
+
+
+            ComputeHandleT   handle(patch_id, local_id);
+            ComputeIteratorT iter(local_id,
+                                  reinterpret_cast<LocalT*>(s_output_value),
+                                  s_output_offset,
+                                  fixed_offset,
+                                  patch_id,
+                                  num_owned,
+                                  not_owned_patch,
+                                  not_owned_local_id,
+                                  int(op == Op::FE));
+
+            compute_op(handle, iter);
+        }
+
+        local_id += blockThreads;
+    }
+}
+
+}  // namespace detail
+/**
+ * @brief The main query function to be called by the whole block. In this
+ * function, threads will be assigned to mesh elements which will be accessible
+ * through the input computation lambda function (compute_op). This function
+ * also provides a predicate to specify the active set i.e., the set on which
+ * the query operations should be done. This is mainly used to skip query on
+ * a subset of the input mesh elements which may lead to better performance
+ * @tparam Op the type of query operation
+ * @tparam blockThreads the number of CUDA threads in the block
+ * @tparam computeT the type of compute lambda function (inferred)
+ * @tparam activeSetT the type of active set lambda function (inferred)
+ * @param context which store various parameters needed for the query
+ * operation. The context can be obtained from RXMeshStatic
+ * @param compute_op the computation lambda function that will be executed by
+ * each thread in the block. This lambda function takes two input parameters:
+ * 1. Handle to the mesh element assigned to the thread. The handle type matches
+ * the source of the query (e.g., VertexHandle for VE query) 2. an iterator to
+ * the query output. The iterator type matches the type of the mesh element
+ * "iterated" on (e.g., EdgeIterator for VE query)
+ * @param compute_active_set a predicate used to specify the active set. This
+ * lambda function take a single parameter which is a handle of the type similar
+ * to the input of the query operation (e.g., VertexHandle for VE query)
+ * @param oriented specifies if the query are oriented. Currently only VV query
+ * is supported for oriented queries. FV, FE and EV is oriented by default
+ */
+template <Op op, uint32_t blockThreads, typename computeT, typename activeSetT>
+__device__ __inline__ void query_block_dispatcher(const Context& context,
+                                                  computeT       compute_op,
+                                                  activeSetT compute_active_set,
+                                                  const bool oriented = false)
+{
+    if (blockIdx.x >= context.get_num_patches()) {
+        return;
+    }
+
+    detail::query_block_dispatcher<op, blockThreads>(
+        context, blockIdx.x, compute_op, compute_active_set, oriented);
+}
+
+
+/**
+ * @brief The main query function to be called by the whole block. In this
+ * function, threads will be assigned to mesh elements which will be accessible
+ * through the input computation lambda function (compute_op).
+ * @tparam Op the type of query operation
+ * @tparam blockThreads the number of CUDA threads in the block
+ * @tparam computeT the type of compute lambda function (inferred)
+ * @param context which store various parameters needed for the query
+ * operation. The context can be obtained from RXMeshStatic
+ * @param compute_op the computation lambda function that will be executed by
+ * each thread in the block. This lambda function takes two input parameters:
+ * 1. Handle to the mesh element assigned to the thread. The handle type matches
+ * the source of the query (e.g., VertexHandle for VE query) 2. an iterator to
+ * the query output. The iterator type matches the type of the mesh element
+ * "iterated" on (e.g., EdgeIterator for VE query)
+ * @param oriented specifies if the query are oriented. Currently only VV query
+ * is supported for oriented queries. FV, FE and EV is oriented by default
+ */
+template <Op op, uint32_t blockThreads, typename computeT>
+__device__ __inline__ void query_block_dispatcher(const Context& context,
+                                                  computeT       compute_op,
+                                                  const bool oriented = false)
+{
+    // Extract the type of the first input parameters of the compute lambda
+    // function. It should be Vertex/Edge/FaceHandle
+    using ComputeTraits  = detail::FunctionTraits<computeT>;
+    using ComputeHandleT = typename ComputeTraits::template arg<0>::type;
+
+    query_block_dispatcher<op, blockThreads>(
+        context, compute_op, [](ComputeHandleT) { return true; }, oriented);
+}
+
+
+/**
+ * @brief This function is used to perform a query operation on a specific mesh
+ * element. This is only needed for higher query (e.g., 2-ring query) where the
+ * first query is done using query_block_dispatcher in which each thread is
+ * assigned to a mesh element. Subsequent queries should be handled by this
+ * function. This function should be called by the whole CUDA block.
+ * @tparam Op the type of query operation
+ * @tparam blockThreads the number of CUDA threads in the block
+ * @tparam computeT the type of compute lambda function (inferred)
+ * @tparam HandleT the type of input handle (inferred) which should match the
+ * input of the query operations (e.g., VertexHandle for VE query)
+ * @param context which store various parameters needed for the query
+ * operation. The context can be obtained from RXMeshStatic
+ * @param src_id the input mesh element to the query. Inactive threads can
+ * simply pass HandleT() in which case they are skipped
+ * @param compute_op the computation lambda function that will be executed by
+ * the thread. This lambda function takes two input parameters:
+ * 1. HandleT which is the same as src_id 2. an iterator to the query output.
+ * The iterator type matches the type of the mesh element "iterated" on (e.g.,
+ * EdgeIterator for VE query)
+ * @param oriented specifies if the query are oriented. Currently only VV query
+ * is supported for oriented queries. FV, FE and EV is oriented by default
+ */
+template <Op op, uint32_t blockThreads, typename computeT, typename HandleT>
+__device__ __inline__ void higher_query_block_dispatcher(
+    const Context& context,
+    const HandleT  src_id,
+    computeT       compute_op,
+    const bool     oriented = false)
+{
+    using ComputeTraits    = detail::FunctionTraits<computeT>;
+    using ComputeIteratorT = typename ComputeTraits::template arg<1>::type;
+
+    // The whole block should be calling this function. If one thread is not
+    // participating, its src_id should be INVALID32
+
+    auto compute_active_set = [](HandleT) { return true; };
+
+    // the source and local id of the source mesh element
+    std::pair<uint32_t, uint16_t> pl = src_id.unpack();
+
+    // Here, we want to identify the set of unique patches for this thread
+    // block. We do this by first sorting the patches, compute discontinuity
+    // head flag, then threads with head flag =1 can add their patches to the
+    // shared memory buffer that will contain the unique patches
+
+    __shared__ uint32_t s_block_patches[blockThreads];
+    __shared__ uint32_t s_num_patches;
+    if (threadIdx.x == 0) {
+        s_num_patches = 0;
+    }
+    typedef cub::BlockRadixSort<uint32_t, blockThreads, 1>  BlockRadixSort;
+    typedef cub::BlockDiscontinuity<uint32_t, blockThreads> BlockDiscontinuity;
+    union TempStorage
+    {
+        typename BlockRadixSort::TempStorage     sort_storage;
+        typename BlockDiscontinuity::TempStorage discont_storage;
+    };
+    __shared__ TempStorage all_temp_storage;
+    uint32_t               thread_data[1], thread_head_flags[1];
+    thread_data[0]       = pl.first;
+    thread_head_flags[0] = 0;
+    BlockRadixSort(all_temp_storage.sort_storage).Sort(thread_data);
+    BlockDiscontinuity(all_temp_storage.discont_storage)
+        .FlagHeads(thread_head_flags, thread_data, cub::Inequality());
+
+    if (thread_head_flags[0] == 1 && thread_data[0] != INVALID32) {
+        uint32_t id         = ::atomicAdd(&s_num_patches, uint32_t(1));
+        s_block_patches[id] = thread_data[0];
+    }
+
+    // We could eliminate the discontinuity operation and atomicAdd and instead
+    // use thrust::unique. However, this method causes illegal memory access
+    // and it looks like a bug in thrust
+    /*__syncthreads();
+    // uniquify
+    uint32_t* new_end = thrust::unique(thrust::device, s_block_patches,
+                                       s_block_patches + blockThreads);
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+        s_num_patches = new_end - s_block_patches - 1;
+    }*/
+    __syncthreads();
+
+
+    for (uint32_t p = 0; p < s_num_patches; ++p) {
+
+        uint32_t patch_id = s_block_patches[p];
+
+        assert(patch_id < context.get_num_patches());
+
+        uint32_t  num_src_in_patch = 0;
+        uint16_t *s_output_offset(nullptr), *s_output_value(nullptr);
+        uint16_t  num_owned = 0;
+        uint16_t* not_owned_local_id(nullptr);
+        uint32_t* not_owned_patch(nullptr);
+
+        detail::template query_block_dispatcher<op, blockThreads>(
+            context.get_patches_info()[patch_id],
+            compute_active_set,
+            oriented,
+            num_src_in_patch,
+            s_output_offset,
+            s_output_value,
+            num_owned,
+            not_owned_patch,
+            not_owned_local_id);
+
+
+        if (pl.first == patch_id) {
+
+            constexpr uint32_t fixed_offset =
+                ((op == Op::EV)                 ? 2 :
+                 (op == Op::FV || op == Op::FE) ? 3 :
+                                                  0);
+
+            ComputeIteratorT iter(
+                pl.second,
+                reinterpret_cast<typename ComputeIteratorT::LocalT*>(
+                    s_output_value),
+                s_output_offset,
+                fixed_offset,
+                patch_id,
+                num_owned,
+                not_owned_patch,
+                not_owned_local_id,
+                int(op == Op::FE));
+
+            compute_op(src_id, iter);
+        }
+        __syncthreads();
+    }
+}
+
+
+}  // namespace rxmesh
diff --git a/include/rxmesh/kernels/rxmesh_attribute.cuh b/include/rxmesh/kernels/rxmesh_attribute.cuh
deleted file mode 100644
index 4e6b319f..00000000
--- a/include/rxmesh/kernels/rxmesh_attribute.cuh
+++ /dev/null
@@ -1,87 +0,0 @@
-#pragma once
-#include <cub/block/block_reduce.cuh>
-#include "rxmesh/util/macros.h"
-namespace RXMESH {
-
-template <class T>
-class RXMeshAttribute;
-
-template <class T>
-__global__ void rxmesh_attribute_axpy(const RXMeshAttribute<T> X,
-                                      const T*                 alpha,
-                                      RXMeshAttribute<T>       Y,
-                                      const T*                 beta,
-                                      const uint32_t attribute_id = INVALID32)
-{
-    // Y = alpha*X + beta*Y
-    // if attribute is INVALID32, then the operation is applied to all
-    // attribute (one thread per mesh element on all attribute)
-    // otherwise, the operation is applied on only that attribute
-
-    // alpha and beta should be of size attributes per element if attribute ==
-    // INVALID32. Otherwise, they should point to a single variable
-
-    assert(X.get_num_mesh_elements() == Y.get_num_mesh_elements());
-    assert(X.get_num_attribute_per_element() ==
-           Y.get_num_attribute_per_element());
-
-    uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if (idx < X.get_num_mesh_elements()) {
-
-        if (attribute_id == INVALID32) {
-            for (uint32_t attr = 0; attr < X.get_num_attribute_per_element();
-                 ++attr) {
-                Y(idx, attr) =
-                    alpha[attr] * X(idx, attr) + beta[attr] * Y(idx, attr);
-            }
-        } else {
-            Y(idx, attribute_id) = alpha[0] * X(idx, attribute_id) +
-                                   beta[0] * Y(idx, attribute_id);
-        }
-    }
-}
-
-
-template <class T, uint32_t blockSize>
-__global__ void rxmesh_attribute_norm2(const RXMeshAttribute<T> X,
-                                       const uint32_t           attribute_id,
-                                       T*                       d_block_output)
-{
-    uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
-    T        threa_val = 0;
-    if (idx < X.get_num_mesh_elements()) {
-        threa_val = X(idx, attribute_id);
-    }
-    threa_val *= threa_val;
-
-
-    typedef cub::BlockReduce<T, blockSize>       BlockReduce;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-    T block_sum = BlockReduce(temp_storage).Sum(threa_val);
-    if (threadIdx.x == 0) {
-        d_block_output[blockIdx.x] = block_sum;
-    }
-}
-
-
-template <class T, uint32_t blockSize>
-__global__ void rxmesh_attribute_dot(const RXMeshAttribute<T> X,
-                                     const RXMeshAttribute<T> Y,
-                                     const uint32_t           attribute_id,
-                                     T*                       d_block_output)
-{
-    uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
-    T        threa_val = 0;
-    if (idx < X.get_num_mesh_elements()) {
-        threa_val = X(idx, attribute_id) * Y(idx, attribute_id);
-    }
-
-    typedef cub::BlockReduce<T, blockSize>       BlockReduce;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-    T block_sum = BlockReduce(temp_storage).Sum(threa_val);
-    if (threadIdx.x == 0) {
-        d_block_output[blockIdx.x] = block_sum;
-    }
-}
-}  // namespace RXMESH
\ No newline at end of file
diff --git a/include/rxmesh/kernels/rxmesh_iterator.cuh b/include/rxmesh/kernels/rxmesh_iterator.cuh
deleted file mode 100644
index 486f7136..00000000
--- a/include/rxmesh/kernels/rxmesh_iterator.cuh
+++ /dev/null
@@ -1,128 +0,0 @@
-#pragma once
-#include <stdint.h>
-namespace RXMESH {
-
-struct RXMeshIterator
-{
-    __device__ RXMeshIterator(const uint16_t  local_id,
-                              const uint16_t* patch_output,
-                              const uint16_t* patch_offset,
-                              const uint32_t* output_ltog_map,
-                              const uint32_t  offset_size,
-                              const uint32_t  num_src_in_patch,
-                              int             shift = 0)
-        : m_patch_output(patch_output), m_patch_offset(patch_offset),
-          m_output_ltog_map(output_ltog_map),
-          m_num_src_in_patch(num_src_in_patch), m_shift(shift)
-    {
-        set(local_id, offset_size);
-    }
-
-    RXMeshIterator(const RXMeshIterator& orig) = default;
-
-    __device__ uint16_t local_id() const
-    {
-        return m_local_id;
-    }
-
-    __device__ uint16_t size() const
-    {
-        return m_end - m_begin;
-    }
-
-    __device__ uint16_t neighbour_local_id(uint32_t i) const
-    {
-        return m_patch_output[m_begin + i];
-    }
-
-    __device__ uint32_t operator[](const uint32_t i) const
-    {
-        assert(m_patch_output);
-        assert(m_output_ltog_map);
-        assert(i + m_begin < m_end);
-        return m_output_ltog_map[((m_patch_output[m_begin + i]) >> m_shift)];
-    }
-
-    __device__ uint32_t operator*() const
-    {
-        assert(m_patch_output);
-        assert(m_output_ltog_map);
-        return ((*this)[m_current]);
-    }
-
-    __device__ uint32_t back() const
-    {
-        return ((*this)[size() - 1]);
-    }
-
-    __device__ uint32_t front() const
-    {
-        return ((*this)[0]);
-    }
-
-    __device__ RXMeshIterator& operator++()
-    {
-        // pre
-        m_current = (m_current + 1) % size();
-        return *this;
-    }
-    __device__ const RXMeshIterator operator++(int)
-    {
-        // post
-        RXMeshIterator pre(*this);
-        m_current = (m_current + 1) % size();
-        return pre;
-    }
-
-    __device__ RXMeshIterator& operator--()
-    {
-        // pre
-        m_current = (m_current == 0) ? size() - 1 : m_current - 1;
-        return *this;
-    }
-
-    __device__ const RXMeshIterator operator--(int)
-    {
-        // post
-        RXMeshIterator pre(*this);
-        m_current = (m_current == 0) ? size() - 1 : m_current - 1;
-        return pre;
-    }
-
-    __device__ bool operator==(const RXMeshIterator& rhs) const
-    {
-        return rhs.m_local_id == m_local_id && rhs.m_current == m_current;
-    }
-
-    __device__ bool operator!=(const RXMeshIterator& rhs) const
-    {
-        return !(*this == rhs);
-    }
-
-
-    // private:
-    const uint16_t* m_patch_output;
-    const uint16_t* m_patch_offset;
-    const uint32_t* m_output_ltog_map;
-    uint16_t        m_local_id;
-    uint16_t        m_begin;
-    uint16_t        m_end;
-    uint16_t        m_current;
-    int             m_shift;
-    uint32_t        m_num_src_in_patch;
-
-    __device__ void set(const uint16_t local_id, const uint32_t offset_size)
-    {
-        m_current = 0;
-        m_local_id = local_id;
-        if (offset_size == 0) {
-            m_begin = m_patch_offset[m_local_id];
-            m_end = m_patch_offset[m_local_id + 1];
-        } else {
-            m_begin = m_local_id * offset_size;
-            m_end = (m_local_id + 1) * offset_size;
-        }
-        assert(m_end > m_begin);
-    }
-};
-}  // namespace RXMESH
\ No newline at end of file
diff --git a/include/rxmesh/kernels/rxmesh_loader.cuh b/include/rxmesh/kernels/rxmesh_loader.cuh
deleted file mode 100644
index a93e912c..00000000
--- a/include/rxmesh/kernels/rxmesh_loader.cuh
+++ /dev/null
@@ -1,171 +0,0 @@
-#pragma once
-
-#include <assert.h>
-#include <stdint.h>
-
-#include "rxmesh/rxmesh.h"
-#include "rxmesh/rxmesh_context.h"
-
-namespace RXMESH {
-
-/**
-* load_patch_ad_size()
-*/
-__device__ __forceinline__ void load_patch_ad_size(const RXMeshContext& context,
-                                                   const uint32_t       p_id,
-                                                   uint4&               ad_size,
-                                                   uint2& ad_size_ltog_v,
-                                                   uint2& ad_size_ltog_e,
-                                                   uint2& ad_size_ltog_f)
-{
-
-    ad_size.x = context.get_ad_size()[p_id].x;
-    ad_size.y = context.get_ad_size()[p_id].y;
-    ad_size.z = context.get_ad_size()[p_id].z;
-    ad_size.w = context.get_ad_size()[p_id].w;
-
-    ad_size_ltog_v = context.get_ad_size_ltog_v()[p_id];
-    ad_size_ltog_e = context.get_ad_size_ltog_e()[p_id];
-    ad_size_ltog_f = context.get_ad_size_ltog_f()[p_id];
-    assert(ad_size.y % 2 == 0);
-    assert(ad_size.w % context.get_face_degree() == 0);
-
-    /*if (threadIdx.x == 0) {
-        printf("\n   blockIdx.x= %u, p_id = %u \n"
-            "   edges_add= %u, edges_size= %u \n"
-            "   faces_add= %u, faces_size= %u \n"
-            "   s_ad_size_ltog_v.x= %u, s_ad_size_ltog_v.y= %u \n"
-            "   s_ad_size_ltog_e.x= %u, s_ad_size_ltog_e.y= %u \n"
-            "   s_ad_size_ltog_f.x= %u, s_ad_size_ltog_f.y= %u \n",
-            blockIdx.x, p_id,
-            s_ad_size.x, s_ad_size.y, s_ad_size.z, s_ad_size.w,
-            s_ad_size_ltog_v.x, s_ad_size_ltog_v.y,
-            s_ad_size_ltog_e.x, s_ad_size_ltog_e.y,
-            s_ad_size_ltog_f.x, s_ad_size_ltog_f.y);
-    }*/
-}
-
-/**
- * load_patch_edges()
- */
-__device__ __forceinline__ void load_patch_edges(const RXMeshContext& context,
-                                                 uint16_t*    patch_edges,
-                                                 const uint4& ad_sz)
-{
-
-    // whole block should be calling this
-
-    // load edges
-    assert(ad_sz.y % 2 == 0);
-    uint32_t        size32 = ad_sz.y / 2;
-    const uint32_t* edges_ptr32 =
-        (const uint32_t*)(context.get_patches_edges() + ad_sz.x);
-    uint32_t* patch_edges32 = (uint32_t*)(patch_edges);
-#pragma unroll 2
-    for (uint32_t i = threadIdx.x; i < size32; i += blockDim.x) {
-        uint32_t a = edges_ptr32[i];
-        patch_edges32[i] = a;
-    }
-}
-
-/**
- * load_patch_faces()
- */
-__device__ __forceinline__ void load_patch_faces(const RXMeshContext& context,
-                                                 uint16_t*    patch_faces,
-                                                 const uint4& ad_sz)
-{
-
-    // whole block should be calling this
-
-    // load faces
-    assert(ad_sz.w % 3 == 0);
-
-    uint32_t        size32 = ad_sz.w / 2;
-    uint32_t        reminder = ad_sz.w % 2;
-    const uint32_t* faces_ptr32 =
-        (const uint32_t*)(context.get_patches_faces() + ad_sz.z);
-    uint32_t* patch_faces32 = (uint32_t*)(patch_faces);
-    //#pragma unroll 3
-    for (uint32_t i = threadIdx.x; i < size32; i += blockDim.x) {
-        uint32_t a = faces_ptr32[i];
-        patch_faces32[i] = a;
-    }
-
-    if (reminder != 0) {
-        if (threadIdx.x == 0) {
-            patch_faces[ad_sz.w - 1] =
-                context.get_patches_faces()[ad_sz.z + ad_sz.w - 1];
-        }
-    }
-}
-
-/**
- * load_mapping()
- */
-__device__ __forceinline__ void load_mapping(const RXMeshContext& context,
-                                             const ELEMENT        ele,
-                                             const uint2& s_ad_size_ltog,
-                                             uint32_t*    mapping,
-                                             const bool   keep_patch_bit)
-{
-    // whole block should be calling this
-    for (uint32_t i = threadIdx.x, start = s_ad_size_ltog.x;
-         i < s_ad_size_ltog.y; i += blockDim.x) {
-
-        switch (ele) {
-            case ELEMENT::VERTEX:
-                if (keep_patch_bit) {
-                    mapping[i] = context.get_patches_ltog_v()[i + start];
-                } else {
-                    mapping[i] = (context.get_patches_ltog_v()[i + start] >> 1);
-                }
-
-                break;
-            case ELEMENT::EDGE:
-                if (keep_patch_bit) {
-                    mapping[i] = context.get_patches_ltog_e()[i + start];
-                } else {
-                    mapping[i] = (context.get_patches_ltog_e()[i + start] >> 1);
-                }
-                break;
-            case ELEMENT::FACE:
-                if (keep_patch_bit) {
-                    mapping[i] = context.get_patches_ltog_f()[i + start];
-                } else {
-                    mapping[i] = (context.get_patches_ltog_f()[i + start] >> 1);
-                }
-                break;
-            default:
-                assert(1 != 1);
-                break;
-        }
-    }
-}
-
-/**
- * load_mesh()
- */
-__device__ __forceinline__ void load_mesh(const RXMeshContext& context,
-                                          const bool           load_edges,
-                                          const bool           load_faces,
-                                          uint16_t*&           s_patch_edges,
-                                          uint16_t*&           s_patch_faces,
-                                          const uint4&         ad_size)
-{
-
-    if (load_edges) {
-        load_patch_edges(context, s_patch_edges, ad_size);
-    }
-    // load patch faces
-    if (load_faces) {
-        if (load_edges) {
-            // if we loaded the edges, then we need to move where
-            // s_patch_faces is pointing at to avoid overwrite
-            s_patch_faces = &s_patch_edges[ad_size.y];
-        }
-        load_patch_faces(context, s_patch_faces, ad_size);
-    }
-}
-
-}  // namespace RXMESH
diff --git a/include/rxmesh/kernels/rxmesh_queries.cuh b/include/rxmesh/kernels/rxmesh_queries.cuh
index cf6c8d18..b77459c8 100644
--- a/include/rxmesh/kernels/rxmesh_queries.cuh
+++ b/include/rxmesh/kernels/rxmesh_queries.cuh
@@ -3,14 +3,13 @@
 #include <assert.h>
 #include <stdint.h>
 
+#include "rxmesh/context.h"
 #include "rxmesh/kernels/collective.cuh"
-#include "rxmesh/kernels/rxmesh_loader.cuh"
+#include "rxmesh/kernels/loader.cuh"
 #include "rxmesh/kernels/util.cuh"
-#include "rxmesh/rxmesh.h"
-#include "rxmesh/rxmesh_context.h"
+#include "rxmesh/types.h"
 
-namespace RXMESH {
-//********************** Tools
+namespace rxmesh {
 template <uint32_t rowOffset,
           uint32_t blockThreads,
           uint32_t itemPerThread = TRANSPOSE_ITEM_PER_THREAD>
@@ -33,7 +32,7 @@ __device__ __forceinline__ void block_mat_transpose(const uint32_t num_rows,
         // INVALID16;
         if (index < nnz) {
             thread_data[i] = mat[index] >> shift;
-            mat[index] = 0;
+            mat[index]     = 0;
         } else {
             thread_data[i] = INVALID16;
         }
@@ -63,7 +62,7 @@ __device__ __forceinline__ void block_mat_transpose(const uint32_t num_rows,
     __syncthreads();
     for (uint32_t i = threadIdx.x; i < num_cols; i += blockThreads) {
         uint16_t val = uint16_t(mat_half[i]);
-        mat[i] = val;
+        mat[i]       = val;
     }
 #else
     for (uint32_t i = 0; i < itemPerThread; ++i) {
@@ -85,43 +84,43 @@ __device__ __forceinline__ void block_mat_transpose(const uint32_t num_rows,
         uint16_t item = thread_data[i];
         if (item != INVALID16) {
             uint16_t offset = mat[item] + local_offset[i];
-            uint16_t row = (itemPerThread * threadIdx.x + i) / rowOffset;
-            output[offset] = row;
+            uint16_t row    = (itemPerThread * threadIdx.x + i) / rowOffset;
+            output[offset]  = row;
         } else {
             break;
         }
     }
 }
-//*************************************************************************
 
 template <uint32_t blockThreads>
-__device__ __forceinline__ void v_v_oreinted(uint16_t*& s_offset_all_patches,
-                                             uint16_t*& s_output_all_patches,
-                                             uint16_t*  s_patch_edges,
-                                             const RXMeshContext& context,
-                                             const uint4&         ad_size,
-                                             const uint16_t       num_vertices,
-                                             const uint16_t num_owned_vertices)
+__device__ __forceinline__ void v_v_oreinted(const PatchInfo& patch_info,
+                                             uint16_t*&       s_output_offset,
+                                             uint16_t*&       s_output_value,
+                                             uint16_t*        s_ev)
 {
-    const uint32_t num_faces = ad_size.w / 3;
-    const uint32_t num_edges = ad_size.y / 2;
 
-    s_offset_all_patches = &s_patch_edges[0];
-    s_output_all_patches =
-        &s_patch_edges[num_vertices + 1 + (num_vertices + 1) % 2];
+    const uint16_t num_edges          = patch_info.num_edges;
+    const uint16_t num_faces          = patch_info.num_faces;
+    const uint16_t num_vertices       = patch_info.num_vertices;
+    const uint16_t num_owned_vertices = patch_info.num_owned_vertices;
+
+    s_output_offset = &s_ev[0];
+    s_output_value  = &s_ev[num_vertices + 1 + (num_vertices + 1) % 2];
 
     // start by loading the faces while also doing transposing EV (might
     // increase ILP)
-    uint16_t* s_patch_FE = &s_output_all_patches[2 * num_edges];
-    uint16_t* s_patch_EF = &s_patch_FE[3 * num_faces + (3 * num_faces) % 2];
-    load_patch_faces(context, s_patch_FE, ad_size);
+    uint16_t*   s_fe    = &s_output_value[2 * num_edges];
+    uint16_t*   s_ef    = &s_fe[3 * num_faces + (3 * num_faces) % 2];
+    LocalEdgeT* temp_fe = reinterpret_cast<LocalEdgeT*>(s_fe);
+    load_patch_FE<blockThreads>(patch_info, temp_fe);
+
 
     for (uint32_t i = threadIdx.x; i < num_edges * 2; i += blockThreads) {
-        s_patch_EF[i] = INVALID16;
+        s_ef[i] = INVALID16;
     }
 
     block_mat_transpose<2u, blockThreads>(
-        num_edges, num_vertices, s_offset_all_patches, s_output_all_patches);
+        num_edges, num_vertices, s_output_offset, s_output_value);
 
     // block_mat_transpose<2u, blockThreads>(
     //    num_faces, num_edges, s_patch_EF_offset, s_patch_EF_output);
@@ -131,17 +130,17 @@ __device__ __forceinline__ void v_v_oreinted(uint16_t*& s_offset_all_patches,
     // that we are working on manifold so it is only two edges per face. We
     // also wanna keep FE for quick look up on a face's three edges.
 
-    // We need to sync here to make sure that s_patch_FE is loaded but there is
+    // We need to sync here to make sure that s_fe is loaded but there is
     // a sync in block_mat_transpose that takes care of this
 
 
     for (uint16_t e = threadIdx.x; e < 3 * num_faces; e += blockThreads) {
-        uint16_t edge = s_patch_FE[e] >> 1;
+        uint16_t edge    = s_fe[e] >> 1;
         uint16_t face_id = e / 3;
 
-        auto ret = atomicCAS(s_patch_EF + 2 * edge, INVALID16, face_id);
+        auto ret = atomicCAS(s_ef + 2 * edge, INVALID16, face_id);
         if (ret != INVALID16) {
-            ret = atomicCAS(s_patch_EF + 2 * edge + 1, INVALID16, face_id);
+            ret = atomicCAS(s_ef + 2 * edge + 1, INVALID16, face_id);
             assert(ret == INVALID16);
         }
     }
@@ -156,13 +155,13 @@ __device__ __forceinline__ void v_v_oreinted(uint16_t*& s_offset_all_patches,
         // if the vertex is not owned by this patch, then there is no reason
         // to orient its edges because no serious computation is done on it
 
-        uint16_t start = s_offset_all_patches[v];
-        uint16_t end = s_offset_all_patches[v + 1];
+        uint16_t start = s_output_offset[v];
+        uint16_t end   = s_output_offset[v + 1];
 
 
         for (uint16_t e_id = start; e_id < end - 1; ++e_id) {
-            uint16_t e_0 = s_output_all_patches[e_id];
-            uint16_t f0(s_patch_EF[2 * e_0]), f1(s_patch_EF[2 * e_0 + 1]);
+            uint16_t e_0 = s_output_value[e_id];
+            uint16_t f0(s_ef[2 * e_0]), f1(s_ef[2 * e_0 + 1]);
 
             // we don't do it for boundary faces
             assert(f0 != INVALID16 && f1 != INVALID16 && f0 < num_faces &&
@@ -172,33 +171,33 @@ __device__ __forceinline__ void v_v_oreinted(uint16_t*& s_offset_all_patches,
             // candidate next edge (only one of them will win)
             uint16_t e_candid_0, e_candid_1;
 
-            if ((s_patch_FE[3 * f0 + 0] >> 1) == e_0) {
-                e_candid_0 = s_patch_FE[3 * f0 + 2] >> 1;
+            if ((s_fe[3 * f0 + 0] >> 1) == e_0) {
+                e_candid_0 = s_fe[3 * f0 + 2] >> 1;
             }
-            if ((s_patch_FE[3 * f0 + 1] >> 1) == e_0) {
-                e_candid_0 = s_patch_FE[3 * f0 + 0] >> 1;
+            if ((s_fe[3 * f0 + 1] >> 1) == e_0) {
+                e_candid_0 = s_fe[3 * f0 + 0] >> 1;
             }
-            if ((s_patch_FE[3 * f0 + 2] >> 1) == e_0) {
-                e_candid_0 = s_patch_FE[3 * f0 + 1] >> 1;
+            if ((s_fe[3 * f0 + 2] >> 1) == e_0) {
+                e_candid_0 = s_fe[3 * f0 + 1] >> 1;
             }
 
-            if ((s_patch_FE[3 * f1 + 0] >> 1) == e_0) {
-                e_candid_1 = s_patch_FE[3 * f1 + 2] >> 1;
+            if ((s_fe[3 * f1 + 0] >> 1) == e_0) {
+                e_candid_1 = s_fe[3 * f1 + 2] >> 1;
             }
-            if ((s_patch_FE[3 * f1 + 1] >> 1) == e_0) {
-                e_candid_1 = s_patch_FE[3 * f1 + 0] >> 1;
+            if ((s_fe[3 * f1 + 1] >> 1) == e_0) {
+                e_candid_1 = s_fe[3 * f1 + 0] >> 1;
             }
-            if ((s_patch_FE[3 * f1 + 2] >> 1) == e_0) {
-                e_candid_1 = s_patch_FE[3 * f1 + 1] >> 1;
+            if ((s_fe[3 * f1 + 2] >> 1) == e_0) {
+                e_candid_1 = s_fe[3 * f1 + 1] >> 1;
             }
 
             for (uint16_t vn = e_id + 1; vn < end; ++vn) {
-                uint16_t e_winning_candid = s_output_all_patches[vn];
+                uint16_t e_winning_candid = s_output_value[vn];
                 if (e_candid_0 == e_winning_candid ||
                     e_candid_1 == e_winning_candid) {
-                    uint16_t temp = s_output_all_patches[e_id + 1];
-                    s_output_all_patches[e_id + 1] = e_winning_candid;
-                    s_output_all_patches[vn] = temp;
+                    uint16_t temp            = s_output_value[e_id + 1];
+                    s_output_value[e_id + 1] = e_winning_candid;
+                    s_output_value[vn]       = temp;
                     break;
                 }
             }
@@ -207,31 +206,33 @@ __device__ __forceinline__ void v_v_oreinted(uint16_t*& s_offset_all_patches,
 
     __syncthreads();
 
-    // Load EV into s_patch_EF since both has the same size (2*#E)
-    s_patch_edges = &s_patch_EF[0];
-    load_patch_edges(context, s_patch_edges, ad_size);
+    // Load EV into s_ef since both has the same size (2*#E)
+    s_ev                  = s_ef;
+    LocalVertexT* temp_ev = reinterpret_cast<LocalVertexT*>(s_ef);
+    load_patch_EV<blockThreads>(patch_info, temp_ev);
+
     __syncthreads();
 
     for (uint32_t v = threadIdx.x; v < num_vertices; v += blockThreads) {
-        uint32_t start = s_offset_all_patches[v];
-        uint32_t end = s_offset_all_patches[v + 1];
+        uint32_t start = s_output_offset[v];
+        uint32_t end   = s_output_offset[v + 1];
 
 
         for (uint32_t e = start; e < end; ++e) {
-            uint16_t edge = s_output_all_patches[e];
-            uint16_t v0 = s_patch_edges[2 * edge];
-            uint16_t v1 = s_patch_edges[2 * edge + 1];
+            uint16_t edge = s_output_value[e];
+            uint16_t v0   = s_ev[2 * edge];
+            uint16_t v1   = s_ev[2 * edge + 1];
 
             assert(v0 == v || v1 == v);
             // d_output[e] = (v0 == v) ? v1 : v0;
-            s_output_all_patches[e] = (v0 == v) * v1 + (v1 == v) * v0;
+            s_output_value[e] = (v0 == v) * v1 + (v1 == v) * v0;
         }
     }
 }
-//********************** 1) vertex incident edges
+
 template <uint32_t blockThreads>
-__device__ __forceinline__ void v_e(const uint32_t num_vertices,
-                                    const uint32_t num_edges,
+__device__ __forceinline__ void v_e(const uint16_t num_vertices,
+                                    const uint16_t num_edges,
                                     uint16_t*      d_edges,
                                     uint16_t*      d_output)
 {
@@ -243,15 +244,13 @@ __device__ __forceinline__ void v_e(const uint32_t num_vertices,
     // num_edges*2 (zero is stored and the end can be inferred). Thus,
     // d_output should be allocated to size = num_edges*2
 
-    block_mat_transpose<2u, blockThreads>(num_edges, num_vertices, d_edges,
-                                          d_output);
+    block_mat_transpose<2u, blockThreads>(
+        num_edges, num_vertices, d_edges, d_output);
 }
-//*************************************************************************
 
-//********************** 0) Vertex adjacent vertices
 template <uint32_t blockThreads>
-__device__ __forceinline__ void v_v(const uint32_t num_vertices,
-                                    const uint32_t num_edges,
+__device__ __forceinline__ void v_v(const uint16_t num_vertices,
+                                    const uint16_t num_edges,
                                     uint16_t*      d_edges,
                                     uint16_t*      d_output)
 {
@@ -280,12 +279,12 @@ __device__ __forceinline__ void v_v(const uint32_t num_vertices,
 
     for (uint32_t v = threadIdx.x; v < num_vertices; v += blockThreads) {
         uint32_t start = d_edges[v];
-        uint32_t end = d_edges[v + 1];
+        uint32_t end   = d_edges[v + 1];
 
         for (uint32_t e = start; e < end; ++e) {
             uint16_t edge = d_output[e];
-            uint16_t v0 = s_edges_duplicate[2 * edge];
-            uint16_t v1 = s_edges_duplicate[2 * edge + 1];
+            uint16_t v0   = s_edges_duplicate[2 * edge];
+            uint16_t v1   = s_edges_duplicate[2 * edge + 1];
 
             assert(v0 == v || v1 == v);
             // d_output[e] = (v0 == v) ? v1 : v0;
@@ -293,13 +292,10 @@ __device__ __forceinline__ void v_v(const uint32_t num_vertices,
         }
     }
 }
-//*************************************************************************
 
-
-//********************** 3) Face incident vertices
-__device__ __forceinline__ void f_v(const uint32_t  num_edges,
+__device__ __forceinline__ void f_v(const uint16_t  num_edges,
                                     const uint16_t* d_edges,
-                                    const uint32_t  num_faces,
+                                    const uint16_t  num_faces,
                                     uint16_t*       d_faces)
 {
     // M_FV = M_FE \dot M_EV
@@ -315,7 +311,7 @@ __device__ __forceinline__ void f_v(const uint32_t  num_edges,
         for (uint32_t i = 0; i < 3; i++) {
             uint16_t e = d_faces[f_id + i];
             flag_t   e_dir(0);
-            RXMeshContext::unpack_edge_dir(e, e, e_dir);
+            Context::unpack_edge_dir(e, e, e_dir);
             // if the direction is flipped, we take the second vertex
             uint16_t e_id = (2 * e) + (1 * e_dir);
             assert(e_id < 2 * num_edges);
@@ -326,14 +322,11 @@ __device__ __forceinline__ void f_v(const uint32_t  num_edges,
         }
     }
 }
-//*************************************************************************
-
 
-//********************** 2) Vertex incident faces
 template <uint32_t blockThreads>
-__device__ __forceinline__ void v_f(const uint32_t num_faces,
-                                    const uint32_t num_edges,
-                                    const uint32_t num_vertices,
+__device__ __forceinline__ void v_f(const uint16_t num_faces,
+                                    const uint16_t num_edges,
+                                    const uint16_t num_vertices,
                                     uint16_t*      d_edges,
                                     uint16_t*      d_faces)
 {
@@ -350,15 +343,13 @@ __device__ __forceinline__ void v_f(const uint32_t num_faces,
     f_v(num_edges, d_edges, num_faces, d_faces);
     __syncthreads();
 
-    block_mat_transpose<3u, blockThreads>(num_faces, num_vertices, d_faces,
-                                          d_edges);
+    block_mat_transpose<3u, blockThreads>(
+        num_faces, num_vertices, d_faces, d_edges);
 }
-//*************************************************************************
 
-//********************** 8) Edge incident faces
 template <uint32_t blockThreads>
-__device__ __forceinline__ void e_f(const uint32_t num_edges,
-                                    const uint32_t num_faces,
+__device__ __forceinline__ void e_f(const uint16_t num_edges,
+                                    const uint16_t num_faces,
                                     uint16_t*      d_faces,
                                     uint16_t*      d_output,
                                     int            shift = 1)
@@ -372,16 +363,13 @@ __device__ __forceinline__ void e_f(const uint32_t num_edges,
     // num_faces*3 (zero is stored and the end can be inferred). Thus,
     // d_output should be allocated to size = num_faces*3
 
-    block_mat_transpose<3u, blockThreads>(num_faces, num_edges, d_faces,
-                                          d_output, shift);
+    block_mat_transpose<3u, blockThreads>(
+        num_faces, num_edges, d_faces, d_output, shift);
 }
-//*************************************************************************
-
 
-//********************** 5) Face adjacent faces
 template <uint32_t blockThreads>
-__device__ __forceinline__ void f_f(const uint32_t num_edges,
-                                    const uint32_t num_faces,
+__device__ __forceinline__ void f_f(const uint16_t num_edges,
+                                    const uint16_t num_faces,
                                     uint16_t*      s_FE,
                                     uint16_t*      s_FF_offset,
                                     uint16_t*      s_FF_output)
@@ -395,9 +383,9 @@ __device__ __forceinline__ void f_f(const uint32_t num_edges,
     // losing FE
     for (uint16_t i = threadIdx.x; i < num_faces * 3; i += blockThreads) {
         flag_t   dir(0);
-        uint16_t e = s_FE[i] >> 1;
+        uint16_t e     = s_FE[i] >> 1;
         s_EF_offset[i] = e;
-        s_FE[i] = e;
+        s_FE[i]        = e;
     }
     __syncthreads();
 
@@ -458,83 +446,74 @@ __device__ __forceinline__ void f_f(const uint32_t num_edges,
         }
     }*/
 }
-//*************************************************************************
 
-
-//**********************
 template <uint32_t blockThreads, Op op>
-__device__ __forceinline__ void query(uint16_t*&     s_offset_all_patches,
-                                      uint16_t*&     s_output_all_patches,
-                                      uint16_t*      s_patch_edges,
-                                      uint16_t*      s_patch_faces,
-                                      const uint32_t num_vertices,
-                                      const uint32_t num_edges,
-                                      const uint32_t num_faces)
+__device__ __forceinline__ void query(uint16_t*&     s_output_offset,
+                                      uint16_t*&     s_output_value,
+                                      uint16_t*      s_ev,
+                                      uint16_t*      s_fe,
+                                      const uint16_t num_vertices,
+                                      const uint16_t num_edges,
+                                      const uint16_t num_faces)
 {
 
 
     switch (op) {
         case Op::VV: {
             assert(num_vertices <= 2 * num_edges);
-            s_offset_all_patches = &s_patch_edges[0];
-            s_output_all_patches = &s_patch_edges[num_vertices + 1];
-            v_v<blockThreads>(num_vertices, num_edges, s_patch_edges,
-                              s_output_all_patches);
+            s_output_offset = &s_ev[0];
+            s_output_value  = &s_ev[num_vertices + 1];
+            v_v<blockThreads>(num_vertices, num_edges, s_ev, s_output_value);
             break;
         }
         case Op::VE: {
             assert(num_vertices <= 2 * num_edges);
-            s_offset_all_patches = &s_patch_edges[0];
-            s_output_all_patches = &s_patch_edges[num_vertices + 1];
-            v_e<blockThreads>(num_vertices, num_edges, s_patch_edges,
-                              s_output_all_patches);
+            s_output_offset = &s_ev[0];
+            s_output_value  = &s_ev[num_vertices + 1];
+            v_e<blockThreads>(num_vertices, num_edges, s_ev, s_output_value);
             break;
         }
         case Op::VF: {
             assert(num_vertices <= 2 * num_edges);
-            s_output_all_patches = &s_patch_edges[0];
-            s_offset_all_patches = &s_patch_faces[0];
-            v_f<blockThreads>(num_faces, num_edges, num_vertices, s_patch_edges,
-                              s_patch_faces);
+            s_output_offset = &s_fe[0];
+            s_output_value  = &s_ev[0];
+            v_f<blockThreads>(num_faces, num_edges, num_vertices, s_ev, s_fe);
             break;
         }
         case Op::EV: {
-            s_output_all_patches = s_patch_edges;
+            s_output_value = s_ev;
             break;
         }
         case Op::EF: {
             assert(num_edges <= 3 * num_faces);
-            s_offset_all_patches = &s_patch_faces[0];
-            s_output_all_patches = &s_patch_faces[num_edges + 1];
-            e_f<blockThreads>(num_edges, num_faces, s_patch_faces,
-                              s_output_all_patches);
+            s_output_offset = &s_fe[0];
+            s_output_value  = &s_fe[num_edges + 1];
+            e_f<blockThreads>(num_edges, num_faces, s_fe, s_output_value);
             break;
         }
         case Op::FV: {
-            s_output_all_patches = s_patch_faces;
-            f_v(num_edges, s_patch_edges, num_faces, s_patch_faces);
+            s_output_value = s_fe;
+            f_v(num_edges, s_ev, num_faces, s_fe);
             break;
         }
         case Op::FE: {
-            s_output_all_patches = s_patch_faces;
+            s_output_value = s_fe;
             break;
         }
         case Op::FF: {
             assert(num_edges <= 3 * num_faces);
-            s_offset_all_patches =
-                &s_patch_faces[3 * num_faces + 2 * 3 * num_faces];
-            //                    ^^^^FE             ^^^^^EF
-            s_output_all_patches = &s_offset_all_patches[num_faces + 1];
-            f_f<blockThreads>(num_edges, num_faces, s_patch_faces,
-                              s_offset_all_patches, s_output_all_patches);
+            s_output_offset = &s_fe[3 * num_faces + 2 * 3 * num_faces];
+            //                      ^^^^FE             ^^^^^EF
+            s_output_value = &s_output_offset[num_faces + 1];
+            f_f<blockThreads>(
+                num_edges, num_faces, s_fe, s_output_offset, s_output_value);
 
             break;
         }
         default:
             assert(1 != 1);
             break;
-    }  // namespace RXMESH
+    }
 }
 
-//*************************************************************************
-}  // namespace RXMESH
+}  // namespace rxmesh
diff --git a/include/rxmesh/kernels/rxmesh_query_dispatcher.cuh b/include/rxmesh/kernels/rxmesh_query_dispatcher.cuh
deleted file mode 100644
index d37e9325..00000000
--- a/include/rxmesh/kernels/rxmesh_query_dispatcher.cuh
+++ /dev/null
@@ -1,404 +0,0 @@
-#pragma once
-#include <assert.h>
-#include <stdint.h>
-#include <cub/block/block_discontinuity.cuh>
-
-#include "rxmesh/kernels/collective.cuh"
-#include "rxmesh/kernels/rxmesh_iterator.cuh"
-#include "rxmesh/kernels/rxmesh_loader.cuh"
-#include "rxmesh/kernels/rxmesh_queries.cuh"
-#include "rxmesh/rxmesh.h"
-#include "rxmesh/rxmesh_context.h"
-#include "rxmesh/rxmesh_util.h"
-
-
-namespace RXMESH {
-
-namespace detail {
-
-/**
- * query_block_dispatcher()
- */
-template <Op op, uint32_t blockThreads, typename activeSetT>
-__device__ __inline__ void query_block_dispatcher(
-    const RXMeshContext& context,
-    const uint32_t       current_patch_id,
-    activeSetT           compute_active_set,
-    const bool           oriented,
-    const bool           output_needs_mapping,
-    uint32_t&            num_src_in_patch,
-    uint32_t*&           input_mapping,
-    uint32_t*&           s_output_mapping,
-    uint16_t*&           s_offset_all_patches,
-    uint16_t*&           s_output_all_patches)
-{
-    static_assert(op != Op::EE, "Op::EE is not supported!");
-    assert(current_patch_id < context.get_num_patches());
-
-
-    ELEMENT src_element, output_element;
-    io_elements(op, src_element, output_element);
-
-    extern __shared__ uint16_t shrd_mem[];
-
-
-    s_offset_all_patches = shrd_mem;
-    s_output_all_patches = shrd_mem;
-    uint16_t *s_patch_edges(shrd_mem), *s_patch_faces(shrd_mem);
-
-    constexpr bool load_faces = (op == Op::VF || op == Op::EE || op == Op::EF ||
-                                 op == Op::FV || op == Op::FE || op == Op::FF);
-    constexpr bool load_edges = (op == Op::VV || op == Op::VE || op == Op::VF ||
-                                 op == Op::EV || op == Op::FV);
-    static_assert(load_edges || load_faces,
-                  "At least faces or edges needs to be loaded");
-
-    constexpr bool is_fixed_offset =
-        (op == Op::EV || op == Op::FV || op == Op::FE);
-
-    __syncthreads();
-
-    // 1) load the patch addressed and size
-    uint4        ad_size;
-    uint2        ad_size_ltog_v, ad_size_ltog_e, ad_size_ltog_f;
-    const uint2& output_ele_ad_size =
-        ((output_element == ELEMENT::EDGE) ?
-             ad_size_ltog_e :
-             ((output_element == ELEMENT::FACE) ? ad_size_ltog_f :
-                                                  ad_size_ltog_v));
-    const uint2& src_element_ad_size =
-        ((src_element == ELEMENT::EDGE) ?
-             ad_size_ltog_e :
-             ((src_element == ELEMENT::FACE) ? ad_size_ltog_f :
-                                               ad_size_ltog_v));
-    load_patch_ad_size(context, current_patch_id, ad_size, ad_size_ltog_v,
-                       ad_size_ltog_e, ad_size_ltog_f);
-
-    // Check if any of the vertices are in the active set
-    // input mapping does not need to be stored in shared memory since it will
-    // be read coalesced, we can rely on L1 cache here
-    input_mapping = nullptr;
-    num_src_in_patch = 0;
-    switch (src_element) {
-        case RXMESH::ELEMENT::VERTEX: {
-            input_mapping =
-                context.get_patches_ltog_v() + src_element_ad_size.x;
-            num_src_in_patch = context.get_size_owned()[current_patch_id].z;
-            break;
-        }
-        case RXMESH::ELEMENT::EDGE: {
-            input_mapping =
-                context.get_patches_ltog_e() + src_element_ad_size.x;
-            num_src_in_patch = context.get_size_owned()[current_patch_id].y;
-            break;
-        }
-        case RXMESH::ELEMENT::FACE: {
-            input_mapping =
-                context.get_patches_ltog_f() + src_element_ad_size.x;
-            num_src_in_patch = context.get_size_owned()[current_patch_id].x;
-            break;
-        }
-    }
-
-
-    bool     is_active = false;
-    uint16_t local_id = threadIdx.x;
-    while (local_id < num_src_in_patch) {
-        is_active =
-            local_id || compute_active_set(input_mapping[local_id] >> 1);
-        local_id += blockThreads;
-    }
-
-
-    if (__syncthreads_or(is_active) == 0) {
-        return;
-    }
-
-    assert(ad_size.y == ad_size_ltog_e.y * 2);
-    assert(ad_size.w == ad_size_ltog_f.y * 3);
-
-
-    // 2) Load the patch info
-    load_mesh(context, load_edges, load_faces, s_patch_edges, s_patch_faces,
-              ad_size);
-    __syncthreads();
-
-    // 3)Perform the query operation
-    if (oriented) {
-        assert(op == Op::VV);
-        if constexpr (op == Op::VV) {
-            v_v_oreinted<blockThreads>(
-                s_offset_all_patches, s_output_all_patches, s_patch_edges,
-                context, ad_size, ad_size_ltog_v.y, num_src_in_patch);
-        }
-    } else {
-        query<blockThreads, op>(s_offset_all_patches, s_output_all_patches,
-                                s_patch_edges, s_patch_faces, ad_size_ltog_v.y,
-                                ad_size_ltog_e.y, ad_size_ltog_f.y);
-    }
-
-
-    // 4) load output mapping
-    s_output_mapping = nullptr;
-    if (output_needs_mapping) {
-        // Read comments in calc_shared_memory() to understand how we calculate
-        // s_output_mapping pointer location in shared memory such that it does
-        // not overwrite the results
-
-        // We add ad_size.w % 2 for padding in case ad_size.w  is not
-        // dividable by 2 in which case memory misalignment happens
-        if constexpr (op == Op::FE) {
-            s_output_mapping =
-                (uint32_t*)&shrd_mem[ad_size.w + (ad_size.w % 2)];
-        }
-        if constexpr (op == Op::EV) {
-            s_output_mapping = (uint32_t*)&shrd_mem[ad_size.y];
-        }
-        if constexpr (op == Op::FV) {
-            s_output_mapping =
-                (uint32_t*)&shrd_mem[ad_size.w + (ad_size.w % 2) + ad_size.y];
-        }
-        if constexpr (op == Op::VE) {
-            s_output_mapping = (uint32_t*)&shrd_mem[2 * ad_size.y];
-        }
-        if constexpr (op == Op::EF || op == Op::VF) {
-            s_output_mapping = (uint32_t*)&shrd_mem[2 * ad_size.w];
-        }
-        if constexpr (op == Op::FF) {
-            // FF uses a lot of shared memory and some of it can be overridden
-            // but we need to wait for the query to be done.
-            __syncthreads();
-            s_output_mapping = (uint32_t*)&shrd_mem[0];
-        }
-
-        if constexpr (op == Op::VV) {
-            // We use extra shared memory that is read only for VV which we can
-            // just use for loading ltog. The drawback is that we need to wait
-            // for the query to finish first before overwriting it with ltog
-            __syncthreads();
-            uint16_t last_vv = ad_size_ltog_v.y + 1 + 2 * ad_size_ltog_e.y;
-            s_output_mapping = (uint32_t*)&shrd_mem[last_vv + last_vv % 2];
-        }
-
-        load_mapping(context, output_element, output_ele_ad_size,
-                     s_output_mapping, false);
-    }
-    __syncthreads();
-}
-}  // namespace detail
-/**
- * query_block_dispatcher()
- */
-template <Op op, uint32_t blockThreads, typename computeT, typename activeSetT>
-__device__ __inline__ void query_block_dispatcher(
-    const RXMeshContext& context,
-    const uint32_t       current_patch_id,
-    computeT             compute_op,
-    activeSetT           compute_active_set,
-    const bool           oriented = false,
-    const bool           output_needs_mapping = true)
-{
-    static_assert(op != Op::EE, "Op::EE is not supported!");
-    assert(current_patch_id < context.get_num_patches());
-
-    uint32_t  num_src_in_patch = 0;
-    uint32_t *input_mapping(nullptr), *s_output_mapping(nullptr);
-    uint16_t *s_offset_all_patches(nullptr), *s_output_all_patches(nullptr);
-
-    detail::template query_block_dispatcher<op, blockThreads>(
-        context, current_patch_id, compute_active_set, oriented,
-        output_needs_mapping, num_src_in_patch, input_mapping, s_output_mapping,
-        s_offset_all_patches, s_output_all_patches);
-
-    assert(input_mapping);
-    assert(s_output_all_patches);
-
-    // 5) Call compute on the output in shared memory by looping over all
-    // source elements in this patch.
-
-    uint16_t local_id = threadIdx.x;
-    while (local_id < num_src_in_patch) {
-
-        uint32_t global_id = input_mapping[local_id] >> 1;
-
-        if (compute_active_set(global_id)) {
-            constexpr uint32_t fixed_offset =
-                ((op == Op::EV)                 ? 2 :
-                 (op == Op::FV || op == Op::FE) ? 3 :
-                                                  0);
-            RXMeshIterator iter(local_id, s_output_all_patches,
-                                s_offset_all_patches, s_output_mapping,
-                                fixed_offset, num_src_in_patch,
-                                int(op == Op::FE));
-
-            compute_op(global_id, iter);
-        }
-
-        local_id += blockThreads;
-    }
-}
-
-/**
- * query_block_dispatcher()
- */
-template <Op op, uint32_t blockThreads, typename computeT, typename activeSetT>
-__device__ __inline__ void query_block_dispatcher(
-    const RXMeshContext& context,
-    computeT             compute_op,
-    activeSetT           compute_active_set,
-    const bool           oriented = false,
-    const bool           output_needs_mapping = true)
-{
-    if (blockIdx.x >= context.get_num_patches()) {
-        return;
-    }
-    query_block_dispatcher<op, blockThreads>(context, blockIdx.x, compute_op,
-                                             compute_active_set, oriented,
-                                             output_needs_mapping);
-}
-
-/**
- * query_block_dispatcher()
- */
-template <Op op, uint32_t blockThreads, typename computeT>
-__device__ __inline__ void query_block_dispatcher(
-    const RXMeshContext& context,
-    computeT             compute_op,
-    const bool           oriented = false,
-    const bool           output_needs_mapping = true)
-{
-    if (blockIdx.x >= context.get_num_patches()) {
-        return;
-    }
-    query_block_dispatcher<op, blockThreads>(
-        context, blockIdx.x, compute_op, [](uint32_t) { return true; },
-        oriented, output_needs_mapping);
-}
-
-
-/**
- * query_block_dispatcher()
- */
-template <Op op, uint32_t blockThreads, typename computeT>
-__device__ __inline__ void query_block_dispatcher(const RXMeshContext& context,
-                                                  const uint32_t element_id,
-                                                  computeT       compute_op,
-                                                  const bool oriented = false)
-{
-    // The whole block should be calling this function. If one thread is not
-    // participating, its element_id should be INVALID32
-
-    auto compute_active_set = [](uint32_t) { return true; };
-
-    uint32_t element_patch = INVALID32;
-    if (element_id != INVALID32) {
-        switch (op) {
-            case RXMESH::Op::VV:
-            case RXMESH::Op::VE:
-            case RXMESH::Op::VF:
-                element_patch = context.get_vertex_patch()[element_id];
-                break;
-            case RXMESH::Op::FV:
-            case RXMESH::Op::FE:
-            case RXMESH::Op::FF:
-                element_patch = context.get_face_patch()[element_id];
-                break;
-            case RXMESH::Op::EV:
-            case RXMESH::Op::EE:
-            case RXMESH::Op::EF:
-                element_patch = context.get_edge_patch()[element_id];
-                break;
-        }
-    }
-
-    // Here, we want to identify the set of unique patches for this thread
-    // block. We do this by first sorting the patches, compute discontinuity
-    // head flag, then threads with head flag =1 can add their patches to the
-    // shared memory buffer that will contain the unique patches
-
-    __shared__ uint32_t s_block_patches[blockThreads];
-    __shared__ uint32_t s_num_patches;
-    if (threadIdx.x == 0) {
-        s_num_patches = 0;
-    }
-    typedef cub::BlockRadixSort<uint32_t, blockThreads, 1>  BlockRadixSort;
-    typedef cub::BlockDiscontinuity<uint32_t, blockThreads> BlockDiscontinuity;
-    union TempStorage
-    {
-        typename BlockRadixSort::TempStorage     sort_storage;
-        typename BlockDiscontinuity::TempStorage discont_storage;
-    };
-    __shared__ TempStorage all_temp_storage;
-    uint32_t               thread_data[1], thread_head_flags[1];
-    thread_data[0] = element_patch;
-    thread_head_flags[0] = 0;
-    BlockRadixSort(all_temp_storage.sort_storage).Sort(thread_data);
-    BlockDiscontinuity(all_temp_storage.discont_storage)
-        .FlagHeads(thread_head_flags, thread_data, cub::Inequality());
-
-    if (thread_head_flags[0] == 1 && thread_data[0] != INVALID32) {
-        uint32_t id = ::atomicAdd(&s_num_patches, uint32_t(1));
-        s_block_patches[id] = thread_data[0];
-    }
-
-    // We could eliminate the discontinuity operation and atomicAdd and instead
-    // use thrust::unique. However, this method causes illegal memory access
-    // and it looks like a bug in thrust
-    /*__syncthreads();
-    // uniquify
-    uint32_t* new_end = thrust::unique(thrust::device, s_block_patches,
-                                       s_block_patches + blockThreads);
-    __syncthreads();
-
-    if (threadIdx.x == 0) {
-        s_num_patches = new_end - s_block_patches - 1;
-    }*/
-    __syncthreads();
-
-
-    for (uint32_t p = 0; p < s_num_patches; ++p) {
-
-        uint32_t patch_id = s_block_patches[p];
-
-        assert(patch_id < context.get_num_patches());
-
-        uint32_t  num_src_in_patch = 0;
-        uint32_t *input_mapping(nullptr), *s_output_mapping(nullptr);
-        uint16_t *s_offset_all_patches(nullptr), *s_output_all_patches(nullptr);
-
-        detail::template query_block_dispatcher<op, blockThreads>(
-            context, patch_id, compute_active_set, oriented, true,
-            num_src_in_patch, input_mapping, s_output_mapping,
-            s_offset_all_patches, s_output_all_patches);
-
-        assert(input_mapping);
-        assert(s_output_all_patches);
-
-
-        if (element_patch == patch_id) {
-
-            uint16_t local_id = INVALID16;
-
-            for (uint16_t j = 0; j < num_src_in_patch; ++j) {
-                if (element_id == s_output_mapping[j]) {
-                    local_id = j;
-                    break;
-                }
-            }
-
-            constexpr uint32_t fixed_offset =
-                ((op == Op::EV)                 ? 2 :
-                 (op == Op::FV || op == Op::FE) ? 3 :
-                                                  0);
-
-            RXMeshIterator iter(local_id, s_output_all_patches,
-                                s_offset_all_patches, s_output_mapping,
-                                fixed_offset, num_src_in_patch,
-                                int(op == Op::FE));
-
-            compute_op(element_id, iter);
-        }
-    }
-}
-
-}  // namespace RXMESH
diff --git a/include/rxmesh/kernels/util.cuh b/include/rxmesh/kernels/util.cuh
index 9ccd97ce..7c2c88ce 100644
--- a/include/rxmesh/kernels/util.cuh
+++ b/include/rxmesh/kernels/util.cuh
@@ -2,87 +2,77 @@
 #include <cuda_runtime.h>
 #include <stdint.h>
 
-namespace RXMESH {
+namespace rxmesh {
 
-/**
- * memcpy()
- */
 template <typename attrT>
 __global__ void memcpy(attrT* d_dest, const attrT* d_src, const uint32_t length)
 {
     const uint32_t stride = blockDim.x * gridDim.x;
-    uint32_t       i = blockDim.x * blockIdx.x + threadIdx.x;
+    uint32_t       i      = blockDim.x * blockIdx.x + threadIdx.x;
     while (i < length) {
         d_dest[i] = d_src[i];
         i += stride;
     }
 }
 
-/**
- * memset()
- */
+
 template <typename attrT>
 __global__ void memset(attrT* d_dest, const attrT val, const uint32_t length)
 {
     const uint32_t stride = blockDim.x * gridDim.x;
-    uint32_t       i = blockDim.x * blockIdx.x + threadIdx.x;
+    uint32_t       i      = blockDim.x * blockIdx.x + threadIdx.x;
     while (i < length) {
         d_dest[i] = val;
         i += stride;
     }
 }
 
-/**
- * atomicAdd() on uint16_t
- */
 __device__ __forceinline__ uint16_t atomicAdd(uint16_t* address, uint16_t val)
 {
     // Taken from
     // https://github.com/pytorch/pytorch/blob/master/aten/src/THC/THCAtomics.cuh#L36
-    size_t    offset = (size_t)address & 2;
+    size_t    offset        = (size_t)address & 2;
     uint32_t* address_as_ui = (uint32_t*)((char*)address - offset);
-    bool      is_32_align = offset;
-    uint32_t  old = *address_as_ui;
+    bool      is_32_align   = offset;
+    uint32_t  old           = *address_as_ui;
     uint32_t  old_bytes;
     uint32_t  newval;
     uint32_t  assumed;
 
     do {
-        assumed = old;
+        assumed   = old;
         old_bytes = is_32_align ? old >> 16 : old & 0xffff;
         // preserve size in initial cast. Casting directly to uint32_t pads
         // negative signed values with 1's (e.g. signed -1 = unsigned ~0).
         newval = static_cast<uint16_t>(val + old_bytes);
         newval = is_32_align ? (old & 0xffff) | (newval << 16) :
                                (old & 0xffff0000) | newval;
-        old = atomicCAS(address_as_ui, assumed, newval);
+        old    = atomicCAS(address_as_ui, assumed, newval);
     } while (assumed != old);
     return (is_32_align) ? uint16_t(old >> 16) : uint16_t(old & 0xffff);
 }
 
-/**
- * atomicAdd() on uint8_t
- */
+
 __device__ __forceinline__ uint8_t atomicAdd(uint8_t* address, uint8_t val)
 {
     // Taken from
     // https://github.com/pytorch/pytorch/blob/master/aten/src/THC/THCAtomics.cuh#L14
-    size_t    offset = (size_t)address & 3;
+    size_t    offset        = (size_t)address & 3;
     uint32_t* address_as_ui = (uint32_t*)((char*)address - offset);
-    uint32_t  old = *address_as_ui;
-    uint32_t  shift = offset * 8;
+    uint32_t  old           = *address_as_ui;
+    uint32_t  shift         = offset * 8;
     uint32_t  old_byte;
     uint32_t  newval;
     uint32_t  assumed;
 
     do {
-        assumed = old;
+        assumed  = old;
         old_byte = (old >> shift) & 0xff;
         // preserve size in initial cast. Casting directly to uint32_t pads
         // negative signed values with 1's (e.g. signed -1 = unsigned ~0).
         newval = static_cast<uint8_t>(val + old_byte);
         newval = (old & ~(0x000000ff << shift)) | (newval << shift);
-        old = atomicCAS(address_as_ui, assumed, newval);
+        old    = atomicCAS(address_as_ui, assumed, newval);
     } while (assumed != old);
 
     return uint8_t((old >> shift) & 0xff);
@@ -111,7 +101,7 @@ __device__ __forceinline__ unsigned short int atomicCAS(
 #else
     // Taken from
     // https://github.com/rapidsai/cudf/blob/89b802e6cecffe2425048f1f70cd682b865730b8/cpp/include/cudf/detail/utilities/device_atomics.cuh
-    using T_int = unsigned int;
+    using T_int       = unsigned int;
     using T_int_short = unsigned short int;
 
     bool   is_32_align = (reinterpret_cast<size_t>(address) & 2) ? false : true;
@@ -132,7 +122,7 @@ __device__ __forceinline__ unsigned short int atomicCAS(
 
         T_int new_value = (is_32_align) ? (old & 0xffff0000) | u_val :
                                           (old & 0xffff) | (T_int(u_val) << 16);
-        old = ::atomicCAS(address_uint32, assumed, new_value);
+        old             = ::atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
     return target_value;
@@ -140,9 +130,7 @@ __device__ __forceinline__ unsigned short int atomicCAS(
 #endif
 }
 
-/**
- * dynamic_smem_size()
- */
+
 __device__ __forceinline__ unsigned dynamic_smem_size()
 {
     unsigned ret;
@@ -151,4 +139,4 @@ __device__ __forceinline__ unsigned dynamic_smem_size()
 }
 
 
-}  // namespace RXMESH
\ No newline at end of file
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/launch_box.h b/include/rxmesh/launch_box.h
index b99198c1..70423d2e 100644
--- a/include/rxmesh/launch_box.h
+++ b/include/rxmesh/launch_box.h
@@ -1,12 +1,18 @@
+#pragma once
 #include <stdint.h>
 
-namespace RXMESH {
+namespace rxmesh {
 
+/**
+ * @brief Stores different parameters needed to launch kernels i.e., number of
+ * CUDA blocks and threads, dynamic shared memory. These parameters are meant to
+ * be calculated by RXMeshStatic and then used by the user to launch kernels
+ */
 template <uint32_t blockThreads>
 struct LaunchBox
 {
-    uint32_t blocks, smem_bytes_dyn, smem_bytes_static,
-        expected_output_per_block;
+    uint32_t       blocks, num_registers_per_thread;
+    size_t         smem_bytes_dyn, smem_bytes_static;
     const uint32_t num_threads = blockThreads;
 };
-}  // namespace RXMESH
\ No newline at end of file
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/local.h b/include/rxmesh/local.h
new file mode 100644
index 00000000..d8b0fadb
--- /dev/null
+++ b/include/rxmesh/local.h
@@ -0,0 +1,77 @@
+#pragma once
+#include <stdint.h>
+#include <string>
+#include "rxmesh/util/macros.h"
+
+namespace rxmesh {
+
+/**
+ * @brief Local vertex type (wrapped around uint16_t)
+ */
+struct LocalVertexT
+{
+    /**
+     * @brief Default constructor
+     */
+    __device__ __host__ LocalVertexT() : id(INVALID16)
+    {
+    }
+
+    /**
+     * @brief Constructor using local index
+     * @param id vertex local index in the owner patch
+     * @return
+     */
+    __device__ __host__ LocalVertexT(uint16_t id) : id(id)
+    {
+    }
+    uint16_t id;
+};
+
+/**
+ * @brief Local edge type (wrapped around uint16_t)
+ */
+struct LocalEdgeT
+{
+    /**
+     * @brief Default constructor
+     */
+    __device__ __host__ LocalEdgeT() : id(INVALID16)
+    {
+    }
+
+    /**
+     * @brief Constructor using local index
+     * @param id edge local index in the owner patch
+     * @return
+     */
+    __device__ __host__ LocalEdgeT(uint16_t id) : id(id)
+    {
+    }
+    uint16_t id;
+};
+
+/**
+ * @brief Local face type (wrapped around uint16_t)
+ */
+struct LocalFaceT
+{
+    /**
+     * @brief Default constructor
+     */
+    __device__ __host__ LocalFaceT() : id(INVALID16)
+    {
+    }
+
+    /**
+     * @brief Constructor using local index
+     * @param id face local index in the owner patch
+     * @return
+     */
+    __device__ __host__ LocalFaceT(uint16_t id) : id(id)
+    {
+    }
+    uint16_t id;
+};
+
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/patch_info.h b/include/rxmesh/patch_info.h
new file mode 100644
index 00000000..f7e3fdd4
--- /dev/null
+++ b/include/rxmesh/patch_info.h
@@ -0,0 +1,43 @@
+#pragma once
+#include <cuda_runtime_api.h>
+#include <stdint.h>
+#include <string>
+#include <utility>
+#include "rxmesh/local.h"
+#include "rxmesh/util/macros.h"
+
+namespace rxmesh {
+
+/**
+ * @brief PatchInfo stores the information needed for query operations in a
+ * patch
+ */
+struct ALIGN(16) PatchInfo
+{
+    // The topology information: edge incident vertices and face incident edges
+    LocalVertexT* ev;
+    LocalEdgeT*   fe;
+
+
+    // Non-owned mesh elements patch ID
+    uint32_t* not_owned_patch_v;
+    uint32_t* not_owned_patch_e;
+    uint32_t* not_owned_patch_f;
+
+
+    // Non-owned mesh elements local ID
+    LocalVertexT* not_owned_id_v;
+    LocalEdgeT*   not_owned_id_e;
+    LocalFaceT*   not_owned_id_f;
+
+    // Number of mesh elements in the patch
+    uint16_t num_vertices, num_edges, num_faces;
+
+    // Number of mesh elements owned by this patch
+    uint16_t num_owned_vertices, num_owned_edges, num_owned_faces;
+
+    // The index of this patch (relative to all other patches)
+    uint32_t patch_id;
+};
+
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/patcher/patcher.cu b/include/rxmesh/patcher/patcher.cu
index b1599a63..576305cd 100644
--- a/include/rxmesh/patcher/patcher.cu
+++ b/include/rxmesh/patcher/patcher.cu
@@ -3,36 +3,61 @@
 #include <functional>
 #include <iomanip>
 #include <queue>
+#include <unordered_map>
 #include "cub/device/device_radix_sort.cuh"
 #include "cub/device/device_scan.cuh"
 #include "cuda_profiler_api.h"
 #include "rxmesh/kernels/util.cuh"
 #include "rxmesh/patcher/patcher.h"
 #include "rxmesh/patcher/patcher_kernel.cuh"
-#include "rxmesh/util/export_tools.h"
 #include "rxmesh/util/log.h"
 #include "rxmesh/util/macros.h"
 #include "rxmesh/util/timer.h"
 #include "rxmesh/util/util.h"
 
 
-extern std::vector<std::vector<float>> Verts;  // TODO remove this
-namespace RXMESH {
-
-
-namespace PATCHER {
-
-//********************** Constructors/Destructors
-Patcher::Patcher(uint32_t                                  patch_size,
-                 const std::vector<std::vector<uint32_t>>& fvn,
-                 const uint32_t                            num_vertices,
-                 const uint32_t                            num_edges,
-                 const bool is_multi_component /* = true*/,
-                 const bool quite /*=true*/)
-    : m_patch_size(patch_size), m_fvn(fvn), m_num_vertices(num_vertices),
-      m_num_edges(num_edges), m_num_faces(fvn.size()), m_num_seeds(0),
-      m_max_num_patches(0), m_is_multi_component(is_multi_component),
-      m_quite(quite), m_num_components(0), m_patching_time_ms(0)
+namespace rxmesh {
+
+
+namespace patcher {
+
+Patcher::Patcher(uint32_t                                        patch_size,
+                 const std::vector<uint32_t>&                    ff_offset,
+                 const std::vector<uint32_t>&                    ff_values,
+                 const std::vector<std::vector<uint32_t>>&       fv,
+                 const std::unordered_map<std::pair<uint32_t, uint32_t>,
+                                          uint32_t,
+                                          detail::edge_key_hash> edges_map,
+                 const uint32_t                                  num_vertices,
+                 const uint32_t                                  num_edges,
+                 const bool                                      quite)
+    : m_patch_size(patch_size),
+      m_num_patches(0),
+      m_num_vertices(num_vertices),
+      m_num_edges(num_edges),
+      m_num_faces(fv.size()),
+      m_num_seeds(0),
+      m_max_num_patches(0),
+      m_num_components(0),
+      m_num_lloyd_run(0),
+      m_d_face_patch(nullptr),
+      m_d_vertex_patch(nullptr),
+      m_d_edge_patch(nullptr),
+      m_d_patches_offset(nullptr),
+      m_d_patches_size(nullptr),
+      m_d_patches_val(nullptr),
+      m_patching_time_ms(0.0),
+      m_d_seeds(nullptr),
+      m_d_ff_values(nullptr),
+      m_d_ff_offset(nullptr),
+      m_d_queue(nullptr),
+      m_d_queue_ptr(nullptr),
+      m_d_new_num_patches(nullptr),
+      m_d_max_patch_size(nullptr),
+      m_d_cub_temp_storage_scan(nullptr),
+      m_d_cub_temp_storage_max(nullptr),
+      m_cub_scan_bytes(0),
+      m_cub_max_bytes(0)
 {
 
     m_num_patches =
@@ -42,11 +67,44 @@ Patcher::Patcher(uint32_t                                  patch_size,
 
     m_num_seeds = m_num_patches;
 
-    mem_alloc();
+    allocate_memory();
+
+    // degenerate cases
+    if (m_num_patches <= 1) {
+        m_patches_offset[0] = m_num_faces;
+        m_num_seeds         = 1;
+        m_num_components    = 1;
+        m_num_lloyd_run     = 0;
+        for (uint32_t i = 0; i < m_num_faces; ++i) {
+            m_face_patch[i]  = 0;
+            m_patches_val[i] = i;
+        }        
+        allocate_device_memory(ff_offset, ff_values);
+        assign_patch(fv, edges_map);
+    } else {
+
+        initialize_random_seeds(ff_offset, ff_values);
+        allocate_device_memory(ff_offset, ff_values);
+        run_lloyd();
+        postprocess(fv, ff_offset, ff_values);
+        assign_patch(fv, edges_map);
+    }
+
+    if (!quite) {
+        print_statistics();
+    }
 }
 
-void Patcher::mem_alloc()
+Patcher::~Patcher()
 {
+    GPU_FREE(m_d_face_patch);
+    GPU_FREE(m_d_vertex_patch);
+    GPU_FREE(m_d_edge_patch);
+}
+
+void Patcher::allocate_memory()
+{
+    m_seeds.reserve(m_num_seeds);
 
     // patches assigned to each face, vertex, and edge
     m_face_patch.resize(m_num_faces);
@@ -61,14 +119,9 @@ void Patcher::mem_alloc()
     // explicit patches in compressed format
     m_patches_val.resize(m_num_faces);
 
-    // we allow upto double the number of faces due to patch bisecting
+    // we allow up to double the number of faces due to patch bisecting
     m_patches_offset.resize(m_max_num_patches);
 
-    // used to track the frontier and current seeds
-    m_frontier.resize(m_num_faces, INVALID32);
-    m_tf.resize(3);
-    m_seeds.reserve(m_num_seeds);
-
     // external ribbon. it assumes first that all faces will be in there and
     // then shrink to fit after the construction is done
     m_ribbon_ext_offset.resize(m_max_num_patches, 0);
@@ -76,13 +129,89 @@ void Patcher::mem_alloc()
     m_ribbon_ext_val.resize(m_num_faces);
 }
 
-Patcher::~Patcher()
+void Patcher::allocate_device_memory(const std::vector<uint32_t>& ff_offset,
+                                     const std::vector<uint32_t>& ff_values)
 {
-}
-//**************************************************************************
+    // ff
+    CUDA_ERROR(cudaMalloc((void**)&m_d_ff_values,
+                          ff_values.size() * sizeof(uint32_t)));
+    CUDA_ERROR(cudaMalloc((void**)&m_d_ff_offset,
+                          ff_offset.size() * sizeof(uint32_t)));
+
+    CUDA_ERROR(cudaMemcpy((void**)m_d_ff_values,
+                          ff_values.data(),
+                          ff_values.size() * sizeof(uint32_t),
+                          cudaMemcpyHostToDevice));
+
+    CUDA_ERROR(cudaMemcpy((void**)m_d_ff_offset,
+                          ff_offset.data(),
+                          ff_offset.size() * sizeof(uint32_t),
+                          cudaMemcpyHostToDevice));
+    // face/vertex/edge patch
+    CUDA_ERROR(
+        cudaMalloc((void**)&m_d_face_patch, m_num_faces * sizeof(uint32_t)));
+    CUDA_ERROR(cudaMalloc((void**)&m_d_vertex_patch,
+                          m_num_vertices * sizeof(uint32_t)));
+    CUDA_ERROR(
+        cudaMalloc((void**)&m_d_edge_patch, m_num_edges * sizeof(uint32_t)));
+
+    // seeds
+    CUDA_ERROR(
+        cudaMalloc((void**)&m_d_seeds, m_max_num_patches * sizeof(uint32_t)));
+
+    CUDA_ERROR(cudaMemcpy((void**)m_d_seeds,
+                          m_seeds.data(),
+                          m_num_patches * sizeof(uint32_t),
+                          cudaMemcpyHostToDevice));
 
+    // utility
+    // 0 -> queue start
+    // 1-> queue end
+    // 2-> next queue end
+    std::vector<uint32_t> h_queue_ptr{0, m_num_patches, m_num_patches};
+    CUDA_ERROR(cudaMalloc((void**)&m_d_queue, m_num_faces * sizeof(uint32_t)));
+    CUDA_ERROR(cudaMalloc((void**)&m_d_queue_ptr, 3 * sizeof(uint32_t)));
+    CUDA_ERROR(cudaMemcpy(m_d_queue_ptr,
+                          h_queue_ptr.data(),
+                          3 * sizeof(uint32_t),
+                          cudaMemcpyHostToDevice));
+
+    // patch offset/size/value and max patch size
+    CUDA_ERROR(cudaMalloc((void**)&m_d_patches_offset,
+                          m_max_num_patches * sizeof(uint32_t)));
+    CUDA_ERROR(cudaMalloc((void**)&m_d_patches_size,
+                          m_max_num_patches * sizeof(uint32_t)));
+    CUDA_ERROR(
+        cudaMalloc((void**)&m_d_patches_val, m_num_faces * sizeof(uint32_t)));
+    CUDA_ERROR(cudaMalloc((void**)&m_d_max_patch_size, sizeof(uint32_t)));
+
+    CUDA_ERROR(cudaMalloc((void**)&m_d_new_num_patches, sizeof(uint32_t)));
+
+    CUDA_ERROR(cudaMemcpy((void**)m_d_new_num_patches,
+                          &m_num_patches,
+                          sizeof(uint32_t),
+                          cudaMemcpyHostToDevice));
+
+    // CUB temp memory
+    m_d_cub_temp_storage_scan = nullptr;
+    m_d_cub_temp_storage_max  = nullptr;
+    m_cub_scan_bytes          = 0;
+    m_cub_max_bytes           = 0;
+    ::cub::DeviceScan::InclusiveSum(m_d_cub_temp_storage_scan,
+                                    m_cub_scan_bytes,
+                                    m_d_patches_size,
+                                    m_d_patches_offset,
+                                    m_max_num_patches);
+    ::cub::DeviceReduce::Max(m_d_cub_temp_storage_max,
+                             m_cub_max_bytes,
+                             m_d_patches_size,
+                             m_d_max_patch_size,
+                             m_max_num_patches);
+    CUDA_ERROR(
+        cudaMalloc((void**)&m_d_cub_temp_storage_scan, m_cub_scan_bytes));
+    CUDA_ERROR(cudaMalloc((void**)&m_d_cub_temp_storage_max, m_cub_max_bytes));
+}
 
-//********************** Exporters/Importer
 void Patcher::print_statistics()
 {
     RXMESH_TRACE("Patcher: num_patches = {}", m_num_patches);
@@ -94,325 +223,101 @@ void Patcher::print_statistics()
     RXMESH_TRACE(
         "Patcher: Parallel patches construction time = {} (ms) and {} "
         "(ms/lloyd_run)",
-        m_patching_time_ms, m_patching_time_ms / float(m_num_lloyd_run));
+        m_patching_time_ms,
+        m_patching_time_ms / float(m_num_lloyd_run));
 
     // max-min patch size
     uint32_t max_patch_size(0), min_patch_size(m_num_faces), avg_patch_size(0);
     get_max_min_avg_patch_size(min_patch_size, max_patch_size, avg_patch_size);
     RXMESH_TRACE(
         "Patcher: max_patch_size= {}, min_patch_size= {}, avg_patch_size= {}",
-        max_patch_size, min_patch_size, avg_patch_size);
+        max_patch_size,
+        min_patch_size,
+        avg_patch_size);
 
     RXMESH_TRACE("Patcher: number external ribbon faces = {} ({:02.2f}%)",
-                 get_num_ext_ribbon_faces(), get_ribbon_overhead());
-
-    /*std::string filename = "patch_dist.txt";
-    filename = STRINGIFY(OUTPUT_DIR) + filename;
-    std::fstream file(filename.c_str(), std::ios::out);
-    file.precision(15);
-    for (uint32_t p = 0; p < m_num_patches; p++) {
-        uint32_t p_size =
-            m_patches_offset[p] - ((p == 0) ? 0 : m_patches_offset[p - 1]);
-        file << p_size << "\n";
-    }
-    file.close();*/
+                 get_num_ext_ribbon_faces(),
+                 get_ribbon_overhead());
 }
 
-template <class T_d>
-void Patcher::export_ext_ribbon(const std::vector<std::vector<T_d>>& Verts,
-                                int                                  patch_id)
+void Patcher::initialize_random_seeds(const std::vector<uint32_t>& ff_offset,
+                                      const std::vector<uint32_t>& ff_values)
 {
-    uint32_t start = ((patch_id == 0) ? 0 : m_ribbon_ext_offset[patch_id - 1]);
-    export_face_list("ribbon_ext" + std::to_string(patch_id) + ".obj", m_fvn,
-                     Verts, m_ribbon_ext_offset[patch_id] - start,
-                     m_ribbon_ext_val.data() + start);
-}
 
-template <class T_d>
-void Patcher::export_patches(const std::vector<std::vector<T_d>>& Verts)
-{
-    export_attribute_VTK("patches.vtk", m_fvn, Verts, 1, m_face_patch.data(),
-                         m_vertex_patch.data(), false);
-
-    /*if (!m_vertex_patch.empty()) {
-        export_as_cubes_VTK(
-            "patches_vertex.vtk", m_num_vertices, 0.05f, m_vertex_patch.data(),
-            [&Verts](uint32_t i) { return Verts[i][0]; },
-            [&Verts](uint32_t i) { return Verts[i][1]; },
-            [&Verts](uint32_t i) { return Verts[i][2]; }, m_num_patches, false);
-    }*/
-}
+    // 1) Identify the components i.e., for each component list the faces
+    // that belong to that it
+    // 2) Generate number of (random) seeds in each component
+    // proportional to the number of faces it contain
 
+    std::vector<std::vector<uint32_t>> components;
+    get_multi_components(components, ff_offset, ff_values);
 
-template <class T_d>
-void Patcher::export_components(
-    const std::vector<std::vector<T_d>>&      Verts,
-    const std::vector<std::vector<uint32_t>>& components)
-{
-
-    uint32_t           num_components = components.size();
-    std::vector<float> rand_color(num_components + 1);
-    for (uint32_t i = 0; i < num_components; ++i) {
-        rand_color[i] = float(rand()) / float(RAND_MAX);
-    }
-    rand_color[num_components] = 0.0f;
-    std::vector<uint32_t> face_component(m_num_faces, INVALID32);
-    uint32_t              comp_id = 0;
-    for (const auto& comp : components) {
-        for (const auto& cf : comp) {
-            assert(face_component[cf] == INVALID32);
-            face_component[cf] = comp_id;
-        }
-        ++comp_id;
-    }
-    export_attribute_VTK("components.vtk", m_fvn, Verts, 1,
-                         face_component.data(), face_component.data(),
-                         num_components, false, rand_color.data());
-}
-
-
-template <class T_d>
-void Patcher::export_single_patch(const std::vector<std::vector<T_d>>& Verts,
-                                  int                                  patch_id)
-{
-
-    std::vector<uint32_t> vf1(3);
-
-    std::string filename =
-        STRINGIFY(OUTPUT_DIR) + ("patch_" + std::to_string(patch_id) + ".obj");
-
-    std::fstream file(filename, std::ios::out);
-
-    for (uint32_t i = 0; i < Verts.size(); ++i) {
-        file << "v " << Verts[i][0] << " " << Verts[i][1] << " " << Verts[i][2]
-             << std::endl;
-    }
-
-    uint32_t p_start = (patch_id == 0) ? 0 : m_patches_offset[patch_id - 1];
-    uint32_t p_end = m_patches_offset[patch_id];
-
-    for (uint32_t fb = p_start; fb < p_end; ++fb) {
-        uint32_t face = m_patches_val[fb];
-
-        get_incident_vertices(face, vf1);
-
-        file << "f " << vf1[0] + 1 << " " << vf1[1] + 1 << " " << vf1[2] + 1
-             << std::endl;
-    }
-}
-
-template <class T_d, typename EdgeIDFunc>
-void Patcher::export_single_patch_edges(
-    const std::vector<std::vector<T_d>>& Verts,
-    int                                  patch_id,
-    EdgeIDFunc                           get_edge_id)
-{
-    // export edges of that are assigned to patch_id
-
-    std::string filename = STRINGIFY(OUTPUT_DIR) +
-                           ("patch_edges_" + std::to_string(patch_id) + ".obj");
-
-    std::fstream file(filename, std::ios::out);
-
-    for (uint32_t i = 0; i < Verts.size(); ++i) {
-        file << "v " << Verts[i][0] << " " << Verts[i][1] << " " << Verts[i][2]
-             << std::endl;
-    }
-
-
-    std::vector<uint32_t> vf1(3);
-
-    for (uint32_t f = 0; f < m_num_faces; ++f) {
-        get_incident_vertices(f, vf1);
-
-        uint32_t v1 = vf1.back();
-        for (uint32_t v = 0; v < vf1.size(); ++v) {
-            uint32_t v0 = vf1[v];
-
-            uint32_t edge_id = get_edge_id(v0, v1);
-
-            if (get_edge_patch_id(edge_id) == patch_id) {
-                file << "f " << v0 + 1 << " " << v1 + 1 << " " << v0 + 1
-                     << std::endl;
-            }
-            v1 = v0;
-        }
-    }
-}
-//**************************************************************************
-
-
-//********************** executer/internal utilities
-void Patcher::execute(std::function<uint32_t(uint32_t, uint32_t)> get_edge_id,
-                      const std::vector<std::vector<uint32_t>>&   ef)
-{
-
-    // degenerate cases
-    if (m_num_patches <= 1) {
-        m_patches_offset[0] = m_num_faces;
-
-        for (uint32_t i = 0; i < m_num_faces; ++i) {
-            m_face_patch[i] = 0;
-            m_patches_val[i] = i;
-        }        
-        m_neighbour_patches_offset.resize(1, 0);
-        assign_patch(get_edge_id);
-        if (!m_quite) {
-            print_statistics();
-        }
-        return;
-    }
-
-    parallel_execute(ef);
-
-    postprocess();
-
-    // export_patches(Verts);
-    // for (uint32_t i = 0; i < m_num_patches;++i){
-    //	export_ext_ribbon(Verts, i);
-    //}
-
-    m_ribbon_ext_val.resize(m_ribbon_ext_offset[m_num_patches - 1]);
-
-    // assign patches to vertices and edges
-    assign_patch(get_edge_id);
-
-    // export_single_patch_edges(Verts, 0, get_edge_id);
-
-
-    if (!m_quite) {
-        print_statistics();
-    }
-}
-
-void Patcher::initialize_cluster_seeds()
-{
-    // cluster i.e., start from one triangle and grow in bfs style from it
-    // for experiments only
-
-    double   r = double(rand()) / double(RAND_MAX);
-    uint32_t rand_face =
-        static_cast<uint32_t>(r * static_cast<double>(m_num_faces - 1));
-    std::queue<uint32_t> qu;
-    qu.push(rand_face);
-
-    std::vector<uint32_t> n_faces(3);
-    std::vector<uint32_t> taken;
-    taken.push_back(rand_face);
-
-    while (true) {
-        uint32_t current_face = qu.front();
-        qu.pop();
-
-        m_seeds.push_back(current_face);
-
-        if (m_seeds.size() == m_num_seeds) {
-            return;
-        }
-
-        get_adjacent_faces(current_face, n_faces);
-
-        for (uint32_t i = 0; i < n_faces.size(); i++) {
-            uint32_t ff = n_faces[i];
-            if (ff == SPECIAL || ff == INVALID32 ||
-                find_index(ff, taken) != std::numeric_limits<uint32_t>::max()) {
-                continue;
-            }
-            qu.push(ff);
-            taken.push_back(ff);
-        }
-    }
-}
-
-void Patcher::initialize_random_seeds()
-{
-    // random
-    if (!m_is_multi_component) {
+    m_num_components = components.size();
+    if (m_num_components == 1) {
         initialize_random_seeds_single_component();
     } else {
-        // if multi-component,
-        // 1) Identify the components i.e., for each component list the faces
-        // that belong to that it
-        // 2) Generate number of (random) seeds in each component
-        // proportional to the number of faces it contain
-
-        std::vector<std::vector<uint32_t>> components;
-        get_multi_components(components);
-
-        // export_components(Verts, components);
-
-        m_num_components = components.size();
-        if (m_num_components == 1) {
-            initialize_random_seeds_single_component();
+        if (m_num_seeds <= m_num_components) {
+            // we have too many components so we increase the number of
+            // seeds. this case should not be encountered frequently
+            // since we generate only one seed per component
+            m_num_seeds = m_num_components;
+            for (auto& comp : components) {
+                generate_random_seed_from_component(comp, 1);
+            }
         } else {
-            if (m_num_seeds <= m_num_components) {
-                // we have too many components so we increase the number of
-                // seeds. this case should not be encountered frequently
-                // since we generate only one seed per component
-                m_num_seeds = m_num_components;
-                for (auto& comp : components) {
-                    generate_random_seed_from_component(comp, 1);
-                }
-            } else {
-                // if we have more seeds to give than the number of components,
-                // then first secure that we have at least one seed per
-                // component then we calculate the number of extra/remaining
-                // seeds that will need be added. Every component then will have
-                // a weight proportional to its size that tells how many of
-                // these remaining seeds it can take
-
-                uint32_t num_remaining_seeds = m_num_seeds - m_num_components;
-                uint32_t num_extra_seeds_inserted = 0;
-
-                // sort the order of the component to be processed by their size
-                std::vector<size_t> component_order(components.size());
-                fill_with_sequential_numbers(component_order.data(),
-                                             component_order.size());
-                std::sort(component_order.begin(), component_order.end(),
-                          [&components](const size_t& a, const size_t& b) {
-                              return components[a].size() >
-                                     components[b].size();
-                          });
-
-                // process components in descending order with repsect to their
-                // size
-                for (size_t c = 0; c < component_order.size(); ++c) {
-
-                    std::vector<uint32_t>& comp =
-                        components[component_order[c]];
-
-                    uint32_t size = comp.size();
-                    // this weight tells how many extra faces this component
-                    // have from num_remaining_seeds
-                    float weight = static_cast<float>(size) /
-                                   static_cast<float>(m_num_faces);
-                    uint32_t component_num_seeds =
-                        static_cast<uint32_t>(std::ceil(
-                            weight * static_cast<float>(num_remaining_seeds)));
-
-
-                    num_extra_seeds_inserted += component_num_seeds;
-                    if (num_extra_seeds_inserted > num_remaining_seeds) {
-                        if (num_extra_seeds_inserted - num_remaining_seeds >
-                            component_num_seeds) {
-                            component_num_seeds = 0;
-                        } else {
-                            component_num_seeds -= (num_extra_seeds_inserted -
-                                                    num_remaining_seeds);
-                        }
+            // if we have more seeds to give than the number of components,
+            // then first secure that we have at least one seed per
+            // component then we calculate the number of extra/remaining
+            // seeds that will need be added. Every component then will have
+            // a weight proportional to its size that tells how many of
+            // these remaining seeds it can take
+
+            uint32_t num_remaining_seeds      = m_num_seeds - m_num_components;
+            uint32_t num_extra_seeds_inserted = 0;
+
+            // sort the order of the component to be processed by their size
+            std::vector<size_t> component_order(components.size());
+            fill_with_sequential_numbers(component_order.data(),
+                                         component_order.size());
+            std::sort(component_order.begin(),
+                      component_order.end(),
+                      [&components](const size_t& a, const size_t& b) {
+                          return components[a].size() > components[b].size();
+                      });
+
+            // process components in descending order with respect to their
+            // size
+            for (size_t c = 0; c < component_order.size(); ++c) {
+
+                std::vector<uint32_t>& comp = components[component_order[c]];
+
+                uint32_t size = comp.size();
+                // this weight tells how many extra faces this component
+                // have from num_remaining_seeds
+                float weight =
+                    static_cast<float>(size) / static_cast<float>(m_num_faces);
+                uint32_t component_num_seeds = static_cast<uint32_t>(std::ceil(
+                    weight * static_cast<float>(num_remaining_seeds)));
+
+
+                num_extra_seeds_inserted += component_num_seeds;
+                if (num_extra_seeds_inserted > num_remaining_seeds) {
+                    if (num_extra_seeds_inserted - num_remaining_seeds >
+                        component_num_seeds) {
+                        component_num_seeds = 0;
+                    } else {
+                        component_num_seeds -=
+                            (num_extra_seeds_inserted - num_remaining_seeds);
                     }
-
-                    component_num_seeds += 1;
-                    generate_random_seed_from_component(comp,
-                                                        component_num_seeds);
                 }
+
+                component_num_seeds += 1;
+                generate_random_seed_from_component(comp, component_num_seeds);
             }
         }
     }
 
-
-    // export_face_list("seeds.obj", m_fvn, Verts, uint32_t(m_seeds.size()),
-    //                 m_seeds.data());
+    assert(m_num_patches == m_seeds.size());
 }
 
 void Patcher::initialize_random_seeds_single_component()
@@ -422,8 +327,8 @@ void Patcher::initialize_random_seeds_single_component()
     fill_with_sequential_numbers(rand_num.data(), rand_num.size());
     random_shuffle(rand_num.data(), rand_num.size());
     m_seeds.resize(m_num_seeds);
-    std::memcpy(m_seeds.data(), rand_num.data(),
-                m_num_seeds * sizeof(uint32_t));
+    std::memcpy(
+        m_seeds.data(), rand_num.data(), m_num_seeds * sizeof(uint32_t));
 }
 
 void Patcher::generate_random_seed_from_component(
@@ -441,16 +346,18 @@ void Patcher::generate_random_seed_from_component(
 
     random_shuffle(component.data(), component.size());
     m_seeds.resize(num_seeds_before + num_seeds);
-    std::memcpy(m_seeds.data() + num_seeds_before, component.data(),
+    std::memcpy(m_seeds.data() + num_seeds_before,
+                component.data(),
                 num_seeds * sizeof(uint32_t));
 }
 
 
 void Patcher::get_multi_components(
-    std::vector<std::vector<uint32_t>>& components)
+    std::vector<std::vector<uint32_t>>& components,
+    const std::vector<uint32_t>&        ff_offset,
+    const std::vector<uint32_t>&        ff_values)
 {
-    std::vector<bool>     visited(m_num_faces, false);
-    std::vector<uint32_t> ff(3);
+    std::vector<bool> visited(m_num_faces, false);
     for (uint32_t f = 0; f < m_num_faces; ++f) {
         if (!visited[f]) {
             std::vector<uint32_t> current_component;
@@ -461,15 +368,16 @@ void Patcher::get_multi_components(
             std::queue<uint32_t> face_queue;
             face_queue.push(f);
             while (!face_queue.empty()) {
-                uint32_t current_face = face_queue.front();
+                uint32_t face = face_queue.front();
                 face_queue.pop();
-                get_adjacent_faces(current_face, ff);
-
-                for (const auto& f : ff) {
-                    if (!visited[f]) {
-                        current_component.push_back(f);
-                        face_queue.push(f);
-                        visited[f] = true;
+                uint32_t start = (face == 0) ? 0 : ff_offset[face - 1];
+                uint32_t end   = ff_offset[face];
+                for (uint32_t f = start; f < end; ++f) {
+                    uint32_t n_face = ff_values[f];
+                    if (!visited[n_face]) {
+                        current_component.push_back(n_face);
+                        face_queue.push(n_face);
+                        visited[n_face] = true;
                     }
                 }
             }
@@ -479,10 +387,11 @@ void Patcher::get_multi_components(
     }
 }
 
-void Patcher::postprocess()
+void Patcher::postprocess(const std::vector<std::vector<uint32_t>>& fv,
+                          const std::vector<uint32_t>&              ff_offset,
+                          const std::vector<uint32_t>&              ff_values)
 {
-    // Post process the patches by extracting the ribbons and populate the
-    // neighbour patches storage
+    // Post process the patches by extracting the ribbons 
     //
     // For patch P, we start first by identifying boundary faces; faces that has
     // an edge on P's boundary. These faces are captured by querying the
@@ -491,13 +400,11 @@ void Patcher::postprocess()
     // faces we can extract boundary vertices. We also now know which patch is
     // neighbor to P. Then we can use the boundary vertices to find the faces
     // that are incident to these vertices on the neighbor patches
+    std::vector<uint32_t> frontier;
+    frontier.reserve(m_num_faces);
 
     std::vector<uint32_t> bd_vertices;
     bd_vertices.reserve(m_patch_size);
-    std::vector<uint32_t> vf1(3), vf2(3);
-
-    m_neighbour_patches_offset.resize(m_num_patches);
-    m_neighbour_patches.reserve(m_num_patches * 3);
 
     // build vertex incident faces
     std::vector<std::vector<uint32_t>> vertex_incident_faces(
@@ -506,23 +413,18 @@ void Patcher::postprocess()
         vertex_incident_faces[i].clear();
     }
     for (uint32_t face = 0; face < m_num_faces; ++face) {
-        get_incident_vertices(face, vf1);
-        for (uint32_t v = 0; v < vf1.size(); ++v) {
-            vertex_incident_faces[vf1[v]].push_back(face);
+        for (uint32_t v = 0; v < fv[face].size(); ++v) {
+            vertex_incident_faces[fv[face][v]].push_back(face);
         }
     }
 
     for (uint32_t cur_p = 0; cur_p < m_num_patches; ++cur_p) {
 
         uint32_t p_start = (cur_p == 0) ? 0 : m_patches_offset[cur_p - 1];
-        uint32_t p_end = m_patches_offset[cur_p];
-
-        m_neighbour_patches_offset[cur_p] =
-            (cur_p == 0) ? 0 : m_neighbour_patches_offset[cur_p - 1];
-        uint32_t neighbour_patch_start = m_neighbour_patches_offset[cur_p];
+        uint32_t p_end   = m_patches_offset[cur_p];
 
         bd_vertices.clear();
-        m_frontier.clear();
+        frontier.clear();
 
 
         //***** Pass One
@@ -531,49 +433,36 @@ void Patcher::postprocess()
         for (uint32_t fb = p_start; fb < p_end; ++fb) {
             uint32_t face = m_patches_val[fb];
 
-            get_adjacent_faces(face, m_tf);
+            bool     added = false;
+            uint32_t start = (face == 0) ? 0 : ff_offset[face - 1];
+            uint32_t end   = ff_offset[face];
 
-            bool added = false;
-            for (uint32_t g = 0; g < m_tf.size(); ++g) {
-                uint32_t n = m_tf[g];
+            for (uint32_t g = start; g < end; ++g) {
+                uint32_t n       = ff_values[g];
                 uint32_t n_patch = get_face_patch_id(n);
 
                 // n is boundary face if its patch is not the current patch we
                 // are processing
                 if (n_patch != cur_p) {
                     if (!added) {
-                        m_frontier.push_back(face);
+                        frontier.push_back(face);
                         added = true;
                     }
 
-                    // add n_patch as a neighbour patch to the current patch
-                    auto itt = std::find(
-                        m_neighbour_patches.begin() + neighbour_patch_start,
-                        m_neighbour_patches.end(), n_patch);
-
-                    if (itt == m_neighbour_patches.end()) {
-                        m_neighbour_patches.push_back(n_patch);
-                        ++m_neighbour_patches_offset[cur_p];
-                        assert(m_neighbour_patches_offset[cur_p] ==
-                               m_neighbour_patches.size());
-                    }
-
                     // find/add the boundary vertices; these are the vertices
                     // that are shared between face and n
-                    get_incident_vertices(face, vf1);
-                    get_incident_vertices(n, vf2);
-
-                    // add the common vertices in vf1 and vf2
-                    for (uint32_t i = 0; i < vf1.size(); ++i) {
-                        auto it_vf = std::find(vf2.begin(), vf2.end(), vf1[i]);
-                        if (it_vf != vf2.end()) {
-                            bd_vertices.push_back(vf1[i]);
+
+                    // add the common vertices in fv[face] and fv[n]
+                    for (uint32_t i = 0; i < fv[face].size(); ++i) {
+                        auto it_vf =
+                            std::find(fv[n].begin(), fv[n].end(), fv[face][i]);
+                        if (it_vf != fv[n].end()) {
+                            bd_vertices.push_back(fv[face][i]);
                         }
                     }
 
-
                     // we don't break out of this loop because we want to get
-                    // all the neighbour patches and boundary vertices
+                    // all the boundary vertices
                     // break;
                 }
             }
@@ -585,13 +474,6 @@ void Patcher::postprocess()
         inplace_remove_duplicates_sorted(bd_vertices);
 
 
-        // export_as_cubes("cubes" + std::to_string(cur_p) + ".obj",
-        //    bd_vertices.size(), 0.01f,
-        //    [&bd_vertices](uint32_t i) {return Verts[bd_vertices[i]][0]; },
-        //    [&bd_vertices](uint32_t i) {return Verts[bd_vertices[i]][1]; },
-        //    [&bd_vertices](uint32_t i) {return Verts[bd_vertices[i]][2]; });
-
-
         //***** Pass Two
 
         // 3) for every vertex on the patch boundary, we add all the faces
@@ -636,61 +518,37 @@ void Patcher::postprocess()
             }
         }
     }
-}
-
-void Patcher::get_adjacent_faces(uint32_t               face_id,
-                                 std::vector<uint32_t>& ff) const
-{
-    if (m_fvn.size() != 0) {
-        // We account here for non-manifold cases where a face might not be
-        // adjacent to just three faces
-        uint32_t size = m_fvn[face_id].size() - 3;
-        ff.resize(size);
-        std::memcpy(ff.data(), m_fvn[face_id].data() + 3,
-                    size * sizeof(uint32_t));
-    } else {
-        RXMESH_ERROR(
-            "Patcher::get_adjacent_faces() can not get adjacent faces!!");
-    }
-}
 
-void Patcher::get_incident_vertices(uint32_t face_id, std::vector<uint32_t>& fv)
-{
-    if (m_fvn.size() != 0) {
-        fv.resize(3);
-        std::memcpy(fv.data(), m_fvn[face_id].data(), 3 * sizeof(uint32_t));
-    } else {
-        RXMESH_ERROR(
-            "Patcher::get_incident_vertices() can not get adjacent faces!!");
-    }
+    m_ribbon_ext_val.resize(m_ribbon_ext_offset[m_num_patches - 1]);
 }
 
 void Patcher::assign_patch(
-    std::function<uint32_t(uint32_t, uint32_t)> get_edge_id)
+    const std::vector<std::vector<uint32_t>>&                 fv,
+    const std::unordered_map<std::pair<uint32_t, uint32_t>,
+                             uint32_t,
+                             ::rxmesh::detail::edge_key_hash> edges_map)
 {
     // For every patch p, for every face in the patch, find the three edges
     // that bound that face, and assign them to the patch. For boundary vertices
     // and edges assign them to one patch (TODO smallest face count). For now,
     // we assign it to the first patch
 
-    std::vector<uint32_t> vf1(3);
-
     for (uint32_t cur_p = 0; cur_p < m_num_patches; ++cur_p) {
 
         uint32_t p_start = (cur_p == 0) ? 0 : m_patches_offset[cur_p - 1];
-        uint32_t p_end = m_patches_offset[cur_p];
+        uint32_t p_end   = m_patches_offset[cur_p];
 
         for (uint32_t f = p_start; f < p_end; ++f) {
 
             uint32_t face = m_patches_val[f];
 
-            get_incident_vertices(face, vf1);
+            uint32_t v1 = fv[face].back();
+            for (uint32_t v = 0; v < fv[face].size(); ++v) {
+                uint32_t v0 = fv[face][v];
 
-            uint32_t v1 = vf1.back();
-            for (uint32_t v = 0; v < vf1.size(); ++v) {
-                uint32_t v0 = vf1[v];
-
-                uint32_t edge_id = get_edge_id(v0, v1);
+                std::pair<uint32_t, uint32_t> key =
+                    ::rxmesh::detail::edge_key(v0, v1);
+                uint32_t edge_id = edges_map.at(key);
 
                 if (m_vertex_patch[v0] == INVALID32) {
                     m_vertex_patch[v0] = cur_p;
@@ -704,170 +562,23 @@ void Patcher::assign_patch(
             }
         }
     }
-}
 
-//********************** Parallel Execute
-void Patcher::populate_ff(const std::vector<std::vector<uint32_t>>& ef,
-                          std::vector<uint32_t>&                    h_ff_values,
-                          std::vector<uint32_t>&                    h_ff_offset)
-{
-    assert(ef.size() == m_num_edges);
-    uint32_t                           total_ff_values = 0;
-    std::vector<std::vector<uint32_t>> h_ff_values_vec;
-    for (uint32_t f = 0; f < m_num_faces; ++f) {
-        std::vector<uint32_t> ff;
-        ff.reserve(3);
-        h_ff_values_vec.push_back(ff);
-    }
-    for (uint32_t e = 0; e < ef.size(); ++e) {
-        for (uint32_t f0 = 0; f0 < ef[e].size() - 1; ++f0) {
-            uint32_t face0 = ef[e][f0];
-            for (uint32_t f1 = f0 + 1; f1 < ef[e].size(); ++f1) {
-                uint32_t face1 = ef[e][f1];
-                total_ff_values += 2;
-                h_ff_values_vec[face0].push_back(face1);
-                h_ff_values_vec[face1].push_back(face0);
-            }
-        }
-    }
 
-    h_ff_offset.clear();
-    h_ff_offset.resize(m_num_faces);
-    for (uint32_t f = 0; f < m_num_faces; ++f) {
-        uint32_t s = 0;
-        if (f != 0) {
-            s = h_ff_offset[f - 1];
-        }
-        h_ff_offset[f] = s + h_ff_values_vec[f].size();
-    }
-    assert(h_ff_offset.back() == total_ff_values);
-    h_ff_values.clear();
-    h_ff_values.reserve(total_ff_values);
-    for (uint32_t f = 0; f < m_num_faces; ++f) {
-        for (uint32_t ff = 0; ff < h_ff_values_vec[f].size(); ff++) {
-            h_ff_values.push_back(h_ff_values_vec[f][ff]);
-        }
-    }
+    CUDA_ERROR(cudaMemcpy(m_d_edge_patch,
+                          m_edge_patch.data(),
+                          sizeof(uint32_t) * (m_num_edges),
+                          cudaMemcpyHostToDevice));
+    CUDA_ERROR(cudaMemcpy(m_d_vertex_patch,
+                          m_vertex_patch.data(),
+                          sizeof(uint32_t) * (m_num_vertices),
+                          cudaMemcpyHostToDevice));
 }
 
-void Patcher::parallel_execute(const std::vector<std::vector<uint32_t>>& ef)
+void Patcher::run_lloyd()
 {
-    // TODO use streams
-    // TODO we don't need ef. We only use it to compute FF which we already
-    // compute in RXMesh build_local method before invoking patcher.
-
-    // adjacent faces
-    uint32_t *d_ff_values(nullptr), *d_ff_offset(nullptr);
-    {
-        std::vector<uint32_t> h_ff_values, h_ff_offset;
-        populate_ff(ef, h_ff_values, h_ff_offset);
-        assert(h_ff_offset.size() == m_num_faces);
-        CUDA_ERROR(cudaMalloc((void**)&d_ff_values,
-                              h_ff_values.size() * sizeof(uint32_t)));
-        CUDA_ERROR(cudaMalloc((void**)&d_ff_offset,
-                              h_ff_offset.size() * sizeof(uint32_t)));
-
-        CUDA_ERROR(cudaMemcpy(d_ff_values, h_ff_values.data(),
-                              h_ff_values.size() * sizeof(uint32_t),
-                              cudaMemcpyHostToDevice));
-        CUDA_ERROR(cudaMemcpy(d_ff_offset, h_ff_offset.data(),
-                              h_ff_offset.size() * sizeof(uint32_t),
-                              cudaMemcpyHostToDevice));
-    }
-
-
-    // faces patch
-    uint32_t* d_face_patch = nullptr;
-    CUDA_ERROR(
-        cudaMalloc((void**)&d_face_patch, m_num_faces * sizeof(uint32_t)));
-
-    // seeds (allocate m_max_num_patches but copy only m_num_patches)
-    initialize_random_seeds();
-    uint32_t* d_seeds = nullptr;
-    assert(m_num_patches == m_seeds.size());
-    CUDA_ERROR(
-        cudaMalloc((void**)&d_seeds, m_max_num_patches * sizeof(uint32_t)));
-    CUDA_ERROR(cudaMemcpy(d_seeds, m_seeds.data(),
-                          m_num_patches * sizeof(uint32_t),
-                          cudaMemcpyHostToDevice));
-
-
-    // queue of size num_faces
-    // queue_start and queue_end
-    uint32_t* d_queue = nullptr;
-    CUDA_ERROR(cudaMalloc((void**)&d_queue, m_num_faces * sizeof(uint32_t)));
-
-    // 0 -> queue start
-    // 1-> queue end
-    // 2-> next queue end
     std::vector<uint32_t> h_queue_ptr{0, m_num_patches, m_num_patches};
-    uint32_t*             d_queue_ptr;
-    CUDA_ERROR(cudaMalloc((void**)&d_queue_ptr, 3 * sizeof(uint32_t)));
-    CUDA_ERROR(cudaMemcpy(d_queue_ptr, h_queue_ptr.data(), 3 * sizeof(uint32_t),
-                          cudaMemcpyHostToDevice));
 
-    // patches offset, values, and size
-    uint32_t *d_patches_offset, *d_patches_val, *d_patches_size,
-        *d_max_patch_size;
-    CUDA_ERROR(cudaMalloc((void**)&d_patches_offset,
-                          m_max_num_patches * sizeof(uint32_t)));
-    CUDA_ERROR(cudaMalloc((void**)&d_patches_size,
-                          m_max_num_patches * sizeof(uint32_t)));
-    CUDA_ERROR(
-        cudaMalloc((void**)&d_patches_val, m_num_faces * sizeof(uint32_t)));
-    CUDA_ERROR(cudaMalloc((void**)&d_max_patch_size, sizeof(uint32_t)));
-    void * d_cub_temp_storage_scan(nullptr), *d_cub_temp_storage_max(nullptr);
-    size_t cub_temp_storage_bytes_scan = 0;
-    size_t cub_temp_storage_bytes_max = 0;
-    ::cub::DeviceScan::InclusiveSum(d_cub_temp_storage_scan,
-                                    cub_temp_storage_bytes_scan, d_patches_size,
-                                    d_patches_offset, m_max_num_patches);
-    ::cub::DeviceReduce::Max(d_cub_temp_storage_max, cub_temp_storage_bytes_max,
-                             d_patches_size, d_max_patch_size,
-                             m_max_num_patches);
-    CUDA_ERROR(cudaMalloc((void**)&d_cub_temp_storage_scan,
-                          cub_temp_storage_bytes_scan));
-    CUDA_ERROR(cudaMalloc((void**)&d_cub_temp_storage_max,
-                          cub_temp_storage_bytes_max));
-
-    // Lloyd iterations loop
-    uint32_t* d_new_num_patches = nullptr;
-    CUDA_ERROR(cudaMalloc((void**)&d_new_num_patches, sizeof(uint32_t)));
-    CUDA_ERROR(cudaMemcpy(d_new_num_patches, &m_num_patches, sizeof(uint32_t),
-                          cudaMemcpyHostToDevice));
-
-    /* const char separator = ' ';
-    const int  numWidth = 15;
-    if (!m_quite) {
-        std::cout << std::endl;
-        std::cout << std::left << std::setw(numWidth) << std::setfill(separator)
-                  << "iter";
-        std::cout << std::left << std::setw(numWidth) << std::setfill(separator)
-                  << "#";
-        std::cout << std::left << std::setw(numWidth) << std::setfill(separator)
-                  << "avg";
-        std::cout << std::left << std::setw(numWidth) << std::setfill(separator)
-                  << "stddev";
-        std::cout << std::left << std::setw(numWidth) << std::setfill(separator)
-                  << "max";
-        std::cout << std::left << std::setw(numWidth) << std::setfill(separator)
-                  << "min" << std::endl
-                  << std::endl;
-    }*/
-
-    /*auto draw = [&]() {
-        CUDA_ERROR(cudaDeviceSynchronize());
-        CUDA_ERROR(cudaMemcpy(m_face_patch, d_face_patch,
-                              m_num_faces * sizeof(uint32_t),
-                              cudaMemcpyDeviceToHost));
-        for (uint32_t i = 0; i < m_num_faces; ++i) {
-            m_face_patch[i] = m_face_patch[i] >> 1;
-        }
-        export_patches(Verts);
-    };*/
-
-
-    CUDA_ERROR(cudaProfilerStart());
+    //CUDA_ERROR(cudaProfilerStart());
     GPUTimer timer;
     timer.start();
 
@@ -876,158 +587,101 @@ void Patcher::parallel_execute(const std::vector<std::vector<uint32_t>>& ef)
         ++m_num_lloyd_run;
 
         const uint32_t threads_s = 256;
-        const uint32_t blocks_s = DIVIDE_UP(m_num_patches, threads_s);
+        const uint32_t blocks_s  = DIVIDE_UP(m_num_patches, threads_s);
         const uint32_t threads_f = 256;
-        const uint32_t blocks_f = DIVIDE_UP(m_num_faces, threads_f);
+        const uint32_t blocks_f  = DIVIDE_UP(m_num_faces, threads_f);
 
         // add more seeds if needed
         if (m_num_lloyd_run % 5 == 0 && m_num_lloyd_run > 0) {
             uint32_t threshold = m_patch_size;
 
-            /*{
-            //add new seeds only to the top 10% large patches
-                CUDA_ERROR(cudaMemcpy(m_patches_offset.data(), d_patches_offset,
-                                      m_num_patches * sizeof(uint32_t),
-                                      cudaMemcpyDeviceToHost));
-                std::vector<uint32_t> sorted_patches(m_num_patches);
-                for (uint32_t p = 0; p < m_num_patches; ++p) {
-                    sorted_patches[p] = (p == 0) ? 0 : m_patches_offset[p - 1];
-                    sorted_patches[p] = m_patches_offset[p] - sorted_patches[p];
-                }
-                std::sort(sorted_patches.begin(), sorted_patches.end());
-                auto     dd = std::upper_bound(sorted_patches.begin(),
-                                           sorted_patches.end(), m_patch_size);
-                uint32_t large_patches_start = dd - sorted_patches.begin();
-                uint32_t large_patches_num =
-                    sorted_patches.size() - large_patches_start;
-                threshold = sorted_patches[sorted_patches.size() -
-                                           0.1 * large_patches_num] -
-                            1;
-            }*/
-
-            CUDA_ERROR(cudaMemcpy(d_new_num_patches, &m_num_patches,
-                                  sizeof(uint32_t), cudaMemcpyHostToDevice));
-            add_more_seeds<<<m_num_patches, 1>>>(
-                m_num_patches, d_new_num_patches, d_seeds, d_patches_offset,
-                d_patches_val, threshold);
-
-            CUDA_ERROR(cudaMemcpy(&m_num_patches, d_new_num_patches,
-                                  sizeof(uint32_t), cudaMemcpyDeviceToHost));
+            CUDA_ERROR(cudaMemcpy(m_d_new_num_patches,
+                                  &m_num_patches,
+                                  sizeof(uint32_t),
+                                  cudaMemcpyHostToDevice));
+            add_more_seeds<<<m_num_patches, 1>>>(m_num_patches,
+                                                 m_d_new_num_patches,
+                                                 m_d_seeds,
+                                                 m_d_patches_offset,
+                                                 m_d_patches_val,
+                                                 threshold);
+
+            CUDA_ERROR(cudaMemcpy(&m_num_patches,
+                                  m_d_new_num_patches,
+                                  sizeof(uint32_t),
+                                  cudaMemcpyDeviceToHost));
 
             if (m_num_patches >= m_max_num_patches) {
                 RXMESH_ERROR(
-                    "Patcher::parallel_execute() m_num_patches exceeds "
+                    "Patcher::run_lloyd() m_num_patches exceeds "
                     "m_max_num_patches");
             }
         }
         h_queue_ptr[0] = 0;
         h_queue_ptr[1] = m_num_patches;
         h_queue_ptr[2] = m_num_patches;
-        CUDA_ERROR(cudaMemcpy(d_queue_ptr, h_queue_ptr.data(),
-                              3 * sizeof(uint32_t), cudaMemcpyHostToDevice));
+        CUDA_ERROR(cudaMemcpy(m_d_queue_ptr,
+                              h_queue_ptr.data(),
+                              3 * sizeof(uint32_t),
+                              cudaMemcpyHostToDevice));
 
-        RXMESH::memset<<<blocks_f, threads_f>>>(d_face_patch, INVALID32,
-                                                m_num_faces);
+        rxmesh::memset<<<blocks_f, threads_f>>>(
+            m_d_face_patch, INVALID32, m_num_faces);
 
-        RXMESH::memcpy<<<blocks_s, threads_s>>>(d_queue, d_seeds,
-                                                m_num_patches);
+        rxmesh::memcpy<<<blocks_s, threads_s>>>(
+            m_d_queue, m_d_seeds, m_num_patches);
 
-        RXMESH::memset<<<blocks_s, threads_s>>>(d_patches_size, 0u,
-                                                m_num_patches);
+        rxmesh::memset<<<blocks_s, threads_s>>>(
+            m_d_patches_size, 0u, m_num_patches);
 
         write_initial_face_patch<<<blocks_s, threads_s>>>(
-            m_num_patches, d_face_patch, d_seeds, d_patches_size);
+            m_num_patches, m_d_face_patch, m_d_seeds, m_d_patches_size);
 
         // Cluster seed propagation
         while (true) {
             // Launch enough threads to cover all the faces. However, only
             // subset will do actual work depending on the queue size
-            cluster_seed_propagation<<<blocks_f, threads_f>>>(
-                m_num_faces, m_num_patches, d_queue_ptr, d_queue, d_face_patch,
-                d_patches_size, d_ff_offset, d_ff_values);
-
-            reset_queue_ptr<<<1, 1>>>(d_queue_ptr);
-
-            CUDA_ERROR(cudaMemcpy(h_queue_ptr.data(), d_queue_ptr,
-                                  sizeof(uint32_t), cudaMemcpyDeviceToHost));
+            cluster_seed_propagation<<<blocks_f, threads_f>>>(m_num_faces,
+                                                              m_num_patches,
+                                                              m_d_queue_ptr,
+                                                              m_d_queue,
+                                                              m_d_face_patch,
+                                                              m_d_patches_size,
+                                                              m_d_ff_offset,
+                                                              m_d_ff_values);
+
+            reset_queue_ptr<<<1, 1>>>(m_d_queue_ptr);
+
+            CUDA_ERROR(cudaMemcpy(h_queue_ptr.data(),
+                                  m_d_queue_ptr,
+                                  sizeof(uint32_t),
+                                  cudaMemcpyDeviceToHost));
 
             if (h_queue_ptr[0] >= m_num_faces) {
                 break;
             }
         }
 
-
-        uint32_t max_patch_size = construct_patches_compressed_parallel(
-            d_cub_temp_storage_max, cub_temp_storage_bytes_max, d_patches_size,
-            d_max_patch_size, d_cub_temp_storage_scan,
-            cub_temp_storage_bytes_scan, d_patches_offset, d_face_patch,
-            d_patches_val);
-
-        // draw();
-
-
-        /*uint32_t* d_second_queue;
-        {
-            CUDA_ERROR(cudaMalloc((void**)&d_second_queue,
-                                  m_num_faces * sizeof(uint32_t)));
-            CUDA_ERROR(
-                cudaMemset(d_second_queue, 0, m_num_faces * sizeof(uint32_t)));
-        }*/
+        uint32_t max_patch_size = construct_patches_compressed_format();
 
         // Interior
-        uint32_t threads_i = 512;
+        uint32_t threads_i   = 512;
         uint32_t shmem_bytes = max_patch_size * (sizeof(uint32_t));
-        RXMESH::memset<<<blocks_f, threads_f>>>(d_queue, INVALID32,
-                                                m_num_faces);
-        interior<<<m_num_patches, threads_i, shmem_bytes>>>(
-            m_num_patches, d_patches_offset, d_patches_val, d_face_patch,
-            d_seeds, d_ff_offset, d_ff_values, d_queue /*, d_second_queue*/);
-
-        /*{
-            std::vector<uint32_t> second_queue(m_num_faces);
-            CUDA_ERROR(cudaMemcpy(second_queue.data(), d_second_queue,
-                                  m_num_faces * sizeof(uint32_t),
-                                  cudaMemcpyDeviceToHost));
-            std::vector<uint32_t> face_list;
-            for (uint32_t i = 0; i < second_queue.size(); ++i) {
-                if (second_queue[i] == 1) {
-                    face_list.push_back(i);
-                }
-            }
-            export_face_list("second_queue.obj", m_fvn, Verts,
-                             uint32_t(face_list.size()), face_list.data());
-        }*/
-        /*{
-            printf("\n d_face_patch");
-            ::RXMESH::print_arr_host<<<1, 1>>>(m_num_faces, d_face_patch);
-            CUDA_ERROR(cudaDeviceSynchronize());
-        }*/
-
-        /* if (!m_quite) {
-            CUDA_ERROR(cudaDeviceSynchronize());
-            double   my_avg(0), my_stddev(0);
-            uint32_t my_max(0), my_min(0);
-            CUDA_ERROR(cudaMemcpy(m_patches_offset.data(), d_patches_offset,
-                                  m_num_patches * sizeof(uint32_t),
-                                  cudaMemcpyDeviceToHost));
-            compute_avg_stddev_max_min_rs(m_patches_offset.data(),
-                                          m_num_patches, my_avg, my_stddev,
-                                          my_max, my_min);
-            std::cout << std::left << std::setw(numWidth)
-                      << std::setfill(separator) << m_num_lloyd_run;
-            std::cout << std::left << std::setw(numWidth)
-                      << std::setfill(separator) << m_num_patches;
-            std::cout << std::left << std::setw(numWidth)
-                      << std::setfill(separator) << my_avg;
-            std::cout << std::left << std::setw(numWidth)
-                      << std::setfill(separator) << my_stddev;
-            std::cout << std::left << std::setw(numWidth)
-                      << std::setfill(separator) << my_max;
-            std::cout << std::left << std::setw(numWidth)
-                      << std::setfill(separator) << my_min << std::endl;
-        }*/
+        rxmesh::memset<<<blocks_f, threads_f>>>(
+            m_d_queue, INVALID32, m_num_faces);
+        interior<<<m_num_patches, threads_i, shmem_bytes>>>(m_num_patches,
+                                                            m_d_patches_offset,
+                                                            m_d_patches_val,
+                                                            m_d_face_patch,
+                                                            m_d_seeds,
+                                                            m_d_ff_offset,
+                                                            m_d_ff_values,
+                                                            m_d_queue);
 
         if (max_patch_size < m_patch_size) {
+            shift<<<blocks_f, threads_f>>>(
+                m_num_faces, m_d_face_patch, m_d_patches_val);
+
             break;
         }
     }
@@ -1037,105 +691,88 @@ void Patcher::parallel_execute(const std::vector<std::vector<uint32_t>>& ef)
     CUDA_ERROR(cudaDeviceSynchronize());
     CUDA_ERROR(cudaGetLastError());
     m_patching_time_ms = timer.elapsed_millis();
-    CUDA_ERROR(cudaProfilerStop());
+    //CUDA_ERROR(cudaProfilerStop());
 
 
     // move data to host
     m_num_seeds = m_num_patches;
     m_seeds.resize(m_num_seeds);
-    CUDA_ERROR(cudaMemcpy(m_seeds.data(), d_seeds,
+    CUDA_ERROR(cudaMemcpy(m_seeds.data(),
+                          m_d_seeds,
                           m_num_seeds * sizeof(uint32_t),
                           cudaMemcpyDeviceToHost));
-    CUDA_ERROR(cudaMemcpy(m_face_patch.data(), d_face_patch,
+    CUDA_ERROR(cudaMemcpy(m_face_patch.data(),
+                          m_d_face_patch,
                           sizeof(uint32_t) * m_num_faces,
                           cudaMemcpyDeviceToHost));
     m_patches_offset.resize(m_num_patches);
-    CUDA_ERROR(cudaMemcpy(m_patches_offset.data(), d_patches_offset,
+    CUDA_ERROR(cudaMemcpy(m_patches_offset.data(),
+                          m_d_patches_offset,
                           sizeof(uint32_t) * m_num_patches,
                           cudaMemcpyDeviceToHost));
-    CUDA_ERROR(cudaMemcpy(m_patches_val.data(), d_patches_val,
+    CUDA_ERROR(cudaMemcpy(m_patches_val.data(),
+                          m_d_patches_val,
                           sizeof(uint32_t) * m_num_faces,
                           cudaMemcpyDeviceToHost));
 
+    GPU_FREE(m_d_ff_values);
+    GPU_FREE(m_d_ff_offset);
 
-    // draw();
+    GPU_FREE(m_d_new_num_patches);
+    GPU_FREE(m_d_max_patch_size);
 
-    for (uint32_t i = 0; i < m_num_faces; ++i) {
-        m_face_patch[i] = m_face_patch[i] >> 1;
-        m_patches_val[i] = m_patches_val[i] >> 1;
-    }
+    GPU_FREE(m_d_cub_temp_storage_scan);
+    GPU_FREE(m_d_cub_temp_storage_max);
+    m_cub_max_bytes  = 0;
+    m_cub_scan_bytes = 0;
 
+    GPU_FREE(m_d_seeds);
+    GPU_FREE(m_d_queue);
+    GPU_FREE(m_d_queue_ptr);
 
-    GPU_FREE(d_ff_values);
-    GPU_FREE(d_ff_offset);
-    GPU_FREE(d_face_patch);
-    GPU_FREE(d_seeds);
-    GPU_FREE(d_queue);
-    GPU_FREE(d_patches_offset);
-    GPU_FREE(d_patches_size);
-    GPU_FREE(d_patches_val);
-    GPU_FREE(d_queue_ptr);
-    GPU_FREE(d_cub_temp_storage_scan);
-    GPU_FREE(d_cub_temp_storage_max);
-    GPU_FREE(d_max_patch_size);
-    GPU_FREE(d_new_num_patches);
+    GPU_FREE(m_d_patches_offset);
+    GPU_FREE(m_d_patches_size);
+    GPU_FREE(m_d_patches_val);
 }
 
-uint32_t Patcher::construct_patches_compressed_parallel(
-    void*     d_cub_temp_storage_max,
-    size_t    cub_temp_storage_bytes_max,
-    uint32_t* d_patches_size,
-    uint32_t* d_max_patch_size,
-    void*     d_cub_temp_storage_scan,
-    size_t    cub_temp_storage_bytes_scan,
-    uint32_t* d_patches_offset,
-    uint32_t* d_face_patch,
-    uint32_t* d_patches_val)
+uint32_t Patcher::construct_patches_compressed_format()
 {
     uint32_t       max_patch_size = 0;
-    const uint32_t threads_s = 256;
-    const uint32_t blocks_s = DIVIDE_UP(m_num_patches, threads_s);
-    const uint32_t threads_f = 256;
-    const uint32_t blocks_f = DIVIDE_UP(m_num_faces, threads_f);
+    const uint32_t threads_s      = 256;
+    const uint32_t blocks_s       = DIVIDE_UP(m_num_patches, threads_s);
+    const uint32_t threads_f      = 256;
+    const uint32_t blocks_f       = DIVIDE_UP(m_num_faces, threads_f);
 
     // Compute max patch size
     max_patch_size = 0;
-    ::cub::DeviceReduce::Max(d_cub_temp_storage_max, cub_temp_storage_bytes_max,
-                             d_patches_size, d_max_patch_size, m_num_patches);
-    CUDA_ERROR(cudaMemcpy(&max_patch_size, d_max_patch_size, sizeof(uint32_t),
+    ::cub::DeviceReduce::Max(m_d_cub_temp_storage_max,
+                             m_cub_max_bytes,
+                             m_d_patches_size,
+                             m_d_max_patch_size,
+                             m_num_patches);
+    CUDA_ERROR(cudaMemcpy(&max_patch_size,
+                          m_d_max_patch_size,
+                          sizeof(uint32_t),
                           cudaMemcpyDeviceToHost));
 
     // Construct compressed patches
-    ::cub::DeviceScan::InclusiveSum(d_cub_temp_storage_scan,
-                                    cub_temp_storage_bytes_scan, d_patches_size,
-                                    d_patches_offset, m_num_patches);
-    RXMESH::memset<<<blocks_s, threads_s>>>(d_patches_size, 0u, m_num_patches);
-
-    construct_patches_compressed<<<blocks_f, threads_f>>>(
-        m_num_faces, d_face_patch, m_num_patches, d_patches_offset,
-        d_patches_size, d_patches_val);
+    ::cub::DeviceScan::InclusiveSum(m_d_cub_temp_storage_scan,
+                                    m_cub_scan_bytes,
+                                    m_d_patches_size,
+                                    m_d_patches_offset,
+                                    m_num_patches);
+    rxmesh::memset<<<blocks_s, threads_s>>>(
+        m_d_patches_size, 0u, m_num_patches);
+
+    construct_patches_compressed<<<blocks_f, threads_f>>>(m_num_faces,
+                                                          m_d_face_patch,
+                                                          m_num_patches,
+                                                          m_d_patches_offset,
+                                                          m_d_patches_size,
+                                                          m_d_patches_val);
 
     return max_patch_size;
 }
-//**************************************************************************
-
-
-template void Patcher::export_single_patch(
-    const std::vector<std::vector<double>>& Verts,
-    int                                     patch_id);
-template void Patcher::export_single_patch(
-    const std::vector<std::vector<float>>& Verts,
-    int                                    patch_id);
-template void Patcher::export_patches(
-    const std::vector<std::vector<double>>& Verts);
-template void Patcher::export_patches(
-    const std::vector<std::vector<float>>& Verts);
-template void Patcher::export_ext_ribbon(
-    const std::vector<std::vector<double>>& Verts,
-    int                                     patch_id);
-template void Patcher::export_ext_ribbon(
-    const std::vector<std::vector<float>>& Verts,
-    int                                    patch_id);
-
-}  // namespace PATCHER
-}  // namespace RXMESH
\ No newline at end of file
+
+}  // namespace patcher
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/patcher/patcher.h b/include/rxmesh/patcher/patcher.h
index 028a8fb7..47ea1230 100644
--- a/include/rxmesh/patcher/patcher.h
+++ b/include/rxmesh/patcher/patcher.h
@@ -2,47 +2,37 @@
 
 #include <stdint.h>
 #include <functional>
-namespace RXMESH {
+#include <unordered_map>
+#include "rxmesh/util/util.h"
 
-namespace PATCHER {
+namespace rxmesh {
 
+namespace patcher {
+
+/**
+ * @brief Takes an input mesh and partition it to patches using Lloyd algorithm
+ * on the gpu
+ */
 class Patcher
 {
    public:
+    Patcher() = default;
+
     Patcher(uint32_t                                  patch_size,
-            const std::vector<std::vector<uint32_t>>& fvn,
-            const uint32_t                            num_vertices,
-            const uint32_t                            num_edges,
-            const bool                                is_multi_component = true,
-            const bool                                quite = true);
-
-    void execute(std::function<uint32_t(uint32_t, uint32_t)> get_edge_id,
-                 const std::vector<std::vector<uint32_t>>&   ef);
-
-    template <class T_d>
-    void export_patches(const std::vector<std::vector<T_d>>& Verts);
-
-    template <class T_d>
-    void export_components(
-        const std::vector<std::vector<T_d>>&      Verts,
-        const std::vector<std::vector<uint32_t>>& components);
-
-    template <class T_d>
-    void export_ext_ribbon(const std::vector<std::vector<T_d>>& Verts,
-                           int                                  patch_id);
-
-    template <class T_d>
-    void export_single_patch(const std::vector<std::vector<T_d>>& Verts,
-                             int                                  patch_id);
-
-    template <class T_d, typename EdgeIDFunc>
-    void export_single_patch_edges(const std::vector<std::vector<T_d>>& Verts,
-                                   int        patch_id,
-                                   EdgeIDFunc get_edge_id);
-    void print_statistics();
+            const std::vector<uint32_t>&              ff_offset,
+            const std::vector<uint32_t>&              ff_values,
+            const std::vector<std::vector<uint32_t>>& fv,
+            const std::unordered_map<std::pair<uint32_t, uint32_t>,
+                                     uint32_t,
+                                     ::rxmesh::detail::edge_key_hash> edges_map,
+            const uint32_t num_vertices,
+            const uint32_t num_edges,
+            const bool     quite);
+
+    virtual ~Patcher();
 
+    void print_statistics();
 
-    //********************** Getter
     uint32_t get_num_patches() const
     {
         return m_num_patches;
@@ -58,6 +48,21 @@ class Patcher
         return m_face_patch;
     }
 
+    uint32_t* get_device_face_patch()
+    {
+        return m_d_face_patch;
+    }
+
+    uint32_t* get_device_vertex_patch()
+    {
+        return m_d_vertex_patch;
+    }
+
+    uint32_t* get_device_edge_patch()
+    {
+        return m_d_edge_patch;
+    }
+
     std::vector<uint32_t>& get_vertex_patch()
     {
         return m_vertex_patch;
@@ -78,16 +83,6 @@ class Patcher
         return m_patches_offset.data();
     }
 
-    uint32_t* get_neighbour_patches()
-    {
-        return m_neighbour_patches.data();
-    }
-
-    uint32_t* get_neighbour_patches_offset()
-    {
-        return m_neighbour_patches_offset.data();
-    }
-
     std::vector<uint32_t>& get_external_ribbon_val()
     {
         return m_ribbon_ext_val;
@@ -154,75 +149,88 @@ class Patcher
     {
         return m_num_lloyd_run;
     }
-    //**************************************************************************
-
-
-    ~Patcher();
 
    private:
-    void mem_alloc();
-
-    void assign_patch(std::function<uint32_t(uint32_t, uint32_t)> get_edge_id);
-
-    void initialize_cluster_seeds();
-    void initialize_random_seeds();
-    void get_multi_components(std::vector<std::vector<uint32_t>>& components);
+    /**
+     * @brief Allocate various auxiliary memory needed to store patches info on
+     * the host
+     */
+    void allocate_memory();
+
+    /**
+     * @brief Allocate various temporarily memory on the device needed to
+     * compute patches on the device
+     * @param ff_offset offset indicate start (and end) to index ff_values to
+     * get face-incident-faces
+     * @param ff_values stores face-incident-faces in compressed format
+     */
+    void allocate_device_memory(const std::vector<uint32_t>& ff_offset,
+                                const std::vector<uint32_t>& ff_values);
+
+    void assign_patch(
+        const std::vector<std::vector<uint32_t>>&                 fv,
+        const std::unordered_map<std::pair<uint32_t, uint32_t>,
+                                 uint32_t,
+                                 ::rxmesh::detail::edge_key_hash> edges_map);
+
+    void initialize_random_seeds(const std::vector<uint32_t>& ff_offset,
+                                 const std::vector<uint32_t>& ff_values);
+
+    void get_multi_components(std::vector<std::vector<uint32_t>>& components,
+                              const std::vector<uint32_t>&        ff_offset,
+                              const std::vector<uint32_t>&        ff_values);
 
     void initialize_random_seeds_single_component();
     void generate_random_seed_from_component(std::vector<uint32_t>& component,
                                              uint32_t               num_seeds);
 
-    void postprocess();
-    void get_adjacent_faces(uint32_t face_id, std::vector<uint32_t>& ff) const;
-    void get_incident_vertices(uint32_t face_id, std::vector<uint32_t>& fv);
-
-    void     populate_ff(const std::vector<std::vector<uint32_t>>& ef,
-                         std::vector<uint32_t>&                    h_ff_values,
-                         std::vector<uint32_t>&                    h_ff_offset);
-    uint32_t construct_patches_compressed_parallel(
-        void*     d_cub_temp_storage_max,
-        size_t    cub_temp_storage_bytes_max,
-        uint32_t* d_patches_size,
-        uint32_t* d_max_patch_size,
-        void*     d_cub_temp_storage_scan,
-        size_t    cub_temp_storage_bytes_scan,
-        uint32_t* d_patches_offset,
-        uint32_t* d_face_patch,
-        uint32_t* d_patches_val);
-    void parallel_execute(const std::vector<std::vector<uint32_t>>& ef);
-    //********
-
-    const std::vector<std::vector<uint32_t>>& m_fvn;
-
-    uint32_t m_patch_size;
-    uint32_t m_num_patches, m_num_vertices, m_num_edges, m_num_faces,
-        m_num_seeds, m_max_num_patches;
+    void postprocess(const std::vector<std::vector<uint32_t>>& fv,
+                     const std::vector<uint32_t>&              ff_offset,
+                     const std::vector<uint32_t>&              ff_values);
+
+    uint32_t construct_patches_compressed_format();
+
+    void run_lloyd();
+
+
+    uint32_t m_patch_size, m_num_patches, m_num_vertices, m_num_edges,
+        m_num_faces, m_num_seeds, m_max_num_patches, m_num_components,
+        m_num_lloyd_run;
 
     // store the face, vertex, edge patch
     std::vector<uint32_t> m_face_patch, m_vertex_patch, m_edge_patch;
+    uint32_t *            m_d_face_patch, *m_d_vertex_patch, *m_d_edge_patch;
 
-    bool m_is_multi_component;
-    bool m_quite;
-
-    uint32_t m_num_components;
 
     // Stores the patches in compressed format
     std::vector<uint32_t> m_patches_val, m_patches_offset;
 
-    //Stores ribbon in compressed format 
-    std::vector<uint32_t> m_ribbon_ext_val, m_ribbon_ext_offset;
+    // deallocated immediately after computing patches
+    uint32_t *m_d_patches_offset, *m_d_patches_size, *m_d_patches_val;
 
-    //Stores neighbour patches in compressed format 
-    std::vector<uint32_t> m_neighbour_patches, m_neighbour_patches_offset;
+    // Stores ribbon in compressed format
+    std::vector<uint32_t> m_ribbon_ext_val, m_ribbon_ext_offset;
 
     // caching the time taken to construct the patches
     float m_patching_time_ms;
 
-    // utility vectors
-    std::vector<uint32_t> m_frontier, m_tf, m_seeds;
-    uint32_t              m_num_lloyd_run = 0;
-    //********
+    std::vector<uint32_t> m_seeds;
+
+    // (deallocated immediately after computing patches)
+    uint32_t* m_d_seeds;
+
+    // stores ff on the device (deallocated immediately after computing patches)
+    uint32_t *m_d_ff_values, *m_d_ff_offset;
+
+    // utility used during creating patches (deallocated immediately after
+    // computing patches)
+    uint32_t *m_d_queue, *m_d_queue_ptr, *m_d_new_num_patches,
+        *m_d_max_patch_size;
+
+    // CUB temp memory(deallocated immediately after computing patches)
+    void * m_d_cub_temp_storage_scan, *m_d_cub_temp_storage_max;
+    size_t m_cub_scan_bytes, m_cub_max_bytes;
 };
 
-}  // namespace PATCHER
-}  // namespace RXMESH
\ No newline at end of file
+}  // namespace patcher
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/patcher/patcher_kernel.cuh b/include/rxmesh/patcher/patcher_kernel.cuh
index 479e7882..13aa8fca 100644
--- a/include/rxmesh/patcher/patcher_kernel.cuh
+++ b/include/rxmesh/patcher/patcher_kernel.cuh
@@ -1,10 +1,21 @@
 #pragma once
 
 #include "rxmesh/kernels/collective.cuh"
-namespace RXMESH {
+namespace rxmesh {
 
-namespace PATCHER {
+namespace patcher {
 
+__global__ static void shift(const uint32_t num_faces,
+                             uint32_t*      face_patch,                             
+                             uint32_t*      patches_val)
+{
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    while (tid < num_faces) {
+        face_patch[tid] = face_patch[tid] >> 1;
+        patches_val[tid] = patches_val[tid] >> 1;
+        tid += blockDim.x * gridDim.x;
+    }
+}
 
 __device__ __forceinline__ const uint32_t* get_face_faces(
     const uint32_t* d_ff_offset,
@@ -29,7 +40,7 @@ __global__ static void write_initial_face_patch(const uint32_t num_seeds,
 {
     uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
     while (tid < num_seeds) {
-        uint32_t seed = d_seeds[tid];
+        uint32_t seed      = d_seeds[tid];
         d_face_patch[seed] = tid << 1;
         assert(d_patches_size[tid] == 0);
         d_patches_size[tid] = 1;
@@ -57,12 +68,12 @@ __global__ static void cluster_seed_propagation(const uint32_t  num_faces,
     // first bit in d_face_patch is reserved for 'is boundary face'
     uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
 
-    uint32_t current_queue_end = d_queue_ptr[1];
+    uint32_t current_queue_end   = d_queue_ptr[1];
     uint32_t current_queue_start = d_queue_ptr[0];
     while (tid >= current_queue_start && tid < current_queue_end) {
-        uint32_t        face_id = d_queue[tid];
+        uint32_t        face_id    = d_queue[tid];
         uint32_t        face_patch = d_face_patch[face_id] >> 1;
-        uint32_t        ff_len = 0;
+        uint32_t        ff_len     = 0;
         const uint32_t* ff_ptr =
             get_face_faces(d_ff_offset, d_ff_values, face_id, ff_len);
 
@@ -71,8 +82,8 @@ __global__ static void cluster_seed_propagation(const uint32_t  num_faces,
         for (uint32_t i = 0; i < ff_len; i++) {
             uint32_t n_face = ff_ptr[i];
 
-            uint32_t assumed = ::atomicCAS(&d_face_patch[n_face], INVALID32,
-                                           (face_patch << 1));
+            uint32_t assumed = ::atomicCAS(
+                &d_face_patch[n_face], INVALID32, (face_patch << 1));
             assert((assumed >> 1) < num_patches || assumed == INVALID32);
 
             if (assumed == INVALID32) {
@@ -90,8 +101,8 @@ __global__ static void cluster_seed_propagation(const uint32_t  num_faces,
             }
         }
 
-        face_patch = face_patch << 1;
-        face_patch = face_patch | is_boundary;
+        face_patch            = face_patch << 1;
+        face_patch            = face_patch | is_boundary;
         d_face_patch[face_id] = face_patch;
 
         tid += blockDim.x * gridDim.x;
@@ -108,15 +119,15 @@ __global__ static void construct_patches_compressed(
 {
     uint32_t face = threadIdx.x + blockIdx.x * blockDim.x;
     while (face < num_faces) {
-        uint32_t patch_id = d_face_patch[face];
+        uint32_t patch_id    = d_face_patch[face];
         uint32_t is_boundary = patch_id & 1;
-        patch_id = patch_id >> 1;
+        patch_id             = patch_id >> 1;
         uint32_t pos = ::atomicAdd(&d_patches_size[patch_id], uint32_t(1));
         if (patch_id != 0) {
             pos += d_patches_offset[patch_id - 1];
         }
         uint32_t res = face << 1;
-        res = res | is_boundary;
+        res          = res | is_boundary;
         assert(pos < num_faces);
         assert(face < ((num_faces << 1) | 1));
         d_patches_val[pos] = res;
@@ -148,9 +159,9 @@ __global__ static void interior(const uint32_t  num_patches,
         const uint32_t patch_id = blockIdx.x;
         const uint32_t p_start =
             (patch_id == 0) ? 0 : d_patches_offset[patch_id - 1];
-        const uint32_t p_end = d_patches_offset[patch_id];
+        const uint32_t p_end  = d_patches_offset[patch_id];
         const uint32_t p_size = p_end - p_start;
-        uint32_t       tid = threadIdx.x;
+        uint32_t       tid    = threadIdx.x;
 
         extern __shared__ uint32_t s_queue[];
 
@@ -167,7 +178,7 @@ __global__ static void interior(const uint32_t  num_patches,
                 /*if (blockIdx.x == 1) {
                     d_second_queue[face >> 1] = 1;
                 }*/
-                s_queue[pos] = face;
+                s_queue[pos]  = face;
                 d_queue[face] = 0;
             }
             tid += blockDim.x;
@@ -177,14 +188,14 @@ __global__ static void interior(const uint32_t  num_patches,
         // if there is no boundary, it means that the patch is a single
         // component. Pick any face as a seed, nobody cares!
         if (s_queue_size > 0) {
-            uint32_t queue_end = 0;
+            uint32_t queue_end   = 0;
             uint32_t queue_start = 0;
 
             while (true) {
                 // loop++;
 
                 queue_start = queue_end;
-                queue_end = s_queue_size;
+                queue_end   = s_queue_size;
 
                 /*if (threadIdx.x == 0  && patch_id == 0) {
                     printf(
@@ -202,15 +213,15 @@ __global__ static void interior(const uint32_t  num_patches,
 
                 tid = threadIdx.x;
                 while (tid < queue_end - queue_start) {
-                    uint32_t        face = s_queue[tid + queue_start];
+                    uint32_t        face   = s_queue[tid + queue_start];
                     uint32_t        ff_len = 0;
                     const uint32_t* ff_ptr =
                         get_face_faces(d_ff_offset, d_ff_values, face, ff_len);
                     for (uint32_t i = 0; i < ff_len; ++i) {
                         uint32_t n_face = ff_ptr[i];
                         if (d_face_patch[n_face] >> 1 == patch_id) {
-                            uint32_t assumed = ::atomicCAS(d_queue + n_face,
-                                                           INVALID32, patch_id);
+                            uint32_t assumed = ::atomicCAS(
+                                d_queue + n_face, INVALID32, patch_id);
                             if (assumed == INVALID32) {
                                 uint32_t pos =
                                     ::atomicAdd(&s_queue_size, uint32_t(1));
@@ -253,7 +264,7 @@ __global__ static void add_more_seeds(const uint32_t  num_patches,
         uint32_t       patch_id = blockIdx.x;
         const uint32_t p_start =
             (patch_id == 0) ? 0 : d_patches_offset[patch_id - 1];
-        const uint32_t p_end = d_patches_offset[patch_id];
+        const uint32_t p_end  = d_patches_offset[patch_id];
         const uint32_t p_size = p_end - p_start;
 
         if (p_size > threshold) {
@@ -275,5 +286,5 @@ __global__ static void add_more_seeds(const uint32_t  num_patches,
         }
     }
 }
-}  // namespace PATCHER
-}  // namespace RXMESH
\ No newline at end of file
+}  // namespace patcher
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/reduce_handle.h b/include/rxmesh/reduce_handle.h
new file mode 100644
index 00000000..a44bb9ce
--- /dev/null
+++ b/include/rxmesh/reduce_handle.h
@@ -0,0 +1,140 @@
+#pragma once
+
+#include "rxmesh/attribute.h"
+#include "rxmesh/kernels/attribute.cuh"
+
+namespace rxmesh {
+
+/**
+ * @brief This class is used to compute different reduction operations on
+ * Attribute. To create a new ReduceHandle, use create_reduce_handle()
+ * from Attribute
+ * @tparam T The type of the attribute
+ */
+template <typename T>
+class ReduceHandle
+{
+
+   public:
+    ReduceHandle()                    = default;
+    ReduceHandle(const ReduceHandle&) = default;
+
+    /**
+     * @brief Constructor which allocates internal memory used in all reduce
+     * operations
+     * @param attr one of Attribute used for subsequent reduction
+     * operations
+     */
+    ReduceHandle(const Attribute<T>& attr) : m_num_patches(attr.m_num_patches)
+    {
+        CUDA_ERROR(
+            cudaMalloc(&m_d_reduce_1st_stage, m_num_patches * sizeof(T)));
+
+        CUDA_ERROR(cudaMalloc(&m_d_reduce_2nd_stage, sizeof(T)));
+
+        m_d_reduce_temp_storage = NULL;
+        cub::DeviceReduce::Sum(m_d_reduce_temp_storage,
+                               m_reduce_temp_storage_bytes,
+                               m_d_reduce_1st_stage,
+                               m_d_reduce_2nd_stage,
+                               m_num_patches);
+
+        CUDA_ERROR(
+            cudaMalloc(&m_d_reduce_temp_storage, m_reduce_temp_storage_bytes));
+    }
+
+    ~ReduceHandle()
+    {
+        GPU_FREE(m_d_reduce_1st_stage);
+        GPU_FREE(m_d_reduce_2nd_stage);
+        GPU_FREE(m_d_reduce_temp_storage);
+        m_reduce_temp_storage_bytes = 0;
+    }
+
+    /**
+     * @brief compute dot product between two input attributes and return the
+     * output on the host
+     * @param attr1 first input attribute
+     * @param attr2 second input attribute
+     * @param stream stream to run the computation on
+     * @return the output of dot product on the host
+     */
+    T dot(const Attribute<T>& attr1,
+          const Attribute<T>& attr2,
+          cudaStream_t        stream = NULL)
+    {
+        if ((attr1.get_allocated() & DEVICE) != DEVICE ||
+            (attr2.get_allocated() & DEVICE) != DEVICE) {
+            RXMESH_ERROR(
+                "ReduceHandle::dot() input attributes to should be "
+                "allocated on the device");
+        }
+
+        detail::dot_kernel<T, attr1.m_block_size>
+            <<<m_num_patches, attr1.m_block_size, 0, stream>>>(
+                attr1,
+                attr2,
+                attr1.m_d_element_per_patch,
+                m_num_patches,
+                attr1.get_num_attributes(),
+                m_d_reduce_1st_stage);
+
+        return reduce_2nd_stage(stream);
+    }
+
+    /**
+     * @brief compute L2 norm between two input attributes and return the output
+     * on the host
+     * @param attr input attribute
+     * @param stream stream to run the computation on
+     * @return the output of L2 norm on the host
+     */
+    T norm2(const Attribute<T>& attr, cudaStream_t stream = NULL)
+    {
+        if ((attr.get_allocated() & DEVICE) != DEVICE) {
+            RXMESH_ERROR(
+                "ReduceHandle::norm2() input attribute to should be "
+                "allocated on the device");
+        }
+
+        detail::norm2_kernel<T, attr.m_block_size>
+            <<<m_num_patches, attr.m_block_size, 0, stream>>>(
+                attr,
+                attr.m_d_element_per_patch,
+                m_num_patches,
+                attr.get_num_attributes(),
+                m_d_reduce_1st_stage);
+
+        return std::sqrt(reduce_2nd_stage(stream));
+    }
+
+
+   private:
+    T reduce_2nd_stage(cudaStream_t stream)
+    {
+        T h_output = 0;
+
+        cub::DeviceReduce::Sum(m_d_reduce_temp_storage,
+                               m_reduce_temp_storage_bytes,
+                               m_d_reduce_1st_stage,
+                               m_d_reduce_2nd_stage,
+                               m_num_patches,
+                               stream);
+
+        CUDA_ERROR(cudaMemcpyAsync(&h_output,
+                                   m_d_reduce_2nd_stage,
+                                   sizeof(T),
+                                   cudaMemcpyDeviceToHost,
+                                   stream));
+        CUDA_ERROR(cudaStreamSynchronize(stream));
+
+        return h_output;
+    }
+
+    size_t   m_reduce_temp_storage_bytes;
+    T*       m_d_reduce_1st_stage;
+    T*       m_d_reduce_2nd_stage;
+    void*    m_d_reduce_temp_storage;
+    uint32_t m_num_patches;
+};
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/rxmesh.cpp b/include/rxmesh/rxmesh.cpp
index a9176497..a9c28b07 100644
--- a/include/rxmesh/rxmesh.cpp
+++ b/include/rxmesh/rxmesh.cpp
@@ -1,1212 +1,709 @@
-#include "rxmesh.h"
-
 #include <assert.h>
-#include <exception>
+#include <omp.h>
+#include <iostream>
 #include <memory>
+#include <numeric>
 #include <queue>
+
 #include "patcher/patcher.h"
-#include "rxmesh/rxmesh_context.h"
-#include "rxmesh/util/export_tools.h"
-#include "rxmesh/util/math.h"
-
-namespace RXMESH {
-// extern std::vector<std::vector<RXMESH::float>> Verts; // TODO remove this
-
-//********************** Constructors/Destructors
-template <uint32_t patchSize>
-RXMesh<patchSize>::RXMesh(std::vector<std::vector<uint32_t>>& fv,
-                          std::vector<std::vector<coordT>>&   coordinates,
-                          const bool                          sort /*= false*/,
-                          const bool                          quite /*= true*/)
-    : m_num_edges(0), m_num_faces(0), m_num_vertices(0), m_max_ele_count(0),
-      m_max_valence(0), m_max_valence_vertex_id(INVALID32),
-      m_max_edge_incident_faces(0), m_max_face_adjacent_faces(0),
-      m_face_degree(3), m_num_patches(0), m_is_input_edge_manifold(true),
-      m_is_input_closed(true), m_is_sort(sort), m_quite(quite),
-      m_max_vertices_per_patch(0), m_max_edges_per_patch(0),
-      m_max_faces_per_patch(0), m_d_patches_ltog_v(nullptr),
-      m_d_patches_ltog_e(nullptr), m_d_patches_ltog_f(nullptr),
-      m_d_ad_size_ltog_v(nullptr), m_d_ad_size_ltog_e(nullptr),
-      m_d_ad_size_ltog_f(nullptr), m_d_patches_edges(nullptr),
-      m_d_patches_faces(nullptr), m_d_patch_distribution_v(nullptr),
-      m_d_patch_distribution_e(nullptr), m_d_patch_distribution_f(nullptr),
-      m_d_ad_size(nullptr), m_d_neighbour_patches(nullptr),
-      m_d_neighbour_patches_offset(nullptr)
+#include "rxmesh/context.h"
+#include "rxmesh/rxmesh.h"
+#include "rxmesh/util/util.h"
+
+namespace rxmesh {
+RXMesh::RXMesh(const std::vector<std::vector<uint32_t>>& fv, const bool quite)
+    : m_num_edges(0),
+      m_num_faces(0),
+      m_num_vertices(0),
+      m_max_valence(0),
+      m_max_edge_incident_faces(0),
+      m_max_face_adjacent_faces(0),
+      m_max_vertices_per_patch(0),
+      m_max_edges_per_patch(0),
+      m_max_faces_per_patch(0),
+      m_max_not_owned_vertices(0),
+      m_max_not_owned_edges(0),
+      m_max_not_owned_faces(0),
+      m_num_patches(0),
+      m_patch_size(512),
+      m_is_input_edge_manifold(true),
+      m_is_input_closed(true),
+      m_quite(quite),
+      m_d_patches_info(nullptr),
+      m_h_patches_info(nullptr)
 {
     // Build everything from scratch including patches
-    build_local(fv, coordinates);
-    device_alloc_local();
-}
-
-template <uint32_t patchSize>
-RXMesh<patchSize>::~RXMesh()
-{
-    GPU_FREE(m_d_patches_ltog_v);
-    GPU_FREE(m_d_patches_ltog_e);
-    GPU_FREE(m_d_patches_ltog_f);
-    GPU_FREE(m_d_patches_edges);
-    GPU_FREE(m_d_patches_faces);
-    GPU_FREE(m_d_ad_size_ltog_v);
-    GPU_FREE(m_d_ad_size_ltog_e);
-    GPU_FREE(m_d_ad_size_ltog_f);
-    GPU_FREE(m_d_ad_size);
-    GPU_FREE(m_d_patch_distribution_v);
-    GPU_FREE(m_d_patch_distribution_e);
-    GPU_FREE(m_d_patch_distribution_f);
-    GPU_FREE(m_d_vertex_patch);
-    GPU_FREE(m_d_edge_patch);
-    GPU_FREE(m_d_face_patch);
-    GPU_FREE(m_d_neighbour_patches);
-    GPU_FREE(m_d_neighbour_patches_offset);
-};
-//**************************************************************************
-
-
-//********************** Builders
-template <uint32_t patchSize>
-void RXMesh<patchSize>::build_local(
-    std::vector<std::vector<uint32_t>>& fv,
-    std::vector<std::vector<coordT>>&   coordinates)
-{
-    // we build everything here from scratch
-    // 1) set num vertices
-    // 2) populate edge_map
-    // 3) for each edge, store a list of faces that are incident to that edge
-    // 4) copy fv to m_fvn and append the adjacent faces for each face using
-    // info from 3)
-    // 5) patch the mesh
-    // 6) populate the local mesh
-
-    //=========== 1)
-    m_num_faces = static_cast<uint32_t>(fv.size());
-    set_num_vertices(fv);
-    //===============================
-
-
-    //=========== 2)
-    populate_edge_map(fv);
-    m_num_edges = static_cast<uint32_t>(m_edges_map.size());
-    //===============================
-
-
-    //=========== 3)
-    std::vector<std::vector<uint32_t>> ef;
-    edge_incident_faces(fv, ef);
-    // caching mesh type; edge manifold, closed
-    for (uint32_t e = 0; e < ef.size(); ++e) {
-        if (ef[e].size() < 2) {
-            m_is_input_closed = false;
-        }
-        if (ef[e].size() > 2) {
-            m_is_input_edge_manifold = false;
-        }
-    }
-    //===============================
-
-
-    //=========== 4)
-    // copy fv
-    std::vector<std::vector<uint32_t>> rep(fv);
-    rep.swap(m_fvn);
-    // extend m_fvn by adding the face neighbors
-    for (uint32_t e = 0; e < ef.size(); ++e) {
-        assert(ef[e].size() != 0);  // we don't handle dangling edges
-
-        for (uint32_t f = 0; f < ef[e].size(); ++f) {
-            uint32_t f0 = ef[e][f];
-            for (uint32_t s = f + 1; s < ef[e].size(); ++s) {
-                uint32_t f1 = ef[e][s];
-                m_fvn[f0].push_back(f1);
-                m_fvn[f1].push_back(f0);
-            }
-        }
-    }
-    //===============================
-
-
-    //=========== 5)
-    // create an instance of Patcher and execute it and then move the
-    // ownership to m_patcher
-    std::unique_ptr<PATCHER::Patcher> pp = std::make_unique<PATCHER::Patcher>(
-        patchSize, m_fvn, m_num_vertices, m_num_edges, true, m_quite);
-    pp->execute(
-        [this](uint32_t v0, uint32_t v1) { return this->get_edge_id(v0, v1); },
-        ef);
-
-    m_patcher = std::move(pp);
-    m_num_patches = m_patcher->get_num_patches();
-    // m_patcher->export_patches(Verts);
-    //===============================
-
-    //=========== 5.5)
-    // sort indices based on patches
-    if (m_is_sort) {
-        sort(fv, coordinates);
-    }
-    //===============================
-
-    //=========== 6)
-    m_max_size.x = m_max_size.y = 0;
-    m_h_owned_size.resize(m_num_patches);
-    for (uint32_t p = 0; p < m_num_patches; ++p) {
-        build_patch_locally(p);
-        m_max_size.x = static_cast<unsigned int>(
-            std::max(size_t(m_max_size.x), m_h_patches_edges[p].size()));
-        m_max_size.y = static_cast<unsigned int>(
-            std::max(size_t(m_max_size.y), m_h_patches_faces[p].size()));
-    }
-
-    m_max_size.x = round_up_multiple(m_max_size.x, 32u);
-    m_max_size.y = round_up_multiple(m_max_size.y, 32u);
-
-    m_max_vertices_per_patch = 0;
-    m_max_edges_per_patch = 0;
-    m_max_faces_per_patch = 0;
-    m_max_owned_vertices_per_patch = 0;
-    m_max_owned_edges_per_patch = 0;
-    m_max_owned_faces_per_patch = 0;
-    for (uint32_t p = 0; p < m_num_patches; ++p) {
-        m_max_vertices_per_patch = std::max(
-            m_max_vertices_per_patch, uint32_t(m_h_patches_ltog_v[p].size()));
-        m_max_edges_per_patch = std::max(
-            m_max_edges_per_patch, uint32_t(m_h_patches_ltog_e[p].size()));
-        m_max_faces_per_patch = std::max(
-            m_max_faces_per_patch, uint32_t(m_h_patches_ltog_f[p].size()));
-
-        m_max_owned_faces_per_patch =
-            std::max(m_max_owned_faces_per_patch, m_h_owned_size[p].x);
-        m_max_owned_edges_per_patch =
-            std::max(m_max_owned_edges_per_patch, m_h_owned_size[p].y);
-        m_max_owned_vertices_per_patch =
-            std::max(m_max_owned_vertices_per_patch, m_h_owned_size[p].z);
-    }
-
-    // scanned histogram of element count in patches
-    m_h_patch_distribution_v.resize(m_num_patches + 1, 0);
-    m_h_patch_distribution_e.resize(m_num_patches + 1, 0);
-    m_h_patch_distribution_f.resize(m_num_patches + 1, 0);
-
-    for (uint32_t v = 0; v < m_num_vertices; ++v) {
-        uint32_t patch = m_patcher->get_vertex_patch_id(v);
-        if (patch != INVALID32) {
-            m_h_patch_distribution_v[patch]++;
-        }
-    }
-    for (uint32_t f = 0; f < m_num_faces; ++f) {
-        uint32_t patch = m_patcher->get_face_patch_id(f);
-        if (patch != INVALID32) {
-            m_h_patch_distribution_f[patch]++;
-        }
-    }
-    for (uint32_t e = 0; e < m_num_edges; ++e) {
-        uint32_t patch = m_patcher->get_edge_patch_id(e);
-        if (patch != INVALID32) {
-            m_h_patch_distribution_e[patch]++;
-        }
+    if (fv.empty()) {
+        RXMESH_ERROR(
+            "RXMesh::RXMesh input fv is empty. Can not be build RXMesh "
+            "properly");
     }
-    auto ex_scan = [](std::vector<uint32_t>& vv) {
-        uint32_t dd = 0;
-        for (uint32_t i = 1; i < vv.size(); ++i) {
-            uint32_t temp = vv[i];
-            vv[i] = dd + vv[i - 1];
-            dd = temp;
-        }
-        vv[0] = 0;
-    };
+    build(fv);
+    build_device();
+    calc_max_not_owned_elements();
 
-    ex_scan(m_h_patch_distribution_v);
-    ex_scan(m_h_patch_distribution_e);
-    ex_scan(m_h_patch_distribution_f);
+    // Allocate and copy the context to the gpu
+    m_rxmesh_context.init(m_num_edges,
+                          m_num_faces,
+                          m_num_vertices,
+                          m_num_patches,
+                          m_d_patches_info);
 
     if (!m_quite) {
-        RXMESH_TRACE("#Vertices = {}, #Faces= {}, #Edges= {}", m_num_vertices,
-                     m_num_faces, m_num_edges);
+        RXMESH_TRACE("#Vertices = {}, #Faces= {}, #Edges= {}",
+                     m_num_vertices,
+                     m_num_faces,
+                     m_num_edges);
         RXMESH_TRACE("Input is {} edge manifold",
                      ((m_is_input_edge_manifold) ? "" : " Not"));
         RXMESH_TRACE("Input is {} closed", ((m_is_input_closed) ? "" : " Not"));
         RXMESH_TRACE("max valence = {}", m_max_valence);
         RXMESH_TRACE("max edge incident faces = {}", m_max_edge_incident_faces);
         RXMESH_TRACE("max face adjacent faces = {}", m_max_face_adjacent_faces);
-        RXMESH_TRACE("per-patch maximum edges references= {}", m_max_size.x);
-        RXMESH_TRACE("per-patch maximum  faces references= {}", m_max_size.y);
-        RXMESH_TRACE("per-patch maximum face count (owned)= {} ({})",
-                     m_max_faces_per_patch, m_max_owned_faces_per_patch);
-        RXMESH_TRACE("per-patch maximum edge count (owned) = {} ({})",
-                     m_max_edges_per_patch, m_max_owned_edges_per_patch);
-        RXMESH_TRACE("per-patch maximum vertex count (owned)= {} ({})",
-                     m_max_vertices_per_patch, m_max_owned_vertices_per_patch);
+        RXMESH_TRACE("per-patch maximum face count = {}",
+                     m_max_faces_per_patch);
+        RXMESH_TRACE("per-patch maximum edge count = {}",
+                     m_max_edges_per_patch);
+        RXMESH_TRACE("per-patch maximum vertex count = {}",
+                     m_max_vertices_per_patch);
+        RXMESH_TRACE("per-patch maximum not-owned face count = {}",
+                     m_max_not_owned_faces);
+        RXMESH_TRACE("per-patch maximum not-owned edge count = {}",
+                     m_max_not_owned_edges);
+        RXMESH_TRACE("per-patch maximum not-owned vertex count = {}",
+                     m_max_not_owned_vertices);
     }
-    //===============================
-
-    m_max_ele_count = std::max(m_num_edges, m_num_faces);
-    m_max_ele_count = std::max(m_num_vertices, m_max_ele_count);
 }
 
-template <uint32_t patchSize>
-void RXMesh<patchSize>::build_patch_locally(const uint32_t patch_id)
+RXMesh::~RXMesh()
 {
-    // Build the patch in local index space
-    // This is the two small matrices defining incident relation between
-    // edge-vertices and faces-edges along with the mapping from local to
-    // global space for vertices, edge, and faces
-
-    // We we create a new patch, we make sure that the elements owned by the
-    // patch will have local indices lower than any elements (of the same type)
-    // that is not owned by the patch
-    const uint32_t *p_val(m_patcher->get_patches_val()),
-        *p_off(m_patcher->get_patches_offset());
-
-
-    // patch start and end
-    const uint32_t p_start = (patch_id == 0) ? 0 : p_off[patch_id - 1];
-    const uint32_t p_end = p_off[patch_id];
-    const uint32_t r_start =
-        (patch_id == 0) ? 0 :
-                          m_patcher->get_external_ribbon_offset()[patch_id - 1];
-    const uint32_t r_end = m_patcher->get_external_ribbon_offset()[patch_id];
-
-    const uint32_t total_patch_num_faces =
-        (p_end - p_start) + (r_end - r_start);
-    uint16_t total_patch_num_edges(0), total_patch_num_vertices(0);
-
-    assert(total_patch_num_faces <= m_num_faces);
-
-    //** faces
-    // container for this patch local faces i.e., face incident edges
-    std::vector<uint16_t> fp(m_face_degree * total_patch_num_faces);
-
-    // the mapping from this patch local space (uint16_t) to global one
-    std::vector<uint32_t> f_ltog(total_patch_num_faces);
-
-    //** edges
-    // container for this patch local edges i.e., edge incident vertices
-    std::vector<uint16_t> ep;
-
-    // the mapping from this patch local space to global one
-    std::vector<uint32_t> e_ltog;
-
-    //** vertices
-    // the mapping from this patch local space to global one
-    std::vector<uint32_t> v_ltog;
-
-    // count the number of elements owned and not owned by the patch
-    uint16_t              num_edges_owned(0), num_vertices_owned(0);
-    std::vector<uint32_t> tmp_e, tmp_v;
-    tmp_e.reserve(patchSize * 3);
-    tmp_v.reserve(patchSize);
-    auto insert_if_not_found = [](uint32_t               index,
-                                  std::vector<uint32_t>& tmp) -> uint32_t {
-        for (uint32_t i = 0; i < tmp.size(); ++i) {
-            if (tmp[i] == index) {
-                return INVALID32;
-            }
-        }
-        tmp.push_back(index);
-        return static_cast<uint32_t>(tmp.size() - 1);
-    };
-    auto count_num_elements = [&](uint32_t global_f) {
-        for (uint32_t j = 0; j < 3; j++) {
-            // find the edge global id
-            uint32_t global_v0 = m_fvn[global_f][j];
-            uint32_t global_v1 = m_fvn[global_f][(j + 1) % 3];
-
-            // find the edge in m_edge_map with v0,v1
-            std::pair<uint32_t, uint32_t> my_edge =
-                edge_key(global_v0, global_v1);
-            uint32_t global_e = get_edge_id(my_edge);
-
-            uint32_t v_index = insert_if_not_found(global_v0, tmp_v);
-            if (v_index != INVALID32) {
-                total_patch_num_vertices++;
-                if (m_patcher->get_vertex_patch_id(global_v0) == patch_id) {
-                    num_vertices_owned++;
-                }
-            }
-
-            uint32_t e_index = insert_if_not_found(global_e, tmp_e);
-            if (e_index != INVALID32) {
-                total_patch_num_edges++;
-                if (m_patcher->get_edge_patch_id(global_e) == patch_id) {
-                    num_edges_owned++;
-                }
-            }
-        }
-    };
-    for (uint32_t s = p_start; s < p_end; ++s) {
-        uint32_t global_f = p_val[s];
-        count_num_elements(global_f);
-    }
-    for (uint32_t s = r_start; s < r_end; ++s) {
-        uint32_t global_f = m_patcher->get_external_ribbon_val()[s];
-        count_num_elements(global_f);
-    }
-
-    // 1) loop over patch faces
-    e_ltog.resize(total_patch_num_edges);
-    v_ltog.resize(total_patch_num_vertices);
-    ep.resize(total_patch_num_edges * 2);
-
-    // to track how many faces/edges/vertices we have locally created so far
-    uint16_t faces_count(0), edges_owned_count(0), edges_not_owned_count(0),
-        vertices_owned_count(0), vertices_not_owned_count(0);
-    for (uint32_t s = p_start; s < p_end; ++s) {
-        uint32_t global_f = p_val[s];
-        create_new_local_face(patch_id, global_f, m_fvn[global_f], faces_count,
-                              edges_owned_count, edges_not_owned_count,
-                              vertices_owned_count, vertices_not_owned_count,
-                              num_edges_owned, num_vertices_owned, f_ltog,
-                              e_ltog, v_ltog, fp, ep);
-    }
-
-
-    // 2) loop over ribbon faces
-    for (uint32_t s = r_start; s < r_end; ++s) {
-        uint32_t global_f = m_patcher->get_external_ribbon_val()[s];
-        create_new_local_face(patch_id, global_f, m_fvn[global_f], faces_count,
-                              edges_owned_count, edges_not_owned_count,
-                              vertices_owned_count, vertices_not_owned_count,
-                              num_edges_owned, num_vertices_owned, f_ltog,
-                              e_ltog, v_ltog, fp, ep);
-    }
-
-    if (vertices_owned_count != num_vertices_owned ||
-        edges_owned_count != num_edges_owned ||
-        edges_owned_count + edges_not_owned_count != total_patch_num_edges ||
-        vertices_owned_count + vertices_not_owned_count !=
-            total_patch_num_vertices) {
-        RXMESH_ERROR("RXMesh::build_patch_locally() patch is " +
-                     std::to_string(patch_id) + " not built correctly!!");
+    for (uint32_t p = 0; p < m_num_patches; ++p) {
+        free(m_h_patches_info[p].not_owned_patch_v);
+        free(m_h_patches_info[p].not_owned_patch_e);
+        free(m_h_patches_info[p].not_owned_patch_f);
+        free(m_h_patches_info[p].not_owned_id_v);
+        free(m_h_patches_info[p].not_owned_id_e);
+        free(m_h_patches_info[p].not_owned_id_f);
     }
 
+    // m_d_patches_info is a pointer to pointer(s) which we can not dereference
+    // on the host so we copy these pointers to the host by re-using
+    // m_h_patches_info and then free the memory these pointers are pointing to.
+    // Finally, we free the parent pointer memory
 
-    m_h_owned_size[patch_id].x = (p_end - p_start);
-    m_h_owned_size[patch_id].y = num_edges_owned;
-    m_h_owned_size[patch_id].z = num_vertices_owned;
-
-    // faces
-    m_h_patches_faces.push_back(fp);
-    m_h_patches_ltog_f.push_back(f_ltog);
-
-
-    // edges
-    m_h_patches_edges.push_back(ep);
-    m_h_patches_ltog_e.push_back(e_ltog);
+    CUDA_ERROR(cudaMemcpy(m_h_patches_info,
+                          m_d_patches_info,
+                          m_num_patches * sizeof(PatchInfo),
+                          cudaMemcpyDeviceToHost));
 
-    // vertices
-    m_h_patches_ltog_v.push_back(v_ltog);
+    for (uint32_t p = 0; p < m_num_patches; ++p) {
+        GPU_FREE(m_h_patches_info[p].not_owned_patch_v);
+        GPU_FREE(m_h_patches_info[p].not_owned_patch_e);
+        GPU_FREE(m_h_patches_info[p].not_owned_patch_f);
+        GPU_FREE(m_h_patches_info[p].not_owned_id_v);
+        GPU_FREE(m_h_patches_info[p].not_owned_id_e);
+        GPU_FREE(m_h_patches_info[p].not_owned_id_f);
+        GPU_FREE(m_h_patches_info[p].ev);
+        GPU_FREE(m_h_patches_info[p].fe);
+    }
+    GPU_FREE(m_d_patches_info);
+    free(m_h_patches_info);
 }
 
-template <uint32_t patchSize>
-uint16_t RXMesh<patchSize>::create_new_local_face(
-    const uint32_t               patch_id,
-    const uint32_t               global_f,
-    const std::vector<uint32_t>& fv,
-    uint16_t&                    faces_count,
-    uint16_t&                    edges_owned_count,
-    uint16_t&                    edges_not_owned_count,
-    uint16_t&                    vertices_owned_count,
-    uint16_t&                    vertices_not_owned_count,
-    const uint16_t               num_edges_owned,
-    const uint16_t               num_vertices_owned,
-    std::vector<uint32_t>&       f_ltog,
-    std::vector<uint32_t>&       e_ltog,
-    std::vector<uint32_t>&       v_ltog,
-    std::vector<uint16_t>&       fp,
-    std::vector<uint16_t>&       ep)
+void RXMesh::build(const std::vector<std::vector<uint32_t>>& fv)
 {
+    std::vector<uint32_t>              ff_values;
+    std::vector<uint32_t>              ff_offset;
+    std::vector<std::vector<uint32_t>> ef;
+    build_supporting_structures(fv, ef, ff_offset, ff_values);
 
-    uint16_t local_f = faces_count++;
-    f_ltog[local_f] = global_f;
-
-    // shift to left and set first bit to 1 if global_f's patch is this patch
-    f_ltog[local_f] = f_ltog[local_f] << 1;
-    f_ltog[local_f] =
-        f_ltog[local_f] | (m_patcher->get_face_patch_id(global_f) == patch_id);
-
-    auto find_increment_index =
-        [&patch_id](const uint32_t& global, std::vector<uint32_t>& vect,
-                    uint16_t& owned_count, uint16_t& not_owned_count,
-                    const uint16_t num_owned, bool& incremented,
-                    const uint32_t ele_patch) -> uint16_t {
-        incremented = true;
-
-        for (uint16_t id = 0; id < owned_count; ++id) {
-            if (global == (vect[id] >> 1)) {
-                incremented = false;
-                return id;
-            }
-        }
-
-        for (uint16_t id = num_owned; id < num_owned + not_owned_count; ++id) {
-            if (global == (vect[id] >> 1)) {
-                incremented = false;
-                return id;
-            }
-        }
-        uint32_t to_store = (global << 1);
-        uint16_t ret_id;
-        if (ele_patch == patch_id) {
-            to_store = to_store | 1;
-            ret_id = owned_count++;
-        } else {
-            ret_id = num_owned + (not_owned_count++);
-        }
-        vect[ret_id] = to_store;
-        return ret_id;
-    };
-
-    for (uint32_t j = 0; j < m_face_degree; j++) {
-
-        // find the edge global id
-        uint32_t global_v0 = fv[j];
-        uint32_t global_v1 = fv[(j + 1) % m_face_degree];
-
-        // find the edge in m_edge_map with v0,v1
-        std::pair<uint32_t, uint32_t> my_edge = edge_key(global_v0, global_v1);
+    m_patcher = std::make_unique<patcher::Patcher>(m_patch_size,
+                                                   ff_offset,
+                                                   ff_values,
+                                                   fv,
+                                                   m_edges_map,
+                                                   m_num_vertices,
+                                                   m_num_edges,
+                                                   m_quite);
 
-        assert(my_edge.first == global_v0 || my_edge.first == global_v1);
-        assert(my_edge.second == global_v0 || my_edge.second == global_v1);
+    m_num_patches = m_patcher->get_num_patches();
 
-        int dir = 1;
-        if (my_edge.first == global_v0 && my_edge.second == global_v1) {
-            dir = 0;
-        }
+    m_h_patches_ltog_f.resize(m_num_patches);
+    m_h_patches_ltog_e.resize(m_num_patches);
+    m_h_patches_ltog_v.resize(m_num_patches);
+    m_h_num_owned_f.resize(m_num_patches);
+    m_h_num_owned_v.resize(m_num_patches);
+    m_h_num_owned_e.resize(m_num_patches);
+    m_h_patches_fe.resize(m_num_patches);
+    m_h_patches_ev.resize(m_num_patches);
 
-        uint32_t global_e = get_edge_id(my_edge);
-
-        // convert edge to local index by searching for it. if not
-        // found, then increment the number of local edges
-        bool     new_e(false);
-        uint16_t local_e = find_increment_index(
-            global_e, e_ltog, edges_owned_count, edges_not_owned_count,
-            num_edges_owned, new_e, m_patcher->get_edge_patch_id(global_e));
-
-        if (new_e) {
-            // if it is new edges, then we need to either look for
-            // its vertices. if there were inserted before in the
-            // patch, then retrieve their local id. otherwise, we
-            // new vertices to the patch
-            assert(my_edge.first != my_edge.second);
-
-            bool     new_v(false);
-            uint16_t local_v0 = find_increment_index(
-                my_edge.first, v_ltog, vertices_owned_count,
-                vertices_not_owned_count, num_vertices_owned, new_v,
-                m_patcher->get_vertex_patch_id(my_edge.first));
-
-            uint16_t local_v1 = find_increment_index(
-                my_edge.second, v_ltog, vertices_owned_count,
-                vertices_not_owned_count, num_vertices_owned, new_v,
-                m_patcher->get_vertex_patch_id(my_edge.second));
-
-            assert(local_v0 != local_v1);
-
-            // new edges are appended in the end of e_ltog
-            // and so as their vertices in ep
-            ep[2 * local_e] = local_v0;
-            ep[2 * local_e + 1] = local_v1;
-        }
-        // shift local_e to left
-        // set the first bit to 1 if (dir ==1)
-        local_e = local_e << 1;
-        local_e = local_e | (dir & 1);
-        fp[local_f * m_face_degree + j] = local_e;
+#pragma omp parallel for
+    for (int p = 0; p < static_cast<int>(m_num_patches); ++p) {
+        build_single_patch(fv, p);
     }
 
-    return local_f;
+    calc_statistics(fv, ef);
 }
 
-template <uint32_t patchSize>
-void RXMesh<patchSize>::set_num_vertices(
-    const std::vector<std::vector<uint32_t>>& fv)
+void RXMesh::build_supporting_structures(
+    const std::vector<std::vector<uint32_t>>& fv,
+    std::vector<std::vector<uint32_t>>&       ef,
+    std::vector<uint32_t>&                    ff_offset,
+    std::vector<uint32_t>&                    ff_values)
 {
+    m_num_faces    = static_cast<uint32_t>(fv.size());
     m_num_vertices = 0;
-    for (uint32_t i = 0; i < fv.size(); ++i) {
-        if (fv[i].size() != 3) {
-            RXMESH_ERROR("RXMesh::count_vertices() Face" + std::to_string(i) +
-                         " is not triangles. Non-triangular faces are not "
-                         "supported yet");
-        }
-        for (uint32_t j = 0; j < fv[i].size(); ++j) {
-            m_num_vertices = std::max(m_num_vertices, fv[i][j]);
-        }
-    }
-    ++m_num_vertices;
-}
-
-
-template <uint32_t patchSize>
-void RXMesh<patchSize>::populate_edge_map(
-    const std::vector<std::vector<uint32_t>>& fv)
-{
-
-    // create edges and populate edge_map
-    // and also compute max valence
+    m_num_edges    = 0;
+    m_edges_map.clear();
 
-    m_edges_map.reserve(m_num_faces * 3);  // upper bound
-
-    std::vector<uint32_t> vv_count(m_num_vertices, 0);
-    m_max_valence = 0;
+    // assuming manifold mesh i.e., #E = 1.5#F
+    ef.clear();
+    uint32_t reserve_size =
+        static_cast<size_t>(1.5f * static_cast<float>(m_num_faces));
+    ef.reserve(reserve_size);
+    m_edges_map.reserve(reserve_size);
 
-    for (uint32_t f = 0; f < m_num_faces; ++f) {
+    std::vector<uint32_t> ff_size(m_num_faces, 0);
 
-        if (fv[f].size() < 3) {
+    for (uint32_t f = 0; f < fv.size(); ++f) {
+        if (fv[f].size() != 3) {
             RXMESH_ERROR(
-                "RXMesh::populate_edge_map() Face {} has less than three "
-                "vertices",
+                "rxmesh::build_supporting_structures() Face {} is not "
+                "triangle. Non-triangular faces are not supported",
                 f);
+            exit(EXIT_FAILURE);
         }
-        for (uint32_t j = 0; j < fv[f].size(); ++j) {
-
-            uint32_t v0 = fv[f][j];
-            uint32_t v1 = (j != fv[f].size() - 1) ? fv[f][j + 1] : fv[f][0];
-
-            std::pair<uint32_t, uint32_t> my_edge = edge_key(v0, v1);
 
-            typename std::unordered_map<std::pair<uint32_t, uint32_t>, uint32_t,
-                                        edge_key_hash>::const_iterator e_it =
-                m_edges_map.find(my_edge);
+        for (uint32_t v = 0; v < fv[f].size(); ++v) {
+            uint32_t v0 = fv[f][v];
+            uint32_t v1 = fv[f][(v + 1) % 3];
 
-            if (e_it == m_edges_map.end()) {
-                m_edges_map.insert(std::make_pair(my_edge, m_num_edges++));
+            m_num_vertices = std::max(m_num_vertices, v0);
 
-                vv_count[v0]++;
-                vv_count[v1]++;
+            std::pair<uint32_t, uint32_t> edge   = detail::edge_key(v0, v1);
+            auto                          e_iter = m_edges_map.find(edge);
+            if (e_iter == m_edges_map.end()) {
+                uint32_t edge_id = m_num_edges++;
+                m_edges_map.insert(std::make_pair(edge, edge_id));
+                std::vector<uint32_t> tmp(1, f);
+                ef.push_back(tmp);
+            } else {
+                uint32_t edge_id = (*e_iter).second;
 
-                // also set max valence
-                if (m_max_valence < vv_count[v0]) {
-                    m_max_valence = vv_count[v0];
-                    m_max_valence_vertex_id = v0;
-                }
-                if (m_max_valence < vv_count[v1]) {
-                    m_max_valence = vv_count[v1];
-                    m_max_valence_vertex_id = v1;
+                for (uint32_t f0 = 0; f0 < ef[edge_id].size(); ++f0) {
+                    uint32_t other_face = ef[edge_id][f0];
+                    ++ff_size[other_face];
                 }
+                ff_size[f] += ef[edge_id].size();
+
+                ef[edge_id].push_back(f);
             }
         }
     }
-}
+    ++m_num_vertices;
 
-template <uint32_t patchSize>
-void RXMesh<patchSize>::edge_incident_faces(
-    const std::vector<std::vector<uint32_t>>& fv,
-    std::vector<std::vector<uint32_t>>&       ef)
-{
-    // populate ef by the faces incident to each edge
-    // must call populate_edge_map before call it
+    if (m_num_edges != static_cast<uint32_t>(m_edges_map.size())) {
+        RXMESH_ERROR(
+            "rxmesh::build_supporting_structures() m_num_edges ({}) should "
+            "match the size of edge_map ({})",
+            m_num_edges,
+            m_edges_map.size());
+        exit(EXIT_FAILURE);
+    }
 
-    assert(m_edges_map.size() > 0);
+    ff_offset.resize(m_num_faces);
+    std::inclusive_scan(ff_size.begin(), ff_size.end(), ff_offset.begin());
+    ff_values.clear();
+    ff_values.resize(ff_offset.back());
+    std::fill(ff_size.begin(), ff_size.end(), 0);
 
-    uint32_t num_edges = static_cast<uint32_t>(m_edges_map.size());
+    for (uint32_t e = 0; e < m_num_edges; ++e) {
+        for (uint32_t i = 0; i < ef[e].size(); ++i) {
+            uint32_t f0 = ef[e][i];
+            for (uint32_t j = i + 1; j < ef[e].size(); ++j) {
+                uint32_t f1 = ef[e][j];
+
+                uint32_t f0_offset = ff_size[f0]++;
+                uint32_t f1_offset = ff_size[f1]++;
+                f0_offset += (f0 == 0) ? 0 : ff_offset[f0 - 1];
+                f1_offset += (f1 == 0) ? 0 : ff_offset[f1 - 1];
+
+                ff_values[f0_offset] = f1;
+                ff_values[f1_offset] = f0;
+            }
+        }
+    }
+}
 
-    // reserve space assuming mesh is mostly manifold (edge is shared by
-    // two faces)
-    ef.clear();
-    ef.resize(num_edges, std::vector<uint32_t>(0));
-    for (uint32_t e = 0; e < num_edges; ++e) {
-        ef[e].reserve(2);
+void RXMesh::calc_statistics(const std::vector<std::vector<uint32_t>>& fv,
+                             const std::vector<std::vector<uint32_t>>& ef)
+{
+    if (m_num_vertices == 0 || m_num_faces == 0 || m_num_edges == 0 ||
+        fv.size() == 0 || ef.size() == 0) {
+        RXMESH_ERROR(
+            "RXMesh::calc_statistics() input mesh has not been initialized");
+        exit(EXIT_FAILURE);
     }
 
+    // calc max valence, max ef, is input closed, and is input manifold
     m_max_edge_incident_faces = 0;
-    for (uint32_t f = 0; f < m_num_faces; ++f) {
+    m_max_valence             = 0;
+    std::vector<uint32_t> vv_count(m_num_vertices, 0);
+    m_is_input_closed        = true;
+    m_is_input_edge_manifold = true;
+    for (auto& e_iter : m_edges_map) {
+        uint32_t v0 = e_iter.first.first;
+        uint32_t v1 = e_iter.first.second;
 
-        for (uint32_t j = 0; j < fv[f].size(); ++j) {
+        vv_count[v0]++;
+        vv_count[v1]++;
 
-            uint32_t v0 = fv[f][j];
-            uint32_t v1 = (j != fv[f].size() - 1) ? fv[f][j + 1] : fv[f][0];
+        m_max_valence = std::max(m_max_valence, vv_count[v0]);
+        m_max_valence = std::max(m_max_valence, vv_count[v1]);
 
-            uint32_t edge_num = get_edge_id(v0, v1);
-            ef[edge_num].push_back(f);
-            m_max_edge_incident_faces = std::max(m_max_edge_incident_faces,
-                                                 uint32_t(ef[edge_num].size()));
+        uint32_t edge_id = e_iter.second;
+        m_max_edge_incident_faces =
+            std::max(m_max_edge_incident_faces, uint32_t(ef[edge_id].size()));
+
+        if (ef[edge_id].size() < 2) {
+            m_is_input_closed = false;
+        }
+        if (ef[edge_id].size() > 2) {
+            m_is_input_edge_manifold = false;
         }
     }
 
-    // calc m_max_face_adjacent_faces
+    // calc max ff
     m_max_face_adjacent_faces = 0;
-    for (uint32_t f = 0; f < m_num_faces; ++f) {
+    for (uint32_t f = 0; f < fv.size(); ++f) {
         uint32_t ff_count = 0;
-        for (uint32_t j = 0; j < fv[f].size(); ++j) {
-            uint32_t v0 = fv[f][j];
-            uint32_t v1 = (j != fv[f].size() - 1) ? fv[f][j + 1] : fv[f][0];
+        for (uint32_t v = 0; v < fv[f].size(); ++v) {
+            uint32_t v0       = fv[f][v];
+            uint32_t v1       = fv[f][(v + 1) % 3];
             uint32_t edge_num = get_edge_id(v0, v1);
             ff_count += ef[edge_num].size() - 1;
         }
         m_max_face_adjacent_faces =
             std::max(ff_count, m_max_face_adjacent_faces);
     }
+
+    // max number of vertices/edges/faces per patch
+    m_max_vertices_per_patch = 0;
+    m_max_edges_per_patch    = 0;
+    m_max_faces_per_patch    = 0;
+    for (uint32_t p = 0; p < m_num_patches; ++p) {
+        m_max_vertices_per_patch = std::max(
+            m_max_vertices_per_patch, uint32_t(m_h_patches_ltog_v[p].size()));
+        m_max_edges_per_patch = std::max(
+            m_max_edges_per_patch, uint32_t(m_h_patches_ltog_e[p].size()));
+        m_max_faces_per_patch = std::max(
+            m_max_faces_per_patch, uint32_t(m_h_patches_ltog_f[p].size()));
+    }
 }
 
-template <uint32_t patchSize>
-uint32_t RXMesh<patchSize>::get_edge_id(const uint32_t v0,
-                                        const uint32_t v1) const
+void RXMesh::calc_max_not_owned_elements()
 {
-    // v0 and v1 are two vertices in global space. we return the edge
-    // id in global space also (by querying m_edges_map)
-    assert(m_edges_map.size() != 0);
-
-    std::pair<uint32_t, uint32_t> edge = edge_key(v0, v1);
-
-    assert(edge.first == v0 || edge.first == v1);
-    assert(edge.second == v0 || edge.second == v1);
-
-    return get_edge_id(edge);
+    m_max_not_owned_vertices = 0;
+    m_max_not_owned_edges    = 0;
+    m_max_not_owned_faces    = 0;
+
+    for (int p = 0; p < static_cast<int>(m_num_patches); ++p) {
+        m_max_not_owned_vertices =
+            std::max(m_max_not_owned_vertices,
+                     uint32_t(m_h_patches_info[p].num_vertices -
+                              m_h_patches_info[p].num_owned_vertices));
+
+        m_max_not_owned_edges =
+            std::max(m_max_not_owned_edges,
+                     uint32_t(m_h_patches_info[p].num_edges -
+                              m_h_patches_info[p].num_owned_edges));
+
+        m_max_not_owned_faces =
+            std::max(m_max_not_owned_faces,
+                     uint32_t(m_h_patches_info[p].num_faces -
+                              m_h_patches_info[p].num_owned_faces));
+    }
 }
 
-template <uint32_t patchSize>
-uint32_t RXMesh<patchSize>::get_edge_id(
-    const std::pair<uint32_t, uint32_t>& edge) const
+void RXMesh::build_single_patch(const std::vector<std::vector<uint32_t>>& fv,
+                                const uint32_t patch_id)
 {
-    uint32_t edge_id = -1;
-    try {
-        edge_id = m_edges_map.at(edge);
-    } catch (const std::out_of_range&) {
-        RXMESH_ERROR(
-            "RXMesh::get_edge_id() mapping edges went wrong."
-            " Can not find an edge connecting vertices {} and {}",
-            edge.first, edge.second);
-    }
+    // Build the patch local index space
+    // This is the two small matrices defining incident relation between
+    // edge-vertices and faces-edges (i.e., the topology) along with the mapping
+    // from local to global space for vertices, edge, and faces
 
-    return edge_id;
+    // When we create a new patch, we make sure that the elements owned by the
+    // patch will have local indices lower than any other elements (of the same
+    // type) that is not owned by the patch.
+
+    build_single_patch_ltog(fv, patch_id);
+
+    build_single_patch_topology(fv, patch_id);
 }
-//**************************************************************************
 
-//********************** sort
-template <uint32_t patchSize>
-void RXMesh<patchSize>::sort(std::vector<std::vector<uint32_t>>& fv,
-                             std::vector<std::vector<coordT>>&   coordinates)
+void RXMesh::build_single_patch_ltog(
+    const std::vector<std::vector<uint32_t>>& fv,
+    const uint32_t                            patch_id)
 {
-    if (m_num_patches == 1) {
-        return;
-    }
-    std::vector<uint32_t> new_face_id(m_num_faces, INVALID32);
-    std::vector<uint32_t> new_vertex_id(m_num_vertices, INVALID32);
-    std::vector<uint32_t> new_edge_id(m_num_edges, INVALID32);
-
-    const uint32_t* patches_offset = m_patcher->get_patches_offset();
-    const uint32_t* patches_val = m_patcher->get_patches_val();
-
-    // patch status:
-    // 1) 0: has not been processed/seen before
-    // 2) 1: currently in the queue
-    // 3) 2: has been processed (assigned new id)
-    std::vector<uint32_t> patch_status(m_num_patches, 0);
-
-    std::queue<uint32_t> patch_queue;
-    patch_queue.push(0);
-    uint32_t face_counter = 0;
-    uint32_t vertex_counter = 0;
-    uint32_t edge_counter = 0;
-
-    //*****Compute new ID for faces, edges, and vertices
-    while (true) {
-
-        std::queue<uint32_t> patch_queue;
-
-        for (uint32_t p = 0; p < m_num_patches; ++p) {
-            if (patch_status[p] == 0) {
-                patch_queue.push(p);
-                patch_status[p] = 1;
-                break;
-            }
-        }
+    // patch start and end
+    const uint32_t p_start =
+        (patch_id == 0) ? 0 : m_patcher->get_patches_offset()[patch_id - 1];
+    const uint32_t p_end = m_patcher->get_patches_offset()[patch_id];
 
-        // this happens when all patches has been processed
-        if (patch_queue.empty()) {
-            break;
-        }
+    // ribbon start and end
+    const uint32_t r_start =
+        (patch_id == 0) ? 0 :
+                          m_patcher->get_external_ribbon_offset()[patch_id - 1];
+    const uint32_t r_end = m_patcher->get_external_ribbon_offset()[patch_id];
 
 
-        while (patch_queue.size() > 0) {
-            uint32_t p = patch_queue.front();
-            patch_queue.pop();
-            patch_status[p] = 2;
-
-            uint32_t p_start = (p == 0) ? 0 : patches_offset[p - 1];
-            uint32_t p_end = patches_offset[p];
-            // first loop over p's faces and assigned its faces new id
-            for (uint32_t f = p_start; f < p_end; ++f) {
-                uint32_t face = patches_val[f];
-                new_face_id[face] = face_counter++;
-
-                // assign face's vertices new id
-                for (uint32_t v = 0; v < 3; ++v) {
-                    uint32_t vertex = m_fvn[face][v];
-                    // if the vertex is owned by this patch
-                    if (m_patcher->get_vertex_patch_id(vertex) == p &&
-                        new_vertex_id[vertex] == INVALID32) {
-                        new_vertex_id[vertex] = vertex_counter++;
-                    }
-                }
+    const uint32_t total_patch_num_faces =
+        (p_end - p_start) + (r_end - r_start);
+    m_h_patches_ltog_f[patch_id].resize(total_patch_num_faces);
+    m_h_patches_ltog_v[patch_id].resize(3 * total_patch_num_faces);
+    m_h_patches_ltog_e[patch_id].resize(3 * total_patch_num_faces);
 
+    auto add_new_face = [&](uint32_t global_face_id, uint16_t local_face_id) {
+        m_h_patches_ltog_f[patch_id][local_face_id] = global_face_id;
 
-                // assign face's edge new id
-                uint32_t v1 = 2;
-                for (uint32_t v0 = 0; v0 < 3; ++v0) {
-                    uint32_t vertex0 = m_fvn[face][v0];
-                    uint32_t vertex1 = m_fvn[face][v1];
-                    uint32_t edge = get_edge_id(vertex0, vertex1);
+        for (uint32_t v = 0; v < 3; ++v) {
+            uint32_t v0 = fv[global_face_id][v];
+            uint32_t v1 = fv[global_face_id][(v + 1) % 3];
 
-                    // if the edge is owned by this patch
-                    if (m_patcher->get_edge_patch_id(edge) == p &&
-                        new_edge_id[edge] == INVALID32) {
-                        new_edge_id[edge] = edge_counter++;
-                    }
-                    v1 = v0;
-                }
-            }
+            uint32_t edge_id = get_edge_id(v0, v1);
 
-            // second loop over p's ribbon and push new patches into the queue
-            // only if there are not in the queue and the have not been
-            // processed yet.
-            uint32_t ribbon_start =
-                (p == 0) ? 0 : m_patcher->get_external_ribbon_offset()[p - 1];
-            uint32_t ribbon_end = m_patcher->get_external_ribbon_offset()[p];
-            for (uint32_t f = ribbon_start; f < ribbon_end; ++f) {
-                // this is a face in the ribbon
-                uint32_t face = m_patcher->get_external_ribbon_val()[f];
-                // get the face actual patch
-                uint32_t face_patch = m_patcher->get_face_patch_id(face);
-                assert(face_patch != p);
-                if (patch_status[face_patch] == 0) {
-                    patch_queue.push(face_patch);
-                    patch_status[face_patch] = 1;
-                }
-            }
-        }
-    }
-    if (edge_counter != m_num_edges || vertex_counter != m_num_vertices ||
-        face_counter != m_num_faces) {
-        RXMESH_ERROR("RXMesh::sort Error in assigning new IDs");
-    }
-    //**** Apply changes
-    m_max_valence_vertex_id = new_vertex_id[m_max_valence_vertex_id];
-    // coordinates
-    {
-        std::vector<std::vector<coordT>> coord_ordered(coordinates);
-        for (uint32_t v = 0; v < m_num_vertices; ++v) {
-            uint32_t new_v_id = new_vertex_id[v];
-            coord_ordered[new_v_id][0] = coordinates[v][0];
-            coord_ordered[new_v_id][1] = coordinates[v][1];
-            coord_ordered[new_v_id][2] = coordinates[v][2];
-        }
-        coordinates.swap(coord_ordered);
-    }
+            m_h_patches_ltog_v[patch_id][local_face_id * 3 + v] = v0;
 
-    // edge map
-    {
-        std::unordered_map<std::pair<uint32_t, uint32_t>, uint32_t,
-                           edge_key_hash>
-            edges_map;
-        edges_map.reserve(m_num_faces * 3);
-        for (auto& it : m_edges_map) {
-            uint32_t v0 = new_vertex_id[it.first.first];
-            uint32_t v1 = new_vertex_id[it.first.second];
-            uint32_t edge_id = new_edge_id[it.second];
-
-            std::pair<uint32_t, uint32_t> my_edge = edge_key(v0, v1);
-
-            typename std::unordered_map<std::pair<uint32_t, uint32_t>, uint32_t,
-                                        edge_key_hash>::const_iterator e_it =
-                edges_map.find(my_edge);
-
-            if (e_it == edges_map.end()) {
-                edges_map.insert(std::make_pair(my_edge, edge_id));
-            } else {
-                RXMESH_ERROR("RXMesh::sort Unknown error");
-            }
+            m_h_patches_ltog_e[patch_id][local_face_id * 3 + v] = edge_id;
         }
-        m_edges_map.swap(edges_map);
-    }
+    };
 
-    // m_fvn
-    {
-        std::vector<std::vector<uint32_t>> fvn(m_fvn);
-        for (uint32_t f = 0; f < m_fvn.size(); ++f) {
-            uint32_t new_f_id = new_face_id[f];
-            fvn[new_f_id].resize(3);
-            // v
-            fvn[new_f_id][0] = new_vertex_id[m_fvn[f][0]];
-            fvn[new_f_id][1] = new_vertex_id[m_fvn[f][1]];
-            fvn[new_f_id][2] = new_vertex_id[m_fvn[f][2]];
-
-            fv[new_f_id][0] = fvn[new_f_id][0];
-            fv[new_f_id][1] = fvn[new_f_id][1];
-            fv[new_f_id][2] = fvn[new_f_id][2];
-
-            // n
-            for (uint32_t n = 3; n < m_fvn[f].size(); ++n) {
-                fvn[new_f_id].push_back(new_face_id[m_fvn[f][n]]);
-            }
-        }
-        m_fvn.swap(fvn);
+    uint16_t local_face_id = 0;
+    for (uint32_t f = p_start; f < p_end; ++f) {
+        uint32_t face_id = m_patcher->get_patches_val()[f];
+        add_new_face(face_id, local_face_id++);
     }
 
-    // patcher
-    {
-        uint32_t* patch_val = m_patcher->get_patches_val();
-        for (uint32_t i = 0; i < m_num_faces; ++i) {
-            patch_val[i] = new_face_id[patch_val[i]];
-        }
-
-        uint32_t num_ext_ribbon_faces =
-            m_patcher->get_external_ribbon_offset()[m_num_patches - 1];
-        for (uint32_t i = 0; i < num_ext_ribbon_faces; ++i) {
-            m_patcher->get_external_ribbon_val()[i] =
-                new_face_id[m_patcher->get_external_ribbon_val()[i]];
-        }
+    for (uint32_t f = r_start; f < r_end; ++f) {
+        uint32_t face_id = m_patcher->get_external_ribbon_val()[f];
+        add_new_face(face_id, local_face_id++);
+    }
 
-        {
-            std::vector<uint32_t> face_patch(m_num_faces);
-            for (uint32_t f = 0; f < m_num_faces; ++f) {
-                uint32_t new_f_id = new_face_id[f];
-                face_patch[new_f_id] = m_patcher->get_face_patch_id(f);
-            }
-            std::memcpy(m_patcher->get_face_patch().data(), face_patch.data(),
-                        m_num_faces * sizeof(uint32_t));
-        }
 
-        {
+    auto create_unique_mapping = [&](std::vector<uint32_t>&       ltog_map,
+                                     const std::vector<uint32_t>& patch) {
+        std::sort(ltog_map.begin(), ltog_map.end());
+        auto unique_end = std::unique(ltog_map.begin(), ltog_map.end());
+        ltog_map.resize(unique_end - ltog_map.begin());
 
-            std::vector<uint32_t> vertex_patch(m_num_vertices);
-            for (uint32_t v = 0; v < m_num_vertices; ++v) {
-                uint32_t new_v_id = new_vertex_id[v];
-                vertex_patch[new_v_id] = m_patcher->get_vertex_patch_id(v);
-            }
-            std::memcpy(m_patcher->get_vertex_patch().data(),
-                        vertex_patch.data(), m_num_vertices * sizeof(uint32_t));
-        }
+        // we use stable partition since we want ltog to be sorted so we can
+        // use binary search on it when we populate the topology
+        auto part_end = std::stable_partition(
+            ltog_map.begin(), ltog_map.end(), [&patch, patch_id](uint32_t i) {
+                return patch[i] == patch_id;
+            });
+        return static_cast<uint16_t>(part_end - ltog_map.begin());
+    };
 
-        {
-            std::vector<uint32_t> edge_patch(m_num_edges);
-            for (uint32_t e = 0; e < m_num_edges; ++e) {
-                uint32_t new_e_id = new_edge_id[e];
-                edge_patch[new_e_id] = m_patcher->get_edge_patch_id(e);
-            }
-            std::memcpy(m_patcher->get_edge_patch().data(), edge_patch.data(),
-                        m_num_edges * sizeof(uint32_t));
-        }
-    }
+    m_h_num_owned_f[patch_id] = create_unique_mapping(
+        m_h_patches_ltog_f[patch_id], m_patcher->get_face_patch());
 
-    /*m_patcher->export_patches(coordinates);
+    m_h_num_owned_e[patch_id] = create_unique_mapping(
+        m_h_patches_ltog_e[patch_id], m_patcher->get_edge_patch());
 
-    std::vector<uint32_t> vert_id(m_num_vertices);
-    std::vector<uint32_t> face_id(m_num_faces);
-    fill_with_sequential_numbers(vert_id.data(), vert_id.size());
-    fill_with_sequential_numbers(face_id.data(), face_id.size());
-    export_attribute_VTK("sort_faces.vtk", m_fvn, coordinates,
-                         true, face_id.data(), vert_id.data(), false);
-    export_attribute_VTK("sort_vertices.vtk", m_fvn, coordinates,
-                         false, face_id.data(), vert_id.data(), false);*/
-}
-//**************************************************************************
-
-//********************** Move to Device
-template <uint32_t patchSize>
-template <typename Tin, typename Tst>
-void RXMesh<patchSize>::get_starting_ids(
-    const std::vector<std::vector<Tin>>& input,
-    std::vector<Tst>&                    starting_id)
-{
-    // get the starting ids for the mesh elements in input and store it
-    // in the first (x) component of starting_id
-
-    // uint32_t prv = 0;
-    assert(starting_id.size() > 0);
-    assert(starting_id.size() > input.size());
-    starting_id[0].x = 0;
-    for (uint32_t p = 1; p <= input.size(); ++p) {
-        starting_id[p].x = starting_id[p - 1].x + input[p - 1].size();
-        // starting_id[p].x = input[p].size() + prv;
-        // prv = starting_id[p].x;
-    }
+    m_h_num_owned_v[patch_id] = create_unique_mapping(
+        m_h_patches_ltog_v[patch_id], m_patcher->get_vertex_patch());
 }
 
-template <uint32_t patchSize>
-template <typename Tin, typename Tad>
-void RXMesh<patchSize>::get_size(const std::vector<std::vector<Tin>>& input,
-                                 std::vector<Tad>&                    ad)
+void RXMesh::build_single_patch_topology(
+    const std::vector<std::vector<uint32_t>>& fv,
+    const uint32_t                            patch_id)
 {
-    // get the size of each element of input and store it as the second(y)
-    // component in ad
-    assert(ad.size() >= input.size());
+    // patch start and end
+    const uint32_t p_start =
+        (patch_id == 0) ? 0 : m_patcher->get_patches_offset()[patch_id - 1];
+    const uint32_t p_end = m_patcher->get_patches_offset()[patch_id];
 
-    for (uint32_t p = 0; p < input.size(); ++p) {
-        ad[p].y = input[p].size();
-    }
-}
+    // ribbon start and end
+    const uint32_t r_start =
+        (patch_id == 0) ? 0 :
+                          m_patcher->get_external_ribbon_offset()[patch_id - 1];
+    const uint32_t r_end = m_patcher->get_external_ribbon_offset()[patch_id];
 
-template <uint32_t patchSize>
-template <typename T>
-void RXMesh<patchSize>::padding_to_multiple(std::vector<std::vector<T>>& input,
-                                            const uint32_t multiple,
-                                            const T        init_val)
-{
-    // resize each element on input to be mulitple of multiple by add
-    // init_val to the end
-
-    for (uint32_t p = 0; p < input.size(); ++p) {
-        const uint32_t new_size =
-            round_up_multiple(uint32_t(input[p].size()), multiple);
-        assert(new_size >= input[p].size());
-        input[p].resize(new_size, static_cast<T>(init_val));
-    }
-}
+    const uint16_t patch_num_edges = m_h_patches_ltog_e[patch_id].size();
+    const uint16_t patch_num_faces = m_h_patches_ltog_f[patch_id].size();
+
+    m_h_patches_ev[patch_id].resize(patch_num_edges * 2);
+    m_h_patches_fe[patch_id].resize(patch_num_faces * 3);
+
+    std::vector<bool> is_added_edge(patch_num_edges, false);
+
+    auto find_local_index = [&patch_id](
+                                const uint32_t               global_id,
+                                const uint32_t               element_patch,
+                                const uint16_t               num_owned_elements,
+                                const std::vector<uint32_t>& ltog) -> uint16_t {
+        uint32_t start = 0;
+        uint32_t end   = num_owned_elements;
+        if (element_patch != patch_id) {
+            start = num_owned_elements;
+            end   = ltog.size();
+        }
+        auto it = std::lower_bound(
+            ltog.begin() + start, ltog.begin() + end, global_id);
+        if (it == ltog.begin() + end) {
+            return INVALID16;
+        } else {
+            return static_cast<uint16_t>(it - ltog.begin());
+        }
+    };
 
-template <uint32_t patchSize>
-void RXMesh<patchSize>::device_alloc_local()
-{
 
-    // allocate and transfer patch information to device
-    // make sure to build_local first before calling this
+    auto add_new_face = [&](const uint32_t global_face_id) {
+        const uint16_t local_face_id =
+            find_local_index(global_face_id,
+                             m_patcher->get_face_patch_id(global_face_id),
+                             m_h_num_owned_f[patch_id],
+                             m_h_patches_ltog_f[patch_id]);
 
-    // storing the start id(x) and element count(y)
-    m_h_ad_size_ltog_v.resize(m_num_patches + 1);
-    m_h_ad_size_ltog_e.resize(m_num_patches + 1);
-    m_h_ad_size_ltog_f.resize(m_num_patches + 1);
-    m_h_ad_size.resize(m_num_patches + 1);
+        for (uint32_t v = 0; v < 3; ++v) {
 
-    // get mesh element count per patch
-    get_size(m_h_patches_ltog_v, m_h_ad_size_ltog_v);
-    get_size(m_h_patches_ltog_e, m_h_ad_size_ltog_e);
-    get_size(m_h_patches_ltog_f, m_h_ad_size_ltog_f);
 
-    // how many edges and faces we have in each patch
-    for (uint32_t p = 0; p < m_num_patches; ++p) {
-        m_h_ad_size[p].y = m_h_ad_size_ltog_e[p].y * 2;  // edges size
-        m_h_ad_size[p].w =
-            m_h_ad_size_ltog_f[p].y * m_face_degree;  // faces size
-    }
+            const uint32_t global_v0 = fv[global_face_id][v];
+            const uint32_t global_v1 = fv[global_face_id][(v + 1) % 3];
 
+            std::pair<uint32_t, uint32_t> edge_key =
+                detail::edge_key(global_v0, global_v1);
 
-    // increase to multiple so that each vector size is multiple of 32
-    // so that when we copy it to the device, read will be coalesced
-    padding_to_multiple(m_h_patches_edges, WARPSIZE,
-                        static_cast<uint16_t>(INVALID16));
-    padding_to_multiple(m_h_patches_faces, WARPSIZE,
-                        static_cast<uint16_t>(INVALID16));
-    padding_to_multiple(m_h_patches_ltog_v, WARPSIZE,
-                        static_cast<uint32_t>(INVALID32));
-    padding_to_multiple(m_h_patches_ltog_e, WARPSIZE,
-                        static_cast<uint32_t>(INVALID32));
-    padding_to_multiple(m_h_patches_ltog_f, WARPSIZE,
-                        static_cast<uint32_t>(INVALID32));
-
-    // get the starting id of each patch
-    std::vector<uint1> h_edges_ad(m_num_patches + 1),
-        h_faces_ad(m_num_patches + 1);
-
-    get_starting_ids(m_h_patches_ltog_v, m_h_ad_size_ltog_v);
-    get_starting_ids(m_h_patches_ltog_e, m_h_ad_size_ltog_e);
-    get_starting_ids(m_h_patches_ltog_f, m_h_ad_size_ltog_f);
-    get_starting_ids(m_h_patches_edges, h_edges_ad);
-    get_starting_ids(m_h_patches_faces, h_faces_ad);
-
-    // m_h_ad_size[0].x = m_h_ad_size[0].z = 0;
-    for (uint32_t p = 0; p <= m_num_patches; ++p) {
-        m_h_ad_size[p].x = h_edges_ad[p].x;  // edges address
-        m_h_ad_size[p].z = h_faces_ad[p].x;  // faces address
-    }
+            assert(edge_key.first == global_v0 || edge_key.first == global_v1);
+            assert(edge_key.second == global_v0 ||
+                   edge_key.second == global_v1);
 
+            int dir = 1;
+            if (edge_key.first == global_v0 && edge_key.second == global_v1) {
+                dir = 0;
+            }
 
-    // alloc mesh data
-    CUDA_ERROR(cudaMalloc((void**)&m_d_patches_ltog_v,
-                          sizeof(uint32_t) * m_h_ad_size_ltog_v.back().x));
-    CUDA_ERROR(cudaMalloc((void**)&m_d_patches_ltog_e,
-                          sizeof(uint32_t) * m_h_ad_size_ltog_e.back().x));
-    CUDA_ERROR(cudaMalloc((void**)&m_d_patches_ltog_f,
-                          sizeof(uint32_t) * m_h_ad_size_ltog_f.back().x));
-    CUDA_ERROR(cudaMalloc((void**)&m_d_patches_edges,
-                          sizeof(uint16_t) * m_h_ad_size.back().x));
-    CUDA_ERROR(cudaMalloc((void**)&m_d_patches_faces,
-                          sizeof(uint16_t) * m_h_ad_size.back().z));
-    if (!m_quite) {
-        uint32_t patch_local_storage =
-            sizeof(uint16_t) * (m_h_ad_size.back().x + m_h_ad_size.back().z) +
-            sizeof(uint32_t) *
-                (m_h_ad_size_ltog_v.back().x + m_h_ad_size_ltog_e.back().x +
-                 m_h_ad_size_ltog_f.back().x);
-        uint32_t patch_membership_storage =
-            (m_num_faces + m_num_edges + m_num_vertices) * sizeof(uint32_t);
-        m_total_gpu_storage_mb =
-            double(patch_local_storage + patch_membership_storage) /
-            double(1024 * 1024);
-        RXMESH_TRACE("Total storage = {0:f} Mb", m_total_gpu_storage_mb);
-    }
+            const uint32_t global_edge_id = get_edge_id(edge_key);
 
-    // alloc ad_size_ltog and edges_/faces_ad
-    CUDA_ERROR(cudaMalloc((void**)&m_d_ad_size_ltog_v,
-                          sizeof(uint2) * (m_num_patches + 1)));
-    CUDA_ERROR(cudaMalloc((void**)&m_d_ad_size_ltog_e,
-                          sizeof(uint2) * (m_num_patches + 1)));
-    CUDA_ERROR(cudaMalloc((void**)&m_d_ad_size_ltog_f,
-                          sizeof(uint2) * (m_num_patches + 1)));
-    CUDA_ERROR(
-        cudaMalloc((void**)&m_d_ad_size, sizeof(uint4) * (m_num_patches + 1)));
+            uint16_t local_edge_id =
+                find_local_index(global_edge_id,
+                                 m_patcher->get_edge_patch_id(global_edge_id),
+                                 m_h_num_owned_e[patch_id],
+                                 m_h_patches_ltog_e[patch_id]);
 
-    CUDA_ERROR(cudaMalloc((void**)&m_d_owned_size,
-                          sizeof(uint4) * (m_num_patches + 1)));
+            assert(local_edge_id != INVALID16);
+            if (!is_added_edge[local_edge_id]) {
 
+                is_added_edge[local_edge_id] = true;
 
-    // copy the mesh data for each patch
-    for (uint32_t p = 0; p < m_num_patches; ++p) {
-        // m_d_ pointer are linear. The host containers are not but we can
-        // take advantage of pointer arthematic (w/ word offsetting) to get
-        // things work without copyt the host containers in a linear array
-
-        uint32_t start_v = m_h_ad_size_ltog_v[p].x;
-        uint32_t start_e = m_h_ad_size_ltog_e[p].x;
-        uint32_t start_f = m_h_ad_size_ltog_f[p].x;
-        uint32_t start_edges = m_h_ad_size[p].x;
-        uint32_t start_faces = m_h_ad_size[p].z;
-
-        // ltog
-        CUDA_ERROR(cudaMemcpy(m_d_patches_ltog_v + start_v,
-                              m_h_patches_ltog_v[p].data(),
-                              m_h_ad_size_ltog_v[p].y * sizeof(uint32_t),
-                              cudaMemcpyHostToDevice));
+                const uint16_t local_v0 = find_local_index(
+                    edge_key.first,
+                    m_patcher->get_vertex_patch_id(edge_key.first),
+                    m_h_num_owned_v[patch_id],
+                    m_h_patches_ltog_v[patch_id]);
 
-        CUDA_ERROR(cudaMemcpy(m_d_patches_ltog_e + start_e,
-                              m_h_patches_ltog_e[p].data(),
-                              m_h_ad_size_ltog_e[p].y * sizeof(uint32_t),
-                              cudaMemcpyHostToDevice));
+                const uint16_t local_v1 = find_local_index(
+                    edge_key.second,
+                    m_patcher->get_vertex_patch_id(edge_key.second),
+                    m_h_num_owned_v[patch_id],
+                    m_h_patches_ltog_v[patch_id]);
 
-        CUDA_ERROR(cudaMemcpy(m_d_patches_ltog_f + start_f,
-                              m_h_patches_ltog_f[p].data(),
-                              m_h_ad_size_ltog_f[p].y * sizeof(uint32_t),
-                              cudaMemcpyHostToDevice));
+                assert(local_v0 != INVALID16 && local_v1 != INVALID16);
 
-        // patches
-        CUDA_ERROR(cudaMemcpy(m_d_patches_edges + start_edges,
-                              m_h_patches_edges[p].data(),
-                              m_h_ad_size_ltog_e[p].y * 2 * sizeof(uint16_t),
-                              cudaMemcpyHostToDevice));
+                m_h_patches_ev[patch_id][local_edge_id * 2]     = local_v0;
+                m_h_patches_ev[patch_id][local_edge_id * 2 + 1] = local_v1;
+            }
 
-        CUDA_ERROR(cudaMemcpy(
-            m_d_patches_faces + start_faces, m_h_patches_faces[p].data(),
-            m_h_ad_size_ltog_f[p].y * m_face_degree * sizeof(uint16_t),
-            cudaMemcpyHostToDevice));
-    }
+            // shift local_e to left
+            // set the first bit to 1 if (dir ==1)
+            local_edge_id = local_edge_id << 1;
+            local_edge_id = local_edge_id | (dir & 1);
+            m_h_patches_fe[patch_id][local_face_id * 3 + v] = local_edge_id;
+        }
+    };
 
 
-    // copy ad_size
-    CUDA_ERROR(cudaMemcpy(m_d_ad_size_ltog_v, m_h_ad_size_ltog_v.data(),
-                          sizeof(uint2) * (m_num_patches + 1),
-                          cudaMemcpyHostToDevice));
-    CUDA_ERROR(cudaMemcpy(m_d_ad_size_ltog_e, m_h_ad_size_ltog_e.data(),
-                          sizeof(uint2) * (m_num_patches + 1),
-                          cudaMemcpyHostToDevice));
-    CUDA_ERROR(cudaMemcpy(m_d_ad_size_ltog_f, m_h_ad_size_ltog_f.data(),
-                          sizeof(uint2) * (m_num_patches + 1),
-                          cudaMemcpyHostToDevice));
-    CUDA_ERROR(cudaMemcpy(m_d_ad_size, m_h_ad_size.data(),
-                          sizeof(uint4) * (m_num_patches + 1),
-                          cudaMemcpyHostToDevice));
-    CUDA_ERROR(cudaMemcpy(m_d_owned_size, m_h_owned_size.data(),
-                          sizeof(uint4) * (m_num_patches),
-                          cudaMemcpyHostToDevice));
-
-
-    // allocate and copy face/vertex/edge patch
-    CUDA_ERROR(
-        cudaMalloc((void**)&m_d_face_patch, sizeof(uint32_t) * (m_num_faces)));
-    CUDA_ERROR(
-        cudaMalloc((void**)&m_d_edge_patch, sizeof(uint32_t) * (m_num_edges)));
-    CUDA_ERROR(cudaMalloc((void**)&m_d_vertex_patch,
-                          sizeof(uint32_t) * (m_num_vertices)));
-
-    CUDA_ERROR(
-        cudaMemcpy(m_d_face_patch, this->m_patcher->get_face_patch().data(),
-                   sizeof(uint32_t) * (m_num_faces), cudaMemcpyHostToDevice));
-    CUDA_ERROR(
-        cudaMemcpy(m_d_edge_patch, this->m_patcher->get_edge_patch().data(),
-                   sizeof(uint32_t) * (m_num_edges), cudaMemcpyHostToDevice));
-    CUDA_ERROR(cudaMemcpy(
-        m_d_vertex_patch, this->m_patcher->get_vertex_patch().data(),
-        sizeof(uint32_t) * (m_num_vertices), cudaMemcpyHostToDevice));
-
-    CUDA_ERROR(cudaMalloc((void**)&m_d_patch_distribution_v,
-                          (m_num_patches + 1) * sizeof(uint32_t)));
-    CUDA_ERROR(cudaMalloc((void**)&m_d_patch_distribution_e,
-                          (m_num_patches + 1) * sizeof(uint32_t)));
-    CUDA_ERROR(cudaMalloc((void**)&m_d_patch_distribution_f,
-                          (m_num_patches + 1) * sizeof(uint32_t)));
-    CUDA_ERROR(cudaMemcpy(
-        m_d_patch_distribution_v, m_h_patch_distribution_v.data(),
-        (m_num_patches + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice));
-    CUDA_ERROR(cudaMemcpy(
-        m_d_patch_distribution_e, m_h_patch_distribution_e.data(),
-        (m_num_patches + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice));
-    CUDA_ERROR(cudaMemcpy(
-        m_d_patch_distribution_f, m_h_patch_distribution_f.data(),
-        (m_num_patches + 1) * sizeof(uint32_t), cudaMemcpyHostToDevice));
-
-
-    uint32_t* n_patches = m_patcher->get_neighbour_patches();
-    uint32_t* n_patches_offset = m_patcher->get_neighbour_patches_offset();
-
-    CUDA_ERROR(cudaMalloc((void**)&m_d_neighbour_patches_offset,
-                          m_num_patches * sizeof(uint32_t)));
-    CUDA_ERROR(cudaMemcpy(m_d_neighbour_patches_offset, n_patches_offset,
-                          m_num_patches * sizeof(uint32_t),
-                          cudaMemcpyHostToDevice));
-    if (n_patches) {
-        CUDA_ERROR(
-            cudaMalloc((void**)&m_d_neighbour_patches,
-                       n_patches_offset[m_num_patches - 1] * sizeof(uint32_t)));
-        CUDA_ERROR(
-            cudaMemcpy(m_d_neighbour_patches, n_patches,
-                       n_patches_offset[m_num_patches - 1] * sizeof(uint32_t),
-                       cudaMemcpyHostToDevice));
+    for (uint32_t f = p_start; f < p_end; ++f) {
+        uint32_t face_id = m_patcher->get_patches_val()[f];
+        add_new_face(face_id);
     }
 
-
-    // Allocate and copy the context to the gpu
-    m_rxmesh_context.init(
-        m_num_edges, m_num_faces, m_num_vertices, m_face_degree, m_max_valence,
-        m_max_edge_incident_faces, m_max_face_adjacent_faces, m_num_patches,
-        m_d_face_patch, m_d_edge_patch, m_d_vertex_patch, m_d_patches_ltog_v,
-        m_d_patches_ltog_e, m_d_patches_ltog_f, m_d_ad_size_ltog_v,
-        m_d_ad_size_ltog_e, m_d_ad_size_ltog_f, m_d_patches_edges,
-        m_d_patches_faces, m_d_ad_size, m_d_owned_size, m_max_size,
-        m_d_patch_distribution_v, m_d_patch_distribution_e,
-        m_d_patch_distribution_f, m_d_neighbour_patches,
-        m_d_neighbour_patches_offset);
+    for (uint32_t f = r_start; f < r_end; ++f) {
+        uint32_t face_id = m_patcher->get_external_ribbon_val()[f];
+        add_new_face(face_id);
+    }
 }
 
+uint32_t RXMesh::get_edge_id(const uint32_t v0, const uint32_t v1) const
+{
+    // v0 and v1 are two vertices in global space. we return the edge
+    // id in global space also (by querying m_edges_map)
+    assert(m_edges_map.size() != 0);
 
-//**************************************************************************
+    std::pair<uint32_t, uint32_t> edge = detail::edge_key(v0, v1);
 
+    assert(edge.first == v0 || edge.first == v1);
+    assert(edge.second == v0 || edge.second == v1);
 
-//********************** Export
-template <uint32_t patchSize>
-void RXMesh<patchSize>::write_connectivity(std::fstream& file) const
-{
-    for (uint32_t p = 0; p < m_num_patches; ++p) {  // for every patch
-        assert(m_h_ad_size[p].w % 3 == 0);
-        uint16_t patch_num_faces = m_h_ad_size[p].w / 3;
-        for (uint32_t f = 0; f < patch_num_faces; ++f) {
-            uint32_t f_global = m_h_patches_ltog_f[p][f] >> 1;
-            if (m_patcher->get_face_patch_id(f_global) != p) {
-                // if it is a ribbon
-                continue;
-            }
+    return get_edge_id(edge);
+}
 
-            file << "f ";
-            for (uint32_t e = 0; e < 3; ++e) {
-                uint16_t edge = m_h_patches_faces[p][3 * f + e];
-                flag_t   dir(0);
-                RXMeshContext::unpack_edge_dir(edge, edge, dir);
-                uint16_t e_id = (2 * edge) + dir;
-                uint16_t v = m_h_patches_edges[p][e_id];
-                file << (m_h_patches_ltog_v[p][v] >> 1) + 1 << " ";
-            }
-            file << std::endl;
-        }
+uint32_t RXMesh::get_edge_id(const std::pair<uint32_t, uint32_t>& edge) const
+{
+    uint32_t edge_id = INVALID32;
+    try {
+        edge_id = m_edges_map.at(edge);
+    } catch (const std::out_of_range&) {
+        RXMESH_ERROR(
+            "rxmesh::get_edge_id() mapping edges went wrong."
+            " Can not find an edge connecting vertices {} and {}",
+            edge.first,
+            edge.second);
+        exit(EXIT_FAILURE);
     }
+
+    return edge_id;
 }
 
-//**************************************************************************
+void RXMesh::build_device()
+{
+    CUDA_ERROR(cudaMalloc((void**)&m_d_patches_info,
+                          m_num_patches * sizeof(PatchInfo)));
+
+    m_h_patches_info = (PatchInfo*)malloc(m_num_patches * sizeof(PatchInfo));
+
+#pragma omp parallel for
+    for (int p = 0; p < static_cast<int>(m_num_patches); ++p) {
+        PatchInfo d_patch;
+        d_patch.num_faces          = m_h_patches_ltog_f[p].size();
+        d_patch.num_edges          = m_h_patches_ltog_e[p].size();
+        d_patch.num_vertices       = m_h_patches_ltog_v[p].size();
+        d_patch.num_owned_faces    = m_h_num_owned_f[p];
+        d_patch.num_owned_edges    = m_h_num_owned_e[p];
+        d_patch.num_owned_vertices = m_h_num_owned_v[p];
+        d_patch.patch_id           = p;
+
+        m_h_patches_info[p].num_faces          = m_h_patches_ltog_f[p].size();
+        m_h_patches_info[p].num_edges          = m_h_patches_ltog_e[p].size();
+        m_h_patches_info[p].num_vertices       = m_h_patches_ltog_v[p].size();
+        m_h_patches_info[p].num_owned_faces    = m_h_num_owned_f[p];
+        m_h_patches_info[p].num_owned_edges    = m_h_num_owned_e[p];
+        m_h_patches_info[p].num_owned_vertices = m_h_num_owned_v[p];
+        m_h_patches_info[p].patch_id           = p;
+
+
+        // allocate and copy patch topology to the device
+        CUDA_ERROR(cudaMalloc((void**)&d_patch.ev,
+                              d_patch.num_edges * 2 * sizeof(LocalVertexT)));
+        CUDA_ERROR(cudaMemcpy(d_patch.ev,
+                              m_h_patches_ev[p].data(),
+                              d_patch.num_edges * 2 * sizeof(LocalVertexT),
+                              cudaMemcpyHostToDevice));
+        m_h_patches_info[p].ev =
+            reinterpret_cast<LocalVertexT*>(m_h_patches_ev[p].data());
+
+        CUDA_ERROR(cudaMalloc((void**)&d_patch.fe,
+                              d_patch.num_faces * 3 * sizeof(LocalEdgeT)));
+        CUDA_ERROR(cudaMemcpy(d_patch.fe,
+                              m_h_patches_fe[p].data(),
+                              d_patch.num_faces * 3 * sizeof(LocalEdgeT),
+                              cudaMemcpyHostToDevice));
+        m_h_patches_info[p].fe =
+            reinterpret_cast<LocalEdgeT*>(m_h_patches_fe[p].data());
+
+        // copy not-owned mesh elements to device
+
+        auto populate_not_owned =
+            [p](const std::vector<std::vector<uint32_t>>& ltog,
+                const std::vector<uint32_t>&              element_patch,
+                const std::vector<uint16_t>&              num_owned,
+                auto*&                                    d_not_owned_id,
+                uint32_t*&                                d_not_owned_patch,
+                auto*&                                    h_not_owned_id,
+                uint32_t*&                                h_not_owned_patch) {
+                using LocalT = typename std::remove_reference<decltype(
+                    *d_not_owned_id)>::type;
+
+                const uint16_t num_not_owned = ltog[p].size() - num_owned[p];
+
+                h_not_owned_id =
+                    (LocalT*)malloc(num_not_owned * sizeof(LocalT));
+                h_not_owned_patch =
+                    (uint32_t*)malloc(num_not_owned * sizeof(uint32_t));
+
+                for (uint16_t i = 0; i < num_not_owned; ++i) {
+                    uint16_t local_id     = i + num_owned[p];
+                    uint32_t global_id    = ltog[p][local_id];
+                    uint32_t owning_patch = element_patch[global_id];
+                    h_not_owned_patch[i]  = owning_patch;
+
+                    auto it = std::lower_bound(
+                        ltog[owning_patch].begin(),
+                        ltog[owning_patch].begin() + num_owned[owning_patch],
+                        global_id);
+
+                    if (it ==
+                        ltog[owning_patch].begin() + num_owned[owning_patch]) {
+                        RXMESH_ERROR(
+                            "rxmesh::build_device can not find the local id of "
+                            "{} in patch {}. Maybe this patch does not own "
+                            "this mesh element.",
+                            global_id,
+                            owning_patch);
+                    } else {
+                        h_not_owned_id[i].id = static_cast<uint16_t>(
+                            it - ltog[owning_patch].begin());
+                    }
+                }
+
+                // Copy to device
+                CUDA_ERROR(cudaMalloc((void**)&d_not_owned_id,
+                                      sizeof(LocalT) * num_not_owned));
+                CUDA_ERROR(cudaMemcpy(d_not_owned_id,
+                                      h_not_owned_id,
+                                      sizeof(LocalT) * num_not_owned,
+                                      cudaMemcpyHostToDevice));
+
+                CUDA_ERROR(cudaMalloc((void**)&d_not_owned_patch,
+                                      sizeof(uint32_t) * num_not_owned));
+                CUDA_ERROR(cudaMemcpy(d_not_owned_patch,
+                                      h_not_owned_patch,
+                                      sizeof(uint32_t) * num_not_owned,
+                                      cudaMemcpyHostToDevice));
+            };
+
+
+        populate_not_owned(m_h_patches_ltog_f,
+                           m_patcher->get_face_patch(),
+                           m_h_num_owned_f,
+                           d_patch.not_owned_id_f,
+                           d_patch.not_owned_patch_f,
+                           m_h_patches_info[p].not_owned_id_f,
+                           m_h_patches_info[p].not_owned_patch_f);
+
+        populate_not_owned(m_h_patches_ltog_e,
+                           m_patcher->get_edge_patch(),
+                           m_h_num_owned_e,
+                           d_patch.not_owned_id_e,
+                           d_patch.not_owned_patch_e,
+                           m_h_patches_info[p].not_owned_id_e,
+                           m_h_patches_info[p].not_owned_patch_e);
+
+        populate_not_owned(m_h_patches_ltog_v,
+                           m_patcher->get_vertex_patch(),
+                           m_h_num_owned_v,
+                           d_patch.not_owned_id_v,
+                           d_patch.not_owned_patch_v,
+                           m_h_patches_info[p].not_owned_id_v,
+                           m_h_patches_info[p].not_owned_patch_v);
+
+        CUDA_ERROR(cudaMemcpy(m_d_patches_info + p,
+                              &d_patch,
+                              sizeof(PatchInfo),
+                              cudaMemcpyHostToDevice));
+    }
+}
 
-template class RXMesh<PATCH_SIZE>;
-}  // namespace RXMESH
+}  // namespace rxmesh
diff --git a/include/rxmesh/rxmesh.h b/include/rxmesh/rxmesh.h
index 8d1d42b5..cbf10331 100644
--- a/include/rxmesh/rxmesh.h
+++ b/include/rxmesh/rxmesh.h
@@ -1,166 +1,129 @@
 #pragma once
-
-#include <fstream>
 #include <memory>
 #include <unordered_map>
 #include <vector>
+#include "rxmesh/context.h"
+#include "rxmesh/patch_info.h"
 #include "rxmesh/patcher/patcher.h"
-#include "rxmesh/rxmesh_context.h"
+#include "rxmesh/types.h"
 #include "rxmesh/util/log.h"
 #include "rxmesh/util/macros.h"
+#include "rxmesh/util/util.h"
 
 class RXMeshTest;
 
-namespace RXMESH {
-using coordT = float;
-
-// This class is responsible for building the data structure of representing
-// the mesh a matrix (small sub-matrices). It should/can not be instantiated.
-// In order to use it, use RXMeshStatic
-
-enum class Op
-{
-    // Vertex
-    VV = 0,
-    VE = 1,
-    VF = 2,
-
-    // Face
-    FV = 3,
-    FE = 4,
-    FF = 5,
-
-    // Edge
-    EV = 6,
-    EE = 7,
-    EF = 8,
-};
-
-inline std::string op_to_string(const Op& op)
-{
-    switch (op) {
-        case RXMESH::Op::VV:
-            return "VV";
-        case RXMESH::Op::VE:
-            return "VE";
-        case RXMESH::Op::VF:
-            return "VF";
-        case RXMESH::Op::FV:
-            return "FV";
-        case RXMESH::Op::FE:
-            return "FE";
-        case RXMESH::Op::FF:
-            return "FF";
-        case RXMESH::Op::EV:
-            return "EV";
-        case RXMESH::Op::EF:
-            return "EF";
-        case RXMESH::Op::EE:
-            return "EE";
-        default:
-            return "";
-    }
-}
-
-enum class ELEMENT
-{
-    VERTEX = 0,
-    EDGE = 1,
-    FACE = 2
-};
+namespace rxmesh {
 
-template <uint32_t patchSize = PATCH_SIZE>
+/**
+ * @brief The main class for creating RXMesh data structure. It takes an input
+ * mesh on the host, computes the patches, and creates the data structure on the
+ * GPU. It is not mean to be used directly by the user. Users should use
+ * RXMeshStatic instead
+ */
 class RXMesh
 {
    public:
-    // Exporter
-    template <typename VertT>
-    void exportOBJ(const std::string& filename, VertT getCoords)
-    {
-        std::string  fn = STRINGIFY(OUTPUT_DIR) + filename;
-        std::fstream file(fn, std::ios::out);
-        file.precision(30);
-
-        // write vertices
-        for (uint32_t v = 0; v < m_num_vertices; ++v) {
-            uint32_t v_id = v;
-
-            file << "v  ";
-            for (uint32_t i = 0; i < 3; ++i) {
-                file << getCoords(v_id, i) << "  ";
-            }
-            file << std::endl;
-        }
-        // write connectivity
-        write_connectivity(file);
-        file.close();
-    }
-
-
-    // getter
+    /**
+     * @brief Total number of vertices in the mesh
+     */
     uint32_t get_num_vertices() const
     {
         return m_num_vertices;
     }
+
+    /**
+     * @brief Total number of edges in the mesh
+     */
     uint32_t get_num_edges() const
     {
         return m_num_edges;
     }
+
+    /**
+     * @brief Total number of faces in the mesh
+     */
     uint32_t get_num_faces() const
     {
         return m_num_faces;
     }
 
+    /**
+     * @brief Maximum valence in the input mesh
+     */
     uint32_t get_max_valence() const
     {
         return m_max_valence;
     }
 
+    /**
+     * @brief Maximum number of incident faces to an edge in the input mesh
+     */
     uint32_t get_max_edge_incident_faces() const
     {
         return m_max_edge_incident_faces;
     }
 
-    uint32_t get_max_edge_adjacent_faces() const
+    /**
+     * @brief Maximum number of adjacent faces to a face in the input mesh
+     */
+    uint32_t get_max_face_adjacent_faces() const
     {
         return m_max_face_adjacent_faces;
     }
-    uint32_t get_face_degree() const
-    {
-        return m_face_degree;
-    }
 
-    const RXMeshContext& get_context() const
+    /**
+     * @brief Return a context that store various information about the mesh on
+     * the GPU
+     */
+    const Context& get_context() const
     {
         return m_rxmesh_context;
     }
 
+    /**
+     * @brief returns true if the input mesh is manifold
+     */
     bool is_edge_manifold() const
     {
         return m_is_input_edge_manifold;
     }
 
+    /**
+     * @brief returns true if the input mesh is closed
+     */
     bool is_closed() const
     {
         return m_is_input_closed;
     }
 
+    /**
+     * @brief returns the patch size used during partitioning the input mesh
+     */
     uint32_t get_patch_size() const
     {
-        return patchSize;
+        return m_patch_size;
     }
 
+    /**
+     * @brief Total number of patches of the input mesh
+     */
     uint32_t get_num_patches() const
     {
         return m_num_patches;
     }
 
+    /**
+     * @brief Returns the number of disconnected component the input mesh is
+     * composed of
+     */
     uint32_t get_num_components() const
     {
         return m_patcher->get_num_components();
     }
 
-
+    /**
+     * @brief Return the max, min, and average patch size of the input mesh
+     */
     void get_max_min_avg_patch_size(uint32_t& min_p,
                                     uint32_t& max_p,
                                     uint32_t& avg_p) const
@@ -168,267 +131,165 @@ class RXMesh
         return m_patcher->get_max_min_avg_patch_size(min_p, max_p, avg_p);
     }
 
+    /**
+     * @brief Return (approximate) overhead due to ribbons
+     */
     double get_ribbon_overhead() const
     {
         return m_patcher->get_ribbon_overhead();
     }
 
+    /**
+     * @brief Maximum number of vertices in a patch
+     */
     uint32_t get_per_patch_max_vertices() const
     {
         return m_max_vertices_per_patch;
     }
 
+    /**
+     * @brief Maximum number of edges in a patch
+     */
     uint32_t get_per_patch_max_edges() const
     {
         return m_max_edges_per_patch;
     }
 
+    /**
+     * @brief Maximum number of faces in a patch
+     */
     uint32_t get_per_patch_max_faces() const
     {
         return m_max_faces_per_patch;
     }
 
-    uint32_t get_per_patch_max_owned_vertices() const
-    {
-        return m_max_owned_vertices_per_patch;
-    }
-
-    uint32_t get_per_patch_max_owned_edges() const
-    {
-        return m_max_owned_edges_per_patch;
-    }
-
-    uint32_t get_per_patch_max_owned_faces() const
-    {
-        return m_max_owned_faces_per_patch;
-    }
-
+    /**
+     * @brief The time used to construct the patches on the GPU
+     */
     float get_patching_time() const
     {
         return m_patcher->get_patching_time();
     }
 
+    /**
+     * @brief The number of Lloyd iterations run to partition the mesh into
+     * patches
+     */
     uint32_t get_num_lloyd_run() const
     {
         return m_patcher->get_num_lloyd_run();
     }
 
+    /**
+     * @brief Return the edge id given two vertices. Edges are undirected.
+     * @param v0 first input vertex
+     * @param v1 second input vertex
+     * @return edge id composed by v0-v1 (same as edge id for v1-v0)
+     */
     uint32_t get_edge_id(const uint32_t v0, const uint32_t v1) const;
 
-    double get_gpu_storage_mb() const
-    {
-        return m_total_gpu_storage_mb;
-    }
-
-    const std::unique_ptr<PATCHER::Patcher>& get_patcher() const
-    {
-        return m_patcher;
-    };
-
    protected:
     virtual ~RXMesh();
 
-    RXMeshContext m_rxmesh_context;
-
     RXMesh(const RXMesh&) = delete;
 
-    virtual void write_connectivity(std::fstream& file) const;
-
-    // build everything from scratch including patches (use this)
-    RXMesh(std::vector<std::vector<uint32_t>>& fv,
-           std::vector<std::vector<coordT>>&   coordinates,
-           const bool                          sort = false,
-           const bool                          quite = true);
+    RXMesh(const std::vector<std::vector<uint32_t>>& fv,
+           const bool                                quite = false);
+
+    /**
+     * @brief build different supporting data structure used to build RXMesh
+     *
+     * Set the number of vertices, edges, and faces, populate edge_map (which
+     * takes two connected vertices and returns their edge id), build
+     * face-incident-faces data structure (used to in creating patches). This is
+     * done using a single pass over FV
+     *
+     * @param fv input face incident vertices
+     * @param ef output edge incident faces
+     * @param ef output face adjacent faces
+     */
+    void build_supporting_structures(
+        const std::vector<std::vector<uint32_t>>& fv,
+        std::vector<std::vector<uint32_t>>&       ef,
+        std::vector<uint32_t>&                    ff_offset,
+        std::vector<uint32_t>&                    ff_values);
+
+    /**
+     * @brief Calculate various statistics for the input mesh
+     *
+     * Calculate max valence, max edge incident faces, max face adjacent faces,
+     * if the input is closed, if the input is edge manifold, and max number of
+     * vertices/edges/faces per patch
+     *
+     * @param fv input face incident vertices
+     * @param ef input edge incident faces
+     */
+    void calc_statistics(const std::vector<std::vector<uint32_t>>& fv,
+                         const std::vector<std::vector<uint32_t>>& ef);
+
+    void calc_max_not_owned_elements();
+
+    void build(const std::vector<std::vector<uint32_t>>& fv);
+    void build_single_patch(const std::vector<std::vector<uint32_t>>& fv,
+                            const uint32_t                            patch_id);
+
+    void build_single_patch_ltog(const std::vector<std::vector<uint32_t>>& fv,
+                                 const uint32_t patch_id);
+
+    void build_single_patch_topology(
+        const std::vector<std::vector<uint32_t>>& fv,
+        const uint32_t                            patch_id);
+
+
+    void build_device();
 
     uint32_t get_edge_id(const std::pair<uint32_t, uint32_t>& edge) const;
 
-    void     build_local(std::vector<std::vector<uint32_t>>& fv,
-                         std::vector<std::vector<coordT>>&   coordinates);
-    void     build_patch_locally(const uint32_t patch_id);
-    void     populate_edge_map(const std::vector<std::vector<uint32_t>>& fv);
-    uint16_t create_new_local_face(const uint32_t               patch_id,
-                                   const uint32_t               global_f,
-                                   const std::vector<uint32_t>& fv,
-                                   uint16_t&                    faces_count,
-                                   uint16_t&      edges_owned_count,
-                                   uint16_t&      edges_not_owned_count,
-                                   uint16_t&      vertices_owned_count,
-                                   uint16_t&      vertices_not_owned_count,
-                                   const uint16_t num_edges_owned,
-                                   const uint16_t num_vertices_owned,
-                                   std::vector<uint32_t>& f_ltog,
-                                   std::vector<uint32_t>& e_ltog,
-                                   std::vector<uint32_t>& v_ltog,
-                                   std::vector<uint16_t>& fp,
-                                   std::vector<uint16_t>& ep);
-    void     set_num_vertices(const std::vector<std::vector<uint32_t>>& fv);
-    void     edge_incident_faces(const std::vector<std::vector<uint32_t>>& fv,
-                                 std::vector<std::vector<uint32_t>>&       ef);
-
-    inline std::pair<uint32_t, uint32_t> edge_key(const uint32_t v0,
-                                                  const uint32_t v1) const
-    {
-        uint32_t i = std::max(v0, v1);
-        uint32_t j = std::min(v0, v1);
-        return std::make_pair(i, j);
-    }
-
-    template <typename pt_T>
-    void host_malloc(pt_T*& arr, uint32_t count)
-    {
-        arr = (pt_T*)malloc(count * sizeof(pt_T));
-        if (arr == NULL) {
-            RXMESH_ERROR(
-                "RXMesh::host_malloc() malloc failed with count = {} and total "
-                "size = {}",
-                count, count * sizeof(pt_T));
-        }
-    }
-
-    void device_alloc_local();
-
-    template <typename Tin, typename Tst>
-    void get_starting_ids(const std::vector<std::vector<Tin>>& input,
-                          std::vector<Tst>&                    starting_id);
-
-    template <typename T>
-    void padding_to_multiple(std::vector<std::vector<T>>& input,
-                             const uint32_t               multiple,
-                             const T                      init_val);
-
-    template <typename Tin, typename Tad>
-    void get_size(const std::vector<std::vector<Tin>>& input,
-                  std::vector<Tad>&                    ad);
-
-    void sort(std::vector<std::vector<uint32_t>>& fv,
-              std::vector<std::vector<coordT>>&   coordinates);
-
-
-    // www.techiedelight.com/use-std-pair-key-std-unordered_map-cpp/
-    struct edge_key_hash
-    {
-        template <class T>
-        inline std::size_t operator()(const std::pair<T, T>& e_key) const
-        {
-            return std::hash<T>()(e_key.first * 8191 + e_key.second * 11003);
-        }
-    };
-
-    // variables
 
     // our friend tester class
     friend class ::RXMeshTest;
 
-    // var
-    uint32_t m_num_edges, m_num_faces, m_num_vertices, m_max_ele_count,
-        m_max_valence, m_max_valence_vertex_id, m_max_edge_incident_faces,
-        m_max_face_adjacent_faces;
-    const uint32_t m_face_degree;
+    Context m_rxmesh_context;
 
-    // patches
-    uint32_t m_num_patches;
+    uint32_t m_num_edges, m_num_faces, m_num_vertices, m_max_valence,
+        m_max_edge_incident_faces, m_max_face_adjacent_faces;
 
-    bool m_is_input_edge_manifold;
-    bool m_is_input_closed;
-    bool m_is_sort;
-    bool m_quite;
+    uint32_t m_max_vertices_per_patch, m_max_edges_per_patch,
+        m_max_faces_per_patch;
 
-    std::unordered_map<std::pair<uint32_t, uint32_t>, uint32_t, edge_key_hash>
-        m_edges_map;
+    uint32_t m_max_not_owned_vertices, m_max_not_owned_edges,
+        m_max_not_owned_faces;
 
-    // store a copy of face incident vertices along with the neighbor
-    // faces of that face
-    std::vector<std::vector<uint32_t>> m_fvn;
+    uint32_t       m_num_patches;
+    const uint32_t m_patch_size;
+    bool           m_is_input_edge_manifold;
+    bool           m_is_input_closed;
+    bool           m_quite;
+
+    // Edge hash map that takes two vertices and return their edge id
+    std::unordered_map<std::pair<uint32_t, uint32_t>,
+                       uint32_t,
+                       detail::edge_key_hash>
+        m_edges_map;
 
     // pointer to the patcher class responsible for everything related to
     // patching the mesh into small pieces
-    std::unique_ptr<PATCHER::Patcher> m_patcher;
+    std::unique_ptr<patcher::Patcher> m_patcher;
 
+    //** main incident relations
+    std::vector<std::vector<uint16_t>> m_h_patches_ev;
+    std::vector<std::vector<uint16_t>> m_h_patches_fe;
 
-    //*************** Patch sub-matrices
+    // the number of owned mesh elements per patch
+    std::vector<uint16_t> m_h_num_owned_f, m_h_num_owned_e, m_h_num_owned_v;
 
-    //****** Host
-    uint32_t m_max_vertices_per_patch, m_max_edges_per_patch,
-        m_max_faces_per_patch;
-    uint32_t m_max_owned_vertices_per_patch, m_max_owned_edges_per_patch,
-        m_max_owned_faces_per_patch;
-    //** main incident relations
-    std::vector<std::vector<uint16_t>> m_h_patches_edges;
-    std::vector<std::vector<uint16_t>> m_h_patches_faces;
-    //.x edge address
-    //.y edge size
-    //.z face address
-    //.w face size
-    std::vector<uint4> m_h_ad_size;
-
-    // the size of owned mesh elements per patch
-    //.x faces
-    //.y edges
-    //.z vertex
-    std::vector<uint4> m_h_owned_size;
-
-    uint2 m_max_size;  // max number of edges(*2) and faces(*face_degree)
-                       // in a patch
-                       // this counts the size of edges and faces arrays
-                       // rounded up to multiple of 32
-
-    //** mappings
+    // mappings
     // local to global map for (v)ertices (e)dges and (f)aces
     std::vector<std::vector<uint32_t>> m_h_patches_ltog_v;
     std::vector<std::vector<uint32_t>> m_h_patches_ltog_e;
     std::vector<std::vector<uint32_t>> m_h_patches_ltog_f;
 
-    // storing the start id(x) and element count(y)
-    std::vector<uint2> m_h_ad_size_ltog_v, m_h_ad_size_ltog_e,
-        m_h_ad_size_ltog_f;
-
-
-    //****** Device
-    // Each device pointer points to a long array that holds specific data
-    // separated by patch id
-    //       ____________ _____________ ____________
-    //      |____________|_____________|____________|
-    //           ^^            ^^            ^^
-    //      patch 1 data  patch 2 data   patch 3 data
-
-    // We store the starting id and the size of mesh elements for each patch
-    // in m_d_ad_size_ltog_MESHELE (ad for address) where MESHELE could be
-    // v,e, or f. This is for the mapping pointers
-    // For incidence pointers, we only need store the starting id
 
-    //** face/vertex/edge patch (indexed by in global space)
-    uint32_t *m_d_face_patch, *m_d_vertex_patch, *m_d_edge_patch;
-
-    //** mapping
-    uint32_t *m_d_patches_ltog_v, *m_d_patches_ltog_e, *m_d_patches_ltog_f;
-    uint2 *   m_d_ad_size_ltog_v, *m_d_ad_size_ltog_e, *m_d_ad_size_ltog_f;
-
-    //** incidence
-    uint16_t *m_d_patches_edges, *m_d_patches_faces;
-
-    //*** Scanned histogram of the number of mesh elements per patch
-    std::vector<uint32_t> m_h_patch_distribution_v, m_h_patch_distribution_e,
-        m_h_patch_distribution_f;
-    uint32_t *m_d_patch_distribution_v, *m_d_patch_distribution_e,
-        *m_d_patch_distribution_f;
-
-    //.x edge address
-    //.y edge size
-    //.z face address
-    //.w face size
-    uint4* m_d_ad_size;
-
-    // the size of owned mesh elements per patch
-    //.x faces
-    //.y edges
-    //.z vertex
-    uint4* m_d_owned_size;
-
-    // neighbour patches
-    uint32_t *m_d_neighbour_patches, *m_d_neighbour_patches_offset;
-
-    double m_total_gpu_storage_mb;
+    PatchInfo *m_d_patches_info, *m_h_patches_info;
 };
-
-extern template class RXMesh<PATCH_SIZE>;
-}  // namespace RXMESH
+}  // namespace rxmesh
diff --git a/include/rxmesh/rxmesh_attribute.h b/include/rxmesh/rxmesh_attribute.h
deleted file mode 100644
index ebfff216..00000000
--- a/include/rxmesh/rxmesh_attribute.h
+++ /dev/null
@@ -1,866 +0,0 @@
-#pragma once
-
-#include <assert.h>
-#include "rxmesh/kernels/collective.cuh"
-#include "rxmesh/kernels/rxmesh_attribute.cuh"
-#include "rxmesh/kernels/util.cuh"
-#include "rxmesh/util/util.h"
-#include "rxmesh/util/vector.h"
-
-namespace RXMESH {
-
-// Flags for where the attributes array resides
-using locationT = uint32_t;
-enum : locationT
-{
-    LOCATION_NONE = 0x00,
-    HOST = 0x01,
-    DEVICE = 0x02,
-    LOCATION_ALL = 0x0F,
-};
-
-// The memory layout
-using layoutT = uint32_t;
-enum : layoutT
-{
-    AoS = 0x00,
-    SoA = 0x01,
-};
-
-// Reduce ops
-using reduceOpT = uint32_t;
-enum : reduceOpT
-{
-    SUM = 0x00,
-    MAX = 0x01,
-    MIN = 0X02,
-    NORM2 = 0X04,  // L2 norm squared
-    DOT = 0x08,    // dot product
-
-};
-
-static std::string location_to_string(locationT target)
-{
-    std::string str = "";
-    if ((target & HOST) == HOST) {
-        str = (str == "" ? "" : " ") + std::string("HOST");
-    }
-    if ((target & DEVICE) == DEVICE) {
-        str = (str == "" ? "" : " ") + std::string("DEVICE");
-    }
-    if (str == "") {
-        str = "NONE";
-    }
-    return str;
-}
-
-template <class T>
-class RXMeshAttribute
-{
-    // Here we manage the attributes on top of the mesh. An attributes is
-    // attached to mesh element (e.g., vertices, edges, or faces). The user
-    // is expected to declare as many attributes as expected to be used
-    // during the lifetime of RXMesh
-
-    // largely inspired by
-    // https://github.com/gunrock/gunrock/blob/master/gunrock/util/array_utils.cuh
-
-
-   public:
-    //********************** Constructors/Destructor
-    RXMeshAttribute()
-        : m_name(nullptr), m_num_mesh_elements(0),
-          m_num_attribute_per_element(0), m_allocated(LOCATION_NONE),
-          m_h_attr(nullptr), m_d_attr(nullptr), m_layout(AoS),
-          d_axpy_alpha(nullptr), d_axpy_beta(nullptr),
-          m_is_axpy_allocated(false), m_is_reduce_allocated(false),
-          m_reduce_temp_storage_bytes(0), m_d_reduce_temp_storage(nullptr),
-          m_d_reduce_output(nullptr), m_reduce_streams(nullptr),
-          m_norm2_temp_buffer(nullptr)
-    {
-
-        this->m_name = (char*)malloc(sizeof(char) * 1);
-        this->m_name[0] = '\0';
-        allocate(0, LOCATION_NONE);
-        m_pitch.x = 0;
-        m_pitch.y = 0;
-    }
-
-    RXMeshAttribute(const char* const name)
-        : m_name(nullptr), m_num_mesh_elements(0),
-          m_num_attribute_per_element(0), m_allocated(LOCATION_NONE),
-          m_h_attr(nullptr), m_d_attr(nullptr), m_layout(AoS),
-          d_axpy_alpha(nullptr), d_axpy_beta(nullptr),
-          m_is_axpy_allocated(false), m_is_reduce_allocated(false),
-          m_reduce_temp_storage_bytes(0)
-    {
-
-        if (name != nullptr) {
-            this->m_name = (char*)malloc(sizeof(char) * (strlen(name) + 1));
-            strcpy(this->m_name, name);
-        }
-        allocate(0, LOCATION_NONE);
-        m_pitch.x = 0;
-        m_pitch.y = 0;
-    }
-
-    //*********************************************************************
-
-
-    //********************** Setter/Getter
-    void set_name(std::string name)
-    {
-        free(this->m_name);
-        this->m_name = (char*)malloc(sizeof(char) * name.length() + 1);
-        strcpy(this->m_name, name.c_str());
-    }
-
-    __host__ __device__ __forceinline__ uint32_t get_num_mesh_elements() const
-    {
-        return this->m_num_mesh_elements;
-    }
-
-    __host__ __device__ __forceinline__ uint32_t
-    get_num_attribute_per_element() const
-    {
-        return this->m_num_attribute_per_element;
-    }
-
-    __host__ __device__ __forceinline__ locationT get_allocated() const
-    {
-        return this->m_allocated;
-    }
-
-    __host__ __device__ __forceinline__ bool is_device_allocated() const
-    {
-        return ((m_allocated & DEVICE) == DEVICE);
-    }
-
-    __host__ __device__ __forceinline__ bool is_host_allocated() const
-    {
-        return ((m_allocated & HOST) == HOST);
-    }
-
-    __host__ __device__ __forceinline__ T* get_pointer(locationT target) const
-    {
-
-        if (target == DEVICE) {
-            return m_d_attr;
-        }
-        if (target == HOST) {
-            return m_h_attr;
-        }
-        return nullptr;
-    }
-
-    void reset(const T value, locationT target, cudaStream_t stream = NULL)
-    {
-
-        if ((target & DEVICE) == DEVICE) {
-
-            assert((m_allocated & DEVICE) == DEVICE);
-
-            const int      threads = 256;
-            const uint32_t total =
-                m_num_attribute_per_element * m_num_mesh_elements;
-            memset<T><<<(total + threads - 1) / threads, threads, 0, stream>>>(
-                m_d_attr, value, total);
-            CUDA_ERROR(cudaDeviceSynchronize());
-            CUDA_ERROR(cudaGetLastError());
-        }
-
-
-        if ((target & HOST) == HOST) {
-            assert((m_allocated & HOST) == HOST);
-            for (uint32_t i = 0;
-                 i < m_num_mesh_elements * m_num_attribute_per_element; ++i) {
-                m_h_attr[i] = value;
-            }
-        }
-    }
-    //*********************************************************************
-
-
-    //********************** Memory Manipulation
-    void init(uint32_t   num_elements,
-              uint32_t   num_attributes_per_elements,
-              locationT  target = DEVICE,
-              layoutT    layout = AoS,
-              const bool with_axpy_alloc = true,
-              const bool with_reduce_alloc = true)
-    {
-        release();
-        m_allocated = LOCATION_NONE;
-        this->m_num_mesh_elements = num_elements;
-        this->m_num_attribute_per_element = num_attributes_per_elements;
-        if (num_elements == 0) {
-            return;
-        }
-        allocate(num_elements, target);
-        m_layout = layout;
-        set_pitch();
-
-        if (!m_is_axpy_allocated && with_axpy_alloc) {
-            CUDA_ERROR(cudaMalloc((void**)&d_axpy_alpha,
-                                  m_num_attribute_per_element * sizeof(T)));
-            CUDA_ERROR(cudaMalloc((void**)&d_axpy_beta,
-                                  m_num_attribute_per_element * sizeof(T)));
-            m_is_axpy_allocated = true;
-        }
-
-        if (!m_is_reduce_allocated && with_reduce_alloc) {
-            // Reduce operations are either SUM, MIN, MAX, or NORM2
-            // NORM2 produce is done in two passes, the first pass uses cub
-            // device API to multiply the input and then store in a temp buffer
-            // (every CUDA block outputs a single value) which then is used for
-            // the second pass using cub host API The other three operations
-            // uses only cub host API. cub host API requires temp buffer which
-            // is taken as the max of what NORM2 requires and the other three
-            // operations.
-
-            // NORM2 temp buffer (to store the per-block output)
-            uint32_t num_blocks = DIVIDE_UP(m_num_mesh_elements, m_block_size);
-            m_norm2_temp_buffer =
-                (T**)malloc(sizeof(T*) * m_num_attribute_per_element);
-            if (!m_norm2_temp_buffer) {
-                RXMESH_ERROR(
-                    "RXMeshAttribute::init() could not allocate "
-                    "m_norm2_temp_buffer.");
-            }
-            for (uint32_t i = 0; i < m_num_attribute_per_element; ++i) {
-                CUDA_ERROR(cudaMalloc(&m_norm2_temp_buffer[i],
-                                      sizeof(T) * num_blocks));
-            }
-
-            m_d_reduce_output =
-                (T**)malloc(sizeof(T*) * m_num_attribute_per_element);
-            if (!m_d_reduce_output) {
-                RXMESH_ERROR(
-                    "RXMeshAttribute::init() could not allocate "
-                    "m_d_reduce_output.");
-            }
-            m_d_reduce_temp_storage =
-                (void**)malloc(sizeof(void*) * m_num_attribute_per_element);
-            if (!m_d_reduce_temp_storage) {
-                RXMESH_ERROR(
-                    "RXMeshAttribute::init() could not allocate "
-                    "m_d_reduce_temp_storage.");
-            }
-            m_reduce_streams = (cudaStream_t*)malloc(
-                sizeof(cudaStream_t) * m_num_attribute_per_element);
-            if (!m_d_reduce_output) {
-                RXMESH_ERROR(
-                    "RXMeshAttribute::init() could not allocate "
-                    "m_reduce_streams.");
-            }
-            {  // get the num bytes for cub device-wide reduce
-                size_t norm2_temp_bytes(0), other_reduce_temp_bytes(0);
-                T*     d_out(NULL);
-                m_d_reduce_temp_storage[0] = NULL;
-                cub::DeviceReduce::Sum(m_d_reduce_temp_storage[0],
-                                       norm2_temp_bytes, m_d_attr, d_out,
-                                       num_blocks);
-                cub::DeviceReduce::Sum(m_d_reduce_temp_storage[0],
-                                       other_reduce_temp_bytes, m_d_attr, d_out,
-                                       m_num_mesh_elements);
-                m_reduce_temp_storage_bytes =
-                    std::max(norm2_temp_bytes, other_reduce_temp_bytes);
-            }
-
-            for (uint32_t i = 0; i < m_num_attribute_per_element; ++i) {
-                CUDA_ERROR(cudaMalloc(&m_d_reduce_temp_storage[i],
-                                      m_reduce_temp_storage_bytes));
-                CUDA_ERROR(cudaMalloc(&m_d_reduce_output[i], sizeof(T)));
-                CUDA_ERROR(cudaStreamCreate(&m_reduce_streams[i]));
-            }
-        }
-    }
-
-    void allocate(uint32_t num_mesh_elements, locationT target = DEVICE)
-    {
-
-        if ((target & HOST) == HOST) {
-            release(HOST);
-            if (num_mesh_elements != 0) {
-                m_h_attr = (T*)malloc(sizeof(T) * num_mesh_elements *
-                                      m_num_attribute_per_element);
-                if (!m_h_attr) {
-                    RXMESH_ERROR(
-                        " RXMeshAttribute::allocate() allocation on {} failed "
-                        "with #mesh_elemnts = {} and #attributes per element = "
-                        "{}" +
-                            location_to_string(HOST),
-                        num_mesh_elements, m_num_attribute_per_element);
-                }
-            }
-            m_allocated = m_allocated | HOST;
-        }
-
-
-        if ((target & DEVICE) == DEVICE) {
-            release(DEVICE);
-            if (num_mesh_elements != 0) {
-                CUDA_ERROR(cudaMalloc((void**)&(m_d_attr),
-                                      sizeof(T) * num_mesh_elements *
-                                          m_num_attribute_per_element));
-            }
-            m_allocated = m_allocated | DEVICE;
-        }
-        this->m_num_mesh_elements = num_mesh_elements;
-    }
-
-    void move(locationT source, locationT target)
-    {
-        if (source == target) {
-            return;
-        }
-
-        if ((source == HOST || source == DEVICE) &&
-            ((source & m_allocated) != source)) {
-            RXMESH_ERROR(
-                "RXMeshAttribute::move() moving source is not valid"
-                " because it was not allocated on source");
-        }
-
-        if (((target & HOST) == HOST || (target & DEVICE) == DEVICE) &&
-            ((target & m_allocated) != target)) {
-            allocate(this->m_num_mesh_elements, target);
-        }
-
-        if (this->m_num_mesh_elements == 0) {
-            return;
-        }
-
-        if (source == HOST && target == DEVICE) {
-            CUDA_ERROR(cudaMemcpy(
-                m_d_attr, m_h_attr,
-                sizeof(T) * m_num_mesh_elements * m_num_attribute_per_element,
-                cudaMemcpyHostToDevice));
-
-        } else if (source == DEVICE && target == HOST) {
-            CUDA_ERROR(cudaMemcpy(
-                m_h_attr, m_d_attr,
-                sizeof(T) * m_num_mesh_elements * m_num_attribute_per_element,
-                cudaMemcpyDeviceToHost));
-        }
-    }
-
-    void release(locationT target = LOCATION_ALL)
-    {
-
-        if (((target & HOST) == HOST) && ((m_allocated & HOST) == HOST)) {
-            free(m_h_attr);
-            m_h_attr = nullptr;
-            m_allocated = m_allocated & (~HOST);
-        }
-
-        if (((target & DEVICE) == DEVICE) &&
-            ((m_allocated & DEVICE) == DEVICE)) {
-            GPU_FREE(m_d_attr);
-            m_allocated = m_allocated & (~DEVICE);
-        }
-
-        if (target == LOCATION_ALL || m_allocated == 0) {
-            m_num_mesh_elements = 0;
-            m_pitch.x = 0;
-            m_pitch.y = 0;
-
-            if (m_is_axpy_allocated) {
-                GPU_FREE(d_axpy_alpha);
-                GPU_FREE(d_axpy_beta);
-                m_is_axpy_allocated = false;
-            }
-            if (m_is_reduce_allocated) {
-                for (uint32_t i = 0; i < m_num_attribute_per_element; ++i) {
-                    GPU_FREE(m_d_reduce_temp_storage[i]);
-                    GPU_FREE(m_norm2_temp_buffer[i]);
-                    GPU_FREE(m_d_reduce_output[i]);
-                    CUDA_ERROR(cudaStreamDestroy(m_reduce_streams[i]));
-                }
-                m_is_reduce_allocated = false;
-                free(m_reduce_streams);
-                free(m_d_reduce_output);
-                free(m_norm2_temp_buffer);
-                free(m_d_reduce_temp_storage);
-            }
-        }
-    }
-
-    void copy(RXMeshAttribute<T>& source,
-              locationT           source_flag,
-              locationT           target_flag)
-    {
-        // Deep copy from source. The source_flag defines where we will copy
-        // from. The target_flag defines where we will copy to.
-
-        // if source_flag and target_flag are both set to LOCATION_ALL, then we
-        // copy what is on host to host, and what on target to target
-
-        // If sourc_flag is set to HOST (or DEVICE) and target_flag is set to
-        // LOCATION_ALL, then we copy source's HOST (or DEVICE) to both HOST
-        // and DEVICE in target
-
-        // Setting source_flag to LOCATION_ALL while target_flag is Not set to
-        // LOCATION_ALL is invalid because we don't know which source to copy
-        // from
-
-        if (source.m_layout != m_layout) {
-            RXMESH_ERROR(
-                "RXMeshAttribute::copy() does not support copy from source of "
-                "different layout!");
-        }
-
-        if ((source_flag & LOCATION_ALL) == LOCATION_ALL &&
-            (target_flag & LOCATION_ALL) != LOCATION_ALL) {
-            RXMESH_ERROR("RXMeshAttribute::copy() Invalid configuration!");
-        }
-
-        if (source.get_num_mesh_elements() != m_num_mesh_elements) {
-            RXMESH_ERROR(
-                "RXMeshAttribute::copy() source has different size than "
-                "target!");
-        }
-
-        // 1) copy from HOST to HOST
-        if ((source_flag & HOST) == HOST && (target_flag & HOST) == HOST) {
-            if ((source_flag & source.m_allocated) != source_flag) {
-                RXMESH_ERROR(
-                    "RXMeshAttribute::copy() copying source is not valid"
-                    " because it was not allocated on host");
-            }
-            if ((target_flag & m_allocated) != target_flag) {
-                RXMESH_ERROR(
-                    "RXMeshAttribute::copy() copying source is not valid"
-                    " because target (this) was not allocated on host");
-            }
-
-            std::memcpy(
-                (void*)m_h_attr, source.m_h_attr,
-                m_num_mesh_elements * m_num_attribute_per_element * sizeof(T));
-        }
-
-
-        // 2) copy from DEVICE to DEVICE
-        if ((source_flag & DEVICE) == DEVICE &&
-            (target_flag & DEVICE) == DEVICE) {
-            if ((source_flag & source.m_allocated) != source_flag) {
-                RXMESH_ERROR(
-                    "RXMeshAttribute::copy() copying source is not valid"
-                    " because it was not allocated on device");
-            }
-            if ((target_flag & m_allocated) != target_flag) {
-                RXMESH_ERROR(
-                    "RXMeshAttribute::copy() copying source is not valid"
-                    " because target (this) was not allocated on device");
-            }
-
-            CUDA_ERROR(cudaMemcpy(
-                m_d_attr, source.m_d_attr,
-                m_num_mesh_elements * m_num_attribute_per_element * sizeof(T),
-                cudaMemcpyDeviceToDevice));
-        }
-
-
-        // 3) copy from DEVICE to HOST
-        if ((source_flag & DEVICE) == DEVICE && (target_flag & HOST) == HOST) {
-            if ((source_flag & source.m_allocated) != source_flag) {
-                RXMESH_ERROR(
-                    "RXMeshAttribute::copy() copying source is not valid"
-                    " because it was not allocated on host");
-            }
-            if ((target_flag & m_allocated) != target_flag) {
-                RXMESH_ERROR(
-                    "RXMeshAttribute::copy() copying source is not valid"
-                    " because target (this) was not allocated on device");
-            }
-
-            CUDA_ERROR(cudaMemcpy(
-                m_h_attr, source.m_d_attr,
-                m_num_mesh_elements * m_num_attribute_per_element * sizeof(T),
-                cudaMemcpyDeviceToHost));
-        }
-
-
-        // 4) copy from HOST to DEVICE
-        if ((source_flag & HOST) == HOST && (target_flag & DEVICE) == DEVICE) {
-            if ((source_flag & source.m_allocated) != source_flag) {
-                RXMESH_ERROR(
-                    "RXMeshAttribute::copy() copying source is not valid"
-                    " because it was not allocated on device");
-            }
-            if ((target_flag & m_allocated) != target_flag) {
-                RXMESH_ERROR(
-                    "RXMeshAttribute::copy() copying source is not valid"
-                    " because target (this) was not allocated on host");
-            }
-
-            CUDA_ERROR(cudaMemcpy(
-                m_d_attr, source.m_h_attr,
-                m_num_mesh_elements * m_num_attribute_per_element * sizeof(T),
-                cudaMemcpyHostToDevice));
-        }
-    }
-
-    void change_layout(locationT target)
-    {
-        // Only supporting HOST target
-        // If target is HOST, then the layout change only for the HOST
-        // the user then can copy the data to the DEVICE.
-        // To change the layout of data in the DEVICE, it should be copied first
-        // to the HOST, change layout, and then copy back to the DEVICE
-
-        // Only make sense when number of attributes is >1
-        if (m_num_attribute_per_element > 1) {
-
-            if ((target & m_allocated) != target) {
-                RXMESH_ERROR(
-                    "RXMeshAttribute::change_layout() changing layout {} is "
-                    "not valid because it was not allocated",
-                    location_to_string(target));
-                return;
-            }
-
-            if ((target & HOST) != HOST) {
-                RXMESH_ERROR(
-                    "RXMeshAttribute::change_layout() changing layout {} is "
-                    "not valid because it is not supported",
-                    location_to_string(target));
-                return;
-            }
-
-            if ((target & HOST) == HOST) {
-                const uint32_t size =
-                    m_num_mesh_elements * m_num_attribute_per_element;
-                const uint32_t num_cols = (m_layout == AoS) ?
-                                              m_num_attribute_per_element :
-                                              m_num_mesh_elements;
-                in_place_matrix_transpose(m_h_attr, m_h_attr + size,
-                                          uint64_t(num_cols));
-
-                m_layout = (m_layout == SoA) ? AoS : SoA;
-                set_pitch();
-            }
-        }
-    }
-    //*********************************************************************
-
-    //********************** BLAS
-    template <uint32_t N>
-    void axpy(const RXMeshAttribute<T>& X,
-              const Vector<N, T>        alpha,
-              const Vector<N, T>        beta,
-              const locationT           location = DEVICE,
-              const uint32_t            attribute_id = INVALID32,
-              cudaStream_t              stream = NULL)
-    {
-        // Implements
-        // Y = alpha*X + beta*Y
-        // where Y is *this.
-        // alpha and beta is passed as vector so different values can be applied
-        // to each attribute.
-        // if attribute == INVALID32, then axpy is applied on all attributes
-        // and alpha (and beta) should be of size m_num_attribute_per_element.
-        // Otherwise axpy will be only applied on the given attribute number
-        //(should be less than m_num_attribute_per_element) and alpha (and
-        // beta) should be of size one
-        // location tells on which side (host to device) the operation
-        // will run
-
-        const uint32_t num_attribute =
-            (attribute_id == INVALID32) ? m_num_attribute_per_element : 1;
-        assert(N >= num_attribute);
-
-        if ((location & DEVICE) == DEVICE) {
-
-            const uint32_t blocks =
-                DIVIDE_UP(m_num_mesh_elements, m_block_size);
-
-            CUDA_ERROR(cudaMemcpyAsync(d_axpy_alpha, (void*)&alpha,
-                                       sizeof(Vector<N, T>),
-                                       cudaMemcpyHostToDevice, stream));
-            CUDA_ERROR(cudaMemcpyAsync(d_axpy_beta, (void*)&beta,
-                                       sizeof(Vector<N, T>),
-                                       cudaMemcpyHostToDevice, stream));
-
-            rxmesh_attribute_axpy<T><<<blocks, m_block_size, 0, stream>>>(
-                X, d_axpy_alpha, *this, d_axpy_beta, attribute_id);
-
-            cudaStreamSynchronize(stream);
-        }
-        if ((location & HOST) == HOST) {
-            for (uint32_t i = 0; i < m_num_mesh_elements; ++i) {
-                for (uint32_t j = 0; j < m_num_attribute_per_element; ++j) {
-                    (*this)(i, j) =
-                        alpha[j] * X(i, j) + beta[j] * (*this)(i, j);
-                }
-            }
-        }
-    }
-
-    template <uint32_t N>
-    void reduce(Vector<N, T>&             h_output,
-                const reduceOpT           op,
-                const RXMeshAttribute<T>* other = nullptr,
-                const locationT           location = DEVICE)
-    {
-        if (N < m_num_attribute_per_element) {
-            RXMESH_ERROR(
-                "RXMeshAttribute::reduce() the output Vector size should be "
-                ">= the number of attributes per mesh element. Output "
-                "Vector size = {}, number of attributes per mesh element = {}",
-                N, m_num_attribute_per_element);
-        }
-
-
-        if ((location & DEVICE) == DEVICE) {
-            if (m_layout != SoA) {
-                RXMESH_ERROR(
-                    "RXMeshAttribute::reduce is not supported for non SoA "
-                    "layouts on the device");
-            }
-            for (uint32_t i = 0; i < m_num_attribute_per_element; ++i) {
-                switch (op) {
-                    case SUM: {
-                        cub::DeviceReduce::Sum(
-                            m_d_reduce_temp_storage[i],
-                            m_reduce_temp_storage_bytes,
-                            m_d_attr + i * m_num_mesh_elements,
-                            m_d_reduce_output[i], m_num_mesh_elements,
-                            m_reduce_streams[i]);
-                        break;
-                    }
-                    case MAX: {
-                        cub::DeviceReduce::Max(
-                            m_d_reduce_temp_storage[i],
-                            m_reduce_temp_storage_bytes,
-                            m_d_attr + i * m_num_mesh_elements,
-                            m_d_reduce_output[i], m_num_mesh_elements,
-                            m_reduce_streams[i]);
-                        break;
-                    }
-                    case MIN: {
-                        cub::DeviceReduce::Min(
-                            m_d_reduce_temp_storage[i],
-                            m_reduce_temp_storage_bytes,
-                            m_d_attr + i * m_num_mesh_elements,
-                            m_d_reduce_output[i], m_num_mesh_elements,
-                            m_reduce_streams[i]);
-                        break;
-                    }
-                    case NORM2: {
-                        uint32_t num_blocks =
-                            DIVIDE_UP(m_num_mesh_elements, m_block_size);
-                        // 1st pass
-                        rxmesh_attribute_norm2<T, m_block_size>
-                            <<<num_blocks, m_block_size, 0,
-                               m_reduce_streams[i]>>>(*this, i,
-                                                      m_norm2_temp_buffer[i]);
-
-                        // 2nd pass
-                        cub::DeviceReduce::Sum(m_d_reduce_temp_storage[i],
-                                               m_reduce_temp_storage_bytes,
-                                               m_norm2_temp_buffer[i],
-                                               m_d_reduce_output[i], num_blocks,
-                                               m_reduce_streams[i]);
-                        break;
-                    }
-                    case DOT: {
-                        if (other == nullptr) {
-                            RXMESH_ERROR(
-                                "RXMeshAttribute::reduce other can not be "
-                                "nullptr for dot product");
-                        }
-                        uint32_t num_blocks =
-                            DIVIDE_UP(m_num_mesh_elements, m_block_size);
-                        // 1st pass
-                        rxmesh_attribute_dot<T, m_block_size>
-                            <<<num_blocks, m_block_size, 0,
-                               m_reduce_streams[i]>>>(*this, *other, i,
-                                                      m_norm2_temp_buffer[i]);
-
-                        // 2nd pass
-                        cub::DeviceReduce::Sum(m_d_reduce_temp_storage[i],
-                                               m_reduce_temp_storage_bytes,
-                                               m_norm2_temp_buffer[i],
-                                               m_d_reduce_output[i], num_blocks,
-                                               m_reduce_streams[i]);
-                        break;
-                    }
-                    default: {
-                        RXMESH_ERROR(
-                            "RXMeshAttribute::reduce is not supported for the "
-                            "given operation");
-                        break;
-                    }
-                }
-                CUDA_ERROR(cudaStreamSynchronize(m_reduce_streams[i]));
-                CUDA_ERROR(cudaMemcpy(&h_output[i], m_d_reduce_output[i],
-                                      sizeof(T), cudaMemcpyDeviceToHost));
-            }
-        }
-
-        if ((location & HOST) == HOST) {
-            for (uint32_t j = 0; j < m_num_attribute_per_element; ++j) {
-                for (uint32_t i = 0; i < m_num_mesh_elements; ++i) {
-                    h_output[i] = 0;
-                    if (op == MAX || op == MIN) {
-                        h_output[i] = (*this)(i, j);
-                    }
-
-                    switch (op) {
-                        case SUM: {
-                            h_output[i] += (*this)(i, j);
-                            break;
-                        }
-                        case MAX: {
-                            h_output[i] = std::max(h_output[i], (*this)(i, j));
-                            break;
-                        }
-                        case MIN: {
-                            h_output[i] = std::min(h_output[i], (*this)(i, j));
-                            break;
-                        }
-                        case NORM2: {
-                            h_output[i] += (*this)(i, j) * (*this)(i, j);
-                            break;
-                        }
-                        case DOT: {
-                            if (other == nullptr) {
-                                RXMESH_ERROR(
-                                    "RXMeshAttribute::reduce other can not be "
-                                    "nullptr for dot product");
-                            }
-                            h_output[i] += (*this)(i, j) * (*other)(i, j);
-                        }
-                        default:
-                            break;
-                    }
-                }
-            }
-        }
-    }
-
-
-    //*********************************************************************
-
-
-    //********************** Operators
-    __host__ __device__ __forceinline__ T& operator()(uint32_t idx,
-                                                      uint32_t attr)
-    {
-
-        assert(attr < m_num_attribute_per_element);
-        assert(idx < m_num_mesh_elements);
-        assert(m_pitch.x > 0 && m_pitch.y > 0);
-
-#ifdef __CUDA_ARCH__
-        return m_d_attr[idx * m_pitch.x + attr * m_pitch.y];
-#else
-        return m_h_attr[idx * m_pitch.x + attr * m_pitch.y];
-#endif
-    }
-
-    __host__ __device__ __forceinline__ T& operator()(uint32_t idx)
-    {
-        // for m_num_attribute_per_element =1
-
-        assert(m_num_attribute_per_element == 1);
-        assert(idx < m_num_mesh_elements);
-
-#ifdef __CUDA_ARCH__
-        return m_d_attr[idx];
-#else
-        return m_h_attr[idx];
-#endif
-    }
-
-    __host__ __device__ __forceinline__ T& operator()(uint32_t idx,
-                                                      uint32_t attr) const
-    {
-
-        assert(attr < m_num_attribute_per_element);
-        assert(idx < m_num_mesh_elements);
-
-#ifdef __CUDA_ARCH__
-        return m_d_attr[idx * m_pitch.x + attr * m_pitch.y];
-#else
-        return m_h_attr[idx * m_pitch.x + attr * m_pitch.y];
-#endif
-    }
-
-    __host__ __device__ __forceinline__ T& operator()(uint32_t idx) const
-    {
-        // for m_num_attribute_per_element =1
-
-        assert(m_num_attribute_per_element == 1);
-        assert(idx < m_num_mesh_elements);
-
-#ifdef __CUDA_ARCH__
-        return m_d_attr[idx];
-#else
-        return m_h_attr[idx];
-#endif
-    }
-
-    __host__ __device__ __forceinline__ T* operator->() const
-    {
-#ifdef __CUDA_ARCH__
-        return m_d_attr;
-#else
-        return m_h_attr;
-#endif
-    }
-
-    __host__ __device__ __forceinline__ bool is_empty() const
-    {
-#ifdef __CUDA_ARCH__
-
-        return (m_d_attr == nullptr) ? true : false;
-#else
-        return (m_h_attr == nullptr) ? true : false;
-
-#endif
-    }
-    //*********************************************************************
-
-
-   private:
-    void set_pitch()
-    {
-        if (m_layout == AoS) {
-            m_pitch.x = m_num_attribute_per_element;
-            m_pitch.y = 1;
-        } else if (m_layout == SoA) {
-            m_pitch.x = 1;
-            m_pitch.y = m_num_mesh_elements;
-        } else {
-            RXMESH_ERROR("RXMeshAttribute::set_pitch() unknown layout");
-        }
-    }
-    //********************** Member Variables
-    char*     m_name;
-    uint32_t  m_num_mesh_elements;
-    uint32_t  m_num_attribute_per_element;
-    locationT m_allocated;
-    T*        m_h_attr;
-    T*        m_d_attr;
-    layoutT   m_layout;
-    // to index: id*m_pitch.x + attr*m_pitch.y
-    uint2 m_pitch;
-
-    constexpr static uint32_t m_block_size = 256;
-
-    // temp array for alpha and beta parameters of axpy allocated on the device
-    T *  d_axpy_alpha, *d_axpy_beta;
-    bool m_is_axpy_allocated;
-
-    // temp array for reduce operations
-    bool          m_is_reduce_allocated;
-    size_t        m_reduce_temp_storage_bytes;
-    void**        m_d_reduce_temp_storage;
-    T**           m_d_reduce_output;
-    cudaStream_t* m_reduce_streams;
-    T**           m_norm2_temp_buffer;
-    //*********************************************************************
-};
-}  // namespace RXMESH
\ No newline at end of file
diff --git a/include/rxmesh/rxmesh_context.h b/include/rxmesh/rxmesh_context.h
deleted file mode 100644
index 31987e79..00000000
--- a/include/rxmesh/rxmesh_context.h
+++ /dev/null
@@ -1,284 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-#include "rxmesh/util/macros.h"
-
-namespace RXMESH {
-
-// context for the mesh parameters and pointers. everything is allocated
-// on rxmesh. this class is meant to be a vehicle to copy various parameters
-// to the device kernels.
-// TODO make sure that __align__(16) is the right one
-class __align__(16) RXMeshContext
-{
-
-   public:
-    RXMeshContext()
-        : m_num_edges(0), m_num_faces(0), m_num_vertices(0), m_face_degree(0),
-          m_max_valence(0), m_max_edge_incident_faces(0),
-          m_max_face_adjacent_faces(0), m_num_patches(0),
-          m_d_face_patch(nullptr), m_d_edge_patch(nullptr),
-          m_d_vertex_patch(nullptr), m_d_patches_ltog_v(nullptr),
-          m_d_patches_ltog_e(nullptr), m_d_patches_ltog_f(nullptr),
-          m_d_ad_size_ltog_v(nullptr), m_d_ad_size_ltog_e(nullptr),
-          m_d_ad_size_ltog_f(nullptr), m_d_patches_edges(nullptr),
-          m_d_patches_faces(nullptr), m_d_patch_distribution_v(nullptr),
-          m_d_patch_distribution_e(nullptr), m_d_patch_distribution_f(nullptr),
-          m_d_ad_size(nullptr), m_d_owned_size(nullptr),
-          m_d_neighbour_patches(nullptr), m_d_neighbour_patches_offset(nullptr)
-
-    {
-        m_d_max_size.x = m_d_max_size.y = 0;
-    }
-
-    void init(
-        const uint32_t num_edges, const uint32_t num_faces,
-        const uint32_t num_vertices, const uint32_t face_degree,
-        const uint32_t max_valence, const uint32_t max_edge_incident_faces,
-        const uint32_t max_face_adjacent_faces, const uint32_t num_patches,
-        uint32_t* d_face_patch, uint32_t* d_edge_patch,
-        uint32_t* d_vertex_patch, uint32_t* d_patches_ltog_v,
-        uint32_t* d_patches_ltog_e, uint32_t* d_patches_ltog_f,
-        uint2* d_ad_size_ltog_v, uint2* d_ad_size_ltog_e,
-        uint2* d_ad_size_ltog_f, uint16_t* d_patches_edges,
-        uint16_t* d_patches_faces, uint4* d_ad_size, uint4* d_owned_size,
-        uint2 max_size, uint32_t* d_patch_distribution_v,
-        uint32_t* d_patch_distribution_e, uint32_t* d_patch_distribution_f,
-        uint32_t* d_neighbour_patches, uint32_t* d_neighbour_patches_offset)
-    {
-
-        m_num_edges = num_edges;
-        m_num_faces = num_faces;
-        m_num_vertices = num_vertices;
-        m_face_degree = face_degree;
-        m_max_valence = max_valence;
-        m_max_edge_incident_faces = max_edge_incident_faces;
-        m_max_face_adjacent_faces = max_face_adjacent_faces;
-        m_num_patches = num_patches;
-        m_d_face_patch = d_face_patch;
-        m_d_edge_patch = d_edge_patch;
-        m_d_vertex_patch = d_vertex_patch;
-        m_d_patches_ltog_v = d_patches_ltog_v;
-        m_d_patches_ltog_e = d_patches_ltog_e;
-        m_d_patches_ltog_f = d_patches_ltog_f;
-        m_d_ad_size_ltog_v = d_ad_size_ltog_v;
-        m_d_ad_size_ltog_e = d_ad_size_ltog_e;
-        m_d_ad_size_ltog_f = d_ad_size_ltog_f;
-        m_d_patches_edges = d_patches_edges;
-        m_d_patches_faces = d_patches_faces;
-        m_d_ad_size = d_ad_size;
-        m_d_owned_size = d_owned_size;
-        m_d_max_size = max_size;
-        m_d_patch_distribution_v = d_patch_distribution_v;
-        m_d_patch_distribution_e = d_patch_distribution_e;
-        m_d_patch_distribution_f = d_patch_distribution_f;
-        m_d_neighbour_patches = d_neighbour_patches;
-        m_d_neighbour_patches_offset = d_neighbour_patches_offset;
-    }
-
-
-    template <typename dataT>
-    __device__ void print_data(const dataT* arr, const uint32_t start_id,
-                               const uint32_t len, int shift = 0) const
-    {
-        printf(" start_id = %u, len = %u\n", start_id, len);
-
-        uint32_t end = len + start_id;
-        for (uint32_t i = start_id; i < end; ++i) {
-            printf(" [%u] ", arr[i] >> shift);
-            if (i % 20 == 0 && i != start_id) {
-                printf("\n");
-            }
-        }
-        printf("\n\n");
-    }
-
-    __device__ void print_patch(uint32_t p_id) const
-    {
-        // print all relevant data of a single patch
-
-        // if (threadIdx.x == 0){
-        printf("\n ********* p_id = %u *********\n", p_id);
-        printf(" global_num_vertices=%u \n", m_num_vertices);
-        printf(" global_num_edges=%u \n", m_num_edges);
-        printf(" global_num_faces=%u \n", m_num_faces);
-        printf(" global_num_patches=%u \n", m_num_patches);
-
-        printf(" patch #vertices = %u, start_id= %u \n",
-               m_d_ad_size_ltog_v[p_id].y, m_d_ad_size_ltog_v[p_id].x);
-        printf(" patch #edges = %u, start_id= %u\n", m_d_ad_size_ltog_e[p_id].y,
-               m_d_ad_size_ltog_e[p_id].x);
-        printf(" patch #faces = %u, start_id= %u\n", m_d_ad_size_ltog_f[p_id].y,
-               m_d_ad_size_ltog_f[p_id].x);
-
-        printf("\n ** d_ltog_v **\n");
-        print_data(m_d_patches_ltog_v, uint32_t(m_d_ad_size_ltog_v[p_id].x),
-                   uint32_t(m_d_ad_size_ltog_v[p_id].y), 1);
-
-        printf("\n ** d_ltog_e **\n");
-        print_data(m_d_patches_ltog_e, uint32_t(m_d_ad_size_ltog_e[p_id].x),
-                   uint32_t(m_d_ad_size_ltog_e[p_id].y), 1);
-
-        printf("\n ** d_ltog_f **\n");
-        print_data(m_d_patches_ltog_f, uint32_t(m_d_ad_size_ltog_f[p_id].x),
-                   uint32_t(m_d_ad_size_ltog_f[p_id].y), 1);
-
-
-        printf("\n ** d_edges **\n");
-        print_data(m_d_patches_edges, uint32_t(m_d_ad_size[p_id].x),
-                   uint32_t(m_d_ad_size[p_id].y));
-
-        printf("\n ** d_faces **\n");
-        print_data(m_d_patches_faces, uint32_t(m_d_ad_size[p_id].z),
-                   uint32_t(m_d_ad_size[p_id].w), 1);
-        //}
-    }
-
-
-    //********************** Getters
-    __device__ __forceinline__ uint32_t get_num_edges() const
-    {
-        return m_num_edges;
-    }
-    __device__ __forceinline__ uint32_t get_num_faces() const
-    {
-        return m_num_faces;
-    }
-    __device__ __forceinline__ uint32_t get_num_vertices() const
-    {
-        return m_num_vertices;
-    }
-    __device__ __forceinline__ uint32_t get_face_degree() const
-    {
-        return m_face_degree;
-    }
-    __device__ __forceinline__ uint32_t get_max_valence() const
-    {
-        return m_max_valence;
-    }
-    __device__ __forceinline__ uint32_t get_max_edge_incident_faces() const
-    {
-        return m_max_edge_incident_faces;
-    }
-
-    __device__ __forceinline__ uint32_t get_max_edge_adjacent_faces() const
-    {
-        return m_max_face_adjacent_faces;
-    }
-    __device__ __forceinline__ uint32_t get_num_patches() const
-    {
-        return m_num_patches;
-    }
-    __device__ __forceinline__ uint32_t* get_face_patch() const
-    {
-        return m_d_face_patch;
-    }
-    __device__ __forceinline__ uint32_t* get_edge_patch() const
-    {
-        return m_d_edge_patch;
-    }
-    __device__ __forceinline__ uint32_t* get_vertex_patch() const
-    {
-        return m_d_vertex_patch;
-    }
-    __device__ __forceinline__ uint32_t* get_patches_ltog_v() const
-    {
-        return m_d_patches_ltog_v;
-    }
-    __device__ __forceinline__ uint32_t* get_patches_ltog_e() const
-    {
-        return m_d_patches_ltog_e;
-    }
-    __device__ __forceinline__ uint32_t* get_patches_ltog_f() const
-    {
-        return m_d_patches_ltog_f;
-    }
-    __device__ __forceinline__ uint2* get_ad_size_ltog_v() const
-    {
-        return m_d_ad_size_ltog_v;
-    }
-    __device__ __forceinline__ uint2* get_ad_size_ltog_e() const
-    {
-        return m_d_ad_size_ltog_e;
-    }
-    __device__ __forceinline__ uint2* get_ad_size_ltog_f() const
-    {
-        return m_d_ad_size_ltog_f;
-    }
-    __device__ __forceinline__ uint16_t* get_patches_edges() const
-    {
-        return m_d_patches_edges;
-    }
-    __device__ __forceinline__ uint16_t* get_patches_faces() const
-    {
-        return m_d_patches_faces;
-    }
-    __device__ __forceinline__ uint4* get_ad_size() const
-    {
-        return m_d_ad_size;
-    }
-    __device__ __forceinline__ uint4* get_size_owned() const
-    {
-        return m_d_owned_size;
-    }
-    __device__ __forceinline__ uint2 get_max_size() const
-    {
-        return m_d_max_size;
-    }
-    __device__ __forceinline__ uint32_t* get_vertex_distribution() const
-    {
-        return m_d_patch_distribution_v;
-    }
-    __device__ __forceinline__ uint32_t* get_edge_distribution() const
-    {
-        return m_d_patch_distribution_e;
-    }
-    __device__ __forceinline__ uint32_t* get_face_distribution() const
-    {
-        return m_d_patch_distribution_f;
-    }
-    //**********************************************************************
-
-    static __device__ __host__ __forceinline__ void unpack_edge_dir(
-        const uint16_t edge_dir, uint16_t& edge, flag_t& dir)
-    {
-        dir = (edge_dir & 1) != 0;
-        edge = edge_dir >> 1;
-    }
-
-   private:
-    // mesh elements count
-    uint32_t m_num_edges, m_num_faces, m_num_vertices, m_face_degree,
-        m_max_valence, m_max_edge_incident_faces, m_max_face_adjacent_faces,
-        m_num_patches;
-
-
-    // max max_num_edges_per_patch*2 for all patches rounded to multiple of 32
-    // max max_num_faces_per_patch*m_face_degree for all patches rounded to
-    // multiple of 32
-    uint2 m_d_max_size;
-
-    //** face/vertex/edge patch (indexed by in global space)
-    uint32_t *m_d_face_patch, *m_d_edge_patch, *m_d_vertex_patch;
-
-    // mapping
-    uint32_t *m_d_patches_ltog_v, *m_d_patches_ltog_e, *m_d_patches_ltog_f;
-    uint2 *   m_d_ad_size_ltog_v, *m_d_ad_size_ltog_e, *m_d_ad_size_ltog_f;
-
-    // incidence
-    uint16_t *m_d_patches_edges, *m_d_patches_faces;
-
-    // scanned histogram of the mesh elements distribution per patch
-    uint32_t *m_d_patch_distribution_v, *m_d_patch_distribution_e,
-        *m_d_patch_distribution_f;
-
-    //.x edge address .y edge size  .z face address .w face size
-    uint4* m_d_ad_size;
-
-    //.x faces .y edges .z vertex
-    uint4* m_d_owned_size;
-
-    // patch neighbour
-    uint32_t *m_d_neighbour_patches, *m_d_neighbour_patches_offset;
-};
-}  // namespace RXMESH
\ No newline at end of file
diff --git a/include/rxmesh/rxmesh_static.h b/include/rxmesh/rxmesh_static.h
index 92f6d84c..af851b5c 100644
--- a/include/rxmesh/rxmesh_static.h
+++ b/include/rxmesh/rxmesh_static.h
@@ -1,46 +1,191 @@
 #pragma once
 #include <assert.h>
+#include <fstream>
+#include <memory>
+
 #include <cuda_profiler_api.h>
-#include "rxmesh/kernels/prototype.cuh"
+
+#include "rxmesh/attribute.h"
+#include "rxmesh/handle.h"
+#include "rxmesh/kernels/for_each.cuh"
 #include "rxmesh/launch_box.h"
 #include "rxmesh/rxmesh.h"
-#include "rxmesh/rxmesh_util.h"
+#include "rxmesh/types.h"
 #include "rxmesh/util/log.h"
 #include "rxmesh/util/timer.h"
 
-namespace RXMESH {
+namespace rxmesh {
 
-template <uint32_t patchSize = PATCH_SIZE>
-class RXMeshStatic : public RXMesh<patchSize>
+/**
+ * @brief This class is responsible for query operations of static meshes. It
+ * extends RXMesh with methods needed to launch kernel and do computation on the
+ * mesh as well as managing mesh attributes
+ */
+class RXMeshStatic : public RXMesh
 {
-    // This class is responsible for query operation of static meshes. It
-    // inherits the constructor and build methods from the base class RXMesh
-    // and create new method(s) for queries
    public:
-    //********************** Constructors/Destructors
     RXMeshStatic(const RXMeshStatic&) = delete;
 
+    /**
+     * @brief Main constructor used to initialize internal member variables
+     * @param fv Face incident vertices as read from an obj file
+     * @param quite run in quite mode
+     */
     RXMeshStatic(std::vector<std::vector<uint32_t>>& fv,
-                 std::vector<std::vector<coordT>>&   coordinates,
-                 const bool                          sort = false,
-                 const bool                          quite = true)
-        : RXMesh<patchSize>(fv, coordinates, sort, quite){};
+                 const bool                          quite = false)
+        : RXMesh(fv, quite)
+    {
+        m_attr_container = std::make_shared<AttributeContainer>();
+    };
 
     virtual ~RXMeshStatic()
     {
     }
 
-    //*********************************************************************
 
     /**
-     * prepare_launch_box()
+     * @brief Apply a lambda function on all vertices in the mesh
+     * @tparam LambdaT type of the lambda function (inferred)
+     * @param location the execution location
+     * @param apply lambda function to be applied on all vertices. The lambda
+     * function signature takes a VertexHandle
+     * @param stream the stream used to run the kernel in case of DEVICE
+     * execution location
+     */
+    template <typename LambdaT>
+    void for_each_vertex(locationT    location,
+                         LambdaT      apply,
+                         cudaStream_t stream = NULL)
+    {
+        if ((location & HOST) == HOST) {
+            const int num_patches = this->get_num_patches();
+#pragma omp parallel for
+            for (int p = 0; p < num_patches; ++p) {
+                for (uint16_t v = 0;
+                     v < this->m_h_patches_info[p].num_owned_vertices;
+                     ++v) {
+                    const VertexHandle v_handle(static_cast<uint32_t>(p), v);
+                    apply(v_handle);
+                }
+            }
+        }
+
+        if ((location & DEVICE) == DEVICE) {
+            if constexpr (IS_HD_LAMBDA(LambdaT) || IS_D_LAMBDA(LambdaT)) {
+
+                const int num_patches = this->get_num_patches();
+                const int threads     = 256;
+                detail::for_each_vertex<<<num_patches, threads, 0, stream>>>(
+                    num_patches, this->m_d_patches_info, apply);
+            } else {
+                RXMESH_ERROR(
+                    "RXMeshStatic::for_each_vertex() Input lambda function "
+                    "should be annotated with  __device__ for execution on "
+                    "device");
+            }
+        }
+    }
+
+    /**
+     * @brief Apply a lambda function on all edges in the mesh
+     * @tparam LambdaT type of the lambda function (inferred)
+     * @param location the execution location
+     * @param apply lambda function to be applied on all edges. The lambda
+     * function signature takes a EdgeHandle
+     * @param stream the stream used to run the kernel in case of DEVICE
+     * execution location
+     */
+    template <typename LambdaT>
+    void for_each_edge(locationT    location,
+                       LambdaT      apply,
+                       cudaStream_t stream = NULL)
+    {
+        if ((location & HOST) == HOST) {
+            const int num_patches = this->get_num_patches();
+#pragma omp parallel for
+            for (int p = 0; p < num_patches; ++p) {
+                for (uint16_t e = 0;
+                     e < this->m_h_patches_info[p].num_owned_edges;
+                     ++e) {
+                    const EdgeHandle e_handle(static_cast<uint32_t>(p), e);
+                    apply(e_handle);
+                }
+            }
+        }
+
+        if ((location & DEVICE) == DEVICE) {
+            if constexpr (IS_HD_LAMBDA(LambdaT) || IS_D_LAMBDA(LambdaT)) {
+
+                const int num_patches = this->get_num_patches();
+                const int threads     = 256;
+                detail::for_each_edge<<<num_patches, threads, 0, stream>>>(
+                    num_patches, this->m_d_patches_info, apply);
+            } else {
+                RXMESH_ERROR(
+                    "RXMeshStatic::for_each_edge() Input lambda function "
+                    "should be annotated with  __device__ for execution on "
+                    "device");
+            }
+        }
+    }
+
+    /**
+     * @brief Apply a lambda function on all faces in the mesh
+     * @tparam LambdaT type of the lambda function (inferred)
+     * @param location the execution location
+     * @param apply lambda function to be applied on all faces. The lambda
+     * function signature takes a FaceHandle
+     * @param stream the stream used to run the kernel in case of DEVICE
+     * execution location
+     */
+    template <typename LambdaT>
+    void for_each_face(locationT    location,
+                       LambdaT      apply,
+                       cudaStream_t stream = NULL)
+    {
+        if ((location & HOST) == HOST) {
+            const int num_patches = this->get_num_patches();
+#pragma omp parallel for
+            for (int p = 0; p < num_patches; ++p) {
+                for (int f = 0; f < this->m_h_patches_info[p].num_owned_faces;
+                     ++f) {
+                    const FaceHandle f_handle(static_cast<uint32_t>(p), f);
+                    apply(f_handle);
+                }
+            }
+        }
+
+        if ((location & DEVICE) == DEVICE) {
+            if constexpr (IS_HD_LAMBDA(LambdaT) || IS_D_LAMBDA(LambdaT)) {
+
+                const int num_patches = this->get_num_patches();
+                const int threads     = 256;
+                detail::for_each_face<<<num_patches, threads, 0, stream>>>(
+                    num_patches, this->m_d_patches_info, apply);
+            } else {
+                RXMESH_ERROR(
+                    "RXMeshStatic::for_each_face() Input lambda function "
+                    "should be annotated with  __device__ for execution on "
+                    "device");
+            }
+        }
+    }
+
+    /**
+     * @brief populate the launch_box with grid size and dynamic shared memory
+     * needed for kernel launch
      * TODO provide variadic version of this function that can accept multiple
      * ops
+     * @param op Query operation done inside this the kernel
+     * @param launch_box input launch box to be populated
+     * @param is_higher_query if the query done will be a higher ordered e.g.,
+     * k-ring
+     * @param oriented if the query is oriented. Valid only for Op::VV queries
      */
     template <uint32_t blockThreads>
     void prepare_launch_box(const Op                 op,
                             LaunchBox<blockThreads>& launch_box,
-                            const bool               is_higher_query = false,
+                            const void*              kernel,
                             const bool               oriented = false) const
     {
         static_assert(
@@ -51,18 +196,380 @@ class RXMeshStatic : public RXMesh<patchSize>
 
         launch_box.blocks = this->m_num_patches;
 
-        const uint32_t output_fixed_offset =
-            (op == Op::EV) ? 2 : ((op == Op::FV || op == Op::FE) ? 3 : 0);
-
         this->template calc_shared_memory<blockThreads>(
-            op, launch_box, is_higher_query, oriented);
+            op, launch_box, kernel, oriented);
+    }
+
+
+    /**
+     * @brief Adding a new face attribute
+     * @tparam T type of the attribute
+     * @param name of the attribute. Should not collide with other attributes
+     * names
+     * @param num_attributes number of the attributes
+     * @param location where to allocate the attributes
+     * @param layout as SoA or AoS
+     * operations
+     * @return shared pointer to the created attribute
+     */
+    template <class T>
+    std::shared_ptr<FaceAttribute<T>> add_face_attribute(
+        const std::string& name,
+        uint32_t           num_attributes,
+        locationT          location = LOCATION_ALL,
+        layoutT            layout   = SoA)
+    {
+        return m_attr_container->template add<FaceAttribute<T>>(
+            name.c_str(),
+            this->m_h_num_owned_f,
+            num_attributes,
+            location,
+            layout);
+    }
+
+    /**
+     * @brief Adding a new face attribute by reading values from a host buffer
+     * f_attributes where the order of faces is the same as the order of
+     * faces given to the constructor.The attributes are populated on device
+     * and host
+     * @tparam T type of the attribute
+     * @param name of the attribute. Should not collide with other attributes
+     * names
+     * @param layout as SoA or AoS
+     * operations
+     * @return shared pointer to the created attribute
+     * TODO implement this
+     */
+    template <class T>
+    std::shared_ptr<VertexAttribute<T>> add_face_attribute(
+        const std::vector<std::vector<T>>& f_attributes,
+        const std::string&                 name,
+        layoutT                            layout = SoA)
+    {
+    }
+
+    /**
+     * @brief Adding a new face attribute by reading values from a host buffer
+     * f_attributes where the order of faces is the same as the order of
+     * faces given to the constructor.The attributes are populated on device
+     * and host
+     * @tparam T type of the attribute
+     * @param name of the attribute. Should not collide with other attributes
+     * names
+     * @param layout as SoA or AoS
+     * operations
+     * @return shared pointer to the created attribute
+     * TODO implement this
+     */
+    template <class T>
+    std::shared_ptr<VertexAttribute<T>> add_face_attribute(
+        const std::vector<T>& f_attributes,
+        const std::string&    name,
+        layoutT               layout = SoA)
+    {
+    }
+
+    /**
+     * @brief Adding a new edge attribute
+     * @tparam T type of the attribute
+     * @param name of the attribute. Should not collide with other attributes
+     * names
+     * @param num_attributes number of the attributes
+     * @param location where to allocate the attributes
+     * @param layout as SoA or AoS
+     * operations
+     * @return shared pointer to the created attribute
+     */
+    template <class T>
+    std::shared_ptr<EdgeAttribute<T>> add_edge_attribute(
+        const std::string& name,
+        uint32_t           num_attributes,
+        locationT          location = LOCATION_ALL,
+        layoutT            layout   = SoA)
+    {
+        return m_attr_container->template add<EdgeAttribute<T>>(
+            name.c_str(),
+            this->m_h_num_owned_e,
+            num_attributes,
+            location,
+            layout);
+    }
+
+    /**
+     * @brief Adding a new vertex attribute
+     * @tparam T type of the attribute
+     * @param name of the attribute. Should not collide with other attributes
+     * names
+     * @param num_attributes number of the attributes
+     * @param location where to allocate the attributes
+     * @param layout as SoA or AoS
+     * operations
+     * @return shared pointer to the created attribute
+     */
+    template <class T>
+    std::shared_ptr<VertexAttribute<T>> add_vertex_attribute(
+        const std::string& name,
+        uint32_t           num_attributes,
+        locationT          location = LOCATION_ALL,
+        layoutT            layout   = SoA)
+    {
+        return m_attr_container->template add<VertexAttribute<T>>(
+            name.c_str(),
+            this->m_h_num_owned_v,
+            num_attributes,
+            location,
+            layout);
+    }
+
+    /**
+     * @brief Adding a new vertex attribute by reading values from a host buffer
+     * v_attributes where the order of vertices is the same as the order of
+     * vertices given to the constructor. The attributes are populated on device
+     * and host
+     * @tparam T type of the attribute
+     * @param v_attributes attributes to read
+     * @param name of the attribute. Should not collide with other attributes
+     * names
+     * @param layout as SoA or AoS
+     * operations
+     * @return shared pointer to the created attribute
+     */
+    template <class T>
+    std::shared_ptr<VertexAttribute<T>> add_vertex_attribute(
+        const std::vector<std::vector<T>>& v_attributes,
+        const std::string&                 name,
+        layoutT                            layout = SoA)
+    {
+        if (v_attributes.empty()) {
+            RXMESH_ERROR(
+                "RXMeshStatic::add_vertex_attribute() input attribute is "
+                "empty");
+        }
+
+        if (v_attributes.size() != get_num_vertices()) {
+            RXMESH_ERROR(
+                "RXMeshStatic::add_vertex_attribute() input attribute size "
+                "({}) is not the same as number of vertices in the input mesh "
+                "({})",
+                v_attributes.size(),
+                get_num_vertices());
+        }
+
+        uint32_t num_attributes = v_attributes[0].size();
+
+        auto ret = m_attr_container->template add<VertexAttribute<T>>(
+            name.c_str(),
+            this->m_h_num_owned_v,
+            num_attributes,
+            LOCATION_ALL,
+            layout);
+
+        // populate the attribute before returning it
+        const int num_patches = this->get_num_patches();
+#pragma omp parallel for
+        for (int p = 0; p < num_patches; ++p) {
+            for (uint16_t v = 0; v < this->m_h_num_owned_v[p]; ++v) {
+
+                const VertexHandle v_handle(static_cast<uint32_t>(p), v);
+
+                uint32_t global_v = m_h_patches_ltog_v[p][v];
+
+                for (uint32_t a = 0; a < num_attributes; ++a) {
+                    (*ret)(v_handle, a) = v_attributes[global_v][a];
+                }
+            }
+        }
+
+        // move to device
+        ret->move(rxmesh::HOST, rxmesh::DEVICE);
+        return ret;
+    }
+
+    /**
+     * @brief Adding a new vertex attribute by reading values from a host buffer
+     * v_attributes where the order of vertices is the same as the order of
+     * vertices given to the constructor. The attributes are populated on device
+     * and host
+     * @tparam T type of the attribute
+     * @param v_attributes attributes to read
+     * @param name of the attribute. Should not collide with other attributes
+     * names
+     * @param layout as SoA or AoS
+     * operations
+     * @return shared pointer to the created attribute
+     */
+    template <class T>
+    std::shared_ptr<VertexAttribute<T>> add_vertex_attribute(
+        const std::vector<T>& v_attributes,
+        const std::string&    name,
+        layoutT               layout = SoA)
+    {
+        if (v_attributes.empty()) {
+            RXMESH_ERROR(
+                "RXMeshStatic::add_vertex_attribute() input attribute is "
+                "empty");
+        }
+
+        if (v_attributes.size() != get_num_vertices()) {
+            RXMESH_ERROR(
+                "RXMeshStatic::add_vertex_attribute() input attribute size "
+                "({}) is not the same as number of vertices in the input mesh "
+                "({})",
+                v_attributes.size(),
+                get_num_vertices());
+        }
+
+        uint32_t num_attributes = 1;
+
+        auto ret = m_attr_container->template add<VertexAttribute<T>>(
+            name.c_str(),
+            this->m_h_num_owned_v,
+            num_attributes,
+            LOCATION_ALL,
+            layout);
+
+        // populate the attribute before returning it
+        const int num_patches = this->get_num_patches();
+#pragma omp parallel for
+        for (int p = 0; p < num_patches; ++p) {
+            for (uint16_t v = 0; v < this->m_h_num_owned_v[p]; ++v) {
+
+                const VertexHandle v_handle(static_cast<uint32_t>(p), v);
+
+                uint32_t global_v = m_h_patches_ltog_v[p][v];
+
+                (*ret)(v_handle, 0) = v_attributes[global_v];
+            }
+        }
+
+        // move to device
+        ret->move(rxmesh::HOST, rxmesh::DEVICE);
+        return ret;
+    }
+
+    /**
+     * @brief Checks if an attribute exists given its name
+     * @param name the attribute name
+     * @return True if the attribute exists. False otherwise.
+     */
+    bool does_attribute_exist(const std::string& name)
+    {
+        return m_attr_container->does_exist(name.c_str());
+    }
+
+    /**
+     * @brief Remove an attribute. Could be vertex, edge, or face attribute
+     * @param name the attribute name
+     */
+    void remove_attribute(const std::string& name)
+    {
+        if (!this->does_attribute_exist(name)) {
+            RXMESH_WARN(
+                "RXMeshStatic::remove_attribute() trying to remove an "
+                "attribute that does not exit with name {}",
+                name);
+            return;
+        }
+
+        m_attr_container->remove(name.c_str());
+    }
+
+
+    /**
+     * @brief Map a vertex handle into a global index as seen in the input
+     * to RXMeshStatic
+     * @param vh input vertex handle
+     * @return the global index of vh
+     */
+    uint32_t map_to_global(const VertexHandle vh) const
+    {
+        auto pl = vh.unpack();
+        return m_h_patches_ltog_v[pl.first][pl.second];
+    }
+
+    /**
+     * @brief Map an edge handle into a global index
+     * @param eh input edge handle
+     * @return the global index of eh
+     */
+    uint32_t map_to_global(const EdgeHandle eh) const
+    {
+        auto pl = eh.unpack();
+        return m_h_patches_ltog_e[pl.first][pl.second];
+    }
+
+    /**
+     * @brief Map a face handle into a global index as seen in the input
+     * to RXMeshStatic
+     * @param vh input face handle
+     * @return the global index of fh
+     */
+    uint32_t map_to_global(const FaceHandle fh) const
+    {
+        auto pl = fh.unpack();
+        return m_h_patches_ltog_f[pl.first][pl.second];
+    }
+
+    /**
+     * @brief Export the mesh to obj file
+     * @tparam T type of vertices coordinates
+     * @param filename the output file
+     * @param coords vertices coordinates
+     */
+    template <typename T>
+    void export_obj(const std::string&        filename,
+                    const VertexAttribute<T>& coords)
+    {
+        std::string  fn = filename;
+        std::fstream file(fn, std::ios::out);
+        file.precision(30);
+
+        uint32_t num_v = 0;
+        for (uint32_t p = 0; p < this->m_num_patches; ++p) {
+
+            const uint32_t p_num_vertices =
+                this->m_h_patches_info[p].num_vertices;
+
+            for (uint16_t v = 0; v < p_num_vertices; ++v) {
+                uint16_t v_id = v;
+                uint32_t p_id = p;
+                if (v >= this->m_h_patches_info[p].num_owned_vertices) {
+                    uint16_t l =
+                        v - this->m_h_patches_info[p].num_owned_vertices;
+                    v_id = this->m_h_patches_info[p].not_owned_id_v[l].id;
+                    p_id = this->m_h_patches_info[p].not_owned_patch_v[l];
+                }
+                VertexHandle vh(p_id, {v_id});
+                file << "v " << coords(vh, 0) << " " << coords(vh, 1) << " "
+                     << coords(vh, 2) << std::endl;
+            }
+
+            const uint32_t p_num_faces =
+                this->m_h_patches_info[p].num_owned_faces;
+
+            for (uint32_t f = 0; f < p_num_faces; ++f) {
+
+                file << "f ";
+                for (uint32_t e = 0; e < 3; ++e) {
+                    uint16_t edge = this->m_h_patches_info[p].fe[3 * f + e].id;
+                    flag_t   dir(0);
+                    Context::unpack_edge_dir(edge, edge, dir);
+                    uint16_t e_id = (2 * edge) + dir;
+                    uint16_t v    = this->m_h_patches_info[p].ev[e_id].id;
+                    file << v + num_v + 1 << " ";
+                }
+                file << std::endl;
+            }
+
+            num_v += p_num_vertices;
+        }
     }
 
    protected:
     template <uint32_t blockThreads>
     void calc_shared_memory(const Op                 op,
                             LaunchBox<blockThreads>& launch_box,
-                            const bool               is_higher_query,
+                            const void*              kernel,
                             const bool               oriented = false) const
     {
         // Operations that uses matrix transpose needs a template parameter
@@ -75,7 +582,8 @@ class RXMeshStatic : public RXMesh<patchSize>
                     "RXMeshStatic::calc_shared_memory() "
                     "TRANSPOSE_ITEM_PER_THREAD = {} needs "
                     "to be increased for op = {}",
-                    TRANSPOSE_ITEM_PER_THREAD, op_to_string(op));
+                    TRANSPOSE_ITEM_PER_THREAD,
+                    op_to_string(op));
             }
         } else if (op == Op::VE || op == Op::EF || op == Op::FF) {
             if (3 * this->m_max_faces_per_patch >
@@ -84,7 +592,8 @@ class RXMeshStatic : public RXMesh<patchSize>
                     "RXMeshStatic::calc_shared_memory() "
                     "TRANSPOSE_ITEM_PER_THREAD = {} needs "
                     "to be increased for op = {}",
-                    TRANSPOSE_ITEM_PER_THREAD, op_to_string(op));
+                    TRANSPOSE_ITEM_PER_THREAD,
+                    op_to_string(op));
             }
         }
 
@@ -105,40 +614,92 @@ class RXMeshStatic : public RXMesh<patchSize>
         launch_box.smem_bytes_dyn = 0;
 
         if (op == Op::FE) {
-            // only faces will be loaded and no extra shared memory is needed
+            // only FE will be loaded
             launch_box.smem_bytes_dyn =
                 3 * this->m_max_faces_per_patch * sizeof(uint16_t);
+            // to load not-owned edges local and patch id
+            launch_box.smem_bytes_dyn +=
+                this->m_max_not_owned_edges *
+                    (sizeof(uint16_t) + sizeof(uint32_t)) +
+                sizeof(uint16_t);
         } else if (op == Op::EV) {
-            // only edges will be loaded and no extra shared memory is needed
+            // only EV will be loaded
             launch_box.smem_bytes_dyn =
                 2 * this->m_max_edges_per_patch * sizeof(uint16_t);
+            // to load not-owned vertices local and patch id
+            launch_box.smem_bytes_dyn += this->m_max_not_owned_vertices *
+                                         (sizeof(uint16_t) + sizeof(uint32_t));
         } else if (op == Op::FV) {
-            // We load both faces and edges. We don't change edges.
-            // faces are updated to contain FV instead of FE by reading from
-            // edges
+            // We load both FE and EV. We don't change EV.
+            // FE are updated to contain FV instead of FE by reading from
+            // EV
             launch_box.smem_bytes_dyn =
                 3 * this->m_max_faces_per_patch * sizeof(uint16_t) +
                 2 * this->m_max_edges_per_patch * sizeof(uint16_t);
+            // no need for extra memory to load not-owned vertices local and
+            // patch id. We load them and overwrite EV.
+            const uint32_t not_owned_v_bytes =
+                this->m_max_not_owned_vertices *
+                (sizeof(uint16_t) + sizeof(uint32_t));
+            const uint32_t edges_bytes =
+                2 * this->m_max_edges_per_patch * sizeof(uint16_t);
+            if (not_owned_v_bytes > edges_bytes) {
+                // launch_box.smem_bytes_dyn += not_owned_v_bytes - edges_bytes;
+                RXMESH_ERROR(
+                    "RXMeshStatic::calc_shared_memory() FV query might fail!");
+            }
         } else if (op == Op::VE) {
-            // load edges and then transpose it in place
+            // load EV and then transpose it in place
             // The transpose needs two buffer; one for prefix sum and another
             // for the actual output
-            // The prefix sum will be stored in place (where edges are loaded)
+            // The prefix sum will be stored in place (where EV are loaded)
             // The output will be stored in another buffer with size equal to
-            // the edges since this output buffer will stored the nnz and the
-            // nnz of a matrix the same before/after transpose
+            // the EV (i.e., 2*#edges) since this output buffer will stored the
+            // nnz and the nnz of a matrix the same before/after transpose
             launch_box.smem_bytes_dyn =
-                (2 * 2 * this->m_max_edges_per_patch) * sizeof(uint16_t);
-        } else if (op == Op::EF || op == Op::VF) {
-            // same as above but with faces
+                (2 * 2 * this->m_max_edges_per_patch) * sizeof(uint16_t) +
+                sizeof(uint16_t);
+
+            // to load the not-owned edges local and patch id
+            launch_box.smem_bytes_dyn += this->m_max_not_owned_edges *
+                                         (sizeof(uint16_t) + sizeof(uint32_t));
+        } else if (op == Op::EF) {
+            // same as Op::VE but with faces
             launch_box.smem_bytes_dyn =
                 (2 * 3 * this->m_max_faces_per_patch) * sizeof(uint16_t) +
+                sizeof(uint16_t) + sizeof(uint16_t);
+
+            // to load the not-owned faces local and patch id
+            launch_box.smem_bytes_dyn += this->m_max_not_owned_faces *
+                                         (sizeof(uint16_t) + sizeof(uint32_t));
+        } else if (op == Op::VF) {
+            // load EV and FE simultaneously. changes FE to FV using EV. Then
+            // transpose FV in place and use EV to store the values/output while
+            // using FV to store the prefix sum. Thus, the space used to store
+            // EV should be max(3*#faces, 2*#edges)
+            launch_box.smem_bytes_dyn =
+                3 * this->m_max_faces_per_patch * sizeof(uint16_t) +
+                std::max(3 * this->m_max_faces_per_patch,
+                         2 * this->m_max_edges_per_patch) *
+                    sizeof(uint16_t) +
                 sizeof(uint16_t);
+
+            // to load the not-owned faces local and patch id
+            launch_box.smem_bytes_dyn += this->m_max_not_owned_faces *
+                                         (sizeof(uint16_t) + sizeof(uint32_t));
         } else if (op == Op::VV) {
-            // similar to VE but we also need to store the edges (EV) even after
-            // we do the transpose.
+            // similar to VE but we also need to store the EV even after
+            // we do the transpose
             launch_box.smem_bytes_dyn =
                 (3 * 2 * this->m_max_edges_per_patch) * sizeof(uint16_t);
+            // no need for extra memory to load not-owned local and patch id.
+            // We load them and overwrite the extra EV
+            if (this->m_max_not_owned_vertices *
+                    (sizeof(uint16_t) + sizeof(uint32_t)) >
+                (2 * this->m_max_edges_per_patch) * sizeof(uint16_t)) {
+                RXMESH_ERROR(
+                    "RXMeshStatic::calc_shared_memory() VV query might fail!");
+            }
         } else if (op == Op::FF) {
             // FF needs to store FE and EF along side with the output itself
             // FE needs 3*max_num_faces
@@ -153,6 +714,8 @@ class RXMeshStatic : public RXMesh<patchSize>
                  4 * this->m_max_faces_per_patch          // FF
                  ) *
                 sizeof(uint16_t);
+            // no need for extra memory to load not-owned faces local and
+            // patch id. We load them and overwrite FE.
         }
 
         if (op == Op::VV && oriented) {
@@ -162,160 +725,56 @@ class RXMeshStatic : public RXMesh<patchSize>
             // Since oriented is only done on manifold, EF needs only
             // 2*max_num_edges since every edge is neighbor to maximum of two
             // faces (which we write on the same place as the extra EV)
-            launch_box.smem_bytes_dyn += (/*2 * this->m_max_edges_per_patch +*/
-                                          3 * this->m_max_faces_per_patch) *
-                                         sizeof(uint16_t);
-        }
-
-        // to store output ltog map without the need to overlap it with
-        // where we store mesh edges/faces
-        // The +1 is for padding
-        if (op == Op::EV || op == Op::FV /*|| op == Op::VV*/) {
-            // For VV, we overwrite the extra storage we used above
-            // to store the mapping which is more than enough to store the
-            // vertices ltog
             launch_box.smem_bytes_dyn +=
-                (this->m_max_vertices_per_patch + 1) * sizeof(uint32_t);
-
-        } else if (op == Op::FE || op == Op::VE || op == Op::EE) {
-            launch_box.smem_bytes_dyn +=
-                (this->m_max_edges_per_patch + 1) * sizeof(uint32_t);
-        } else if (op == Op::VF || op == Op::EF /*|| op == Op::FF*/) {
-            launch_box.smem_bytes_dyn +=
-                (this->m_max_faces_per_patch + 1) * sizeof(uint32_t);
+                (3 * this->m_max_faces_per_patch) * sizeof(uint16_t);
         }
 
 
-        launch_box.smem_bytes_static = check_shared_memory<blockThreads>(
-            op, launch_box.smem_bytes_dyn, is_higher_query);
+        check_shared_memory<blockThreads>(op,
+                                          launch_box.smem_bytes_dyn,
+                                          launch_box.smem_bytes_static,
+                                          launch_box.num_registers_per_thread,
+                                          kernel);
 
 
         if (!this->m_quite) {
             RXMESH_TRACE(
-                "RXMesh::calc_shared_memory() launching {} blocks with {} "
-                "threads on the device",
-                launch_box.blocks, blockThreads);
+                "RXMeshStatic::calc_shared_memory() launching {} blocks with "
+                "{} threads on the device",
+                launch_box.blocks,
+                blockThreads);
         }
     }
 
     template <uint32_t threads>
-    uint32_t check_shared_memory(const Op       op,
-                                 const uint32_t smem_bytes_dyn,
-                                 bool           is_higher_query) const
+    void check_shared_memory(const Op       op,
+                             const uint32_t smem_bytes_dyn,
+                             size_t&        smem_bytes_static,
+                             uint32_t&      num_reg_per_thread,
+                             const void*    kernel) const
     {
         // check if total shared memory (static + dynamic) consumed by
         // k_base_query are less than the max shared per block
-        cudaFuncAttributes func_attr;
-        switch (op) {
-            case Op::VV: {
-                if (is_higher_query) {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr,
-                        detail::higher_query_prototype<Op::VV, threads>));
-                } else {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr, detail::query_prototype<Op::VV, threads>));
-                }
-
-                break;
-            }
-            case Op::VE: {
-                if (is_higher_query) {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr,
-                        detail::higher_query_prototype<Op::VE, threads>));
-                } else {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr, detail::query_prototype<Op::VE, threads>));
-                }
-                break;
-            }
-            case Op::VF: {
-                if (is_higher_query) {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr,
-                        detail::higher_query_prototype<Op::VF, threads>));
-                } else {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr, detail::query_prototype<Op::VF, threads>));
-                }
-                break;
-            }
-            case Op::EV: {
-                if (is_higher_query) {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr,
-                        detail::higher_query_prototype<Op::EV, threads>));
-                } else {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr, detail::query_prototype<Op::EV, threads>));
-                }
-                break;
-            }
-            case Op::EE: {
-                break;
-            }
-            case Op::EF: {
-                if (is_higher_query) {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr,
-                        detail::higher_query_prototype<Op::EF, threads>));
-                } else {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr, detail::query_prototype<Op::EF, threads>));
-                }
-                break;
-            }
-            case Op::FV: {
-                if (is_higher_query) {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr,
-                        detail::higher_query_prototype<Op::FV, threads>));
-                } else {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr, detail::query_prototype<Op::FV, threads>));
-                }
-                break;
-            }
-            case Op::FE: {
-                if (is_higher_query) {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr,
-                        detail::higher_query_prototype<Op::FE, threads>));
-                } else {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr, detail::query_prototype<Op::FE, threads>));
-                }
-                break;
-            }
-            case Op::FF: {
-                if (is_higher_query) {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr,
-                        detail::higher_query_prototype<Op::FF, threads>));
-                } else {
-                    CUDA_ERROR(cudaFuncGetAttributes(
-                        &func_attr, detail::query_prototype<Op::FF, threads>));
-                }
-                break;
-            }
-        }
+        cudaFuncAttributes func_attr = cudaFuncAttributes();
+        CUDA_ERROR(cudaFuncGetAttributes(&func_attr, kernel));
 
-        uint32_t smem_bytes_static = func_attr.sharedSizeBytes;
-        uint32_t num_regs = func_attr.numRegs;
-        int      device_id;
+        smem_bytes_static  = func_attr.sharedSizeBytes;
+        num_reg_per_thread = static_cast<uint32_t>(func_attr.numRegs);
+        int device_id;
         CUDA_ERROR(cudaGetDevice(&device_id));
         cudaDeviceProp devProp;
         CUDA_ERROR(cudaGetDeviceProperties(&devProp, device_id));
 
         if (!this->m_quite) {
             RXMESH_TRACE(
-                "RXMeshStatic::check_shared_memory() query_prototype with "
-                "{} "
-                "required shared memory = {} (dynamic) +  {} (static) = {} "
-                "(bytes) and {} registers",
-                op_to_string(op), smem_bytes_dyn, smem_bytes_static,
-                smem_bytes_dyn + smem_bytes_static, num_regs);
+                "RXMeshStatic::check_shared_memory() user function with {} "
+                "requires shared memory = {} (dynamic) + {} (static) = {} "
+                "(bytes) and {} registers per thread",
+                op_to_string(op),
+                smem_bytes_dyn,
+                smem_bytes_static,
+                smem_bytes_dyn + smem_bytes_static,
+                num_reg_per_thread);
 
             RXMESH_TRACE(
                 "RXMeshStatic::check_shared_memory() available total shared "
@@ -327,12 +786,15 @@ class RXMeshStatic : public RXMesh<patchSize>
         if (smem_bytes_static + smem_bytes_dyn > devProp.sharedMemPerBlock) {
             RXMESH_ERROR(
                 " RXMeshStatic::check_shared_memory() shared memory needed for"
-                " query_prototype ({} bytes) exceeds the max shared memory "
+                " input function ({} bytes) exceeds the max shared memory "
                 "per block on the current device ({} bytes)",
-                smem_bytes_static + smem_bytes_dyn, devProp.sharedMemPerBlock);
+                smem_bytes_static + smem_bytes_dyn,
+                devProp.sharedMemPerBlock);
             exit(EXIT_FAILURE);
         }
-        return static_cast<uint32_t>(smem_bytes_static);
     }
+
+
+    std::shared_ptr<AttributeContainer> m_attr_container;
 };
-}  // namespace RXMESH
\ No newline at end of file
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/rxmesh_util.h b/include/rxmesh/rxmesh_util.h
deleted file mode 100644
index 6a211f79..00000000
--- a/include/rxmesh/rxmesh_util.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#pragma once
-
-#include "rxmesh/rxmesh.h"
-
-namespace RXMESH {
-
-/**
- * io_elements()
- */
-void __device__ __host__ __inline__ io_elements(const Op& op,
-                                                ELEMENT&  source_ele,
-                                                ELEMENT&  output_ele)
-{
-    if (op == Op::VV || op == Op::VE || op == Op::VF) {
-        source_ele = ELEMENT::VERTEX;
-    } else if (op == Op::EV || op == Op::EE || op == Op::EF) {
-        source_ele = ELEMENT::EDGE;
-    } else if (op == Op::FV || op == Op::FE || op == Op::FF) {
-        source_ele = ELEMENT::FACE;
-    }
-    if (op == Op::VV || op == Op::EV || op == Op::FV) {
-        output_ele = ELEMENT::VERTEX;
-    } else if (op == Op::VE || op == Op::EE || op == Op::FE) {
-        output_ele = ELEMENT::EDGE;
-    } else if (op == Op::VF || op == Op::EF || op == Op::FF) {
-        output_ele = ELEMENT::FACE;
-    }
-}
-}  // namespace RXMESH
\ No newline at end of file
diff --git a/include/rxmesh/types.h b/include/rxmesh/types.h
new file mode 100644
index 00000000..a4a84987
--- /dev/null
+++ b/include/rxmesh/types.h
@@ -0,0 +1,125 @@
+#pragma once
+#include <stdint.h>
+#include <string>
+#include "rxmesh/util/macros.h"
+
+namespace rxmesh {
+
+/**
+ * @brief Flags for where data resides. Used with Attributes
+ */
+using locationT = uint32_t;
+enum : locationT
+{
+    LOCATION_NONE = 0x00,
+    HOST          = 0x01,
+    DEVICE        = 0x02,
+    LOCATION_ALL  = 0x0F,
+};
+
+/**
+ * @brief convert locationT to string
+ */
+static std::string location_to_string(const locationT location)
+{
+    switch (location) {
+        case LOCATION_NONE:
+            return "NONE";
+        case HOST:
+            return "HOST";
+        case DEVICE:
+            return "DEVICE";
+        case LOCATION_ALL:
+            return "ALL";
+        default: {
+            RXMESH_ERROR("to_string() unknown location");
+            return "";
+        }
+    }
+}
+
+/**
+ * @brief Memory layout
+ */
+using layoutT = uint32_t;
+enum : layoutT
+{
+    AoS = 0x00,
+    SoA = 0x01,
+};
+/**
+ * @brief convert locationT to string
+ */
+static std::string layout_to_string(const layoutT layout)
+{
+    switch (layout) {
+        case AoS:
+            return "AoS";
+        case SoA:
+            return "SoA";
+        default: {
+            RXMESH_ERROR("to_string() unknown layout");
+            return "";
+        }
+    }
+}
+
+/**
+ * @brief ELEMENT represents the three types of mesh elements
+ */
+enum class ELEMENT
+{
+    VERTEX = 0,
+    EDGE   = 1,
+    FACE   = 2
+};
+
+/**
+ * @brief Various query operations supported in RXMesh
+ */
+enum class Op
+{
+    VV = 0,
+    VE = 1,
+    VF = 2,
+    FV = 3,
+    FE = 4,
+    FF = 5,
+    EV = 6,
+    EE = 7,
+    EF = 8,
+};
+
+/**
+ * @brief Convert an operation to string
+ * @param op a query operation
+ * @return name of the query operation as a string
+ */
+static std::string op_to_string(const Op& op)
+{
+    switch (op) {
+        case Op::VV:
+            return "VV";
+        case Op::VE:
+            return "VE";
+        case Op::VF:
+            return "VF";
+        case Op::FV:
+            return "FV";
+        case Op::FE:
+            return "FE";
+        case Op::FF:
+            return "FF";
+        case Op::EV:
+            return "EV";
+        case Op::EF:
+            return "EF";
+        case Op::EE:
+            return "EE";
+        default: {
+            RXMESH_ERROR("to_string() unknown input operation");
+            return "";
+        }
+    }
+}
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/util/cuda_query.h b/include/rxmesh/util/cuda_query.h
index 6290f876..d6df770c 100644
--- a/include/rxmesh/util/cuda_query.h
+++ b/include/rxmesh/util/cuda_query.h
@@ -4,7 +4,7 @@
 #include "rxmesh/util/log.h"
 #include "rxmesh/util/macros.h"
 
-namespace RXMESH {
+namespace rxmesh {
 inline int convert_SMV_to_cores(int major, int minor)
 {
     // Taken from Nvidia helper_cuda.h to get the number of SM and cuda cores
@@ -29,7 +29,11 @@ inline int convert_SMV_to_cores(int major, int minor)
         {0x61, 128},  // Pascal Generation (SM 6.1) GP10x class
         {0x62, 128},  // Pascal Generation (SM 6.2) GP10x class
         {0x70, 64},   // Volta Generation (SM 7.0) GV100 class
-        {0x72, 64},  {0x75, 64}, {0x80, 64}, {0x86, 128}, {-1, -1}};
+        {0x72, 64},
+        {0x75, 64},
+        {0x80, 64},
+        {0x86, 128},
+        {-1, -1}};
 
     int index = 0;
 
@@ -44,7 +48,9 @@ inline int convert_SMV_to_cores(int major, int minor)
     // properly
     printf(
         "MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
-        major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+        major,
+        minor,
+        nGpuArchCoresPerSM[index - 1].Cores);
     return nGpuArchCoresPerSM[index - 1].Cores;
 }
 
@@ -61,37 +67,48 @@ cudaDeviceProp cuda_query(const int dev, bool quite = false)
             " a CUDA-supported GPU!!!");
     }
 
-    cudaSetDevice(dev);
-    cudaDeviceProp devProp;
+    CUDA_ERROR(cudaSetDevice(dev));
+    cudaDeviceProp dev_prop;
 
-    CUDA_ERROR(cudaGetDeviceProperties(&devProp, dev));
+    CUDA_ERROR(cudaGetDeviceProperties(&dev_prop, dev));
 
     if (!quite) {
 
         RXMESH_TRACE("Total number of device: {}", deviceCount);
         RXMESH_TRACE("Using device Number: {}", dev);
-        RXMESH_TRACE("Device name: {}", devProp.name);
-        RXMESH_TRACE("Compute Capability: {}.{}", (int)devProp.major,
-                     (int)devProp.minor);
+
+        RXMESH_TRACE("Device name: {}", dev_prop.name);
+        RXMESH_TRACE("Compute Capability: {}.{}",
+                     (int)dev_prop.major,
+                     (int)dev_prop.minor);
         RXMESH_TRACE("Total amount of global memory (MB): {0:.1f}",
-                     (float)devProp.totalGlobalMem / 1048576.0f);
+                     (float)dev_prop.totalGlobalMem / 1048576.0f);
         RXMESH_TRACE("{} Multiprocessors, {} CUDA Cores/MP: {} CUDA Cores",
-                     devProp.multiProcessorCount,
-                     convert_SMV_to_cores(devProp.major, devProp.minor),
-                     convert_SMV_to_cores(devProp.major, devProp.minor) *
-                         devProp.multiProcessorCount);
+                     dev_prop.multiProcessorCount,
+                     convert_SMV_to_cores(dev_prop.major, dev_prop.minor),
+                     convert_SMV_to_cores(dev_prop.major, dev_prop.minor) *
+                         dev_prop.multiProcessorCount);
+        RXMESH_TRACE("ECC support: {}",
+                     (dev_prop.ECCEnabled ? "Enabled" : "Disabled"));
         RXMESH_TRACE("GPU Max Clock rate: {0:.1f} MHz ({1:.2f} GHz)",
-                     devProp.clockRate * 1e-3f, devProp.clockRate * 1e-6f);
+                     dev_prop.clockRate * 1e-3f,
+                     dev_prop.clockRate * 1e-6f);
         RXMESH_TRACE("Memory Clock rate: {0:.1f} Mhz",
-                     devProp.memoryClockRate * 1e-3f);
-        RXMESH_TRACE("Memory Bus Width:  {}-bit", devProp.memoryBusWidth);
-        const double maxBW = 2.0 * devProp.memoryClockRate *
-                             (devProp.memoryBusWidth / 8.0) / 1.0E6;
+                     dev_prop.memoryClockRate * 1e-3f);
+        RXMESH_TRACE("Memory Bus Width:  {}-bit", dev_prop.memoryBusWidth);
+        const double maxBW = 2.0 * dev_prop.memoryClockRate *
+                             (dev_prop.memoryBusWidth / 8.0) / 1.0E6;
         RXMESH_TRACE("Peak Memory Bandwidth: {0:f}(GB/s)", maxBW);
         RXMESH_TRACE("Kernels compiled for compute capability: {}",
                      cuda_arch());
     }
 
-    return devProp;
+    if (!dev_prop.managedMemory) {
+        RXMESH_ERROR(
+            "The selected device does not support CUDA unified memory");
+        exit(EXIT_FAILURE);
+    }
+
+    return dev_prop;
 }
-}  // namespace RXMESH
\ No newline at end of file
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/util/export_tools.h b/include/rxmesh/util/export_tools.h
index 43d66334..7f829c27 100644
--- a/include/rxmesh/util/export_tools.h
+++ b/include/rxmesh/util/export_tools.h
@@ -88,7 +88,7 @@ void export_as_cubes_VTK(std::string    filename,
                          CubeY          funY,
                          CubeZ          funZ,
                          const uint32_t num_att,
-                         bool           randomize = 1,
+                         bool           randomize  = 1,
                          float*         randomness = (float*)nullptr)
 {
 
@@ -281,7 +281,7 @@ void export_attribute_VTK(
         if (rand_map_it != rand_map.end()) {
             return rand_map[att[id]];
         } else {
-            double val = double(rand()) / double(RAND_MAX);
+            double val        = double(rand()) / double(RAND_MAX);
             rand_map[att[id]] = val;
             return val;
         }
diff --git a/include/rxmesh/util/import_obj.h b/include/rxmesh/util/import_obj.h
index 47cff4a0..b4601a29 100644
--- a/include/rxmesh/util/import_obj.h
+++ b/include/rxmesh/util/import_obj.h
@@ -2,58 +2,58 @@
 
 #include <string>
 #include <vector>
-
 #include "rxmesh/util/log.h"
 
-#ifndef MAX_LINE_LENGTH
-#define MAX_LINE_LENGTH 2048
-#endif
-
-
-// Read and input mesh from obj file format
-// Input: path to the obj file
-// Output: Verts = 3d vertices (Num vertices X 3)
-//        Faces = faces index to the Vert array (Num facex X 3)
-//        Tex = Tex coordinates (Num texture coordinates X 2)
-//        Faces = faces index to the Tex array (Num facex X 3)
-//        Normals = faces index to the Tex array (Num normals X 3)
-//        Faces = faces index to the Normals array (Num facex X 3)
-
-template <typename DATA_T, typename INDEX_T>
-bool import_obj(const std::string                  fileName,
-                std::vector<std::vector<DATA_T>>&  Verts,
-                std::vector<std::vector<INDEX_T>>& Faces,
-                std::vector<std::vector<DATA_T>>&  Tex,
-                std::vector<std::vector<INDEX_T>>& FacesTex,
-                std::vector<std::vector<DATA_T>>&  Normal,
-                std::vector<std::vector<INDEX_T>>& FacesNormal,
-                bool                               quite = false)
+/**
+ * @brief Read an input mesh from obj file format
+ * @tparam DataT coordinates type (float/double)
+ * @tparam IndexT indices type
+ * @param file_name path to the obj file
+ * @param vertices 3d vertices (3*#vertices)
+ * @param faces face index to the Vert array (3*#faces)
+ * @param tex texture coordinates (2*#texture coordinates)
+ * @param face_tex faces index to the tex array (3*#faces)
+ * @param normals face normal coordinates (3*#normal)
+ * @param face_normal faces index to the Normals array (3*faces)
+ * @param quite run in quite mode
+ * @return true if reading the file is successful
+ */
+template <typename DataT, typename IndexT>
+bool import_obj(const std::string                 file_name,
+                std::vector<std::vector<DataT>>&  vertices,
+                std::vector<std::vector<IndexT>>& faces,
+                std::vector<std::vector<DataT>>&  tex,
+                std::vector<std::vector<IndexT>>& face_tex,
+                std::vector<std::vector<DataT>>&  normals,
+                std::vector<std::vector<IndexT>>& face_normal,
+                bool                              quite = false)
 {
 
-    FILE* Objfile = fopen(fileName.c_str(), "r");
+    FILE* Objfile = fopen(file_name.c_str(), "r");
     if (NULL == Objfile) {
-        RXMESH_ERROR("importOBJ() can not open {}", fileName);
+        RXMESH_ERROR("importOBJ() can not open {}", file_name);
         return false;
     } else {
         if (!quite) {
-            RXMESH_TRACE("Reading {}", fileName);
+            RXMESH_TRACE("Reading {}", file_name);
         }
     }
 
 
     // make sure everything is clean
-    Verts.clear();
-    Faces.clear();
-    Tex.clear();
-    FacesTex.clear();
-    Normal.clear();
-    FacesNormal.clear();
+    vertices.clear();
+    faces.clear();
+    tex.clear();
+    face_tex.clear();
+    normals.clear();
+    face_normal.clear();
 
-    char     line[MAX_LINE_LENGTH];
-    uint32_t lineNum = 1;
-    while (fgets(line, MAX_LINE_LENGTH, Objfile) != NULL) {
+    constexpr uint32_t max_line_length = 2048;
+    char               line[max_line_length];
+    uint32_t           lineNum = 1;
+    while (fgets(line, max_line_length, Objfile) != NULL) {
 
-        char type[MAX_LINE_LENGTH];
+        char type[max_line_length];
 
         if (sscanf(line, "%s", type) == 1) {
             // read only the first letter of the line
@@ -61,9 +61,9 @@ bool import_obj(const std::string                  fileName,
             char* l = &line[strlen(type)];  // next thing after the type
             if (strcmp(type, "v") == 0) {
                 // vertex
-                std::istringstream  ls(&line[1]);
-                std::vector<DATA_T> vert{std::istream_iterator<DATA_T>(ls),
-                                         std::istream_iterator<DATA_T>()};
+                std::istringstream ls(&line[1]);
+                std::vector<DataT> vert{std::istream_iterator<DataT>(ls),
+                                        std::istream_iterator<DataT>()};
                 if (vert.size() < 3) {
                     // vertex has less than coordinates
                     RXMESH_ERROR(
@@ -73,28 +73,28 @@ bool import_obj(const std::string                  fileName,
                     fclose(Objfile);
                     return false;
                 }
-                Verts.push_back(vert);
+                vertices.push_back(vert);
             } else if (strcmp(type, "vn") == 0) {
                 // normal
-                DATA_T   x[3];
+                DataT    x[3];
                 uint32_t count = sscanf(l, "%f %f %f\n", &x[0], &x[1], &x[2]);
 
                 if (count != 3) {
                     RXMESH_ERROR(
-                        "importOBJ() normal has less than 3 "
-                        "coordinates Line[{}]\n",
+                        "importOBJ() normals does not have 3 coordinates "
+                        "Line[{}]\n",
                         lineNum);
                     fclose(Objfile);
                     return false;
                 }
-                std::vector<DATA_T> normal_v(3);
+                std::vector<DataT> normal_v(3);
                 normal_v[0] = x[0];
                 normal_v[1] = x[1];
                 normal_v[2] = x[2];
-                Normal.push_back(normal_v);
+                normals.push_back(normal_v);
             } else if (strcmp(type, "vt") == 0) {
                 // texture
-                DATA_T   x[3];
+                DataT    x[3];
                 uint32_t count = sscanf(l, "%f %f %f\n", &x[0], &x[1], &x[2]);
 
                 if (count != 2 && count != 3) {
@@ -105,40 +105,40 @@ bool import_obj(const std::string                  fileName,
                     fclose(Objfile);
                     return false;
                 }
-                std::vector<DATA_T> tex(count);
+                std::vector<DataT> tx(count);
                 for (uint32_t i = 0; i < count; i++) {
-                    tex[i] = x[i];
+                    tx[i] = x[i];
                 }
-                Tex.push_back(tex);
+                tex.push_back(tx);
             } else if (strcmp(type, "f") == 0) {
                 // face (read vert id, norm id, tex id)
 
-                // const auto & shift = [&Verts](const int i)->int{return i<0 ?
-                // i+Verts.size():i-1;}; const auto & shift_t = [&Tex](const int
-                // i)->int{return i<0 ? i+Tex.size():i-1;}; const auto & shift_n
-                // = [&Normal](const int i)->int{return i<0 ?
-                // i+Normal.size():i-1;};
-
-                std::vector<INDEX_T> f;
-                std::vector<INDEX_T> ft;
-                std::vector<INDEX_T> fn;
-                char                 word[MAX_LINE_LENGTH];
-                uint32_t             offset;
+                // const auto & shift = [&vertices](const int i)->int{return i<0
+                // ? i+vertices.size():i-1;}; const auto & shift_t =
+                // [&Tex](const int i)->int{return i<0 ? i+Tex.size():i-1;};
+                // const auto & shift_n = [&normals ](const int i)->int{return
+                // i<0 ? i+normals .size():i-1;};
+
+                std::vector<IndexT> f;
+                std::vector<IndexT> ft;
+                std::vector<IndexT> fn;
+                char                word[max_line_length];
+                uint32_t            offset;
                 while (sscanf(l, "%s%n", word, &offset) == 1) {
                     l += offset;
                     long int i, it, in;
                     if (sscanf(word, "%ld/%ld/%ld", &i, &it, &in) == 3) {
                         // face, norm, tex
-                        f.push_back(i < 0 ? i + Verts.size() : i - 1);
-                        ft.push_back(i < 0 ? i + Tex.size() : i - 1);
-                        fn.push_back(i < 0 ? i + Normal.size() : i - 1);
+                        f.push_back(i < 0 ? i + vertices.size() : i - 1);
+                        ft.push_back(i < 0 ? i + tex.size() : i - 1);
+                        fn.push_back(i < 0 ? i + normals.size() : i - 1);
                     } else if (sscanf(word, "%ld/%ld", &i, &it) == 2) {
                         // face, tex
-                        f.push_back(i < 0 ? i + Verts.size() : i - 1);
-                        ft.push_back(i < 0 ? i + Tex.size() : i - 1);
+                        f.push_back(i < 0 ? i + vertices.size() : i - 1);
+                        ft.push_back(i < 0 ? i + tex.size() : i - 1);
                     } else if (sscanf(word, "%ld", &i) == 1) {
                         // face
-                        f.push_back(i < 0 ? i + Verts.size() : i - 1);
+                        f.push_back(i < 0 ? i + vertices.size() : i - 1);
                     } else {
                         RXMESH_ERROR(
                             "importOBJ() face has wrong format Line[{}]",
@@ -153,9 +153,9 @@ bool import_obj(const std::string                  fileName,
                     (f.size() > 0 && fn.size() == f.size() &&
                      ft.size() == f.size())) {
 
-                    Faces.push_back(f);
-                    FacesTex.push_back(ft);
-                    FacesNormal.push_back(fn);
+                    faces.push_back(f);
+                    face_tex.push_back(ft);
+                    face_normal.push_back(fn);
                 } else {
                     RXMESH_ERROR("importOBJ() face has wrong format Line[{}]",
                                  lineNum);
@@ -170,8 +170,8 @@ bool import_obj(const std::string                  fileName,
 
             } else {
                 // others
-                RXMESH_ERROR("importOBJ() invalid Line[{}] File[{}]\n", lineNum,
-                             line);
+                RXMESH_ERROR(
+                    "importOBJ() invalid Line[{}] File[{}]\n", lineNum, line);
                 fclose(Objfile);
                 return false;
             }
@@ -184,29 +184,38 @@ bool import_obj(const std::string                  fileName,
     fclose(Objfile);
 
     if (!quite) {
-        RXMESH_TRACE("import_obj() #Verts= {} ", Verts.size());
-        RXMESH_TRACE("import_obj() #Faces= {} ", Faces.size());
-        RXMESH_TRACE("import_obj() #Tex= {} ", Tex.size());
-        RXMESH_TRACE("import_obj() #FacesTex= {} ", FacesTex.size());
-        RXMESH_TRACE("import_obj() #Normal= {} ", Normal.size());
-        RXMESH_TRACE("import_obj() #FacesNormal= {} ", FacesNormal.size());
+        RXMESH_TRACE("import_obj() #vertices= {} ", vertices.size());
+        RXMESH_TRACE("import_obj() #faces= {} ", faces.size());
+        RXMESH_TRACE("import_obj() #tex= {} ", tex.size());
+        RXMESH_TRACE("import_obj() #face_tex= {} ", face_tex.size());
+        RXMESH_TRACE("import_obj() #normals = {} ", normals.size());
+        RXMESH_TRACE("import_obj() #face_normal= {} ", face_normal.size());
     }
     return true;
 }
 
-
-template <typename DATA_T, typename INDEX_T>
-bool import_obj(const std::string                  fileName,
-                std::vector<std::vector<DATA_T>>&  Verts,
-                std::vector<std::vector<INDEX_T>>& Faces,
-                bool                               quite = false)
+/**
+ * @brief Read an input mesh from obj file format
+ * @tparam DataT coordinates type (float/double)
+ * @tparam IndexT indices type
+ * @param file_name path to the obj file
+ * @param vertices 3d vertices (3*#vertices)
+ * @param faces face index to the Vert array (3*#faces)
+ * @param quite run in quite mode
+ * @return true if reading the file is successful
+ */
+template <typename DataT, typename IndexT>
+bool import_obj(const std::string                 file_name,
+                std::vector<std::vector<DataT>>&  vertices,
+                std::vector<std::vector<IndexT>>& faces,
+                bool                              quite = false)
 {
 
-    std::vector<std::vector<DATA_T>>  Tex;
-    std::vector<std::vector<INDEX_T>> FacesTex;
-    std::vector<std::vector<DATA_T>>  Normal;
-    std::vector<std::vector<INDEX_T>> FacesNormal;
+    std::vector<std::vector<DataT>>  tex;
+    std::vector<std::vector<IndexT>> face_tex;
+    std::vector<std::vector<DataT>>  normals;
+    std::vector<std::vector<IndexT>> face_normal;
 
-    return import_obj(fileName, Verts, Faces, Tex, FacesTex, Normal,
-                      FacesNormal, quite);
+    return import_obj(
+        file_name, vertices, faces, tex, face_tex, normals, face_normal, quite);
 }
\ No newline at end of file
diff --git a/include/rxmesh/util/log.h b/include/rxmesh/util/log.h
index 9900558f..fd2b0b17 100644
--- a/include/rxmesh/util/log.h
+++ b/include/rxmesh/util/log.h
@@ -7,12 +7,12 @@
 #include "spdlog/spdlog.h"
 
 
-namespace RXMESH {
+namespace rxmesh {
 
 class Log
 {
    public:
-    static void init()
+    static void init(spdlog::level::level_enum level = spdlog::level::trace)
     {
         std::vector<spdlog::sink_ptr> sinks;
         sinks.emplace_back(
@@ -23,11 +23,11 @@ class Log
         sinks[0]->set_pattern("%^[%T] %n: %v%$");
         sinks[1]->set_pattern("[%T] [%l] %n: %v");
 
-        m_logger = std::make_shared<spdlog::logger>("RXMesh", begin(sinks),
-                                                    end(sinks));
+        m_logger = std::make_shared<spdlog::logger>(
+            "RXMesh", begin(sinks), end(sinks));
         spdlog::register_logger(m_logger);
-        m_logger->set_level(spdlog::level::trace);
-        m_logger->flush_on(spdlog::level::trace);
+        m_logger->set_level(level);
+        m_logger->flush_on(level);
     }
 
     inline static std::shared_ptr<spdlog::logger>& get_logger()
@@ -39,17 +39,17 @@ class Log
    private:
     inline static std::shared_ptr<spdlog::logger> m_logger;
 };
-}  // namespace RXMESH
+}  // namespace rxmesh
 
-#define RXMESH_TRACE(...) ::RXMESH::Log::get_logger()->trace(__VA_ARGS__)
-#define RXMESH_INFO(...) ::RXMESH::Log::get_logger()->info(__VA_ARGS__)
+#define RXMESH_TRACE(...) ::rxmesh::Log::get_logger()->trace(__VA_ARGS__)
+#define RXMESH_INFO(...) ::rxmesh::Log::get_logger()->info(__VA_ARGS__)
 #define RXMESH_WARN(...)                                                      \
-    ::RXMESH::Log::get_logger()->warn("Line {} File {}", __LINE__, __FILE__); \
-    ::RXMESH::Log::get_logger()->warn(__VA_ARGS__)
+    ::rxmesh::Log::get_logger()->warn("Line {} File {}", __LINE__, __FILE__); \
+    ::rxmesh::Log::get_logger()->warn(__VA_ARGS__)
 #define RXMESH_ERROR(...)                                                      \
-    ::RXMESH::Log::get_logger()->error("Line {} File {}", __LINE__, __FILE__); \
-    ::RXMESH::Log::get_logger()->error(__VA_ARGS__)
-#define RXMESH_CRITICAL(...)                                           \
-    ::RXMESH::Log::get_logger()->critical("Line {} File {}", __LINE__, \
-                                          __FILE__);                   \
-    ::RXMESH::Log::get_logger()->critical(__VA_ARGS__)
+    ::rxmesh::Log::get_logger()->error("Line {} File {}", __LINE__, __FILE__); \
+    ::rxmesh::Log::get_logger()->error(__VA_ARGS__)
+#define RXMESH_CRITICAL(...)                    \
+    ::rxmesh::Log::get_logger()->critical(      \
+        "Line {} File {}", __LINE__, __FILE__); \
+    ::rxmesh::Log::get_logger()->critical(__VA_ARGS__)
diff --git a/include/rxmesh/util/macros.h b/include/rxmesh/util/macros.h
index 3534b06e..c5a225fc 100644
--- a/include/rxmesh/util/macros.h
+++ b/include/rxmesh/util/macros.h
@@ -4,10 +4,9 @@
 #include <stdint.h>
 #include "rxmesh/util/log.h"
 
-namespace RXMESH {
+namespace rxmesh {
 
-typedef uint8_t    flag_t;
-constexpr uint32_t PATCH_SIZE = 512;
+typedef uint8_t flag_t;
 
 // TRANSPOSE_ITEM_PER_THREAD
 constexpr uint32_t TRANSPOSE_ITEM_PER_THREAD = 11;
@@ -15,8 +14,8 @@ constexpr uint32_t TRANSPOSE_ITEM_PER_THREAD = 11;
 // used for integer rounding
 #define DIVIDE_UP(num, divisor) (num + divisor - 1) / (divisor)
 
-// assuming a 32-bit index
-#define FULL_MASK 0xffffffff
+// unsigned 64-bit
+#define INVALID64 0xFFFFFFFFFFFFFFFFu
 
 // unsigned 32-bit
 #define INVALID32 0xFFFFFFFFu
@@ -27,12 +26,6 @@ constexpr uint32_t TRANSPOSE_ITEM_PER_THREAD = 11;
 // unsigned 8-bit
 #define INVALID8 0xFFu
 
-// assuming a 32-bit index
-#define SPECIAL 0xFFFFFFFE
-
-// 32
-#define WARPSIZE 32u
-
 
 // http://www.decompile.com/cpp/faq/file_and_line_error_string.htm
 #define STRINGIFY(x) TOSTRING(x)
@@ -61,5 +54,20 @@ inline void HandleError(cudaError_t err, const char* file, int line)
         ptr = nullptr;             \
     }
 
+// Taken from https://stackoverflow.com/a/12779757/1608232
+#if defined(__CUDACC__)  // NVCC
+#define ALIGN(n) __align__(n)
+#elif defined(__GNUC__)  // GCC
+#define ALIGN(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER)  // MSVC
+#define ALIGN(n) __declspec(align(n))
+#else
+#error "Please provide a definition for MY_ALIGN macro for your host compiler!"
+#endif
+
+
+//Taken from https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambda-traits
+#define IS_D_LAMBDA(X) __nv_is_extended_device_lambda_closure_type(X)
+#define IS_HD_LAMBDA(X) __nv_is_extended_host_device_lambda_closure_type(X)
 
-}  // namespace RXMESH
\ No newline at end of file
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/util/math.h b/include/rxmesh/util/math.h
deleted file mode 100644
index 16eb2d7e..00000000
--- a/include/rxmesh/util/math.h
+++ /dev/null
@@ -1,201 +0,0 @@
-#pragma once
-#include <cuda_runtime.h>
-#include <stdint.h>
-#include <numeric>
-#include <vector>
-
-namespace RXMESH {
-// 180.0/PI (multiply this by the radian angle to convert to degree)
-constexpr float RadToDeg = 57.295779513078550;
-
-constexpr float PIf = 3.1415927f;
-
-
-/**
- * l2_norm()
- * TODO remove
- */
-template <typename T>
-__host__ __device__ __forceinline__ T l2_norm(const T ax0,
-                                              const T ax1,
-                                              const T ax2,
-                                              const T bx0,
-                                              const T bx1,
-                                              const T bx2)
-{
-    // compute sqrt((xa0-xb0)*(xa0-xb0) + (xa1-xb1)*(xa1-xb1) +
-    //(xa2-xb2)*(xa2-xb2))
-    return sqrt(l2_norm_sq(ax0, ax1, ax2, bx0, bx1, bx2));
-}
-
-
-/**
- * l2_norm_sq()
- * TODO remove
- */
-template <typename T>
-__host__ __device__ __forceinline__ T l2_norm_sq(const T ax0,
-                                                 const T ax1,
-                                                 const T ax2,
-                                                 const T bx0,
-                                                 const T bx1,
-                                                 const T bx2)
-{
-    // compute (xa0-xb0)*(xa0-xb0) + (xa1-xb1)*(xa1-xb1) + (xa2-xb2)*(xa2-xb2)
-    T x0 = ax0 - bx0;
-    T x1 = ax1 - bx1;
-    T x2 = ax2 - bx2;
-    return x0 * x0 + x1 * x1 + x2 * x2;
-}
-
-/**
- * vector_length()
- * TODO remove
- */
-__device__ __host__ __forceinline__ float vector_length(const float x,
-                                                        const float y,
-                                                        const float z)
-{
-    return sqrtf(x * x + y * y + z * z);
-}
-
-
-/**
- * vector_length()
- * TODO remove
- */
-__device__ __host__ __forceinline__ double vector_length(const double x,
-                                                         const double y,
-                                                         const double z)
-{
-    return sqrt(x * x + y * y + z * z);
-}
-
-/**
- * cross_product()
- * TODO remove
- */
-template <typename T>
-__host__ __device__ __forceinline__ void
-cross_product(T xv1, T yv1, T zv1, T xv2, T yv2, T zv2, T& xx, T& yy, T& zz)
-{
-    xx = yv1 * zv2 - zv1 * yv2;
-    yy = zv1 * xv2 - xv1 * zv2;
-    zz = xv1 * yv2 - yv1 * xv2;
-}
-
-/**
- * vector_normal()
- * TODO remove
- */
-template <typename T>
-__device__ __host__ __forceinline__ T vector_normal(const T& vector_x,
-                                                    const T& vector_y,
-                                                    const T& vector_z)
-{
-    return vector_length(vector_x, vector_y, vector_z);
-}
-
-/**
- * normalize_vector()
- * TODO remove
- */
-template <typename T>
-__device__ __host__ __forceinline__ void normalize_vector(T& vector_x,
-                                                          T& vector_y,
-                                                          T& vector_z)
-{
-    T nn = vector_normal(vector_x, vector_y, vector_z);
-    if (nn == 0) {
-        vector_x = vector_y = vector_z = 0;
-    } else {
-        nn = 1 / nn;
-        vector_x *= nn;
-        vector_y *= nn;
-        vector_z *= nn;
-    }
-}
-
-/**
- * round_up_multiple()
- */
-template <typename T>
-__host__ __device__ __forceinline__ T round_up_multiple(const T numToRound,
-                                                        const T multiple)
-{
-
-    // https://stackoverflow.com/a/3407254/1608232
-    // rounding numToRound to the closest number multiple of multiple
-    // this code meant only for +ve int. for -ve, check the reference above
-    if (multiple == 0) {
-        return numToRound;
-    }
-
-    const T remainder = numToRound % multiple;
-    if (remainder == 0) {
-        return numToRound;
-    }
-    return numToRound + multiple - remainder;
-}
-
-/**
- * round_to_next_power_two()
- */
-__host__ __device__ __forceinline__ uint32_t
-round_to_next_power_two(const uint32_t numToRound)
-{
-
-    // https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
-    uint32_t res = numToRound;
-    if (res == 0) {
-        return 1;
-    }
-    res--;
-    res |= res >> 1;
-    res |= res >> 2;
-    res |= res >> 4;
-    res |= res >> 8;
-    res |= res >> 16;
-    res++;
-    return res;
-}
-
-/**
- * dot()
- * TODO remove
- */
-template <typename T>
-T dot(const std::vector<T>& u, const std::vector<T>& v)
-{
-    return std::inner_product(std::begin(u), std::end(u), std::begin(v), 0.0);
-}
-
-/**
- * scale()
- * TODO remove
- */
-template <typename T>
-void scale(std::vector<T>& v, const T factor)
-{
-    std::transform(
-        v.begin(), v.end(), v.begin(),
-        std::bind(std::multiplies<T>(), std::placeholders::_1, factor));
-}
-
-/**
- * axpy()
- */
-template <typename T>
-void axpy(const std::vector<T>& x,
-          const T               alpha,
-          const T               beta,
-          std::vector<T>&       y)
-{
-    // y = alpha*x + beta*y
-    for (uint32_t i = 0; i < x.size(); ++i) {
-        y[i] *= beta;
-        y[i] += alpha * x[i];
-    }
-}
-
-}  // namespace RXMESH
\ No newline at end of file
diff --git a/include/rxmesh/util/meta.h b/include/rxmesh/util/meta.h
new file mode 100644
index 00000000..c42bf54a
--- /dev/null
+++ b/include/rxmesh/util/meta.h
@@ -0,0 +1,51 @@
+#pragma once
+#include <tuple>
+namespace rxmesh {
+namespace detail {
+
+/**
+ * @brief extracting the input parameter type and return type of a lambda
+ * function. Taken from https://stackoverflow.com/a/7943765/1608232.
+ * For generic types, directly use the result of the signature of its operator()
+ */
+template <typename T>
+struct FunctionTraits : public FunctionTraits<decltype(&T::operator())>
+{
+};
+
+/**
+ * @brief specialization for pointers to member function
+ */
+template <typename ClassType, typename ReturnType, typename... Args>
+struct FunctionTraits<ReturnType (ClassType::*)(Args...) const>
+{
+    /**
+     * @brief arity is the number of arguments.
+     */
+    enum
+    {
+        arity = sizeof...(Args)
+    };
+
+    typedef ReturnType result_type;
+
+    /**
+     * @brief the i-th argument is equivalent to the i-th tuple element of a
+     * tuple composed of those arguments.
+     */
+    template <size_t i>
+    struct arg
+    {
+        using type_rc =
+            typename std::tuple_element<i, std::tuple<Args...>>::type;
+        using type_c = std::conditional_t<std::is_reference_v<type_rc>,
+                                          std::remove_reference_t<type_rc>,
+                                          type_rc>;
+        using type   = std::conditional_t<std::is_const_v<type_c>,
+                                        std::remove_const_t<type_c>,
+                                        type_c>;
+    };
+};
+
+}  // namespace detail
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/util/report.h b/include/rxmesh/util/report.h
index 4363994f..a72abc8f 100644
--- a/include/rxmesh/util/report.h
+++ b/include/rxmesh/util/report.h
@@ -14,8 +14,8 @@
 #include "rxmesh/util/util.h"
 #ifdef __NVCC__
 #include "cuda.h"
-#include "rxmesh/util/cuda_query.h"
 #include "rxmesh/kernels/get_arch.cuh"
+#include "rxmesh/util/cuda_query.h"
 #endif
 #include "rxmesh/util/git_sha1.h"
 
@@ -26,7 +26,7 @@
 #include <unistd.h>
 #endif
 
-namespace RXMESH {
+namespace rxmesh {
 
 // Most values are signed and initialized to -1
 // if any value is not modified, it won't be written
@@ -36,12 +36,13 @@ namespace RXMESH {
 struct TestData
 {
     std::vector<float> time_ms;
-    int32_t            num_blocks = -1;
+    int32_t            num_blocks  = -1;
     int32_t            num_threads = -1;
     std::vector<bool>  passed;
-    std::string        test_name = "";
-    float              dyn_smem = -1;
-    float              static_smem = -1;
+    std::string        test_name   = "";
+    int32_t            dyn_smem    = -1;
+    int32_t            static_smem = -1;
+    int32_t            num_reg     = -1;
 };
 
 struct Report
@@ -60,12 +61,12 @@ struct Report
                         m_doc.GetAllocator());
         std::string str = g_GIT_SHA1;
         m_doc.AddMember("git_sha",
-                        rapidjson::Value().SetString(str.c_str(), str.length(),
-                                                     m_doc.GetAllocator()),
+                        rapidjson::Value().SetString(
+                            str.c_str(), str.length(), m_doc.GetAllocator()),
                         m_doc.GetAllocator());
 
         // Time
-        auto t = std::time(nullptr);
+        auto t  = std::time(nullptr);
         auto tm = *std::localtime(&t);
         {
             std::ostringstream oss;
@@ -80,8 +81,8 @@ struct Report
 
             m_doc.AddMember(
                 "date",
-                rapidjson::Value().SetString(str.c_str(), str.length(),
-                                             m_doc.GetAllocator()),
+                rapidjson::Value().SetString(
+                    str.c_str(), str.length(), m_doc.GetAllocator()),
                 m_doc.GetAllocator());
         }
     }
@@ -94,8 +95,8 @@ struct Report
             cmd = cmd + " " + std::string(argv[i]);
         }
         m_doc.AddMember("command_line",
-                        rapidjson::Value().SetString(cmd.c_str(), cmd.length(),
-                                                     m_doc.GetAllocator()),
+                        rapidjson::Value().SetString(
+                            cmd.c_str(), cmd.length(), m_doc.GetAllocator()),
                         m_doc.GetAllocator());
     }
 
@@ -145,22 +146,25 @@ struct Report
 
         // Memory
         add_member("Total amount of global memory (MB)",
-                   (float)devProp.totalGlobalMem / 1048576.0f, subdoc);
+                   (float)devProp.totalGlobalMem / 1048576.0f,
+                   subdoc);
         add_member("Total amount of shared memory per block (Kb)",
-                   (float)devProp.sharedMemPerBlock / 1024.0f, subdoc);
+                   (float)devProp.sharedMemPerBlock / 1024.0f,
+                   subdoc);
 
         // SM
         add_member("Multiprocessors", devProp.multiProcessorCount, subdoc);
 #ifdef __NVCC__
         add_member("CUDA Cores/MP",
-                   convert_SMV_to_cores(devProp.major, devProp.minor), subdoc);
+                   convert_SMV_to_cores(devProp.major, devProp.minor),
+                   subdoc);
 #endif
 
         // Clocks
-        add_member("GPU Max Clock rate (GHz)", devProp.clockRate * 1e-6f,
-                   subdoc);
-        add_member("Memory Clock rate (GHz)", devProp.memoryClockRate * 1e-6f,
-                   subdoc);
+        add_member(
+            "GPU Max Clock rate (GHz)", devProp.clockRate * 1e-6f, subdoc);
+        add_member(
+            "Memory Clock rate (GHz)", devProp.memoryClockRate * 1e-6f, subdoc);
         add_member("Memory Bus Width (bit)", devProp.memoryBusWidth, subdoc);
         add_member("Peak Memory Bandwidth (GB/s)",
                    2.0 * devProp.memoryClockRate *
@@ -194,8 +198,8 @@ struct Report
 
 
 #ifdef _MSC_VER
-        add_member("Microsoft Full Compiler Version", int32_t(_MSC_FULL_VER),
-                   subdoc);
+        add_member(
+            "Microsoft Full Compiler Version", int32_t(_MSC_FULL_VER), subdoc);
         add_member("Microsoft Compiler Version", int32_t(_MSC_VER), subdoc);
 #else
 
@@ -265,9 +269,7 @@ struct Report
     }
 
     // get model data from RXMesh
-    template <uint32_t p>
-    void model_data(const std::string&       model_name,
-                    const RXMESH::RXMesh<p>& rxmesh)
+    void model_data(const std::string& model_name, const rxmesh::RXMesh& rxmesh)
     {
         rapidjson::Document subdoc(&m_doc.GetAllocator());
         subdoc.SetObject();
@@ -285,21 +287,19 @@ struct Report
         add_member("num_lloyd_run", rxmesh.get_num_lloyd_run(), subdoc);
         add_member("patching_time", rxmesh.get_patching_time(), subdoc);
         uint32_t min_patch_size(0), max_patch_size(0), avg_patch_size(0);
-        rxmesh.get_max_min_avg_patch_size(min_patch_size, max_patch_size,
-                                           avg_patch_size);
+        rxmesh.get_max_min_avg_patch_size(
+            min_patch_size, max_patch_size, avg_patch_size);
         add_member("min_patch_size", min_patch_size, subdoc);
         add_member("max_patch_size", max_patch_size, subdoc);
         add_member("avg_patch_size", avg_patch_size, subdoc);
         add_member("per_patch_max_vertices",
-                   rxmesh.get_per_patch_max_vertices(), subdoc);
-        add_member("per_patch_max_edges", rxmesh.get_per_patch_max_edges(),
-                   subdoc);
-        add_member("per_patch_max_faces", rxmesh.get_per_patch_max_faces(),
-                   subdoc);
-        add_member("ribbon_overhead (%)", rxmesh.get_ribbon_overhead(),
-                   subdoc);
-        add_member("total_gpu_storage (mb)", rxmesh.get_gpu_storage_mb(),
+                   rxmesh.get_per_patch_max_vertices(),
                    subdoc);
+        add_member(
+            "per_patch_max_edges", rxmesh.get_per_patch_max_edges(), subdoc);
+        add_member(
+            "per_patch_max_faces", rxmesh.get_per_patch_max_faces(), subdoc);
+        add_member("ribbon_overhead (%)", rxmesh.get_ribbon_overhead(), subdoc);
         m_doc.AddMember("Model", subdoc, m_doc.GetAllocator());
     }
 
@@ -322,8 +322,12 @@ struct Report
         }
 
         if (test_data.static_smem != -1) {
-            add_member("static_shared_memory (b)", test_data.static_smem,
-                       subdoc);
+            add_member(
+                "static_shared_memory (b)", test_data.static_smem, subdoc);
+        }
+
+        if (test_data.num_reg != -1) {
+            add_member("num_register_per_thread", test_data.num_reg, subdoc);
         }
 
         if (!test_data.passed.empty()) {
@@ -359,8 +363,8 @@ struct Report
     void add_member(std::string member_key, const int32_t member_val, docT& doc)
     {
         rapidjson::Value key(member_key.c_str(), doc.GetAllocator());
-        doc.AddMember(key, rapidjson::Value().SetInt(member_val),
-                      doc.GetAllocator());
+        doc.AddMember(
+            key, rapidjson::Value().SetInt(member_val), doc.GetAllocator());
     }
     template <typename docT>
     void add_member(std::string    member_key,
@@ -368,24 +372,24 @@ struct Report
                     docT&          doc)
     {
         rapidjson::Value key(member_key.c_str(), doc.GetAllocator());
-        doc.AddMember(key, rapidjson::Value().SetUint(member_val),
-                      doc.GetAllocator());
+        doc.AddMember(
+            key, rapidjson::Value().SetUint(member_val), doc.GetAllocator());
     }
 
     template <typename docT>
     void add_member(std::string member_key, const double member_val, docT& doc)
     {
         rapidjson::Value key(member_key.c_str(), doc.GetAllocator());
-        doc.AddMember(key, rapidjson::Value().SetDouble(member_val),
-                      doc.GetAllocator());
+        doc.AddMember(
+            key, rapidjson::Value().SetDouble(member_val), doc.GetAllocator());
     }
 
     template <typename docT>
     void add_member(std::string member_key, const bool member_val, docT& doc)
     {
         rapidjson::Value key(member_key.c_str(), doc.GetAllocator());
-        doc.AddMember(key, rapidjson::Value().SetBool(member_val),
-                      doc.GetAllocator());
+        doc.AddMember(
+            key, rapidjson::Value().SetBool(member_val), doc.GetAllocator());
     }
 
     template <typename docT>
@@ -442,4 +446,4 @@ class CustomReport : public Report
         this->m_doc.AddMember("Model", subdoc, m_doc.GetAllocator());
     }
 };
-}  // namespace RXMESH
\ No newline at end of file
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/util/timer.h b/include/rxmesh/util/timer.h
index 42126c34..c5c06adb 100644
--- a/include/rxmesh/util/timer.h
+++ b/include/rxmesh/util/timer.h
@@ -3,7 +3,7 @@
 #include <chrono>
 #include "rxmesh/util/macros.h"
 
-namespace RXMESH {
+namespace rxmesh {
 
 struct GPUTimer
 {
@@ -65,4 +65,4 @@ struct CPUTimer
     std::chrono::high_resolution_clock::time_point m_start;
     std::chrono::high_resolution_clock::time_point m_stop;
 };
-}  // namespace RXMESH
\ No newline at end of file
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/util/util.h b/include/rxmesh/util/util.h
index 014b62c6..bdbd73b7 100644
--- a/include/rxmesh/util/util.h
+++ b/include/rxmesh/util/util.h
@@ -5,10 +5,10 @@
 #include <random>
 #include "rxmesh/util/macros.h"
 
-namespace RXMESH {
+namespace rxmesh {
 
 /**
- * get_cmd_option()
+ * @brief Parse for an option. Maninly used to parse user input from CMD
  */
 inline char* get_cmd_option(char** begin, char** end, const std::string& option)
 {
@@ -19,8 +19,10 @@ inline char* get_cmd_option(char** begin, char** end, const std::string& option)
     }
     return 0;
 }
+
 /**
- * cmd_option_exists()
+ * @brief Check if an input string exists. Mainly used to check if input option
+ * exists in CMD
  */
 inline bool cmd_option_exists(char**             begin,
                               char**             end,
@@ -31,32 +33,38 @@ inline bool cmd_option_exists(char**             begin,
 }
 
 /**
- * print_device_memory_usage()
+ * @brief Print current GPU memory usage
  */
 inline void print_device_memory_usage()
 {
     // print how much memory is available, used and free on the current device
     size_t free_t, total_t;
     CUDA_ERROR(cudaMemGetInfo(&free_t, &total_t));
-    double free_m = (double)free_t / (double)1048576.0;
+    double free_m  = (double)free_t / (double)1048576.0;
     double total_m = (double)total_t / (double)1048576.0;
-    double used_m = total_m - free_m;
-    RXMESH_TRACE(" device memory mem total = {} (B) [{} (MB)]", total_t,
-                 total_m);
+    double used_m  = total_m - free_m;
+    RXMESH_TRACE(
+        " device memory mem total = {} (B) [{} (MB)]", total_t, total_m);
     RXMESH_TRACE(" device memory free: {} (B) [{} (MB)]", free_t, free_m);
     RXMESH_TRACE(" device memory mem used: {} (MB)", used_m);
 }
 
+
 /**
- * find_index()
+ * @brief Find the index of an entry in a vector
+ * @tparam T type of the entry and vector elements
+ * @param entry to search for
+ * @param vect input vector to search in
+ * @return return the index of the entry or std::numeric_limits<uint32_t>::max()
+ * if it is not found
  */
 template <typename T>
-inline uint32_t find_index(const T entery, const std::vector<T>& vect)
+inline uint32_t find_index(const T entry, const std::vector<T>& vect)
 {
     // get index of entry in vector
 
     typename std::vector<T>::const_iterator it =
-        std::find(vect.begin(), vect.end(), entery);
+        std::find(vect.begin(), vect.end(), entry);
     if (it == vect.end()) {
         return std::numeric_limits<uint32_t>::max();
     }
@@ -64,15 +72,21 @@ inline uint32_t find_index(const T entery, const std::vector<T>& vect)
 }
 
 /**
- * find_index()
+ * @brief Find the index of an entry an array given its size
+ * @tparam T type of the entry and array elements
+ * @param entry to search for
+ * @param arr input array to search in
+ * @param arr_size size of the input array (arr)
+ * @return return the index of the entry or std::numeric_limits<uint32_t>::max()
+ * if it is not found
  */
 template <typename T>
-inline T find_index(const T* arr, const T arr_size, const T val)
+inline T find_index(const T* arr, const T arr_size, const T entry)
 {
     // get index of entry in array
     const T* begin = arr;
-    const T* end = arr + arr_size;
-    const T* it = std::find(begin, end, val);
+    const T* end   = arr + arr_size;
+    const T* it    = std::find(begin, end, entry);
     if (it == end) {
         return std::numeric_limits<T>::max();
     }
@@ -80,7 +94,7 @@ inline T find_index(const T* arr, const T arr_size, const T val)
 }
 
 /**
- * random_shuffle()
+ * @brief Shuffle the content of an input array randomly
  */
 template <typename T>
 inline void random_shuffle(T*             d_in,
@@ -93,7 +107,7 @@ inline void random_shuffle(T*             d_in,
 }
 
 /**
- * fill_with_sequential_numbers()
+ * @brief Fill in an array with sequential numbers
  */
 template <typename T>
 inline void fill_with_sequential_numbers(T*             arr,
@@ -103,23 +117,26 @@ inline void fill_with_sequential_numbers(T*             arr,
     std::iota(arr, arr + size, start);
 }
 
+
 /**
- * compare()
+ * @brief Compare the content of two input arrays
  */
 template <typename T, typename dataT>
 bool compare(const dataT* gold,
              const dataT* arr,
              const T      size,
              const bool   verbose = false,
-             const dataT  tol = 10E-5)
+             const dataT  tol     = 10E-5)
 {
 
     bool result = true;
     for (T i = 0; i < size; i++) {
         if (std::abs(double(gold[i]) - double(arr[i])) > tol) {
             if (verbose) {
-                RXMESH_WARN("compare() mismatch at {} gold = {} arr = {} ", i,
-                            gold[i], arr[i]);
+                RXMESH_WARN("compare() mismatch at {} gold = {} arr = {} ",
+                            i,
+                            gold[i],
+                            arr[i]);
                 result = false;
             } else {
                 // it is not verbose, don't bother running through all entires
@@ -131,7 +148,7 @@ bool compare(const dataT* gold,
 }
 
 /**
- * copy()
+ * @brief Copy the content of one vector to another
  */
 template <typename T>
 void copy(const std::vector<T>& src, std::vector<T>& tar, int tar_start = 0)
@@ -139,8 +156,9 @@ void copy(const std::vector<T>& src, std::vector<T>& tar, int tar_start = 0)
     std::copy(src.begin(), src.end(), tar.data() + tar_start);
 }
 
+
 /**
- * compute_avg_stddev()
+ * @brief Compute the average and standard deviation of an input array
  */
 template <typename T>
 inline void compute_avg_stddev(const T* arr,
@@ -149,7 +167,7 @@ inline void compute_avg_stddev(const T* arr,
                                double&  stddev)
 {
     if (size == 1) {
-        avg = arr[0];
+        avg    = arr[0];
         stddev = 0;
         return;
     }
@@ -170,9 +188,8 @@ inline void compute_avg_stddev(const T* arr,
     return;
 }
 /**
- * compute_avg_stddev_max_min_rs()
- * computes the average and stddev where the input is running sum (output of
- * exclusive sum) the input size is actually size + 1
+ * @brief computes the average and stddev where the input is running sum (output
+ * of exclusive sum) the input size is actually size + 1
  */
 template <typename T>
 inline void compute_avg_stddev_max_min_rs(const T* arr_rs,
@@ -183,15 +200,15 @@ inline void compute_avg_stddev_max_min_rs(const T* arr_rs,
                                           T&       min)
 {
     uint32_t* arr = (uint32_t*)malloc(size * sizeof(uint32_t));
-    max = std::numeric_limits<T>::min();
-    min = std::numeric_limits<T>::max();
+    max           = std::numeric_limits<T>::min();
+    min           = std::numeric_limits<T>::max();
     for (uint32_t i = 0; i < size; i++) {
         // arr[i] = arr_rs[i + 1] - arr_rs[i];
         uint32_t start = (i == 0) ? 0 : arr_rs[i - 1];
-        uint32_t end = arr_rs[i];
-        arr[i] = end - start;
-        max = std::max(max, arr[i]);
-        min = std::min(min, arr[i]);
+        uint32_t end   = arr_rs[i];
+        arr[i]         = end - start;
+        max            = std::max(max, arr[i]);
+        min            = std::min(min, arr[i]);
     }
 
     compute_avg_stddev(arr, size, avg, stddev);
@@ -200,7 +217,7 @@ inline void compute_avg_stddev_max_min_rs(const T* arr_rs,
 }
 
 /**
- * binary_search()
+ * @brief binary search in a vector (has to be sorted --- not checked)
  */
 template <typename T>
 inline size_t binary_search(const std::vector<T>& list,
@@ -235,8 +252,7 @@ inline size_t binary_search(const std::vector<T>& list,
 
 
 /**
- * inplace_remove_duplicates_sorted()
- * in-place remove duplicates from sorted vector
+ * @brief in-place remove duplicates from sorted vector
  * requires one pass over all elements in sort_vec
  * it also resize sort_vec to contain only the unique values
  */
@@ -249,12 +265,12 @@ inline void inplace_remove_duplicates_sorted(std::vector<T>& sort_vec)
 
     // leave the first value
     uint32_t next_unique_id = 1;
-    T        prev_value = sort_vec.front();
+    T        prev_value     = sort_vec.front();
     for (uint32_t i = 1; i < sort_vec.size(); ++i) {
         T curr_val = sort_vec[i];
         if (curr_val != prev_value) {
             sort_vec[next_unique_id++] = curr_val;
-            prev_value = curr_val;
+            prev_value                 = curr_val;
         }
     }
 
@@ -262,7 +278,8 @@ inline void inplace_remove_duplicates_sorted(std::vector<T>& sort_vec)
 }
 
 /**
- * shuffle_obj()
+ * @brief Given the vertex coordinates and face indices, shuffle the input mesh
+ * randomly --- both vertices and face indices
  */
 template <typename T>
 inline void shuffle_obj(std::vector<std::vector<uint32_t>>& Faces,
@@ -306,7 +323,7 @@ inline void shuffle_obj(std::vector<std::vector<uint32_t>>& Faces,
 
 
 /**
- * remove_extension()
+ * @brief Remove the extension of an input file path
  */
 inline std::string remove_extension(const std::string& filename)
 {  // https://stackoverflow.com/a/6417908/1608232
@@ -317,44 +334,42 @@ inline std::string remove_extension(const std::string& filename)
 }
 
 /**
- * extract_file_name()
+ * @brief Extract file path given its full path
  */
 inline std::string extract_file_name(const std::string& full_path)
 {
     // given full path, we extract the file name without extension
-    std::string filename = remove_extension(full_path);
+    std::string filename  = remove_extension(full_path);
     size_t      lastslash = filename.find_last_of("/\\");
 
     return filename.substr(lastslash + 1);
 }
 
+namespace detail {
+
 /**
- * in_place_matrix_transpose()
+ * @brief hash function that takes a pair of vertices and returns a unique
+ * values. Used for storing vertex-edge relation in std map
  */
-template <class RandomIterator>
-void in_place_matrix_transpose(RandomIterator first,
-                               RandomIterator last,
-                               uint64_t       m)
+struct edge_key_hash
 {
-    // in-place matrix transpose represented as row-major format with m
-    // number for columns
-    // https://stackoverflow.com/a/9320349/1608232
-    const uint64_t mn1 = (last - first - 1);
-    const uint64_t n = (last - first) / m;
-
-    std::vector<bool> visited(last - first, false);
-
-    RandomIterator cycle = first;
-    while (++cycle != last) {
-        if (visited[cycle - first]) {
-            continue;
-        }
-        uint64_t a = cycle - first;
-        do {
-            a = (a == mn1) ? mn1 : (n * a) % mn1;
-            std::swap(*(first + a), *cycle);
-            visited[a] = true;
-        } while ((first + a) != cycle);
+    // www.techiedelight.com/use-std-pair-key-std-unordered_map-cpp/
+    template <class T>
+    inline std::size_t operator()(const std::pair<T, T>& e_key) const
+    {
+        return std::hash<T>()(e_key.first * 8191 + e_key.second * 11003);
     }
+};
+
+/**
+ * @brief return consistent edge key given two vertices
+ */
+inline std::pair<uint32_t, uint32_t> edge_key(const uint32_t v0,
+                                              const uint32_t v1)
+{
+    uint32_t i = std::max(v0, v1);
+    uint32_t j = std::min(v0, v1);
+    return std::make_pair(i, j);
 }
-}  // namespace RXMESH
\ No newline at end of file
+}  // namespace detail
+}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/util/vector.h b/include/rxmesh/util/vector.h
index 4b2363d3..ab1e76d4 100644
--- a/include/rxmesh/util/vector.h
+++ b/include/rxmesh/util/vector.h
@@ -2,7 +2,7 @@
 #include <assert.h>
 #include <stdint.h>
 
-namespace RXMESH {
+namespace rxmesh {
 
 template <uint32_t N, typename T>
 struct Vector
@@ -290,11 +290,11 @@ struct Vector
     __host__ __device__ __forceinline__ void normalize()
     {
         T r = norm();
-        if(r == T(0.0)){
+        if (r == T(0.0)) {
             for (uint32_t i = 0; i < N; ++i) {
                 m_v[i] = 0;
             }
-        }else{
+        } else {
             r = 1. / r;
             (*this) *= r;
         }
@@ -440,50 +440,50 @@ inline std::istream& operator>>(std::istream& input, const Vector<N, T>& v)
 }
 
 // Alias
-using Vector2d = Vector<2, double>;
-using Vector2f = Vector<2, float>;
-using Vector2i = Vector<2, int32_t>;
+using Vector2d  = Vector<2, double>;
+using Vector2f  = Vector<2, float>;
+using Vector2i  = Vector<2, int32_t>;
 using Vector2ui = Vector<2, uint32_t>;
-using Vector2s = Vector<2, int16_t>;
+using Vector2s  = Vector<2, int16_t>;
 using Vector2us = Vector<2, uint16_t>;
-using Vector2c = Vector<2, int8_t>;
+using Vector2c  = Vector<2, int8_t>;
 using Vector2uc = Vector<2, uint8_t>;
 
-using Vector3d = Vector<3, double>;
-using Vector3f = Vector<3, float>;
-using Vector3i = Vector<3, int32_t>;
+using Vector3d  = Vector<3, double>;
+using Vector3f  = Vector<3, float>;
+using Vector3i  = Vector<3, int32_t>;
 using Vector3ui = Vector<3, uint32_t>;
-using Vector3s = Vector<3, int16_t>;
+using Vector3s  = Vector<3, int16_t>;
 using Vector3us = Vector<3, uint16_t>;
-using Vector3c = Vector<3, int8_t>;
+using Vector3c  = Vector<3, int8_t>;
 using Vector3uc = Vector<3, uint8_t>;
 
-using Vector4d = Vector<4, double>;
-using Vector4f = Vector<4, float>;
-using Vector4i = Vector<4, int32_t>;
+using Vector4d  = Vector<4, double>;
+using Vector4f  = Vector<4, float>;
+using Vector4i  = Vector<4, int32_t>;
 using Vector4ui = Vector<4, uint32_t>;
-using Vector4s = Vector<4, int16_t>;
+using Vector4s  = Vector<4, int16_t>;
 using Vector4us = Vector<4, uint16_t>;
-using Vector4c = Vector<4, int8_t>;
+using Vector4c  = Vector<4, int8_t>;
 using Vector4uc = Vector<4, uint8_t>;
 
-using Vector6d = Vector<6, double>;
-using Vector6f = Vector<6, float>;
-using Vector6i = Vector<6, int32_t>;
+using Vector6d  = Vector<6, double>;
+using Vector6f  = Vector<6, float>;
+using Vector6i  = Vector<6, int32_t>;
 using Vector6ui = Vector<6, uint32_t>;
-using Vector6s = Vector<6, int16_t>;
+using Vector6s  = Vector<6, int16_t>;
 using Vector6us = Vector<6, uint16_t>;
-using Vector6c = Vector<6, int8_t>;
+using Vector6c  = Vector<6, int8_t>;
 using Vector6uc = Vector<6, uint8_t>;
-}  // namespace RXMESH
+}  // namespace rxmesh
 
 // Hash
 namespace std {
 
 template <uint32_t N, typename T>
-struct hash<RXMESH::Vector<N, T>>
+struct hash<rxmesh::Vector<N, T>>
 {
-    std::size_t operator()(const RXMESH::Vector<N, T>& v) const
+    std::size_t operator()(const rxmesh::Vector<N, T>& v) const
     {
         std::size_t h = 0;
         for (int i = 0; i < N; i++) {
diff --git a/tests/RXMesh_test/CMakeLists.txt b/tests/RXMesh_test/CMakeLists.txt
index f76c7b35..e5716cff 100644
--- a/tests/RXMesh_test/CMakeLists.txt
+++ b/tests/RXMesh_test/CMakeLists.txt
@@ -3,7 +3,7 @@ add_executable( RXMesh_test )
 set( SOURCE_LIST
     rxmesh_test_main.cu
 	rxmesh_test.h
-    test_attribute.cu
+    test_attribute.cuh
 	test_vector.cu
     test_util.cu        
 	test_iterator.cu
@@ -11,6 +11,7 @@ set( SOURCE_LIST
 	test_higher_queries.h
 	query.cuh	
 	higher_query.cuh
+	test_for_each.h
 )
 
 target_sources( RXMesh_test 
@@ -20,7 +21,7 @@ target_sources( RXMesh_test
 
 set_target_properties( RXMesh_test PROPERTIES FOLDER "tests")
 
-#set_property(TARGET RXMesh_test PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+set_property(TARGET RXMesh_test PROPERTY CUDA_SEPARABLE_COMPILATION ON)
 
 source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "RXMesh_test" FILES ${SOURCE_LIST})
 
diff --git a/tests/RXMesh_test/benchmark.sh b/tests/RXMesh_test/benchmark.sh
old mode 100644
new mode 100755
index ec11eba8..c83ae5f6
--- a/tests/RXMesh_test/benchmark.sh
+++ b/tests/RXMesh_test/benchmark.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-echo "This script re-generates RXMesh data in Figure 6 in the paper."
 echo "Please make sure to first compile the source code and then enter the input OBJ files directory."
 read -p "OBJ files directory (no trailing slash): " input_dir
 
@@ -16,13 +15,7 @@ device_id=0
 
 for file in $input_dir/*.obj; do 	 
     if [ -f "$file" ]; then
-		echo $exe --gtest_filter=RXMesh.Queries -input "$file" -num_run $num_run -device_id $device_id
-             $exe --gtest_filter=RXMesh.Queries -input "$file" -num_run $num_run -device_id $device_id
-		
-		echo $exe -s --gtest_filter=RXMesh.Queries -input "$file" -num_run $num_run -device_id $device_id
-             $exe -s --gtest_filter=RXMesh.Queries -input "$file" -num_run $num_run -device_id $device_id
-
-		echo $exe -p --gtest_filter=RXMesh.Queries -input "$file" -num_run $num_run -device_id $device_id
-             $exe -p --gtest_filter=RXMesh.Queries -input "$file" -num_run $num_run -device_id $device_id
+		echo $exe --gtest_filter=RXMeshStatic.Queries -input "$file" -num_run $num_run -device_id $device_id
+         $exe --gtest_filter=RXMeshStatic.Queries -input "$file" -num_run $num_run -device_id $device_id
     fi 
 done
\ No newline at end of file
diff --git a/tests/RXMesh_test/higher_query.cuh b/tests/RXMesh_test/higher_query.cuh
index fbf00d33..909543dc 100644
--- a/tests/RXMesh_test/higher_query.cuh
+++ b/tests/RXMesh_test/higher_query.cuh
@@ -3,107 +3,88 @@
 #include <assert.h>
 #include <stdint.h>
 
-#include "rxmesh/kernels/rxmesh_iterator.cuh"
-#include "rxmesh/kernels/rxmesh_query_dispatcher.cuh"
-#include "rxmesh/rxmesh.h"
-#include "rxmesh/rxmesh_attribute.h"
-#include "rxmesh/rxmesh_context.h"
+#include "rxmesh/attribute.h"
+#include "rxmesh/context.h"
+#include "rxmesh/iterator.cuh"
+#include "rxmesh/kernels/query_dispatcher.cuh"
+
 
 /**
- * higher_query()
+ * @brief perform 2-ring VV query
  */
-template <RXMESH::Op op, uint32_t blockThreads>
-__launch_bounds__(blockThreads) __global__
-    static void higher_query(const RXMESH::RXMeshContext       context,
-                             RXMESH::RXMeshAttribute<uint32_t> d_src,
-                             RXMESH::RXMeshAttribute<uint32_t> output_container,
-                             const bool                        oriented = false)
+template <uint32_t blockThreads, rxmesh::Op op>
+__global__ static void higher_query(
+    const rxmesh::Context                         context,
+    rxmesh::VertexAttribute<rxmesh::VertexHandle> input,
+    rxmesh::VertexAttribute<rxmesh::VertexHandle> output)
 {
-    using namespace RXMESH;
-    uint32_t block_offset = 0;
-    if constexpr (op == Op::EV || op == Op::EF) {
-        block_offset = context.get_edge_distribution()[blockIdx.x];
-    } else if constexpr (op == Op::FV || op == Op::FE || op == Op::FF) {
-        block_offset = context.get_face_distribution()[blockIdx.x];
-    } else if constexpr (op == Op::VV || op == Op::VE || op == Op::VF) {
-        block_offset = context.get_vertex_distribution()[blockIdx.x];
-    }
+    using namespace rxmesh;
 
     // the mesh element that this thread is assigned to
-    uint32_t thread_element = INVALID32;
-
-    // the location where thread_element will store its output
-    uint32_t element_offset;
+    VertexHandle thread_vertex;
 
     // number of vertices in the first ring
     uint32_t num_vv_1st_ring(0), num_vv(0);
 
     // computation done on the first ring/level
     // this is similar to the lambda function for query_block_dispatcher()
-    auto first_level_lambda = [&](uint32_t id, RXMeshIterator& iter) {
-        assert(iter.size() < output_container.get_num_attribute_per_element());
+    auto first_ring_lambda = [&](VertexHandle            id,
+                                 Iterator<VertexHandle>& iter) {
+        assert(iter.size() < output.get_num_attributes());
 
         num_vv_1st_ring = iter.size();
-        num_vv = num_vv_1st_ring;
+        num_vv          = num_vv_1st_ring;
 
         // record the mesh element that this thread is assigned to
-        thread_element = id;
-        element_offset = block_offset + iter.local_id();
-
-        d_src(element_offset) = id;
+        thread_vertex        = id;
+        input(thread_vertex) = thread_vertex;
 
-        output_container(element_offset, 0) = iter.size();
         for (uint32_t i = 0; i < iter.size(); ++i) {
-            output_container(element_offset, i + 1) = iter[i];
+            output(thread_vertex, i) = iter[i];
         }
     };
 
+    query_block_dispatcher<op, blockThreads>(context, first_ring_lambda);
 
-    query_block_dispatcher<op, blockThreads>(context, first_level_lambda,
-                                             oriented);
-
-    uint32_t next_id = 1;
+    uint32_t next_id = 0;
     while (true) {
-        uint32_t next_vertex = INVALID32;
+        VertexHandle next_vertex;
 
-        if (thread_element != INVALID32 && next_id <= num_vv_1st_ring) {
-            next_vertex = output_container(element_offset, next_id);
+        if (thread_vertex.is_valid() && next_id < num_vv_1st_ring) {
+            next_vertex = output(thread_vertex, next_id);
         }
 
-        auto second_level_lambda = [&](uint32_t id, RXMeshIterator& iter) {
+        auto higher_rings_lambda = [&](const VertexHandle&   id,
+                                       const VertexIterator& iter) {
             assert(id == next_vertex);
 
             for (uint32_t i = 0; i < iter.size(); ++i) {
-                if (iter[i] != thread_element) {
+                if (iter[i] != thread_vertex) {
 
                     // make sure that we don't store duplicate outputs
                     bool duplicate = false;
-                    for (uint32_t j = 1; j <= num_vv; ++j) {
-                        if (iter[i] == output_container(element_offset, j)) {
+                    for (uint32_t j = 0; j < num_vv; ++j) {
+                        if (iter[i] == output(thread_vertex, j)) {
                             duplicate = true;
                             break;
                         }
                     }
                     if (!duplicate) {
+                        output(thread_vertex, num_vv) = iter[i];
                         num_vv++;
-                        output_container(element_offset, num_vv) = iter[i];
                     }
                 }
             }
         };
 
-        query_block_dispatcher<op, blockThreads>(context, next_vertex,
-                                                 second_level_lambda);
+        higher_query_block_dispatcher<op, blockThreads>(
+            context, next_vertex, higher_rings_lambda);
 
         bool is_done =
-            (next_id > num_vv_1st_ring) || (thread_element == INVALID32);
+            (next_id >= num_vv_1st_ring) || !thread_vertex.is_valid();
         if (__syncthreads_and(is_done)) {
             break;
         }
         next_id++;
     }
-
-    if (thread_element != INVALID32) {
-        output_container(element_offset, 0) = num_vv;
-    }
 }
\ No newline at end of file
diff --git a/tests/RXMesh_test/query.cuh b/tests/RXMesh_test/query.cuh
index 3b70b345..e4896c38 100644
--- a/tests/RXMesh_test/query.cuh
+++ b/tests/RXMesh_test/query.cuh
@@ -3,47 +3,34 @@
 #include <assert.h>
 #include <stdint.h>
 
-#include "rxmesh/kernels/rxmesh_iterator.cuh"
-#include "rxmesh/kernels/rxmesh_query_dispatcher.cuh"
-#include "rxmesh/rxmesh.h"
-#include "rxmesh/rxmesh_attribute.h"
-#include "rxmesh/rxmesh_context.h"
+#include "rxmesh/attribute.h"
+#include "rxmesh/context.h"
+#include "rxmesh/iterator.cuh"
+#include "rxmesh/kernels/query_dispatcher.cuh"
+
 
 /**
- * query()
+ * @brief perform query of type of and store the output as well as the
+ * corresponding input
  */
-template <RXMESH::Op op, uint32_t blockThreads>
-__launch_bounds__(blockThreads) __global__
-    static void query(const RXMESH::RXMeshContext       context,
-                      RXMESH::RXMeshAttribute<uint32_t> d_src,
-                      RXMESH::RXMeshAttribute<uint32_t> output_container,
-                      const bool                        oriented = false)
+template <uint32_t   blockThreads,
+          rxmesh::Op op,
+          typename InputHandleT,
+          typename OutputHandleT,
+          typename InputAttributeT,
+          typename OutputAttributeT>
+__global__ static void query_kernel(const rxmesh::Context context,
+                                    InputAttributeT       input,
+                                    OutputAttributeT      output,
+                                    const bool            oriented = false)
 {
-    using namespace RXMESH;
-
-    static_assert(op != Op::EE, "Op::EE is not supported!");
-
-    assert(output_container.is_device_allocated());
-
-    uint32_t block_offset = 0;
-    if constexpr (op == Op::EV || op == Op::EF) {
-        block_offset = context.get_edge_distribution()[blockIdx.x];
-    } else if constexpr (op == Op::FV || op == Op::FE || op == Op::FF) {
-        block_offset = context.get_face_distribution()[blockIdx.x];
-    } else if constexpr (op == Op::VV || op == Op::VE || op == Op::VF) {
-        block_offset = context.get_vertex_distribution()[blockIdx.x];
-    }
-
-    auto store_lambda = [&](uint32_t id, RXMeshIterator& iter) {
-        assert(iter.size() < output_container.get_num_attribute_per_element());
-
-        uint32_t id_offset = block_offset + iter.local_id();
-        d_src(id_offset) = id;
+    using namespace rxmesh;
 
-        output_container(id_offset, 0) = iter.size();
+    auto store_lambda = [&](InputHandleT& id, Iterator<OutputHandleT>& iter) {
+        input(id) = id;
 
         for (uint32_t i = 0; i < iter.size(); ++i) {
-            output_container(id_offset, i + 1) = iter[i];
+            output(id, i) = iter[i];
         }
     };
 
diff --git a/tests/RXMesh_test/rxmesh_test.h b/tests/RXMesh_test/rxmesh_test.h
index 751db854..1fb28835 100644
--- a/tests/RXMesh_test/rxmesh_test.h
+++ b/tests/RXMesh_test/rxmesh_test.h
@@ -2,97 +2,54 @@
 
 #include <assert.h>
 #include <iomanip>
-#include "rxmesh/rxmesh_attribute.h"
-#include "rxmesh/rxmesh_context.h"
+#include <vector>
+#include "rxmesh/attribute.h"
+#include "rxmesh/context.h"
 #include "rxmesh/rxmesh_static.h"
 #include "rxmesh/util/util.h"
 
 
 class RXMeshTest
 {
+   private:
+    bool                               m_quite;
+    std::vector<std::vector<uint32_t>> m_h_FE;
+
    public:
     RXMeshTest(const RXMeshTest&) = delete;
-    RXMeshTest(bool quite = true) : m_quite(quite){};
-
-    /**
-     * run_query_verifier()
-     */
-    template <uint32_t patchSize>
-    bool run_query_verifier(
-        const RXMESH::RXMeshStatic<patchSize>&   rxmesh,
-        const RXMESH::Op                         op,
-        const RXMESH::RXMeshAttribute<uint32_t>& input_container,
-        const RXMESH::RXMeshAttribute<uint32_t>& output_container)
+    RXMeshTest(const rxmesh::RXMeshStatic&               rxmesh,
+               const std::vector<std::vector<uint32_t>>& fv,
+               bool                                      quite = true)
+        : m_quite(quite)
     {
+        assert(rxmesh.m_edges_map.size() != 0);
 
-        // run test on specific query operation on an instance of rxmesh. this
-        // does not account for patching so works only on big matrices data-
-        // structure
-
-        populate_FE(rxmesh);
-        switch (op) {
-            case RXMESH::Op::VV:
-                return test_VV(rxmesh, input_container, output_container);
-                break;
-
-            case RXMESH::Op::VE:
-                return test_VE(rxmesh, input_container, output_container);
-                break;
-
-            case RXMESH::Op::VF:
-                return test_VF(rxmesh, input_container, output_container);
-                break;
-
-            case RXMESH::Op::FV:
-                return test_FV(rxmesh, input_container, output_container);
-                break;
-
-            case RXMESH::Op::FE:
-                return test_FE(rxmesh, input_container, output_container);
-                break;
-
-            case RXMESH::Op::FF:
-                return test_FF(rxmesh, input_container, output_container);
-                break;
-
-            case RXMESH::Op::EV:
-                return test_EV(rxmesh, input_container, output_container);
-                break;
-            case RXMESH::Op::EF:
-                return test_EF(rxmesh, input_container, output_container);
-                break;
-
-            default:
-                RXMESH_ERROR("RXMeshTest::run_test() Op is not supported!!");
-                break;
-        }
-        return false;
-    }
+        for (uint32_t f = 0; f < rxmesh.m_num_faces; ++f) {
+            uint32_t i = f;
 
-    /**
-     * run_higher_query_verifier()
-     */
-    template <uint32_t patchSize>
-    bool run_higher_query_verifier(
-        const RXMESH::RXMeshStatic<patchSize>&   rxmesh,
-        const RXMESH::RXMeshAttribute<uint32_t>& input_container,
-        const RXMESH::RXMeshAttribute<uint32_t>& output_container)
-    {
-        populate_FE(rxmesh);
-        return test_VVV(rxmesh, input_container, output_container);
+            std::vector<uint32_t> fe(3);
+
+            for (uint32_t j = 0; j < 3; ++j) {
+
+                uint32_t v0 = fv[i][j];
+                uint32_t v1 = fv[i][(j + 1) % 3];
+
+                std::pair<uint32_t, uint32_t> my_edge =
+                    rxmesh::detail::edge_key(v0, v1);
+                uint32_t edge_id = rxmesh.get_edge_id(my_edge);
+                fe[j]            = edge_id;
+            }
+            m_h_FE.push_back(fe);
+        }
     }
 
 
-    /**
-     * run_ltog_mapping_test()
-     */
-    template <uint32_t patchSize>
-    bool run_ltog_mapping_test(const RXMESH::RXMesh<patchSize>& rxmesh)
+    bool run_ltog_mapping_test(const rxmesh::RXMesh&                     rxmesh,
+                               const std::vector<std::vector<uint32_t>>& fv)
     {
         // check if the mapping created for each patch is consistent
         // i.e., what you have in the local index space represents the global
         // space
-        populate_FE(rxmesh);
         for (uint32_t p = 0; p < rxmesh.m_num_patches; ++p) {
             bool edges_ok(true), faces_ok(true);
             check_mapping(rxmesh, p, edges_ok, faces_ok);
@@ -103,59 +60,21 @@ class RXMeshTest
         return true;
     }
 
-   private:
-    bool                               m_quite;
-    std::vector<std::vector<uint32_t>> m_h_FE;
-
-    template <uint32_t patchSize>
-    void populate_FE(const RXMESH::RXMesh<patchSize>& rxmesh)
-    {
-
-        // populate m_h_FE (in global space) with global edge numbers
-        // m_h_FE should be uninitialized
-        // should call this only if verification is needed.
-
-        if (m_h_FE.size() > 0) {
-            return;
-        }
-        m_h_FE.clear();
-
-        if (rxmesh.m_edges_map.size() == 0) {
-            RXMESH_ERROR(
-                "RXMeshTest::populate_FE() can not call me before"
-                " populating m_edges_map");
-        }
-
-        for (uint32_t f = 0; f < rxmesh.m_num_faces; ++f) {
-            uint32_t i = f;
-
-            std::vector<uint32_t> ff(3);
-
-            for (uint32_t j = 0; j < 3; ++j) {
 
-                uint32_t v0 = rxmesh.m_fvn[i][j];
-                uint32_t v1 =
-                    (j != 2) ? rxmesh.m_fvn[i][j + 1] : rxmesh.m_fvn[i][0];
-                std::pair<uint32_t, uint32_t> my_edge = rxmesh.edge_key(v0, v1);
-                uint32_t edge_id = rxmesh.get_edge_id(my_edge);
-                ff[j] = edge_id;
-            }
-            m_h_FE.push_back(ff);
-        }
-    }
-
-   private:
-    template <uint32_t patchSize>
-    bool test_VVV(const RXMESH::RXMeshStatic<patchSize>&   rxmesh,
-                  const RXMESH::RXMeshAttribute<uint32_t>& input_container,
-                  const RXMESH::RXMeshAttribute<uint32_t>& output_container)
+    /**
+     * @brief verify VV query. If is_higher_query is true, it verifies 2-ring
+     * queries
+     */
+    bool run_test(const rxmesh::RXMeshStatic&                          rxmesh,
+                  const std::vector<std::vector<uint32_t>>&            fv,
+                  const rxmesh::VertexAttribute<rxmesh::VertexHandle>& input,
+                  const rxmesh::VertexAttribute<rxmesh::VertexHandle>& output,
+                  const bool is_higher_query = false)
     {
-
-        // construct VV
-        std::vector<std::vector<uint32_t>> v_v(rxmesh.m_num_vertices,
+        std::vector<std::vector<uint32_t>> v_v(rxmesh.get_num_vertices(),
                                                std::vector<uint32_t>(0));
 
-        auto e_it = rxmesh.m_edges_map.begin();
+        auto e_it  = rxmesh.m_edges_map.begin();
         auto e_end = rxmesh.m_edges_map.end();
 
         for (; e_it != e_end; e_it++) {
@@ -165,184 +84,219 @@ class RXMeshTest
             v_v[vertices.second].push_back(vertices.first);
         }
 
-        // use VV to construct VVV
-        std::vector<std::vector<uint32_t>> v_v_v = v_v;
-        for (uint32_t v = 0; v < v_v_v.size(); ++v) {
+        if (is_higher_query) {
+            // use VV to construct VVV
+            std::vector<std::vector<uint32_t>> v_v_v = v_v;
+            for (uint32_t v = 0; v < v_v_v.size(); ++v) {
 
-            // loop over the v_v list of the vertex v
-            for (uint32_t i = 0; i < v_v[v].size(); ++i) {
+                // loop over the v_v list of the vertex v
+                for (uint32_t i = 0; i < v_v[v].size(); ++i) {
 
-                // this is a vertex in the 1-ring (v_v) of v
-                uint32_t n = v_v_v[v][i];
+                    // this is a vertex in the 1-ring (v_v) of v
+                    uint32_t n = v_v_v[v][i];
 
-                // loop over the v_v list (1-ring) of n
-                for (uint32_t j = 0; j < v_v[n].size(); ++j) {
+                    // loop over the v_v list (1-ring) of n
+                    for (uint32_t j = 0; j < v_v[n].size(); ++j) {
 
-                    // a candidate to be added to the 2-ring of v
-                    uint32_t candid = v_v[n][j];
+                        // a candidate to be added to the 2-ring of v
+                        uint32_t candid = v_v[n][j];
 
-                    // but we need to check first if it is not duplicate and
-                    // it is not v itself
-                    if (candid != v &&
-                        RXMESH::find_index(candid, v_v_v[v]) ==
-                            std::numeric_limits<uint32_t>::max()) {
+                        // but we need to check first if it is not duplicate and
+                        // it is not v itself
+                        if (candid != v &&
+                            rxmesh::find_index(candid, v_v_v[v]) ==
+                                std::numeric_limits<uint32_t>::max()) {
 
-                        v_v_v[v].push_back(candid);
+                            v_v_v[v].push_back(candid);
+                        }
                     }
                 }
             }
-        }
-
-
-        // two-way verification
-        return verifier(rxmesh.get_patcher()->get_vertex_patch().data(), v_v_v,
-                        input_container, output_container);
-    }
-
-    template <uint32_t patchSize>
-    bool test_VV(const RXMESH::RXMeshStatic<patchSize>&   rxmesh,
-                 const RXMESH::RXMeshAttribute<uint32_t>& input_container,
-                 const RXMESH::RXMeshAttribute<uint32_t>& output_container)
-    {
-
-        // construct VV
-
-        std::vector<std::vector<uint32_t>> v_v(rxmesh.m_num_vertices,
-                                               std::vector<uint32_t>(0));
-
-        auto e_it = rxmesh.m_edges_map.begin();
-        auto e_end = rxmesh.m_edges_map.end();
 
-        for (; e_it != e_end; e_it++) {
-            std::pair<uint32_t, uint32_t> vertices = e_it->first;
-
-            v_v[vertices.first].push_back(vertices.second);
-            v_v[vertices.second].push_back(vertices.first);
+            return verifier<rxmesh::VertexHandle, rxmesh::VertexHandle>(
+                v_v_v,
+                rxmesh,
+                rxmesh.m_h_num_owned_v,
+                rxmesh.m_h_patches_ltog_v,
+                rxmesh.m_h_patches_ltog_v,
+                input,
+                output);
         }
 
-        // two-way verification
-        return verifier(rxmesh.get_patcher()->get_vertex_patch().data(), v_v,
-                        input_container, output_container);
+        return verifier<rxmesh::VertexHandle, rxmesh::VertexHandle>(
+            v_v,
+            rxmesh,
+            rxmesh.m_h_num_owned_v,
+            rxmesh.m_h_patches_ltog_v,
+            rxmesh.m_h_patches_ltog_v,
+            input,
+            output);
     }
 
-    template <uint32_t patchSize>
-    bool test_VE(const RXMESH::RXMeshStatic<patchSize>&   rxmesh,
-                 const RXMESH::RXMeshAttribute<uint32_t>& input_container,
-                 const RXMESH::RXMeshAttribute<uint32_t>& output_container)
+    /**
+     * @brief verify VE query
+     */
+    bool run_test(const rxmesh::RXMeshStatic&                          rxmesh,
+                  const std::vector<std::vector<uint32_t>>&            fv,
+                  const rxmesh::VertexAttribute<rxmesh::VertexHandle>& input,
+                  const rxmesh::VertexAttribute<rxmesh::EdgeHandle>&   output)
     {
-
-        // construct VE
-
         std::vector<std::vector<uint32_t>> v_e(rxmesh.m_num_vertices,
                                                std::vector<uint32_t>(0));
 
-        auto e_it = rxmesh.m_edges_map.begin();
+        auto e_it  = rxmesh.m_edges_map.begin();
         auto e_end = rxmesh.m_edges_map.end();
 
         for (; e_it != e_end; e_it++) {
             std::pair<uint32_t, uint32_t> vertices = e_it->first;
-            uint32_t                      edge = e_it->second;
+            uint32_t                      edge     = e_it->second;
             v_e[vertices.first].push_back(edge);
             v_e[vertices.second].push_back(edge);
         }
 
-        // two-way verification
-        return verifier(rxmesh.get_patcher()->get_vertex_patch().data(), v_e,
-                        input_container, output_container);
+        return verifier<rxmesh::VertexHandle, rxmesh::EdgeHandle>(
+            v_e,
+            rxmesh,
+            rxmesh.m_h_num_owned_v,
+            rxmesh.m_h_patches_ltog_v,
+            rxmesh.m_h_patches_ltog_e,
+            input,
+            output);
     }
 
-    template <uint32_t patchSize>
-    bool test_VF(const RXMESH::RXMeshStatic<patchSize>&   rxmesh,
-                 const RXMESH::RXMeshAttribute<uint32_t>& input_container,
-                 const RXMESH::RXMeshAttribute<uint32_t>& output_container)
+    /**
+     * @brief verify VF query
+     */
+    bool run_test(const rxmesh::RXMeshStatic&                          rxmesh,
+                  const std::vector<std::vector<uint32_t>>&            fv,
+                  const rxmesh::VertexAttribute<rxmesh::VertexHandle>& input,
+                  const rxmesh::VertexAttribute<rxmesh::FaceHandle>&   output)
     {
 
-        // construct FV
-
+        if (rxmesh.m_num_faces != fv.size()) {
+            return false;
+        }
         std::vector<std::vector<uint32_t>> v_f(rxmesh.m_num_vertices,
                                                std::vector<uint32_t>(0));
 
-        // TODO this depends on m_fvn which does not record any changes
-        // but it is what the user has passed. Should compute v_f based on
-        // m_edge_map for consistency
-        uint32_t f_deg = rxmesh.m_face_degree;
-        for (uint32_t f = 0; f < rxmesh.m_num_faces; f++) {
-            for (uint32_t v = 0; v < f_deg; v++) {
-                uint32_t vert = rxmesh.m_fvn[f][v];
+        for (uint32_t f = 0; f < fv.size(); f++) {
+            for (uint32_t v = 0; v < 3; v++) {
+                uint32_t vert = fv[f][v];
                 v_f[vert].push_back(f);
             }
         }
 
-        // two-way verification
-        return verifier(rxmesh.get_patcher()->get_vertex_patch().data(), v_f,
-                        input_container, output_container);
+        return verifier<rxmesh::VertexHandle, rxmesh::FaceHandle>(
+            v_f,
+            rxmesh,
+            rxmesh.m_h_num_owned_v,
+            rxmesh.m_h_patches_ltog_v,
+            rxmesh.m_h_patches_ltog_f,
+            input,
+            output);
     }
 
-    template <uint32_t patchSize>
-    bool test_FV(const RXMESH::RXMeshStatic<patchSize>&   rxmesh,
-                 const RXMESH::RXMeshAttribute<uint32_t>& input_container,
-                 const RXMESH::RXMeshAttribute<uint32_t>& output_container)
+    /**
+     * @brief verify EV query
+     */
+    bool run_test(const rxmesh::RXMeshStatic&                        rxmesh,
+                  const std::vector<std::vector<uint32_t>>&          fv,
+                  const rxmesh::EdgeAttribute<rxmesh::EdgeHandle>&   input,
+                  const rxmesh::EdgeAttribute<rxmesh::VertexHandle>& output)
     {
 
-        // construct FV
-
-        uint32_t f_deg = rxmesh.m_face_degree;
-
-        std::vector<std::vector<uint32_t>> f_v(rxmesh.m_num_faces,
-                                               std::vector<uint32_t>(f_deg));
-
-        for (uint32_t f = 0; f < rxmesh.m_num_faces; f++) {
+        std::vector<std::vector<uint32_t>> e_v(rxmesh.m_num_edges,
+                                               std::vector<uint32_t>(2));
 
-            std::memcpy(f_v[f].data(), rxmesh.m_fvn[f].data(),
-                        f_deg * sizeof(uint32_t));
+        auto e_it = rxmesh.m_edges_map.begin();
+        while (e_it != rxmesh.m_edges_map.end()) {
+            e_v[e_it->second][0] = (e_it->first).first;
+            e_v[e_it->second][1] = (e_it->first).second;
+            e_it++;
         }
 
-        // two-way verification
-        return verifier(rxmesh.get_patcher()->get_face_patch().data(), f_v,
-                        input_container, output_container);
+        return verifier<rxmesh::EdgeHandle, rxmesh::VertexHandle>(
+            e_v,
+            rxmesh,
+            rxmesh.m_h_num_owned_e,
+            rxmesh.m_h_patches_ltog_e,
+            rxmesh.m_h_patches_ltog_v,
+            input,
+            output);
     }
 
-    template <uint32_t patchSize>
-    bool test_FE(const RXMESH::RXMeshStatic<patchSize>&   rxmesh,
-                 const RXMESH::RXMeshAttribute<uint32_t>& input_container,
-                 const RXMESH::RXMeshAttribute<uint32_t>& output_container)
+    /**
+     * @brief verify EF query
+     */
+    bool run_test(const rxmesh::RXMeshStatic&                      rxmesh,
+                  const std::vector<std::vector<uint32_t>>&        fv,
+                  const rxmesh::EdgeAttribute<rxmesh::EdgeHandle>& input,
+                  const rxmesh::EdgeAttribute<rxmesh::FaceHandle>& output)
     {
-
-        // construct FE
-
-        uint32_t f_deg = rxmesh.m_face_degree;
-
-        std::vector<std::vector<uint32_t>> f_e(rxmesh.m_num_faces,
+        std::vector<std::vector<uint32_t>> e_f(rxmesh.m_num_edges,
                                                std::vector<uint32_t>(0));
 
         for (uint32_t f = 0; f < rxmesh.m_num_faces; f++) {
-            f_e[f].reserve(f_deg);
-        }
-
-        for (uint32_t f = 0; f < rxmesh.m_num_faces; f++) {
-            uint32_t e0 = m_h_FE[f][0];
-            uint32_t e1 = m_h_FE[f][1];
-            uint32_t e2 = m_h_FE[f][2];
-
-            f_e[f].push_back(e0);
-            f_e[f].push_back(e1);
-            f_e[f].push_back(e2);
+            for (uint32_t e = 0; e < 3; e++) {
+                uint32_t edge = m_h_FE[f][e];
+                e_f[edge].push_back(f);
+            }
         }
 
-
-        // two-way verification
-        return verifier(rxmesh.get_patcher()->get_face_patch().data(), f_e,
-                        input_container, output_container);
+        return verifier<rxmesh::EdgeHandle, rxmesh::FaceHandle>(
+            e_f,
+            rxmesh,
+            rxmesh.m_h_num_owned_e,
+            rxmesh.m_h_patches_ltog_e,
+            rxmesh.m_h_patches_ltog_f,
+            input,
+            output);
     }
 
-    template <uint32_t patchSize>
-    bool test_FF(const RXMESH::RXMeshStatic<patchSize>&   rxmesh,
-                 const RXMESH::RXMeshAttribute<uint32_t>& input_container,
-                 const RXMESH::RXMeshAttribute<uint32_t>& output_container)
+    /**
+     * @brief verify FV query
+     */
+    bool run_test(const rxmesh::RXMeshStatic&                        rxmesh,
+                  const std::vector<std::vector<uint32_t>>&          fv,
+                  const rxmesh::FaceAttribute<rxmesh::FaceHandle>&   input,
+                  const rxmesh::FaceAttribute<rxmesh::VertexHandle>& output)
     {
+        return verifier<rxmesh::FaceHandle, rxmesh::VertexHandle>(
+            fv,
+            rxmesh,
+            rxmesh.m_h_num_owned_f,
+            rxmesh.m_h_patches_ltog_f,
+            rxmesh.m_h_patches_ltog_v,
+            input,
+            output);
+    }
 
-        // construct FF
+    /**
+     * @brief verify FE query
+     */
+    bool run_test(const rxmesh::RXMeshStatic&                      rxmesh,
+                  const std::vector<std::vector<uint32_t>>&        fv,
+                  const rxmesh::FaceAttribute<rxmesh::FaceHandle>& input,
+                  const rxmesh::FaceAttribute<rxmesh::EdgeHandle>& output)
+    {
+        return verifier<rxmesh::FaceHandle, rxmesh::EdgeHandle>(
+            m_h_FE,
+            rxmesh,
+            rxmesh.m_h_num_owned_f,
+            rxmesh.m_h_patches_ltog_f,
+            rxmesh.m_h_patches_ltog_e,
+            input,
+            output);
+    }
 
+    /**
+     * @brief verify FF query
+     */
+    bool run_test(const rxmesh::RXMeshStatic&                      rxmesh,
+                  const std::vector<std::vector<uint32_t>>&        fv,
+                  const rxmesh::FaceAttribute<rxmesh::FaceHandle>& input,
+                  const rxmesh::FaceAttribute<rxmesh::FaceHandle>& output)
+    {
         std::vector<std::vector<uint32_t>> f_f(rxmesh.m_num_faces,
                                                std::vector<uint32_t>(0));
 
@@ -376,137 +330,108 @@ class RXMeshTest
             }
         }
 
-        // two-way verification
-        return verifier(rxmesh.get_patcher()->get_face_patch().data(), f_f,
-                        input_container, output_container);
-    }
-
-    template <uint32_t patchSize>
-    bool test_EV(const RXMESH::RXMeshStatic<patchSize>&   rxmesh,
-                 const RXMESH::RXMeshAttribute<uint32_t>& input_container,
-                 const RXMESH::RXMeshAttribute<uint32_t>& output_container)
-    {
-
-        // construct EV
 
-        std::vector<std::vector<uint32_t>> e_v(rxmesh.m_num_edges,
-                                               std::vector<uint32_t>(2));
-
-        auto e_it = rxmesh.m_edges_map.begin();
-        while (e_it != rxmesh.m_edges_map.end()) {
-            e_v[e_it->second][0] = (e_it->first).first;
-            e_v[e_it->second][1] = (e_it->first).second;
-            e_it++;
-        }
-
-
-        // two-way verification
-        return verifier(rxmesh.get_patcher()->get_edge_patch().data(), e_v,
-                        input_container, output_container);
+        return verifier<rxmesh::FaceHandle, rxmesh::FaceHandle>(
+            f_f,
+            rxmesh,
+            rxmesh.m_h_num_owned_f,
+            rxmesh.m_h_patches_ltog_f,
+            rxmesh.m_h_patches_ltog_f,
+            input,
+            output);
     }
 
-    template <uint32_t patchSize>
-    bool test_EF(const RXMESH::RXMeshStatic<patchSize>&   rxmesh,
-                 const RXMESH::RXMeshAttribute<uint32_t>& input_container,
-                 const RXMESH::RXMeshAttribute<uint32_t>& output_container)
-    {
-
-        // construct EF
-
-        std::vector<std::vector<uint32_t>> e_f(rxmesh.m_num_edges,
-                                               std::vector<uint32_t>(0));
-
-        uint32_t f_deg = rxmesh.m_face_degree;
-        for (uint32_t f = 0; f < rxmesh.m_num_faces; f++) {
-            for (uint32_t e = 0; e < f_deg; e++) {
-                uint32_t edge = m_h_FE[f][e];
-                e_f[edge].push_back(f);
-            }
-        }
-
-        // two-way verification
-        return verifier(rxmesh.get_patcher()->get_edge_patch().data(), e_f,
-                        input_container, output_container);
-    }
 
-    bool verifier(const uint32_t*                           element_patch,
-                  const std::vector<std::vector<uint32_t>>& mesh_ele,
-                  const RXMESH::RXMeshAttribute<uint32_t>&  input_container,
-                  const RXMESH::RXMeshAttribute<uint32_t>&  output_container)
+   private:
+    template <typename InputHandleT,
+              typename OutputHandleT,
+              typename InputAttributeT,
+              typename OutputAttributeT>
+    bool verifier(const std::vector<std::vector<uint32_t>>& gt,
+                  const rxmesh::RXMeshStatic&               rxmesh,
+                  const std::vector<uint16_t>& num_owned_input_elements,
+                  const std::vector<std::vector<uint32_t>>& input_ltog,
+                  const std::vector<std::vector<uint32_t>>& output_ltog,
+                  const InputAttributeT&                    input,
+                  const OutputAttributeT&                   output)
     {
+        auto global_id_from_handle = [&](OutputHandleT xxh) -> uint32_t {
+            auto pl = xxh.unpack();
+            return output_ltog[pl.first][pl.second];
+        };
+
+        for (uint32_t p = 0; p < rxmesh.get_num_patches(); ++p) {
+            for (uint32_t e = 0; e < num_owned_input_elements[p]; ++e) {
+                InputHandleT eh(p, e);
+                if (input(eh) != eh) {
+                    return false;
+                }
 
-        bool results = true;
-
-        const uint32_t input_size = input_container.get_num_mesh_elements();
-
-        assert(input_size == output_container.get_num_mesh_elements());
+                uint32_t e_global = input_ltog[p][e];
 
-        for (uint32_t v = 0; v < input_size; v++) {
+                // Check correctness
+                // check if all output XX are correct
+                uint32_t num_xx = 0;
+                for (uint32_t i = 0; i < output.get_num_attributes(); ++i) {
+                    OutputHandleT xxh = output(eh, i);
+                    if (xxh.is_valid()) {
+                        num_xx++;
 
-            const uint32_t src_ele = input_container(v);
+                        // extract local id from xxh's unique id
+                        uint32_t xx_global = global_id_from_handle(xxh);
 
-            if (src_ele == INVALID32) {
-                // means it is isolated element so don't bother
-                continue;
-            }
-            // check for correctness (e.g, all edges in h_output are actually
-            // edges incident to the vertex v)
-            for (uint32_t i = 1; i <= output_container(v, 0); ++i) {
+                        uint32_t id =
+                            rxmesh::find_index(xx_global, gt[e_global]);
 
-                uint32_t id = RXMESH::find_index(output_container(v, i),
-                                                 mesh_ele[src_ele]);
-
-                if (id == std::numeric_limits<uint32_t>::max()) {
-                    if (!m_quite) {
-                        RXMESH_ERROR(
-                            "RXMeshTest::verifier() element {} is not incident "
-                            "to {}",
-                            output_container(v, i), src_ele);
+                        if (id == std::numeric_limits<uint32_t>::max()) {
+                            return false;
+                        }
                     }
-                    results = false;
                 }
-            }
 
-            // check for completeness (e.g, that all edges incident to the
-            // vertex v are actually returned in output_container)
-            for (uint32_t i = 0; i < mesh_ele[src_ele].size(); i++) {
-                uint32_t e = mesh_ele[src_ele][i];
-                bool     found = false;
-                for (uint32_t j = 1; j <= output_container(v, 0); j++) {
-                    if (output_container(v, j) == e) {
-                        found = true;
-                        break;
-                    }
+                if (num_xx != gt[e_global].size()) {
+                    return false;
                 }
 
-                if (!found) {
-                    if (!m_quite) {
-                        RXMESH_ERROR(
-                            "RXMeshTest::verifier() element {} is not incident "
-                            "to {}",
-                            e, src_ele);
+                // Check completeness
+                // check if all ground truth XX are in the output
+                for (uint32_t i = 0; i < gt[e_global].size(); ++i) {
+                    uint32_t xx = gt[e_global][i];
+
+                    bool found = false;
+                    for (uint32_t j = 0; j < output.get_num_attributes(); ++j) {
+                        OutputHandleT xxh = output(eh, j);
+                        if (xxh.is_valid()) {
+                            uint32_t xx_global = global_id_from_handle(xxh);
+                            if (xx_global == xx) {
+                                found = true;
+                                break;
+                            }
+                        }
+                    }
+
+                    if (!found) {
+                        return false;
                     }
-                    results = false;
                 }
             }
         }
 
-        return results;
+        return true;
     }
 
-    template <uint32_t patchSize>
-    void check_mapping(const RXMESH::RXMesh<patchSize>& rxmesh,
-                       const uint32_t                   patch_id,
-                       bool&                            is_edges_ok,
-                       bool&                            is_faces_ok)
+
+    void check_mapping(const rxmesh::RXMesh& rxmesh,
+                       const uint32_t        patch_id,
+                       bool&                 is_edges_ok,
+                       bool&                 is_faces_ok)
     {
         // check if the mapping is consistent i.e., going from local to
         // global gives the same results as from global to local
 
         // Number of edges and faces in this patch
-        uint32_t num_p_edges = rxmesh.m_h_ad_size[patch_id].y >> 1;
-        uint32_t num_p_faces = static_cast<uint32_t>(
-            static_cast<float>(rxmesh.m_h_ad_size[patch_id].w) / 3.0f);
+        uint32_t num_p_edges = rxmesh.m_h_patches_info[patch_id].num_edges;
+        uint32_t num_p_faces = rxmesh.m_h_patches_info[patch_id].num_faces;
 
         assert(num_p_edges <= std::numeric_limits<uint16_t>::max());
         assert(num_p_faces <= std::numeric_limits<uint16_t>::max());
@@ -516,15 +441,14 @@ class RXMeshTest
         is_faces_ok = check_mapping_faces(rxmesh, patch_id, num_p_faces);
     }
 
-    template <uint32_t patchSize>
-    bool check_mapping_edges(const RXMESH::RXMesh<patchSize>& rxmesh,
-                             const uint32_t                   patch_id,
-                             const uint32_t                   num_p_edges)
+    bool check_mapping_edges(const rxmesh::RXMesh& rxmesh,
+                             const uint32_t        patch_id,
+                             const uint32_t        num_p_edges)
     {
         // 1) For each local edge in the patch, get its global id using the
         // mapping (using m_h_patches_ltog_e)
 
-        // 2) get the local edge's local vertices (using m_h_patches_edges)
+        // 2) get the local edge's local vertices (using m_h_patches_ev)
 
         // 3) map the local vertices to their global id (using
         // m_h_patches_ltog_v)
@@ -539,26 +463,22 @@ class RXMeshTest
 
             // 1)
             // convert the local edge to global one
-            uint32_t e_ltog =
-                (rxmesh.m_h_patches_ltog_e.at(patch_id).at(e_l) >> 1);
+            uint32_t e_ltog = rxmesh.m_h_patches_ltog_e.at(patch_id).at(e_l);
 
             // 2)
             // get the local vertices
-            uint16_t v0_l = rxmesh.m_h_patches_edges.at(patch_id).at(e_l * 2);
-            uint16_t v1_l =
-                rxmesh.m_h_patches_edges.at(patch_id).at(e_l * 2 + 1);
+            uint16_t v0_l = rxmesh.m_h_patches_ev.at(patch_id).at(e_l * 2);
+            uint16_t v1_l = rxmesh.m_h_patches_ev.at(patch_id).at(e_l * 2 + 1);
 
             // 3)
             // convert the local vertices to global
-            uint32_t v0_ltog =
-                (rxmesh.m_h_patches_ltog_v.at(patch_id).at(v0_l) >> 1);
-            uint32_t v1_ltog =
-                (rxmesh.m_h_patches_ltog_v.at(patch_id).at(v1_l) >> 1);
+            uint32_t v0_ltog = rxmesh.m_h_patches_ltog_v.at(patch_id).at(v0_l);
+            uint32_t v1_ltog = rxmesh.m_h_patches_ltog_v.at(patch_id).at(v1_l);
 
 
             // 4)
             // use the convered vertices to look for the edge global id
-            auto my_edge = rxmesh.edge_key(v0_ltog, v1_ltog);
+            auto my_edge = rxmesh::detail::edge_key(v0_ltog, v1_ltog);
 
             uint32_t e_g;
             try {
@@ -570,7 +490,11 @@ class RXMeshTest
                         "find the corresponding edge between global vertices "
                         "{} and {} with local id {} and in patch {} of "
                         "converted to global vertices",
-                        v0_ltog, v1_ltog, v0_l, v1_l, patch_id);
+                        v0_ltog,
+                        v1_ltog,
+                        v0_l,
+                        v1_l,
+                        patch_id);
                 }
                 return false;
             }
@@ -588,7 +512,13 @@ class RXMeshTest
                         "{}, local edge id = {}, mapped to = {}, local "
                         "vertices id = ({}, {}) mapped to= ({}, {}), global "
                         "edge connecting the mapped global vertices = {}",
-                        patch_id, e_l, e_ltog, v0_l, v1_l, v0_ltog, v1_ltog,
+                        patch_id,
+                        e_l,
+                        e_ltog,
+                        v0_l,
+                        v1_l,
+                        v0_ltog,
+                        v1_ltog,
                         e_g);
                 }
                 return false;
@@ -597,64 +527,59 @@ class RXMeshTest
         return true;
     }
 
-    template <uint32_t patchSize>
-    bool check_mapping_faces(const RXMESH::RXMesh<patchSize>& rxmesh,
-                             const uint32_t                   patch_id,
-                             const uint32_t                   num_p_faces)
+    bool check_mapping_faces(const rxmesh::RXMesh& rxmesh,
+                             const uint32_t        patch_id,
+                             const uint32_t        num_p_faces)
     {
-        using namespace RXMESH;
+        using namespace rxmesh;
         // 1) for each local face in the patch, get its global id using the
         // mapping (using m_h_patches_ltog_f)
 
-        // 2) get the local face's local edges (using m_h_patches_faces)
+        // 2) get the local face's local edges (using m_h_patches_fe)
 
         // 3) map the local edges to their global id
         //(using m_h_patches_ltog_v)
 
         // 4) use the converted edges to get their global face id (using
-        // m_h_patches_faces)
+        // m_h_patches_fe)
 
 
         // 5) check if the resulting global face id in 4) matches that
         // obtained in 1)
-        uint32_t              deg = rxmesh.m_face_degree;
-        std::vector<uint16_t> e_l(deg);
-        std::vector<uint16_t> e_g(deg);
-        std::vector<uint16_t> e_ltog(deg);
+        std::vector<uint16_t> e_l(3);
+        std::vector<uint16_t> e_g(3);
+        std::vector<uint16_t> e_ltog(3);
 
         for (uint16_t f_l = 0; f_l < num_p_faces; ++f_l) {
             // 1)
             // convert the local face to global one
-            uint32_t f_ltog =
-                (rxmesh.m_h_patches_ltog_f.at(patch_id).at(f_l) >> 1);
+            uint32_t f_ltog = rxmesh.m_h_patches_ltog_f.at(patch_id).at(f_l);
 
             // 2)
             // get the local edges
-            for (uint32_t i = 0; i < deg; ++i) {
-                e_l[i] =
-                    rxmesh.m_h_patches_faces.at(patch_id).at(f_l * deg + i);
+            for (uint32_t i = 0; i < 3; ++i) {
+                e_l[i] = rxmesh.m_h_patches_fe.at(patch_id).at(f_l * 3 + i);
                 // shift right because the first bit is reserved for edge
                 // direction
                 flag_t dir(0);
-                RXMeshContext::unpack_edge_dir(e_l[i], e_l[i], dir);
+                Context::unpack_edge_dir(e_l[i], e_l[i], dir);
             }
 
             // 3)
             // convert the local edges to global
-            for (uint32_t i = 0; i < deg; ++i) {
-                e_ltog[i] =
-                    (rxmesh.m_h_patches_ltog_e.at(patch_id).at(e_l[i]) >> 1);
+            for (uint32_t i = 0; i < 3; ++i) {
+                e_ltog[i] = rxmesh.m_h_patches_ltog_e.at(patch_id).at(e_l[i]);
             }
 
             // 4)
             // from the mapped face (f_ltog) get its global edges
-            for (uint32_t i = 0; i < deg; ++i) {
+            for (uint32_t i = 0; i < 3; ++i) {
                 e_g[i] = m_h_FE[f_ltog][i];
             }
 
             // 5)
             // check if the global edges matches the mapping edges
-            for (uint32_t i = 0; i < deg; ++i) {
+            for (uint32_t i = 0; i < 3; ++i) {
                 if (e_g[i] != e_ltog[i]) {
                     if (!m_quite) {
                         RXMESH_ERROR(
@@ -664,9 +589,18 @@ class RXMeshTest
                             "edges id = ({}, {}, {}), mapped to = ({}, {}, "
                             "{}), global edges obtained from the mapped global "
                             "face= ({}, {}, {})",
-                            patch_id, f_l, f_ltog, e_l[0], e_l[1], e_l[2],
-                            e_ltog[0], e_ltog[1], e_ltog[2], e_ltog[0],
-                            e_ltog[1], e_ltog[2]);
+                            patch_id,
+                            f_l,
+                            f_ltog,
+                            e_l[0],
+                            e_l[1],
+                            e_l[2],
+                            e_ltog[0],
+                            e_ltog[1],
+                            e_ltog[2],
+                            e_ltog[0],
+                            e_ltog[1],
+                            e_ltog[2]);
                     }
                     return false;
                 }
diff --git a/tests/RXMesh_test/rxmesh_test_main.cu b/tests/RXMesh_test/rxmesh_test_main.cu
index d6ae3274..a8634c69 100644
--- a/tests/RXMesh_test/rxmesh_test_main.cu
+++ b/tests/RXMesh_test/rxmesh_test_main.cu
@@ -5,28 +5,26 @@
 #include "rxmesh/util/vector.h"
 
 using dataT = float;
-std::vector<std::vector<dataT>> Verts;
 
 struct RXMeshTestArg
 {
-    uint32_t    num_run = 1;
-    uint32_t    device_id = 0;
+    uint32_t    num_run       = 1;
+    uint32_t    device_id     = 0;
     std::string obj_file_name = STRINGIFY(INPUT_DIR) "sphere3.obj";
     std::string output_folder = STRINGIFY(OUTPUT_DIR);
-    bool        quite = false;
-    bool        shuffle = false;
-    bool        sort = false;
-    int         argc = argc;
-    char**      argv = argv;
+    bool        quite         = false;
+    int         argc          = argc;
+    char**      argv          = argv;
 } rxmesh_args;
 
 #include "test_higher_queries.h"
 #include "test_queries.h"
-
+#include "test_attribute.cuh"
+#include "test_for_each.h"
 
 int main(int argc, char** argv)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
     Log::init();
 
     ::testing::InitGoogleTest(&argc, argv);
@@ -41,10 +39,8 @@ int main(int argc, char** argv)
                         "              Default is {} \n"
                         "              Hint: Only accepts OBJ files\n"
                         " -o:          JSON file output folder. Default is {} \n"
-                        " -num_run:    Number of iterations for performance testing. Default is {} \n"
-                        " -q:          Run in quite mode.\n"
-                        " -s:          Shuffle input. Default is false.\n"
-                        " -p:          Sort input using patching output. Default is false.\n"
+                        " -num_run:    Number of iterations for performance testing. Default is {} \n"                        
+                        " -q:          Run in quite mode. Default is false\n"
                         " -device_id:  GPU device ID. Default is {}",
             rxmesh_args.obj_file_name, rxmesh_args.output_folder ,rxmesh_args.num_run,rxmesh_args.device_id);
             // clang-format on
@@ -72,18 +68,11 @@ int main(int argc, char** argv)
         if (cmd_option_exists(argv, argc + argv, "-q")) {
             rxmesh_args.quite = true;
         }
-        if (cmd_option_exists(argv, argc + argv, "-s")) {
-            rxmesh_args.shuffle = true;
-        }
-        if (cmd_option_exists(argv, argc + argv, "-p")) {
-            rxmesh_args.sort = true;
-        }
     }
 
     if (!rxmesh_args.quite) {
         RXMESH_TRACE("input= {}", rxmesh_args.obj_file_name);
         RXMESH_TRACE("output_folder= {}", rxmesh_args.output_folder);
-        RXMESH_TRACE("PATCH_SIZE= {}", PATCH_SIZE);
         RXMESH_TRACE("num_run= {}", rxmesh_args.num_run);
         RXMESH_TRACE("device_id= {}", rxmesh_args.device_id);
     }
diff --git a/tests/RXMesh_test/test_attribute.cu b/tests/RXMesh_test/test_attribute.cu
deleted file mode 100644
index 049c9139..00000000
--- a/tests/RXMesh_test/test_attribute.cu
+++ /dev/null
@@ -1,458 +0,0 @@
-#include "gtest/gtest.h"
-#include "rxmesh/rxmesh_attribute.h"
-#include "rxmesh/util/macros.h"
-#include "rxmesh/util/vector.h"
-
-/**
- * test_vector()
- */
-__global__ static void test_vector(
-    RXMESH::RXMeshAttribute<RXMESH::Vector3f> mesh_attr,
-    uint32_t*                                 suceess)
-{
-
-    if (threadIdx.x == 0 && blockIdx.x == 0) {
-        *suceess = 1;
-
-        assert((mesh_attr.get_allocated() & RXMESH::DEVICE) == RXMESH::DEVICE);
-        uint32_t num_mesh_elements = mesh_attr.get_num_mesh_elements();
-        for (uint32_t i = 0; i < num_mesh_elements; ++i) {
-            const auto& vec = mesh_attr(i);
-            if (vec[0] != i + 0 || vec[1] != i + 1 || vec[2] != i + 2) {
-                *suceess = 0;
-                return;
-            }
-        }
-    }
-}
-
-/**
- * test_values()
- */
-template <class T>
-__global__ static void test_values(RXMESH::RXMeshAttribute<T> mesh_attr,
-                                   uint32_t*                  suceess)
-{
-
-    if (threadIdx.x == 0 && blockIdx.x == 0) {
-        *suceess = 1;
-
-        assert((mesh_attr.get_allocated() & RXMESH::DEVICE) == RXMESH::DEVICE);
-        uint32_t num_mesh_elements = mesh_attr.get_num_mesh_elements();
-        for (uint32_t i = 0; i < num_mesh_elements; ++i) {
-            for (uint32_t j = 0; j < mesh_attr.get_num_attribute_per_element();
-                 ++j) {
-                if (mesh_attr(i, j) != i + j) {
-
-                    *suceess = 0;
-                    return;
-                }
-            }
-        }
-    }
-}
-
-/**
- * generate_values()
- */
-template <class T>
-__global__ static void generate_values(RXMESH::RXMeshAttribute<T> mesh_attr)
-{
-
-    if (threadIdx.x == 0 && blockIdx.x == 0) {
-        assert((mesh_attr.get_allocated() & RXMESH::DEVICE) == RXMESH::DEVICE);
-
-        uint32_t num_mesh_elements = mesh_attr.get_num_mesh_elements();
-        for (uint32_t i = 0; i < num_mesh_elements; ++i) {
-            for (uint32_t j = 0; j < mesh_attr.get_num_attribute_per_element();
-                 ++j) {
-                mesh_attr(i, j) = i + j;
-            }
-        }
-    }
-}
-
-
-bool test_host(uint32_t attributes_per_element)
-{
-    using namespace RXMESH;
-    // mesh attr on host
-    uint32_t                       num_mesh_elements = 2048;
-    RXMESH::RXMeshAttribute<float> rxmesh_attr;
-
-    rxmesh_attr.set_name("float_attr");
-    rxmesh_attr.init(num_mesh_elements, attributes_per_element, RXMESH::HOST,
-                     RXMESH::AoS);
-
-    // generate some numbers as AoS
-    for (uint32_t i = 0; i < num_mesh_elements; ++i) {
-        for (uint32_t j = 0; j < attributes_per_element; ++j) {
-            rxmesh_attr(i, j) = i + j;
-        }
-    }
-
-    // change the layout to SoA (good for gpu)
-    rxmesh_attr.change_layout(RXMESH::HOST);
-
-    // move memory to device
-    rxmesh_attr.move(RXMESH::HOST, RXMESH::DEVICE);
-
-
-    // device success variable
-    uint32_t* d_success = nullptr;
-    CUDA_ERROR(cudaMalloc((void**)&d_success, sizeof(uint32_t)));
-
-
-    // actual testing
-    test_values<float><<<1, 1>>>(rxmesh_attr, d_success);
-
-    CUDA_ERROR(cudaPeekAtLastError());
-    CUDA_ERROR(cudaGetLastError());
-    CUDA_ERROR(cudaDeviceSynchronize());
-
-    // host success variable
-    uint32_t h_success(0);
-    CUDA_ERROR(cudaMemcpy(&h_success, d_success, sizeof(uint32_t),
-                          cudaMemcpyDeviceToHost));
-
-    // free device
-    GPU_FREE(d_success);
-
-    // release rxmesh_attribute memory on host and device
-    rxmesh_attr.release();
-
-    // reporting
-    return h_success == 1;
-}
-
-
-bool test_device(uint32_t attributes_per_element)
-{
-    using namespace RXMESH;
-    // Test generating values on device and processing it on host
-
-    // mesh attr on host (but allocated on device)
-    uint32_t                          num_mesh_elements = 2048;
-    RXMESH::RXMeshAttribute<uint32_t> rxmesh_attr;
-    rxmesh_attr.set_name("int_attr");
-    rxmesh_attr.init(num_mesh_elements, attributes_per_element, RXMESH::DEVICE);
-
-
-    // generate some numbers on device
-    generate_values<<<1, 1>>>(rxmesh_attr);
-
-    CUDA_ERROR(cudaDeviceSynchronize());
-    CUDA_ERROR(cudaGetLastError());
-
-
-    // move the generate values to host
-    rxmesh_attr.move(RXMESH::DEVICE, RXMESH::HOST);
-
-    // change the layout to SoA
-    rxmesh_attr.change_layout(RXMESH::HOST);
-
-    // testing
-    bool suceess = true;
-    assert((rxmesh_attr.get_allocated() & RXMESH::HOST) == RXMESH::HOST);
-    num_mesh_elements = rxmesh_attr.get_num_mesh_elements();
-
-    for (uint32_t i = 0; i < num_mesh_elements; ++i) {
-        for (uint32_t j = 0; j < attributes_per_element; ++j) {
-            if (rxmesh_attr(i, j) != i + j) {
-                suceess = false;
-                break;
-            }
-        }
-        if (!suceess) {
-            break;
-        }
-    }
-
-    // release rxmesh_attribute memory on host and device
-    rxmesh_attr.release();
-
-    return suceess;
-}
-
-/*bool test_vector()
-{
-    using namespace RXMESH;
-    // mesh attr on host
-    uint32_t                         num_mesh_elements = 2048;
-    RXMESH::RXMeshAttribute<Vector3f> rxmesh_attr;
-
-    rxmesh_attr.set_name("vector3f_attr");
-    rxmesh_attr.init(num_mesh_elements, 1, RXMESH::HOST, RXMESH::AoS);
-
-    // generate some numbers as AoS
-    for (uint32_t i = 0; i < num_mesh_elements; ++i) {
-        auto& vec = rxmesh_attr(i);
-        vec[0] = i + 0;
-        vec[1] = i + 1;
-        vec[2] = i + 2;
-    }
-
-    // move memory to device
-    rxmesh_attr.move(RXMESH::HOST, RXMESH::DEVICE);
-
-
-    // device success variable
-    uint32_t* d_success = nullptr;
-    CUDA_ERROR(cudaMalloc((void**)&d_success, sizeof(uint32_t)));
-
-
-    // actual testing
-    test_vector<<<1, 1>>>(rxmesh_attr, d_success);
-
-    CUDA_ERROR(cudaPeekAtLastError());
-    CUDA_ERROR(cudaGetLastError());
-    CUDA_ERROR(cudaDeviceSynchronize());
-
-    // host success variable
-    uint32_t h_success(0);
-    CUDA_ERROR(cudaMemcpy(&h_success, d_success, sizeof(uint32_t),
-                          cudaMemcpyDeviceToHost));
-
-    // free device
-    GPU_FREE(d_success);
-
-    // release rxmesh_attribute memory on host and device
-    rxmesh_attr.release();
-
-    // reporting
-    return h_success == 1;
-}*/
-
-bool test_axpy(uint32_t attributes_per_element)
-{
-    using namespace RXMESH;
-
-    float x_val(1.0), y_val(3.0), alpha_val(5.0), beta_val(7.0);
-
-    uint32_t                       num_mesh_elements = 2048;
-    RXMESH::RXMeshAttribute<float> X;
-    RXMESH::RXMeshAttribute<float> Y;
-
-    X.set_name("X");
-    Y.set_name("Y");
-    X.init(num_mesh_elements, attributes_per_element, RXMESH::HOST,
-           RXMESH::AoS);
-    Y.init(num_mesh_elements, attributes_per_element, RXMESH::HOST,
-           RXMESH::AoS);
-
-    // generate some numbers as AoS
-    for (uint32_t i = 0; i < num_mesh_elements; ++i) {
-        for (uint32_t j = 0; j < attributes_per_element; ++j) {
-            X(i, j) = x_val;
-            Y(i, j) = y_val;
-        }
-    }
-
-    X.change_layout(RXMESH::HOST);
-    Y.change_layout(RXMESH::HOST);
-    X.move(RXMESH::HOST, RXMESH::DEVICE);
-    Y.move(RXMESH::HOST, RXMESH::DEVICE);
-
-    // call axpy
-    Vector<3, float> alpha(alpha_val);
-    Vector<3, float> beta(beta_val);
-    Y.axpy(X, alpha, beta);
-
-    // sync
-    CUDA_ERROR(cudaDeviceSynchronize());
-    CUDA_ERROR(cudaPeekAtLastError());
-    CUDA_ERROR(cudaGetLastError());
-
-
-    // move to host (don't need to move X
-    Y.move(RXMESH::DEVICE, RXMESH::HOST);
-
-    // check results
-    bool is_passed = true;
-    for (uint32_t i = 0; i < num_mesh_elements; ++i) {
-        for (uint32_t j = 0; j < attributes_per_element; ++j) {
-            if (std::abs(Y(i, j) - (alpha_val * x_val + beta_val * y_val)) >
-                0.0001) {
-                is_passed = false;
-                break;
-            }
-        }
-        if (!is_passed) {
-            break;
-        }
-    }
-
-    // release rxmesh_attribute memory on host and device
-    X.release();
-    Y.release();
-
-
-    return is_passed;
-}
-
-
-bool test_reduce()
-{
-    using namespace RXMESH;
-    constexpr uint32_t             attributes_per_element = 3;
-    uint32_t                       num_mesh_elements = 2048;
-    RXMESH::RXMeshAttribute<float> X;
-
-    X.set_name("X");
-    X.init(num_mesh_elements, attributes_per_element, RXMESH::HOST,
-           RXMESH::AoS);
-
-    // generate some numbers as AoS
-    for (uint32_t i = 0; i < num_mesh_elements; ++i) {
-        for (uint32_t j = 0; j < attributes_per_element; ++j) {
-            X(i, j) = j + 1;
-        }
-    }
-
-    X.change_layout(RXMESH::HOST);
-    X.move(RXMESH::HOST, RXMESH::DEVICE);
-    Vector<attributes_per_element, float> output;
-
-    // call reduce
-    X.reduce(output, RXMESH::SUM);
-
-
-    // sync
-    CUDA_ERROR(cudaDeviceSynchronize());
-    CUDA_ERROR(cudaPeekAtLastError());
-    CUDA_ERROR(cudaGetLastError());
-
-    bool is_passed = true;
-
-    for (uint32_t j = 0; j < attributes_per_element; ++j) {
-        if (output[j] != num_mesh_elements * (j + 1)) {
-            is_passed = false;
-            break;
-        }
-    }
-
-    // release rxmesh_attribute memory on host and device
-    X.release();
-
-
-    return is_passed;
-}
-
-
-bool test_norm2()
-{
-    using namespace RXMESH;
-    constexpr uint32_t             attributes_per_element = 3;
-    uint32_t                       num_mesh_elements = 2048;
-    RXMESH::RXMeshAttribute<float> X;
-
-    X.set_name("X");
-    X.init(num_mesh_elements, attributes_per_element, RXMESH::HOST,
-           RXMESH::AoS);
-
-    // generate some numbers as AoS
-    for (uint32_t i = 0; i < num_mesh_elements; ++i) {
-        for (uint32_t j = 0; j < attributes_per_element; ++j) {
-            X(i, j) = 2;
-        }
-    }
-
-    X.change_layout(RXMESH::HOST);
-    X.move(RXMESH::HOST, RXMESH::DEVICE);
-    Vector<attributes_per_element, float> output;
-
-    // call reduce
-    X.reduce(output, RXMESH::NORM2);
-
-
-    // sync
-    CUDA_ERROR(cudaDeviceSynchronize());
-    CUDA_ERROR(cudaPeekAtLastError());
-    CUDA_ERROR(cudaGetLastError());
-
-    bool is_passed = true;
-
-    for (uint32_t j = 0; j < attributes_per_element; ++j) {
-        if (output[j] != 4 * num_mesh_elements) {
-            is_passed = false;
-            break;
-        }
-    }
-
-    // release rxmesh_attribute memory on host and device
-    X.release();
-
-
-    return is_passed;
-}
-
-
-bool test_dot()
-{
-    using namespace RXMESH;
-    constexpr uint32_t             attributes_per_element = 3;
-    uint32_t                       num_mesh_elements = 2048;
-    RXMESH::RXMeshAttribute<float> X;
-    RXMESH::RXMeshAttribute<float> Y;
-
-    X.set_name("X");
-    Y.set_name("Y");
-    X.init(num_mesh_elements, attributes_per_element, RXMESH::HOST,
-           RXMESH::AoS);
-    Y.init(num_mesh_elements, attributes_per_element, RXMESH::HOST,
-           RXMESH::AoS);
-
-    // generate some numbers as AoS
-    for (uint32_t i = 0; i < num_mesh_elements; ++i) {
-        for (uint32_t j = 0; j < attributes_per_element; ++j) {
-            X(i, j) = 2;
-            Y(i, j) = 3;
-        }
-    }
-
-    X.change_layout(RXMESH::HOST);
-    X.move(RXMESH::HOST, RXMESH::DEVICE);
-    Y.change_layout(RXMESH::HOST);
-    Y.move(RXMESH::HOST, RXMESH::DEVICE);
-    Vector<attributes_per_element, float> output;
-
-    // call reduce
-    X.reduce(output, RXMESH::DOT, &Y);
-
-
-    // sync
-    CUDA_ERROR(cudaDeviceSynchronize());
-    CUDA_ERROR(cudaPeekAtLastError());
-    CUDA_ERROR(cudaGetLastError());
-
-    bool is_passed = true;
-
-    for (uint32_t j = 0; j < attributes_per_element; ++j) {
-        if (output[j] != 6 * num_mesh_elements) {
-            is_passed = false;
-            break;
-        }
-    }
-
-    // release rxmesh_attribute memory on host and device
-    X.release();
-    Y.release();
-
-
-    return is_passed;
-}
-
-
-TEST(RXMesh, Attributes)
-{
-    using namespace RXMESH;
-    EXPECT_TRUE(test_host(3u)) << " TestAttributes::tes_host failed";
-    EXPECT_TRUE(test_device(3u)) << " TestAttributes::tes_device failed";
-    // EXPECT_TRUE(test_vector()) << " TestAttributes::test_vector failed";
-    EXPECT_TRUE(test_axpy(3u)) << " TestAttributes::test_axpy failed";
-    EXPECT_TRUE(test_reduce()) << " TestAttributes::test_reduce failed";
-    EXPECT_TRUE(test_norm2()) << " TestAttributes::test_norm2 failed";
-    EXPECT_TRUE(test_dot()) << " TestAttributes::test_dot failed";
-
-    CUDA_ERROR(cudaDeviceSynchronize());
-}
\ No newline at end of file
diff --git a/tests/RXMesh_test/test_attribute.cuh b/tests/RXMesh_test/test_attribute.cuh
new file mode 100644
index 00000000..4804e12e
--- /dev/null
+++ b/tests/RXMesh_test/test_attribute.cuh
@@ -0,0 +1,170 @@
+#include "gtest/gtest.h"
+#include "rxmesh/attribute.h"
+#include "rxmesh/reduce_handle.h"
+#include "rxmesh/util/macros.h"
+
+template <typename T>
+void populate(rxmesh::RXMeshStatic&       rxmesh,
+              rxmesh::VertexAttribute<T>& v,
+              T                           val)
+{
+    rxmesh.for_each_vertex(
+        rxmesh::DEVICE,
+        [v, val] __device__(const rxmesh::VertexHandle vh) { v(vh) = val; });
+
+    ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+}
+
+
+template <typename T>
+void populate(rxmesh::RXMeshStatic& rxmesh, rxmesh::FaceAttribute<T>& f, T val)
+{
+    rxmesh.for_each_face(
+        rxmesh::DEVICE,
+        [f, val] __device__(const rxmesh::FaceHandle fh) { f(fh) = val; });
+
+    ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+}
+
+template <typename T>
+void populate(rxmesh::RXMeshStatic&       rxmesh,
+              rxmesh::VertexAttribute<T>& v1,
+              rxmesh::VertexAttribute<T>& v2,
+              T                           v1_val,
+              T                           v2_val)
+{
+    rxmesh.for_each_vertex(
+        rxmesh::DEVICE,
+        [v1, v2, v1_val, v2_val] __device__(const rxmesh::VertexHandle vh) {
+            v1(vh) = v1_val;
+            v2(vh) = v2_val;
+        });
+    ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+}
+
+TEST(Attribute, Norm2)
+{
+    using namespace rxmesh;
+
+    CUDA_ERROR(cudaDeviceReset());
+
+    cuda_query(rxmesh_args.device_id, rxmesh_args.quite);
+
+    std::vector<std::vector<dataT>>    Verts;
+    std::vector<std::vector<uint32_t>> Faces;
+
+    ASSERT_TRUE(
+        import_obj(STRINGIFY(INPUT_DIR) "sphere3.obj", Verts, Faces, true));
+
+    RXMeshStatic rxmesh(Faces, rxmesh_args.quite);
+
+    auto attr = rxmesh.add_vertex_attribute<float>("v", 3, rxmesh::DEVICE);
+
+    const float val(2.0);
+
+    populate<float>(rxmesh, *attr, val);
+
+    ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+
+    ReduceHandle reduce(*attr);
+
+    float output = reduce.norm2(*attr);
+
+    ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+
+    EXPECT_FLOAT_EQ(output, std::sqrt(val * val * rxmesh.get_num_vertices()));
+}
+
+
+TEST(Attribute, Dot)
+{
+    using namespace rxmesh;
+
+    cuda_query(rxmesh_args.device_id, rxmesh_args.quite);
+
+    std::vector<std::vector<dataT>>    Verts;
+    std::vector<std::vector<uint32_t>> Faces;
+
+    ASSERT_TRUE(
+        import_obj(STRINGIFY(INPUT_DIR) "sphere3.obj", Verts, Faces, true));
+
+    RXMeshStatic rxmesh(Faces, rxmesh_args.quite);
+
+    auto v1_attr = rxmesh.add_vertex_attribute<float>("v1", 3, rxmesh::DEVICE);
+    auto v2_attr = rxmesh.add_vertex_attribute<float>("v2", 3, rxmesh::DEVICE);
+
+    const float v1_val(2.0);
+    const float v2_val(3.0);
+
+    populate<float>(rxmesh, *v1_attr, *v2_attr, v1_val, v2_val);
+
+    ReduceHandle reduce(*v1_attr);
+
+    float output = reduce.dot(*v1_attr, *v2_attr);
+
+    ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+
+    EXPECT_FLOAT_EQ(output, v1_val * v2_val * rxmesh.get_num_vertices());
+}
+
+
+TEST(Attribute, CopyFrom)
+{
+    using namespace rxmesh;
+
+    cuda_query(rxmesh_args.device_id, rxmesh_args.quite);
+
+    std::vector<std::vector<dataT>>    Verts;
+    std::vector<std::vector<uint32_t>> Faces;
+
+    ASSERT_TRUE(
+        import_obj(STRINGIFY(INPUT_DIR) "sphere3.obj", Verts, Faces, true));
+
+
+    RXMeshStatic rxmesh(Faces, rxmesh_args.quite);
+
+    auto f_device = rxmesh.add_face_attribute<uint32_t>("d", 3, DEVICE);
+
+    auto f_host = rxmesh.add_face_attribute<uint32_t>("h", 3, HOST);
+
+    uint32_t val = 99;
+
+    populate<uint32_t>(rxmesh, *f_device, val);
+
+    f_host->copy_from(*f_device, DEVICE, HOST);
+
+    rxmesh.for_each_face(
+        HOST, [&](const FaceHandle fh) { EXPECT_EQ((*f_host)(fh), val); });
+}
+
+TEST(Attribute, AddingAndRemoving)
+{
+    using namespace rxmesh;
+
+    cuda_query(rxmesh_args.device_id, rxmesh_args.quite);
+
+    std::vector<std::vector<dataT>>    Verts;
+    std::vector<std::vector<uint32_t>> Faces;
+
+    ASSERT_TRUE(
+        import_obj(STRINGIFY(INPUT_DIR) "sphere3.obj", Verts, Faces, true));
+
+
+    RXMeshStatic rxmesh(Faces, rxmesh_args.quite);
+
+    std::string attr_name = "v_attr";
+
+    auto vertex_attr =
+        rxmesh.add_vertex_attribute<float>(attr_name, 3, rxmesh::LOCATION_ALL);
+
+    EXPECT_TRUE(rxmesh.does_attribute_exist(attr_name));
+
+
+    vertex_attr->move(rxmesh::HOST, rxmesh::DEVICE);
+
+    ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+
+    // this is not neccessary in general but we are just testing the
+    // functionality here
+    rxmesh.remove_attribute(attr_name);
+}
\ No newline at end of file
diff --git a/tests/RXMesh_test/test_for_each.h b/tests/RXMesh_test/test_for_each.h
new file mode 100644
index 00000000..e097c625
--- /dev/null
+++ b/tests/RXMesh_test/test_for_each.h
@@ -0,0 +1,37 @@
+#include "gtest/gtest.h"
+
+#include "rxmesh/util/cuda_query.h"
+#include "rxmesh/util/import_obj.h"
+
+TEST(RXMeshStatic, ForEach)
+{
+    using namespace rxmesh;
+
+    cuda_query(rxmesh_args.device_id, rxmesh_args.quite);
+
+    std::vector<std::vector<dataT>>    Verts;
+    std::vector<std::vector<uint32_t>> Faces;
+
+    ASSERT_TRUE(
+        import_obj(STRINGIFY(INPUT_DIR) "cube.obj", Verts, Faces, true));
+
+
+    RXMeshStatic rxmesh_static(Faces, rxmesh_args.quite);
+
+    std::atomic_uint32_t num_v = 0;
+    std::atomic_uint32_t num_e = 0;
+    std::atomic_uint32_t num_f = 0;
+
+    rxmesh_static.for_each_vertex(HOST,
+                                  [&](const VertexHandle vh) { num_v++; });
+
+    rxmesh_static.for_each_edge(HOST, [&](const EdgeHandle eh) { num_e++; });
+
+    rxmesh_static.for_each_face(HOST, [&](const FaceHandle fh) { num_f++; });
+
+    EXPECT_EQ(num_v, rxmesh_static.get_num_vertices());
+
+    EXPECT_EQ(num_e, rxmesh_static.get_num_edges());
+
+    EXPECT_EQ(num_f, rxmesh_static.get_num_faces());
+}
\ No newline at end of file
diff --git a/tests/RXMesh_test/test_higher_queries.h b/tests/RXMesh_test/test_higher_queries.h
index 7fb94b24..1cc5ffd5 100644
--- a/tests/RXMesh_test/test_higher_queries.h
+++ b/tests/RXMesh_test/test_higher_queries.h
@@ -1,63 +1,57 @@
 #include "gtest/gtest.h"
 #include "higher_query.cuh"
-#include "rxmesh/rxmesh_attribute.h"
+#include "rxmesh/attribute.h"
 #include "rxmesh/rxmesh_static.h"
 #include "rxmesh/util/import_obj.h"
 #include "rxmesh_test.h"
 
-using namespace RXMESH;
-
-TEST(RXMesh, HigherQueries)
+TEST(RXMeshStatic, HigherQueries)
 {
+    using namespace rxmesh;
+
     // Select device
     cuda_query(rxmesh_args.device_id, rxmesh_args.quite);
 
+    std::vector<std::vector<dataT>>    Verts;
     std::vector<std::vector<uint32_t>> Faces;
-    if (!import_obj(rxmesh_args.obj_file_name, Verts, Faces,
-                    rxmesh_args.quite)) {
-        exit(EXIT_FAILURE);
-    }
+    ASSERT_TRUE(import_obj(
+        STRINGIFY(INPUT_DIR) "sphere3.obj", Verts, Faces, rxmesh_args.quite));
 
     // RXMesh
-    RXMeshStatic<PATCH_SIZE> rxmesh_static(Faces, Verts, false,
-                                           rxmesh_args.quite);
+    RXMeshStatic rxmesh(Faces, rxmesh_args.quite);
 
-    uint32_t input_size = rxmesh_static.get_num_vertices();
 
     // input/output container
-    RXMeshAttribute<uint32_t> input_container;
-    input_container.init(input_size, 1u, RXMESH::DEVICE, RXMESH::AoS, false,
-                         false);
+    auto input = rxmesh.add_vertex_attribute<VertexHandle>("input", 1);
+    input->reset(VertexHandle(), rxmesh::DEVICE);
 
-    RXMeshAttribute<uint32_t> output_container;
-    output_container.init(input_size,
-                          input_size,  // that is a bit excessive
-                          RXMESH::DEVICE, RXMESH::SoA, false, false);
+    // we assume that every vertex could store up to num_vertices as its
+    // neighbor vertices which is a bit excessive
+    auto output = rxmesh.add_vertex_attribute<VertexHandle>(
+        "output", rxmesh.get_num_vertices());
+    output->reset(VertexHandle(), rxmesh::DEVICE);
 
     // launch box
-    constexpr uint32_t      blockThreads = 512;
+    constexpr uint32_t      blockThreads = 256;
     LaunchBox<blockThreads> launch_box;
-    rxmesh_static.prepare_launch_box(Op::VV, launch_box, true, false);
+    rxmesh.prepare_launch_box(
+        Op::VV, launch_box, (void*)higher_query<blockThreads, Op::VV>, false);
 
-    output_container.reset(INVALID32, RXMESH::DEVICE);
-    input_container.reset(INVALID32, RXMESH::DEVICE);
 
-    ::RXMeshTest tester(true);
+    RXMeshTest tester(rxmesh, Faces, true);
 
     // launch
-    higher_query<Op::VV, blockThreads>
+    higher_query<blockThreads, Op::VV>
         <<<launch_box.blocks, blockThreads, launch_box.smem_bytes_dyn>>>(
-            rxmesh_static.get_context(), input_container, output_container);
+            rxmesh.get_context(), *input, *output);
+
+    CUDA_ERROR(cudaGetLastError());
+    CUDA_ERROR(cudaDeviceSynchronize());
 
     // move containers to the CPU for testing
-    output_container.move(RXMESH::DEVICE, RXMESH::HOST);
-    input_container.move(RXMESH::DEVICE, RXMESH::HOST);
+    output->move(rxmesh::DEVICE, rxmesh::HOST);
+    input->move(rxmesh::DEVICE, rxmesh::HOST);
 
     // verify
-    EXPECT_TRUE(tester.run_higher_query_verifier(rxmesh_static, input_container,
-                                                 output_container));
-
-
-    input_container.release();
-    output_container.release();
+    EXPECT_TRUE(tester.run_test(rxmesh, Faces, *input, *output, true));
 }
\ No newline at end of file
diff --git a/tests/RXMesh_test/test_iterator.cu b/tests/RXMesh_test/test_iterator.cu
index 33dae66d..13259624 100644
--- a/tests/RXMesh_test/test_iterator.cu
+++ b/tests/RXMesh_test/test_iterator.cu
@@ -1,99 +1,105 @@
 #include "gtest/gtest.h"
-#include "rxmesh/kernels/rxmesh_iterator.cuh"
+#include "rxmesh/iterator.cuh"
 #include "rxmesh/util/util.h"
-template <uint32_t fixedOffset>
-__global__ static void test_iterator(uint32_t* suceess,
-                                     uint32_t* ltog_map,
-                                     uint16_t* patch_output,
-                                     uint32_t  num_elements)
-{
-    using namespace RXMESH;
-    uint32_t       local_id = threadIdx.x;
-    RXMeshIterator iter(local_id, patch_output, patch_output, ltog_map,
-                        fixedOffset, 0);
-
-    if (iter.local_id() != local_id) {
-        atomicAdd(suceess, 1u);
-        return;
-    }
-
-    if (iter.size() != fixedOffset) {
-        atomicAdd(suceess, 1u);
-        return;
-    }
 
-    uint32_t truth = num_elements - threadIdx.x - 1;
-    if (iter[0] != truth || iter[1] != truth || iter[2] != truth ||
-        iter.back() != truth || iter.front() != truth) {
-        atomicAdd(suceess, 1u);
-        return;
-    }
+template <typename HandleT>
+__global__ static void test_iterator(uint32_t*       suceess,
+                                     const uint16_t* patch_output,
+                                     const uint32_t  num_elements,
+                                     const uint32_t  offset_size,
+                                     const uint32_t  patch_id)
+{
+    using namespace rxmesh;
+    uint16_t      local_id = threadIdx.x;
+    const HandleT truth(patch_id, {local_id});
+
+    if (local_id >= num_elements) {
+
+        Iterator<HandleT> iter(
+            local_id,
+            reinterpret_cast<const typename HandleT::LocalT*>(patch_output),
+            nullptr,
+            offset_size,
+            patch_id,
+            num_elements,
+            nullptr,
+            nullptr);
+
+        if (iter.size() != offset_size) {
+            atomicAdd(suceess, 1u);
+            return;
+        }
+        if (iter.front() != truth) {
+            atomicAdd(suceess, 1u);
+            return;
+        }
 
-    for (uint32_t i = 0; i < iter.size(); ++i) {
-        if (*iter != truth) {
+        if (iter.back() != truth) {
             atomicAdd(suceess, 1u);
             return;
         }
-        ++iter;
+
+        for (uint32_t i = 0; i < iter.size(); ++i) {
+            if (iter[i] != truth) {
+                atomicAdd(suceess, 1u);
+                return;
+            }
+        }
+
+        for (uint32_t i = 0; i < iter.size(); ++i) {
+            if (*iter != truth) {
+                atomicAdd(suceess, 1u);
+                return;
+            }
+            ++iter;
+        }
     }
 }
 
 TEST(RXMesh, Iterator)
 {
-    // patch_output:
-    // 0 0 0 | 1 1 1 |  2  2 2 | ......
-
-    // ltog_map:
-    // n-1 n-2 n-3 ..... 3 2 1 0
-
-    // and so the patch_output in global index space should be
-    // n-1 n-1 n-1 | n-2 n-2 n-2 | ...... | 1 1 1 | 0 0 0
+    // The patch contains 32 elements and the patch_id is 1
+    // and patch_output:
+    // 0 0 0 | 1 1 1 | 2 2 2 | ......
+    // i.e., fixed_offset = 3
 
+    using namespace rxmesh;
+    constexpr uint32_t offset_size  = 3;
+    const uint32_t     num_elements = 32;
+    const uint32_t     patch_id     = 1;
 
-    using namespace RXMESH;
-    constexpr uint32_t fixedOffset = 3;
-    const uint32_t     N = 32;
-
-    std::vector<uint16_t> h_patch_output(fixedOffset * N);
+    std::vector<uint16_t> h_patch_output(offset_size * num_elements);
     for (uint32_t i = 0; i < h_patch_output.size(); ++i) {
-        h_patch_output[i] = i / fixedOffset;
-    }
-
-    std::vector<uint32_t> h_ltog_map(N);
-    for (uint32_t i = 0; i < h_ltog_map.size(); ++i) {
-        h_ltog_map[i] = N - i - 1;
+        h_patch_output[i] = i / offset_size;
     }
 
 
-    uint32_t *d_ltog_map(nullptr), *d_suceess(nullptr);
+    uint32_t* d_suceess(nullptr);
     uint16_t* d_patch_output(nullptr);
 
-    CUDA_ERROR(
-        cudaMalloc((void**)&d_ltog_map, h_ltog_map.size() * sizeof(uint32_t)));
     CUDA_ERROR(cudaMalloc((void**)&d_patch_output,
                           h_patch_output.size() * sizeof(uint32_t)));
-    CUDA_ERROR(cudaMemcpy(d_ltog_map, h_ltog_map.data(),
-                          h_ltog_map.size() * sizeof(uint32_t),
-                          cudaMemcpyHostToDevice));
-    CUDA_ERROR(cudaMemcpy(d_patch_output, h_patch_output.data(),
+
+    CUDA_ERROR(cudaMemcpy(d_patch_output,
+                          h_patch_output.data(),
                           h_patch_output.size() * sizeof(uint16_t),
                           cudaMemcpyHostToDevice));
     CUDA_ERROR(cudaMalloc((void**)&d_suceess, sizeof(uint32_t)));
     CUDA_ERROR(cudaMemset(d_suceess, 0, sizeof(uint32_t)));
 
 
-    test_iterator<3u><<<1, N>>>(d_suceess, d_ltog_map, d_patch_output, N);
+    test_iterator<VertexHandle><<<1, num_elements>>>(
+        d_suceess, d_patch_output, num_elements, offset_size, patch_id);
     CUDA_ERROR(cudaDeviceSynchronize());
 
     uint32_t h_success = 0;
-    CUDA_ERROR(cudaMemcpy(&h_success, d_suceess, sizeof(uint32_t),
-                          cudaMemcpyDeviceToHost));
+    CUDA_ERROR(cudaMemcpy(
+        &h_success, d_suceess, sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
     EXPECT_EQ(h_success, 0);
 
     CUDA_ERROR(cudaFree(d_patch_output));
     CUDA_ERROR(cudaFree(d_suceess));
-    CUDA_ERROR(cudaFree(d_ltog_map));
     CUDA_ERROR(cudaDeviceSynchronize());
     CUDA_ERROR(cudaDeviceReset());
 }
\ No newline at end of file
diff --git a/tests/RXMesh_test/test_queries.h b/tests/RXMesh_test/test_queries.h
index b37c344b..f2aca2cd 100644
--- a/tests/RXMesh_test/test_queries.h
+++ b/tests/RXMesh_test/test_queries.h
@@ -1,224 +1,216 @@
+#include <functional>
+#include <numeric>
 #include <vector>
+
 #include "gtest/gtest.h"
-#include "query.cuh"
-#include "rxmesh/rxmesh_attribute.h"
+
 #include "rxmesh/rxmesh_static.h"
-#include "rxmesh/rxmesh_util.h"
 #include "rxmesh/util/import_obj.h"
-#include "rxmesh/util/math.h"
 #include "rxmesh/util/report.h"
 #include "rxmesh_test.h"
-using namespace RXMESH;
-
-/**
- * launcher()
- */
-template <uint32_t blockThreads>
-float launcher(const RXMeshContext&       context,
-               const Op                   op,
-               RXMeshAttribute<uint32_t>& input_container,
-               RXMeshAttribute<uint32_t>& output_container,
-               LaunchBox<blockThreads>&   launch_box,
-               const bool                 oriented = false)
-{
-    CUDA_ERROR(cudaProfilerStart());
-    GPUTimer timer;
-    timer.start();
-
-    switch (op) {
-        case Op::VV:
-            query<Op::VV, blockThreads>
-                <<<launch_box.blocks, blockThreads,
-                   launch_box.smem_bytes_dyn>>>(context, input_container,
-                                                output_container, oriented);
-            break;
-        case Op::VE:
-            query<Op::VE, blockThreads>
-                <<<launch_box.blocks, blockThreads,
-                   launch_box.smem_bytes_dyn>>>(context, input_container,
-                                                output_container);
-            break;
-        case Op::VF:
-            query<Op::VF, blockThreads>
-                <<<launch_box.blocks, blockThreads,
-                   launch_box.smem_bytes_dyn>>>(context, input_container,
-                                                output_container);
-            break;
-        case Op::EV:
-            query<Op::EV, blockThreads>
-                <<<launch_box.blocks, blockThreads,
-                   launch_box.smem_bytes_dyn>>>(context, input_container,
-                                                output_container);
-            break;
-        case Op::EE:
-            RXMESH_ERROR(
-                "RXMeshStatic::launcher_no_src() Op::EE is not "
-                "supported!!");
-            break;
-        case Op::EF:
-            query<Op::EF, blockThreads>
-                <<<launch_box.blocks, blockThreads,
-                   launch_box.smem_bytes_dyn>>>(context, input_container,
-                                                output_container);
-            break;
-        case Op::FV:
-            query<Op::FV, blockThreads>
-                <<<launch_box.blocks, blockThreads,
-                   launch_box.smem_bytes_dyn>>>(context, input_container,
-                                                output_container);
-            break;
-        case Op::FE:
-            query<Op::FE, blockThreads>
-                <<<launch_box.blocks, blockThreads,
-                   launch_box.smem_bytes_dyn>>>(context, input_container,
-                                                output_container);
-            break;
-        case Op::FF:
-            query<Op::FF, blockThreads>
-                <<<launch_box.blocks, blockThreads,
-                   launch_box.smem_bytes_dyn>>>(context, input_container,
-                                                output_container);
-            break;
-    }
 
-    timer.stop();
-    CUDA_ERROR(cudaDeviceSynchronize());
-    CUDA_ERROR(cudaGetLastError());
-    CUDA_ERROR(cudaProfilerStop());
-    return timer.elapsed_millis();
-}
-
-/**
- * calc_fixed_offset()
- */
-template <uint32_t patchSize>
-inline uint32_t max_output_per_element(const RXMeshStatic<patchSize>& rxmesh,
-                                       const Op&                      op)
-{
-    if (op == Op::EV) {
-        return 2;
-    } else if (op == Op::EF) {
-        return rxmesh.get_max_edge_incident_faces();
-    } else if (op == Op::FV || op == Op::FE) {
-        return rxmesh.get_face_degree();
-    } else if (op == Op::FF) {
-        return rxmesh.get_max_edge_adjacent_faces();
-    } else if (op == Op::VV || op == Op::VE || op == Op::VF) {
-        return rxmesh.get_max_valence();
-    } else {
-        RXMESH_ERROR("calc_fixed_offset() Invalid op " + op_to_string(op));
-        return -1u;
-    }    
-}
+#include "query.cuh"
 
-TEST(RXMesh, Oriented_VV)
+TEST(RXMeshStatic, Oriented_VV)
 {
+    using namespace rxmesh;
 
     // Select device
     cuda_query(rxmesh_args.device_id, rxmesh_args.quite);
 
+    std::vector<std::vector<dataT>>    Verts;
     std::vector<std::vector<uint32_t>> Faces;
 
-    ASSERT_TRUE(import_obj(STRINGIFY(INPUT_DIR) "cube.obj", Verts, Faces, true));
+    ASSERT_TRUE(
+        import_obj(STRINGIFY(INPUT_DIR) "cube.obj", Verts, Faces, true));
 
-    // Instantiate RXMesh Static
-    RXMeshStatic<PATCH_SIZE> rxmesh_static(Faces, Verts, false, rxmesh_args.quite);
+    // RXMesh
+    RXMeshStatic rxmesh(Faces, rxmesh_args.quite);
 
-    EXPECT_TRUE(rxmesh_static.is_closed())
+    EXPECT_TRUE(rxmesh.is_closed())
         << " Can't generate oriented VV for input with boundaries";
 
+    auto coordinates = rxmesh.add_vertex_attribute<dataT>(Verts, "coordinates");
+
     // input/output container
-    RXMeshAttribute<uint32_t> input_container;
-    input_container.init(rxmesh_static.get_num_vertices(), 1u, RXMESH::DEVICE,
-                         RXMESH::AoS, false, false);
+    auto input  = rxmesh.add_vertex_attribute<VertexHandle>("input", 1);
+    auto output = rxmesh.add_vertex_attribute<VertexHandle>(
+        "output", rxmesh.get_max_valence());
 
-    RXMeshAttribute<uint32_t> output_container;
-    output_container.init(rxmesh_static.get_num_vertices(),
-                          max_output_per_element(rxmesh_static, Op::VV) + 1,
-                          RXMESH::DEVICE, RXMESH::SoA, false, false);
+    input->reset(VertexHandle(), rxmesh::DEVICE);
+    output->reset(VertexHandle(), rxmesh::DEVICE);
 
     // launch box
-    LaunchBox<256> launch_box;
-    rxmesh_static.prepare_launch_box(Op::VV, launch_box, false, true);
-
-    // launch query
-    float tt = launcher(rxmesh_static.get_context(), Op::VV, input_container,
-                        output_container, launch_box, true);
+    constexpr uint32_t      blockThreads = 256;
+    LaunchBox<blockThreads> launch_box;
+    rxmesh.prepare_launch_box(
+        Op::VV,
+        launch_box,
+        (void*)query_kernel<blockThreads,
+                            Op::VV,
+                            VertexHandle,
+                            VertexHandle,
+                            VertexAttribute<VertexHandle>,
+                            VertexAttribute<VertexHandle>>,
+        true);
+
+    // query
+    query_kernel<blockThreads, Op::VV, VertexHandle, VertexHandle>
+        <<<launch_box.blocks, blockThreads, launch_box.smem_bytes_dyn>>>(
+            rxmesh.get_context(), *input, *output, true);
 
+    CUDA_ERROR(cudaDeviceSynchronize());
 
     // move containers to the CPU for testing
-    output_container.move(RXMESH::DEVICE, RXMESH::HOST);
-    input_container.move(RXMESH::DEVICE, RXMESH::HOST);
+    output->move(rxmesh::DEVICE, rxmesh::HOST);
+    input->move(rxmesh::DEVICE, rxmesh::HOST);
+
+    RXMeshTest tester(rxmesh, Faces, rxmesh_args.quite);
+    EXPECT_TRUE(tester.run_test(rxmesh, Faces, *input, *output));
 
-    RXMeshTest tester(true);
-    EXPECT_TRUE(tester.run_query_verifier(rxmesh_static, Op::VV,
-                                          input_container, output_container));
 
     // Make sure orientation is accurate
     // for the cube, all angle are either 45 or 90
 
-    for (uint32_t v = 0; v < rxmesh_static.get_num_vertices(); ++v) {
+    auto vector_length = [](const dataT x, const dataT y, const dataT z) {
+        return std::sqrt(x * x + y * y + z * z);
+    };
 
-        uint32_t vertex = input_container(v);
+    auto dot = [](const std::vector<dataT>& u, const std::vector<dataT>& v) {
+        return std::inner_product(
+            std::begin(u), std::end(u), std::begin(v), 0.0);
+    };
 
-        uint32_t v_0 = output_container(v, output_container(v, 0));
-        for (uint32_t i = 1; i < output_container(v, 0); ++i) {
+    rxmesh.for_each_vertex(HOST, [&](const VertexHandle& vertex) {
+        for (uint32_t i = 0; i < (*output).get_num_attributes(); ++i) {
 
-            uint32_t v_1 = output_container(v, i);
+            uint32_t j = (i + 1) % output->get_num_attributes();
 
-            std::vector<dataT> p1{Verts[vertex][0] - Verts[v_0][0],
-                                  Verts[vertex][1] - Verts[v_0][1],
-                                  Verts[vertex][2] - Verts[v_0][2]};
+            auto v_0 = (*output)(vertex, i);
+            auto v_1 = (*output)(vertex, j);
 
-            std::vector<dataT> p2{Verts[vertex][0] - Verts[v_1][0],
-                                  Verts[vertex][1] - Verts[v_1][1],
-                                  Verts[vertex][2] - Verts[v_1][2]};
-            dataT              dot_pro = dot(p1, p2);
-            dataT              theta =
-                std::acos(dot_pro / (vector_length(p1[0], p1[1], p1[2]) *
-                                     vector_length(p2[0], p2[1], p2[2])));
-            theta = (theta * 180) / 3.14159265;
-            EXPECT_TRUE(std::abs(theta - 90) < 0.0001 ||
-                        std::abs(theta - 45) < 0.0001);
-            v_0 = v_1;
-        }
-    }
+            if (v_1.is_valid() && v_0.is_valid()) {
 
+                std::vector<dataT> p1{
+                    (*coordinates)(vertex, 0) - (*coordinates)(v_0, 0),
+                    (*coordinates)(vertex, 1) - (*coordinates)(v_0, 1),
+                    (*coordinates)(vertex, 2) - (*coordinates)(v_0, 2)};
 
-    input_container.release();
-    output_container.release();
-}
+                std::vector<dataT> p2{
+                    (*coordinates)(vertex, 0) - (*coordinates)(v_1, 0),
+                    (*coordinates)(vertex, 1) - (*coordinates)(v_1, 1),
+                    (*coordinates)(vertex, 2) - (*coordinates)(v_1, 2)};
 
+                dataT dot_pro = dot(p1, p2);
+                dataT theta =
+                    std::acos(dot_pro / (vector_length(p1[0], p1[1], p1[2]) *
+                                         vector_length(p2[0], p2[1], p2[2])));
+                theta = (theta * 180) / 3.14159265;
+                EXPECT_TRUE(std::abs(theta - 90) < 0.0001 ||
+                            std::abs(theta - 45) < 0.0001);
+            }
+        }
+    });
+}
 
-TEST(RXMesh, Queries)
+template <rxmesh::Op op,
+          typename InputHandleT,
+          typename OutputHandleT,
+          typename InputAttributeT,
+          typename OutputAttributeT>
+void launcher(const std::vector<std::vector<uint32_t>>& Faces,
+              rxmesh::RXMeshStatic&                     rxmesh,
+              InputAttributeT&                          input,
+              OutputAttributeT&                         output,
+              RXMeshTest&                               tester,
+              rxmesh::Report&                           report,
+              bool                                      oriented)
 {
-    if (rxmesh_args.shuffle) {
-        ASSERT_FALSE(rxmesh_args.sort)
-            << " cannot shuffle and sort at the same time!";
+    using namespace rxmesh;
+
+    // launch box
+    constexpr uint32_t      blockThreads = 256;
+    LaunchBox<blockThreads> launch_box;
+    rxmesh.prepare_launch_box(op,
+                              launch_box,
+                              (void*)query_kernel<blockThreads,
+                                                  op,
+                                                  InputHandleT,
+                                                  OutputHandleT,
+                                                  InputAttributeT,
+                                                  OutputAttributeT>,
+                              oriented);
+
+    // test data
+    TestData td;
+    td.test_name   = op_to_string(op);
+    td.num_threads = launch_box.num_threads;
+    td.num_blocks  = launch_box.blocks;
+    td.dyn_smem    = launch_box.smem_bytes_dyn;
+    td.static_smem = launch_box.smem_bytes_static;
+    td.num_reg     = launch_box.num_registers_per_thread;
+
+    float total_time = 0;
+
+
+    for (uint32_t itr = 0; itr < rxmesh_args.num_run; itr++) {
+        // Reset input/output
+        input.reset(InputHandleT(), rxmesh::DEVICE);
+        output.reset(OutputHandleT(), rxmesh::DEVICE);
+        CUDA_ERROR(cudaDeviceSynchronize());
+
+        CUDA_ERROR(cudaProfilerStart());
+        GPUTimer timer;
+        timer.start();
+        query_kernel<blockThreads, op, InputHandleT, OutputHandleT>
+            <<<launch_box.blocks, blockThreads, launch_box.smem_bytes_dyn>>>(
+                rxmesh.get_context(), input, output, oriented);
+
+        timer.stop();
+        CUDA_ERROR(cudaDeviceSynchronize());
+        CUDA_ERROR(cudaGetLastError());
+        CUDA_ERROR(cudaProfilerStop());
+
+        total_time += timer.elapsed_millis();
+        td.time_ms.push_back(timer.elapsed_millis());
     }
-    if (rxmesh_args.sort) {
-        ASSERT_FALSE(rxmesh_args.shuffle)
-            << " cannot shuffle and sort at the same time!";
+
+    // move containers to the CPU for testing
+    output.move(rxmesh::DEVICE, rxmesh::HOST);
+    input.move(rxmesh::DEVICE, rxmesh::HOST);
+
+    // verify
+    bool passed = tester.run_test(rxmesh, Faces, input, output);
+
+    td.passed.push_back(passed);
+    EXPECT_TRUE(passed) << "Testing: " << td.test_name;
+
+    report.add_test(td);
+    if (!rxmesh_args.quite) {
+        RXMESH_TRACE(" {} {} time = {} (ms)",
+                     td.test_name.c_str(),
+                     (passed ? " passed " : " failed "),
+                     total_time / float(rxmesh_args.num_run));
     }
+}
 
+TEST(RXMeshStatic, Queries)
+{
+    using namespace rxmesh;
 
     bool oriented = false;
 
     // Select device
     cuda_query(rxmesh_args.device_id, rxmesh_args.quite);
 
+    std::vector<std::vector<dataT>>    Verts;
     std::vector<std::vector<uint32_t>> Faces;
 
-    ASSERT_TRUE(import_obj(rxmesh_args.obj_file_name, Verts, Faces,
-                    rxmesh_args.quite));
-
-    if (rxmesh_args.shuffle) {
-        shuffle_obj(Faces, Verts);
-    }
+    ASSERT_TRUE(
+        import_obj(rxmesh_args.obj_file_name, Verts, Faces, rxmesh_args.quite));
 
     // RXMesh
-    RXMeshStatic<PATCH_SIZE> rxmesh_static(Faces, Verts, rxmesh_args.sort,
-                               rxmesh_args.quite);
+    RXMeshStatic rxmesh(Faces, rxmesh_args.quite);
 
 
     // Report
@@ -227,108 +219,104 @@ TEST(RXMesh, Queries)
     report.command_line(rxmesh_args.argc, rxmesh_args.argv);
     report.device();
     report.system();
-    report.model_data(rxmesh_args.obj_file_name, rxmesh_static);
+    report.model_data(rxmesh_args.obj_file_name, rxmesh);
     report.add_member("method", std::string("RXMesh"));
 
-    std::string order = "default";
-    if (rxmesh_args.shuffle) {
-        order = "shuffle";
-    } else if (rxmesh_args.sort) {
-        order = "sorted";
-    }
-    report.add_member("input_order", order);
 
     // Tester to verify all queries
-    ::RXMeshTest tester(true);
-    EXPECT_TRUE(tester.run_ltog_mapping_test(rxmesh_static))
+    ::RXMeshTest tester(rxmesh, Faces, rxmesh_args.quite);
+    EXPECT_TRUE(tester.run_ltog_mapping_test(rxmesh, Faces))
         << "Local-to-global mapping test failed";
 
-    // adding query that we want to test
-    std::vector<Op> ops = {Op::VV, Op::VE, Op::VF,  //
-                           Op::FV, Op::FE, Op::FF,  //
-                           Op::EV, Op::EF};
-
-
-    for (auto& ops_it : ops) {
-
-        // Input and output element type
-        ELEMENT source_ele(ELEMENT::VERTEX), output_ele(ELEMENT::VERTEX);
-        io_elements(ops_it, source_ele, output_ele);
-
-        // Input size
-        uint32_t input_size =
-            (source_ele == ELEMENT::VERTEX) ?
-                rxmesh_static.get_num_vertices() :
-                ((source_ele == ELEMENT::EDGE) ? rxmesh_static.get_num_edges() :
-                                                 rxmesh_static.get_num_faces());
-
-        // input/output container
-        RXMeshAttribute<uint32_t> input_container;
-        input_container.init(input_size, 1u, RXMESH::DEVICE, RXMESH::AoS, false,
-                             false);
-
-        // allocate output container
-        // for each mesh element, we reserve the maximum possible output based
-        // on the operation (ops_it). The +1 is used to store the size of the
-        // output for operations that output variable outputs per elements
-        // (e.g., VV)
-        RXMeshAttribute<uint32_t> output_container;
-        output_container.init(input_size,
-                              max_output_per_element(rxmesh_static, ops_it) + 1,
-                              RXMESH::DEVICE, RXMESH::SoA, false, false);
-
-        // launch box
-        LaunchBox<256> launch_box;
-        rxmesh_static.prepare_launch_box(ops_it, launch_box, false, oriented);
-
-        // test data
-        TestData td;
-        td.test_name = op_to_string(ops_it);
-        td.num_threads = launch_box.num_threads;
-        td.num_blocks = launch_box.blocks;
-        td.dyn_smem = launch_box.smem_bytes_dyn;
-        td.static_smem = launch_box.smem_bytes_static;
-
-
-        float total_time = 0;
-        for (uint32_t itr = 0; itr < rxmesh_args.num_run; itr++) {
-
-            output_container.reset(INVALID32, RXMESH::DEVICE);
-            input_container.reset(INVALID32, RXMESH::DEVICE);
-
-            // launch query
-            float tt =
-                launcher(rxmesh_static.get_context(), ops_it, input_container,
-                         output_container, launch_box, oriented);
-            total_time += tt;
-            td.time_ms.push_back(tt);
-        }
+    {
+        // VV
+        auto input  = rxmesh.add_vertex_attribute<VertexHandle>("input", 1);
+        auto output = rxmesh.add_vertex_attribute<VertexHandle>(
+            "output", rxmesh.get_max_valence());
+        launcher<Op::VV, VertexHandle, VertexHandle>(
+            Faces, rxmesh, *input, *output, tester, report, oriented);
+        rxmesh.remove_attribute("input");
+        rxmesh.remove_attribute("output");
+    }
 
-        // move containers to the CPU for testing
-        output_container.move(RXMESH::DEVICE, RXMESH::HOST);
-        input_container.move(RXMESH::DEVICE, RXMESH::HOST);
 
+    {
+        // VE
+        auto input  = rxmesh.add_vertex_attribute<VertexHandle>("input", 1);
+        auto output = rxmesh.add_vertex_attribute<EdgeHandle>(
+            "output", rxmesh.get_max_valence());
+        launcher<Op::VE, VertexHandle, EdgeHandle>(
+            Faces, rxmesh, *input, *output, tester, report, oriented);
+        rxmesh.remove_attribute("input");
+        rxmesh.remove_attribute("output");
+    }
 
-        // verify
-        bool passed = tester.run_query_verifier(
-            rxmesh_static, ops_it, input_container, output_container);
+    {
+        // VF
+        auto input  = rxmesh.add_vertex_attribute<VertexHandle>("input", 1);
+        auto output = rxmesh.add_vertex_attribute<FaceHandle>(
+            "output", rxmesh.get_max_valence());
+        launcher<Op::VF, VertexHandle, FaceHandle>(
+            Faces, rxmesh, *input, *output, tester, report, oriented);
+        rxmesh.remove_attribute("input");
+        rxmesh.remove_attribute("output");
+    }
 
-        td.passed.push_back(passed);
-        EXPECT_TRUE(passed) << "Testing: " << td.test_name;
 
-        report.add_test(td);
-        if (!rxmesh_args.quite) {
-            RXMESH_TRACE(" {} {} time = {} (ms)", td.test_name.c_str(),
-                         (passed ? " passed " : " failed "),
-                         total_time / float(rxmesh_args.num_run));
-        }
+    {
+        // EV
+        auto input  = rxmesh.add_edge_attribute<EdgeHandle>("input", 1);
+        auto output = rxmesh.add_edge_attribute<VertexHandle>("output", 2);
+        launcher<Op::EV, EdgeHandle, VertexHandle>(
+            Faces, rxmesh, *input, *output, tester, report, oriented);
+        rxmesh.remove_attribute("input");
+        rxmesh.remove_attribute("output");
+    }
 
-        input_container.release();
-        output_container.release();
+    {
+        // EF
+        auto input  = rxmesh.add_edge_attribute<EdgeHandle>("input", 1);
+        auto output = rxmesh.add_edge_attribute<FaceHandle>(
+            "output", rxmesh.get_max_edge_incident_faces());
+        launcher<Op::EF, EdgeHandle, FaceHandle>(
+            Faces, rxmesh, *input, *output, tester, report, oriented);
+        rxmesh.remove_attribute("input");
+        rxmesh.remove_attribute("output");
     }
 
+    {
+        // FV
+        auto input  = rxmesh.add_face_attribute<FaceHandle>("input", 1);
+        auto output = rxmesh.add_face_attribute<VertexHandle>("output", 3);
+        launcher<Op::FV, FaceHandle, VertexHandle>(
+            Faces, rxmesh, *input, *output, tester, report, oriented);
+        rxmesh.remove_attribute("input");
+        rxmesh.remove_attribute("output");
+    }
+
+    {
+        // FE
+        auto input  = rxmesh.add_face_attribute<FaceHandle>("input", 1);
+        auto output = rxmesh.add_face_attribute<EdgeHandle>("output", 3);
+        launcher<Op::FE, FaceHandle, EdgeHandle>(
+            Faces, rxmesh, *input, *output, tester, report, oriented);
+        rxmesh.remove_attribute("input");
+        rxmesh.remove_attribute("output");
+    }
+
+    {
+        // FF
+        auto input  = rxmesh.add_face_attribute<FaceHandle>("input", 1);
+        auto output = rxmesh.add_face_attribute<FaceHandle>(
+            "output", rxmesh.get_max_face_adjacent_faces() + 2);
+        launcher<Op::FF, FaceHandle, FaceHandle>(
+            Faces, rxmesh, *input, *output, tester, report, oriented);
+        rxmesh.remove_attribute("input");
+        rxmesh.remove_attribute("output");
+    }
 
+    // Write the report
     report.write(
-        rxmesh_args.output_folder + "/rxmesh/" + order,
+        rxmesh_args.output_folder + "/rxmesh",
         "QueryTest_RXMesh_" + extract_file_name(rxmesh_args.obj_file_name));
 }
diff --git a/tests/RXMesh_test/test_util.cu b/tests/RXMesh_test/test_util.cu
index 036f0453..b3a143c1 100644
--- a/tests/RXMesh_test/test_util.cu
+++ b/tests/RXMesh_test/test_util.cu
@@ -5,7 +5,6 @@
 #include "rxmesh/util/macros.h"
 #include "rxmesh/util/util.h"
 
-//********************** Mat transpose kernel
 template <uint32_t rowOffset, uint32_t blockThreads, uint32_t itemPerThread>
 __global__ static void k_test_block_mat_transpose(uint16_t*      d_src,
                                                   const uint32_t num_rows,
@@ -13,25 +12,20 @@ __global__ static void k_test_block_mat_transpose(uint16_t*      d_src,
                                                   uint16_t*      d_output)
 {
 
-    RXMESH::block_mat_transpose<rowOffset, blockThreads, itemPerThread>(
+    rxmesh::block_mat_transpose<rowOffset, blockThreads, itemPerThread>(
         num_rows, num_cols, d_src, d_output);
 }
-//**************************************************************************
 
-//********************** block scan inplace kernel
 template <typename T, uint32_t blockThreads>
 __global__ static void k_test_block_exclusive_sum(T* d_src, const uint32_t size)
 {
-    RXMESH::cub_block_exclusive_sum<T, blockThreads>(d_src, size);
+    rxmesh::cub_block_exclusive_sum<T, blockThreads>(d_src, size);
 }
-//**************************************************************************
 
-
-//********************** atomicAdd kernel
 template <typename T>
 __global__ static void k_test_atomicAdd(T* d_val)
 {
-    RXMESH::atomicAdd(d_val, 1);
+    rxmesh::atomicAdd(d_val, 1);
     /*__half* as_half = (__half*)(d_val);
     ::atomicAdd(as_half,1);
     __syncthreads();
@@ -40,238 +34,198 @@ __global__ static void k_test_atomicAdd(T* d_val)
         d_val[0] = val;
     }*/
 }
-//**************************************************************************
 
-class TestUtil
+TEST(Util, Scan)
 {
+    using namespace rxmesh;
 
-   public:
-    TestUtil(){};
+    constexpr uint32_t    blockThreads = 128;
+    uint32_t              size         = 8144;
+    std::vector<uint32_t> h_src(size, 1);
+    uint32_t*             d_src = nullptr;
+    CUDA_ERROR(cudaMalloc((void**)&d_src, size * sizeof(uint32_t)));
+    CUDA_ERROR(cudaMemcpy(
+        d_src, h_src.data(), size * sizeof(uint32_t), cudaMemcpyHostToDevice));
 
-    void test_all()
-    {
-        test_scan();
+    k_test_block_exclusive_sum<uint32_t, blockThreads>
+        <<<1, blockThreads>>>(d_src, size);
 
-        test_block_mat_transpose<542, 847, 3>();
+    CUDA_ERROR(cudaDeviceSynchronize());
+    CUDA_ERROR(cudaGetLastError());
 
-        test_atomicAdd<uint16_t>();
+    CUDA_ERROR(cudaMemcpy(
+        h_src.data(), d_src, size * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
-        test_atomicAdd<uint8_t>();
+    for (uint32_t i = 0; i < h_src.size(); ++i) {
+        EXPECT_EQ(h_src[i], i);
     }
 
+    GPU_FREE(d_src);
+}
 
-    bool test_scan()
-    {
-        using namespace RXMESH;
+template <typename T>
+bool test_atomicAdd(const uint32_t threads = 1024)
+{
+    using namespace rxmesh;
 
-        constexpr uint32_t    blockThreads = 128;
-        uint32_t              size = 8144;
-        std::vector<uint32_t> h_src(size, 1);
-        uint32_t*             d_src = nullptr;
-        CUDA_ERROR(cudaMalloc((void**)&d_src, size * sizeof(uint32_t)));
-        CUDA_ERROR(cudaMemcpy(d_src, h_src.data(), size * sizeof(uint32_t),
-                              cudaMemcpyHostToDevice));
+    T  h_val = 0;
+    T* d_val;
 
-        k_test_block_exclusive_sum<uint32_t, blockThreads>
-            <<<1, blockThreads>>>(d_src, size);
+    CUDA_ERROR(cudaMalloc((void**)&d_val, sizeof(T)));
+    CUDA_ERROR(cudaMemcpy(d_val, &h_val, sizeof(T), cudaMemcpyHostToDevice));
 
-        CUDA_ERROR(cudaDeviceSynchronize());
-        CUDA_ERROR(cudaGetLastError());
 
-        CUDA_ERROR(cudaMemcpy(h_src.data(), d_src, size * sizeof(uint32_t),
-                              cudaMemcpyDeviceToHost));
-        bool passed = true;
-        for (uint32_t i = 0; i < h_src.size(); ++i) {
-            if (h_src[i] != i) {
-                passed = false;
-                break;
-            }
-        }
+    k_test_atomicAdd<T><<<1, threads>>>(d_val);
 
-        GPU_FREE(d_src);
+    CUDA_ERROR(cudaDeviceSynchronize());
+    CUDA_ERROR(cudaGetLastError());
 
-        return passed;
-    }
-    //**************************************************************************
-
-
-    //********************** Test matrix transpose
-    template <uint32_t numRows, uint32_t numCols, uint32_t rowOffset>
-    bool test_block_mat_transpose()
-    {
-        using namespace RXMESH;
-        // The matrix is numRows X numCols where every rows has rowOffset
-        // non-zero elements. The matrix passed to the kernel contains the
-        // column ids only and we also pass the rowOffset. The transposed matrix
-        // is stored in the source as row ids and the offset is stored in the
-        // h_res_offset.
-
-        const uint32_t        arr_size = numRows * rowOffset;
-        std::vector<uint16_t> h_src(arr_size);
-        std::vector<uint16_t> row(numCols);
-        fill_with_sequential_numbers(row.data(),
-                                     static_cast<uint32_t>(row.size()));
-        random_shuffle(row.data(), static_cast<uint32_t>(row.size()));
+    CUDA_ERROR(cudaMemcpy(&h_val, d_val, sizeof(T), cudaMemcpyDeviceToHost));
 
-        for (uint32_t s = 0; s < h_src.size(); s += rowOffset) {
-            // prevent duplication in the same row
-            for (uint32_t i = 0; i < rowOffset; ++i) {
-                h_src[s + i] = row[i];
-            }
-            random_shuffle(row.data(), static_cast<uint32_t>(row.size()));
-        }
 
+    // check
+    bool passed = true;
+    if (h_val != static_cast<T>(threads)) {
+        passed = false;
+    }
+    GPU_FREE(d_val);
 
-        // const uint32_t threads = numRows*rowOffset;
-        // We try to divide the number of non-zero elements equally between
-        // threads. However, it may not aligned perfectly. So we need to pad
-        // h_src with INVALID32 since this will be part of the sorting in
-        // the transpose kernel. Also, d_offset should be large enough to
-        // align with the padding.
-
-        const uint32_t threads = 256;
-        const uint32_t item_per_thread =
-            DIVIDE_UP(numRows * rowOffset, threads);
-        const uint32_t blocks = 1;
+    return passed;
+}
 
+TEST(Util, AtomicAdd)
+{
+    EXPECT_TRUE(test_atomicAdd<uint16_t>()) << "uint16_t failed";
+    EXPECT_TRUE(test_atomicAdd<uint8_t>()) << "uint8_t failed";
+}
 
-        if (item_per_thread * threads > numRows * rowOffset) {
-            for (uint32_t i = numRows * rowOffset;
-                 i < item_per_thread * threads; ++i) {
-                h_src.push_back(INVALID16);
-            }
+TEST(Util, BlockMatrixTranspose)
+{
+    constexpr uint32_t numRows   = 542;
+    constexpr uint32_t numCols   = 847;
+    constexpr uint32_t rowOffset = 3;
+
+    using namespace rxmesh;
+    // The matrix is numRows X numCols where every rows has rowOffset
+    // non-zero elements. The matrix passed to the kernel contains the
+    // column ids only and we also pass the rowOffset. The transposed matrix
+    // is stored in the source as row ids and the offset is stored in the
+    // h_res_offset.
+
+    const uint32_t        arr_size = numRows * rowOffset;
+    std::vector<uint16_t> h_src(arr_size);
+    std::vector<uint16_t> row(numCols);
+    fill_with_sequential_numbers(row.data(), static_cast<uint32_t>(row.size()));
+    random_shuffle(row.data(), static_cast<uint32_t>(row.size()));
+
+    for (uint32_t s = 0; s < h_src.size(); s += rowOffset) {
+        // prevent duplication in the same row
+        for (uint32_t i = 0; i < rowOffset; ++i) {
+            h_src[s + i] = row[i];
         }
+        random_shuffle(row.data(), static_cast<uint32_t>(row.size()));
+    }
 
-        uint16_t *d_src, *d_offset;
-        CUDA_ERROR(cudaMalloc((void**)&d_src, h_src.size() * sizeof(uint16_t)));
-        CUDA_ERROR(
-            cudaMalloc((void**)&d_offset, h_src.size() * sizeof(uint16_t)));
-        CUDA_ERROR(cudaMemcpy(d_src, h_src.data(),
-                              h_src.size() * sizeof(uint16_t),
-                              cudaMemcpyHostToDevice));
-
-
-        k_test_block_mat_transpose<rowOffset, threads, item_per_thread>
-            <<<blocks, threads, numRows * rowOffset * sizeof(uint32_t)>>>(
-                d_src, numRows, numCols, d_offset);
-
-        CUDA_ERROR(cudaDeviceSynchronize());
-        CUDA_ERROR(cudaGetLastError());
 
-        std::vector<uint16_t> h_res(arr_size);
-        std::vector<uint16_t> h_res_offset(numCols);
+    // const uint32_t threads = numRows*rowOffset;
+    // We try to divide the number of non-zero elements equally between
+    // threads. However, it may not aligned perfectly. So we need to pad
+    // h_src with INVALID32 since this will be part of the sorting in
+    // the transpose kernel. Also, d_offset should be large enough to
+    // align with the padding.
 
-        CUDA_ERROR(cudaMemcpy(h_res.data(), d_offset,
-                              arr_size * sizeof(uint16_t),
-                              cudaMemcpyDeviceToHost));
+    const uint32_t threads         = 256;
+    const uint32_t item_per_thread = DIVIDE_UP(numRows * rowOffset, threads);
+    const uint32_t blocks          = 1;
 
-        CUDA_ERROR(cudaMemcpy(h_res_offset.data(), d_src,
-                              numCols * sizeof(uint16_t),
-                              cudaMemcpyDeviceToHost));
 
-        std::vector<uint16_t> gold_res(arr_size);
-        std::vector<uint16_t> gold_res_offset(arr_size);
-        std::fill_n(gold_res_offset.data(), numCols, 0);
-        std::fill_n(gold_res.data(), numRows * rowOffset, INVALID16);
-        // count
-        for (uint32_t i = 0; i < arr_size; ++i) {
-            gold_res_offset[h_src[i]]++;
-        }
-        // offset
-        uint32_t prv = gold_res_offset[0];
-        gold_res_offset[0] = 0;
-        for (uint32_t i = 1; i < numCols; ++i) {
-            uint16_t cur = gold_res_offset[i];
-            gold_res_offset[i] = gold_res_offset[i - 1] + prv;
-            prv = cur;
-        }
-        // fill in
-        for (uint32_t i = 0; i < arr_size; ++i) {
-            uint16_t col = h_src[i];
-            uint32_t row = i / rowOffset;
-            uint16_t start = gold_res_offset[col];
-            uint16_t end = (col == numCols - 1) ? numRows * rowOffset :
-                                                  gold_res_offset[col + 1];
-            for (uint32_t j = start; j < end; ++j) {
-                if (gold_res[j] == INVALID16) {
-                    gold_res[j] = row;
-                    break;
-                }
-            }
+    if (item_per_thread * threads > numRows * rowOffset) {
+        for (uint32_t i = numRows * rowOffset; i < item_per_thread * threads;
+             ++i) {
+            h_src.push_back(INVALID16);
         }
+    }
 
+    uint16_t *d_src, *d_offset;
+    CUDA_ERROR(cudaMalloc((void**)&d_src, h_src.size() * sizeof(uint16_t)));
+    CUDA_ERROR(cudaMalloc((void**)&d_offset, h_src.size() * sizeof(uint16_t)));
+    CUDA_ERROR(cudaMemcpy(d_src,
+                          h_src.data(),
+                          h_src.size() * sizeof(uint16_t),
+                          cudaMemcpyHostToDevice));
 
-        for (uint32_t i = 0; i < numCols; ++i) {
-            uint32_t start = h_res_offset[i];
-            uint32_t end =
-                (i == numCols - 1) ? numRows * rowOffset : h_res_offset[i + 1];
-            std::sort(h_res.data() + start, h_res.data() + end);
-        }
 
+    k_test_block_mat_transpose<rowOffset, threads, item_per_thread>
+        <<<blocks, threads, numRows * rowOffset * sizeof(uint32_t)>>>(
+            d_src, numRows, numCols, d_offset);
 
-        // compare
-        bool passed = true;
-        if (!compare<uint16_t, uint16_t>(h_res.data(), gold_res.data(),
-                                         arr_size, false) ||
-            !compare<uint16_t, uint16_t>(
-                h_res_offset.data(), gold_res_offset.data(), numCols, false)) {
-            passed = false;
+    CUDA_ERROR(cudaDeviceSynchronize());
+    CUDA_ERROR(cudaGetLastError());
+
+    std::vector<uint16_t> h_res(arr_size);
+    std::vector<uint16_t> h_res_offset(numCols);
+
+    CUDA_ERROR(cudaMemcpy(h_res.data(),
+                          d_offset,
+                          arr_size * sizeof(uint16_t),
+                          cudaMemcpyDeviceToHost));
+
+    CUDA_ERROR(cudaMemcpy(h_res_offset.data(),
+                          d_src,
+                          numCols * sizeof(uint16_t),
+                          cudaMemcpyDeviceToHost));
+
+    std::vector<uint16_t> gold_res(arr_size);
+    std::vector<uint16_t> gold_res_offset(arr_size);
+    std::fill_n(gold_res_offset.data(), numCols, 0);
+    std::fill_n(gold_res.data(), numRows * rowOffset, INVALID16);
+    // count
+    for (uint32_t i = 0; i < arr_size; ++i) {
+        gold_res_offset[h_src[i]]++;
+    }
+    // offset
+    uint32_t prv       = gold_res_offset[0];
+    gold_res_offset[0] = 0;
+    for (uint32_t i = 1; i < numCols; ++i) {
+        uint16_t cur       = gold_res_offset[i];
+        gold_res_offset[i] = gold_res_offset[i - 1] + prv;
+        prv                = cur;
+    }
+    // fill in
+    for (uint32_t i = 0; i < arr_size; ++i) {
+        uint16_t col   = h_src[i];
+        uint32_t row   = i / rowOffset;
+        uint16_t start = gold_res_offset[col];
+        uint16_t end   = (col == numCols - 1) ? numRows * rowOffset :
+                                                gold_res_offset[col + 1];
+        for (uint32_t j = start; j < end; ++j) {
+            if (gold_res[j] == INVALID16) {
+                gold_res[j] = row;
+                break;
+            }
         }
-
-        GPU_FREE(d_src);
-        GPU_FREE(d_offset);
-
-        return passed;
     }
 
-    //**************************************************************************
-
-    template <typename T>
-    bool test_atomicAdd(const uint32_t threads = 1024)
-    {
-        using namespace RXMESH;
 
-        T  h_val = 0;
-        T* d_val;
-
-        CUDA_ERROR(cudaMalloc((void**)&d_val, sizeof(T)));
-        CUDA_ERROR(
-            cudaMemcpy(d_val, &h_val, sizeof(T), cudaMemcpyHostToDevice));
-
-
-        k_test_atomicAdd<T><<<1, threads>>>(d_val);
-
-        CUDA_ERROR(cudaDeviceSynchronize());
-        CUDA_ERROR(cudaGetLastError());
-
-        CUDA_ERROR(
-            cudaMemcpy(&h_val, d_val, sizeof(T), cudaMemcpyDeviceToHost));
-
-
-        // check
-        bool passed = true;
-        if (h_val != static_cast<T>(threads)) {
-            passed = false;
-        }
-        GPU_FREE(d_val);
-
-        return passed;
+    for (uint32_t i = 0; i < numCols; ++i) {
+        uint32_t start = h_res_offset[i];
+        uint32_t end =
+            (i == numCols - 1) ? numRows * rowOffset : h_res_offset[i + 1];
+        std::sort(h_res.data() + start, h_res.data() + end);
     }
-    ~TestUtil(){};
-};
 
 
-TEST(RXMesh, Util)
-{
-    using namespace RXMESH;
-    TestUtil tc;
+    // compare
+    bool passed = true;
+    if (!compare<uint16_t, uint16_t>(
+            h_res.data(), gold_res.data(), arr_size, false) ||
+        !compare<uint16_t, uint16_t>(
+            h_res_offset.data(), gold_res_offset.data(), numCols, false)) {
+        passed = false;
+    }
 
-    EXPECT_TRUE(tc.test_scan());
-    bool mat_trans = tc.template test_block_mat_transpose<542, 847, 3>();
-    EXPECT_TRUE(mat_trans);
-    EXPECT_TRUE(tc.test_atomicAdd<uint16_t>());
-    EXPECT_TRUE(tc.test_atomicAdd<uint8_t>());
+    GPU_FREE(d_src);
+    GPU_FREE(d_offset);
 
-    CUDA_ERROR(cudaDeviceSynchronize());
-    CUDA_ERROR(cudaDeviceReset());
+    EXPECT_TRUE(passed);
 }
\ No newline at end of file
diff --git a/tests/RXMesh_test/test_vector.cu b/tests/RXMesh_test/test_vector.cu
index c39c6719..42322431 100644
--- a/tests/RXMesh_test/test_vector.cu
+++ b/tests/RXMesh_test/test_vector.cu
@@ -1,9 +1,9 @@
 #include "gtest/gtest.h"
 #include "rxmesh/util/vector.h"
 
-TEST(RXMESH, Vector)
+TEST(RXMesh, Vector)
 {
-    using namespace RXMESH;
+    using namespace rxmesh;
 
     // constrctors
     Vector3f v0(0.5f);