diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9ddff224..e0455c4b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.16)
 project(rmagine 
     LANGUAGES CXX C
-    VERSION 2.2.5)# TODO update this version when merging into main-branch
+    VERSION 2.2.6)# TODO update this version when merging into main-branch
 
 option(BUILD_TOOLS "Build tools" ON)
 option(BUILD_TESTS "Build tests" ON)
@@ -114,6 +114,9 @@ endif(OPENMP_FOUND)
 ########################################
 ## Optional Deps
 
+# for ouster config loading
+find_package(jsoncpp)
+
 ######################################
 ## CUDA: For Optix                  ##
 ######################################
@@ -178,6 +181,11 @@ set(RMAGINE_STATIC_LIBRARIES)
 ### RMAGINE CORE LIB
 add_subdirectory(src/rmagine_core)
 
+### RMAGINE OUSTER LIB
+if(jsoncpp_FOUND)
+    add_subdirectory(src/rmagine_ouster)
+endif(jsoncpp_FOUND)
+
 ### RMAGINE EMBREE LIB
 if(embree_FOUND)
   add_subdirectory(src/rmagine_embree)
@@ -193,7 +201,7 @@ endif(CUDA_FOUND)
 
 message(STATUS "${BoldCyan}Components build:${ColourReset}")
 foreach(LIBRARY ${RMAGINE_LIBRARIES})
-  message(STATUS "- ${BoldGreen}${LIBRARY}${ColourReset}")
+    message(STATUS "- ${BoldGreen}${LIBRARY}${ColourReset}")
 endforeach()
 
 #### TESTS
diff --git a/README.md b/README.md
index 9c89873f..0eb836bc 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,8 @@ Rmagine
   <a href="https://github.com/uos/rmagine/issues">Issues</a>
   <span>&nbsp;&nbsp;•&nbsp;&nbsp;</span>
   <a href="https://github.com/amock/rmagine_examples">Examples</a>
+  <span>&nbsp;&nbsp;•&nbsp;&nbsp;</span>
+  <a href="https://github.com/amock/rmagine_viewer">Viewer</a>
   <br />
 </div>
 
@@ -116,7 +118,7 @@ More detailed examples explaining each step and how to customize it to your need
 
 Please reference the following papers when using the Rmagine library in your scientific work.
 
-```latex
+```bib
 @inproceedings{mock2023rmagine,
   title={{Rmagine: 3D Range Sensor Simulation in Polygonal Maps via Ray Tracing for Embedded Hardware on Mobile Robots}}, 
   author={Mock, Alexander and Wiemann, Thomas and Hertzberg, Joachim},
@@ -127,6 +129,7 @@ Please reference the following papers when using the Rmagine library in your sci
 ```
 
 ## Rmagine-accelerated Applications
+- [rmagine_viewer](https://github.com/amock/rmagine_viewer)
 - [rmagine_gazebo_plugins](https://github.com/uos/rmagine_gazebo_plugins)
 - [RMCL](https://github.com/uos/rmcl)
 - [radarays_ros](https://github.com/uos/radarays_ros)
diff --git a/apps/rmagine_benchmark/Main.cpp b/apps/rmagine_benchmark/Main.cpp
index 3cfc533b..91e8359f 100644
--- a/apps/rmagine_benchmark/Main.cpp
+++ b/apps/rmagine_benchmark/Main.cpp
@@ -116,7 +116,7 @@ int main(int argc, char** argv)
         // Define what to simulate
         double velos_per_second_mean = 0.0;
 
-        std::cout << "- range of last ray: " << cpu_sim->simulateRanges(Tbm)[Tbm.size() * model->phi.size * model->theta.size - 1] << std::endl;
+        // std::cout << "- range of last ray: " << cpu_sim->simulateRanges(Tbm)[Tbm.size() * model->phi.size * model->theta.size - 1] << std::endl;
         std::cout << "-- Starting Benchmark --" << std::endl;
 
         // predefine result buffer
diff --git a/dat/ouster_config/os0-128x1024x20-meta.json b/dat/ouster_config/os0-128x1024x20-meta.json
new file mode 100644
index 00000000..60026a7e
--- /dev/null
+++ b/dat/ouster_config/os0-128x1024x20-meta.json
@@ -0,0 +1,461 @@
+{
+    "beam_altitude_angles": 
+    [
+        45.31,
+        44.31,
+        43.58,
+        43.16,
+        42.31,
+        41.32,
+        40.61,
+        40.19,
+        39.32,
+        38.35,
+        37.66,
+        37.22,
+        36.36,
+        35.42,
+        34.73,
+        34.28,
+        33.4,
+        32.48,
+        31.81,
+        31.34,
+        30.47,
+        29.58,
+        28.9,
+        28.44,
+        27.55,
+        26.68,
+        26.01,
+        25.53,
+        24.64,
+        23.8,
+        23.15,
+        22.64,
+        21.76,
+        20.93,
+        20.28,
+        19.77,
+        18.88,
+        18.09,
+        17.44,
+        16.91,
+        16.01,
+        15.25,
+        14.61,
+        14.06,
+        13.16,
+        12.42,
+        11.78,
+        11.25,
+        10.32,
+        9.61,
+        8.97,
+        8.4,
+        7.49,
+        6.81,
+        6.18,
+        5.58,
+        4.66,
+        4.01,
+        3.38,
+        2.76,
+        1.84,
+        1.2,
+        0.57,
+        -0.06,
+        -0.98,
+        -1.6,
+        -2.22,
+        -2.89,
+        -3.8,
+        -4.4,
+        -5.03,
+        -5.7,
+        -6.62,
+        -7.2,
+        -7.83,
+        -8.52,
+        -9.43,
+        -10,
+        -10.65,
+        -11.37,
+        -12.27,
+        -12.83,
+        -13.47,
+        -14.21,
+        -15.12,
+        -15.65,
+        -16.3,
+        -17.06,
+        -17.97,
+        -18.47,
+        -19.12,
+        -19.91,
+        -20.83,
+        -21.32,
+        -21.98,
+        -22.78,
+        -23.71,
+        -24.18,
+        -24.83,
+        -25.67,
+        -26.59,
+        -27.05,
+        -27.7,
+        -28.56,
+        -29.48,
+        -29.92,
+        -30.6,
+        -31.48,
+        -32.4,
+        -32.84,
+        -33.52,
+        -34.4,
+        -35.32,
+        -35.76,
+        -36.44,
+        -37.36,
+        -38.28,
+        -38.71,
+        -39.4,
+        -40.36,
+        -41.27,
+        -41.69,
+        -42.38,
+        -43.36,
+        -44.29,
+        -44.69,
+        -45.4,
+        -46.42
+    ],
+    "beam_azimuth_angles": 
+    [
+        11.58,
+        4.26,
+        -2.91,
+        -9.95,
+        11.11,
+        4.08,
+        -2.85,
+        -9.65,
+        10.69,
+        3.9,
+        -2.8,
+        -9.39,
+        10.34,
+        3.75,
+        -2.75,
+        -9.18,
+        10.03,
+        3.62,
+        -2.72,
+        -8.99,
+        9.77,
+        3.5,
+        -2.71,
+        -8.84,
+        9.53,
+        3.4,
+        -2.68,
+        -8.72,
+        9.32,
+        3.3,
+        -2.68,
+        -8.6,
+        9.14,
+        3.21,
+        -2.69,
+        -8.52,
+        8.99,
+        3.13,
+        -2.69,
+        -8.45,
+        8.84,
+        3.06,
+        -2.69,
+        -8.41,
+        8.73,
+        2.99,
+        -2.7,
+        -8.37,
+        8.63,
+        2.94,
+        -2.73,
+        -8.36,
+        8.54,
+        2.9,
+        -2.75,
+        -8.35,
+        8.48,
+        2.85,
+        -2.77,
+        -8.37,
+        8.43,
+        2.83,
+        -2.81,
+        -8.4,
+        8.4,
+        2.78,
+        -2.84,
+        -8.43,
+        8.38,
+        2.76,
+        -2.87,
+        -8.51,
+        8.37,
+        2.72,
+        -2.91,
+        -8.57,
+        8.38,
+        2.72,
+        -2.96,
+        -8.65,
+        8.4,
+        2.7,
+        -3.03,
+        -8.76,
+        8.45,
+        2.7,
+        -3.08,
+        -8.87,
+        8.49,
+        2.69,
+        -3.15,
+        -9.01,
+        8.57,
+        2.69,
+        -3.22,
+        -9.18,
+        8.67,
+        2.7,
+        -3.31,
+        -9.36,
+        8.79,
+        2.71,
+        -3.41,
+        -9.58,
+        8.92,
+        2.73,
+        -3.52,
+        -9.81,
+        9.09,
+        2.77,
+        -3.63,
+        -10.1,
+        9.27,
+        2.79,
+        -3.78,
+        -10.42,
+        9.5,
+        2.84,
+        -3.93,
+        -10.8,
+        9.78,
+        2.9,
+        -4.11,
+        -11.23,
+        10.11,
+        2.95,
+        -4.34,
+        -11.75
+    ],
+    "build_date": "2022-09-21T17:47:45Z",
+    "build_rev": "v2.4.0",
+    "client_version": "ouster_client 0.7.1",
+    "data_format": 
+    {
+        "column_window": 
+        [
+            0,
+            1023
+        ],
+        "columns_per_frame": 1024,
+        "columns_per_packet": 16,
+        "pixel_shift_by_row": 
+        [
+            33,
+            12,
+            -8,
+            -28,
+            32,
+            12,
+            -8,
+            -27,
+            30,
+            11,
+            -8,
+            -27,
+            29,
+            11,
+            -8,
+            -26,
+            29,
+            10,
+            -8,
+            -26,
+            28,
+            10,
+            -8,
+            -25,
+            27,
+            10,
+            -8,
+            -25,
+            27,
+            9,
+            -8,
+            -24,
+            26,
+            9,
+            -8,
+            -24,
+            26,
+            9,
+            -8,
+            -24,
+            25,
+            9,
+            -8,
+            -24,
+            25,
+            9,
+            -8,
+            -24,
+            25,
+            8,
+            -8,
+            -24,
+            24,
+            8,
+            -8,
+            -24,
+            24,
+            8,
+            -8,
+            -24,
+            24,
+            8,
+            -8,
+            -24,
+            24,
+            8,
+            -8,
+            -24,
+            24,
+            8,
+            -8,
+            -24,
+            24,
+            8,
+            -8,
+            -24,
+            24,
+            8,
+            -8,
+            -25,
+            24,
+            8,
+            -9,
+            -25,
+            24,
+            8,
+            -9,
+            -25,
+            24,
+            8,
+            -9,
+            -26,
+            24,
+            8,
+            -9,
+            -26,
+            25,
+            8,
+            -9,
+            -27,
+            25,
+            8,
+            -10,
+            -27,
+            25,
+            8,
+            -10,
+            -28,
+            26,
+            8,
+            -10,
+            -29,
+            26,
+            8,
+            -11,
+            -30,
+            27,
+            8,
+            -11,
+            -31,
+            28,
+            8,
+            -12,
+            -32,
+            29,
+            8,
+            -12,
+            -33
+        ],
+        "pixels_per_column": 128,
+        "udp_profile_imu": "LEGACY",
+        "udp_profile_lidar": "LEGACY"
+    },
+    "hostname": "",
+    "image_rev": "ousteros-image-prod-aries-v2.4.0+20220921174636",
+    "imu_to_sensor_transform": 
+    [
+        1,
+        0,
+        0,
+        6.253,
+        0,
+        1,
+        0,
+        -11.775,
+        0,
+        0,
+        1,
+        7.645,
+        0,
+        0,
+        0,
+        1
+    ],
+    "initialization_id": 7109749,
+    "json_calibration_version": 4,
+    "lidar_mode": "1024x20",
+    "lidar_origin_to_beam_origin_mm": 27.67,
+    "lidar_to_sensor_transform": 
+    [
+        -1,
+        0,
+        0,
+        0,
+        0,
+        -1,
+        0,
+        0,
+        0,
+        0,
+        1,
+        36.18,
+        0,
+        0,
+        0,
+        1
+    ],
+    "prod_line": "OS-0-128",
+    "prod_pn": "840-103574-06",
+    "prod_sn": "122219001184",
+    "status": "RUNNING",
+    "udp_port_imu": 55235,
+    "udp_port_lidar": 52031
+}
diff --git a/src/rmagine_core/CMakeLists.txt b/src/rmagine_core/CMakeLists.txt
index 9bcfe03d..9ab8b04b 100644
--- a/src/rmagine_core/CMakeLists.txt
+++ b/src/rmagine_core/CMakeLists.txt
@@ -12,6 +12,7 @@ set(RMAGINE_CORE_SRCS
     src/types/Memory.cpp
     src/types/conversions.cpp
     src/types/sensors.cpp
+    src/types/mesh_types.cpp
     # Util
     src/util/synthetic.cpp
     src/util/assimp/helper.cpp
diff --git a/src/rmagine_core/include/rmagine/math/definitions.h b/src/rmagine_core/include/rmagine/math/definitions.h
deleted file mode 100644
index 6b585dc1..00000000
--- a/src/rmagine_core/include/rmagine/math/definitions.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef RMAGINE_MATH_DEFINITIONS_OLD_H
-#define RMAGINE_MATH_DEFINITIONS_OLD_H
-
-#include <math.h>
-#include <float.h>
-#include <stdint.h>
-
-namespace rmagine
-{
-
-#define __UINT_MAX__ (__INT_MAX__ * 2U + 1U)
-
-#define DEG_TO_RAD      0.017453292519943295
-#define DEG_TO_RAD_F    0.017453292519943295f
-#define RAD_TO_DEG      57.29577951308232
-#define RAD_TO_DEG_F    57.29577951308232f
-
-// Forward declarations
-struct Vector2;
-struct Vector3;
-struct EulerAngles;
-struct Quaternion;
-struct Transform;
-struct Matrix3x3;
-struct Matrix4x4;
-struct AABB;
-
-} // namespace rmagine
-
-#endif // RMAGINE_MATH_DEFINITIONS_H
\ No newline at end of file
diff --git a/src/rmagine_core/include/rmagine/math/linalg.h b/src/rmagine_core/include/rmagine/math/linalg.h
index 5a6b60b8..58bd0a35 100644
--- a/src/rmagine_core/include/rmagine/math/linalg.h
+++ b/src/rmagine_core/include/rmagine/math/linalg.h
@@ -41,6 +41,7 @@
 #define RMAGINE_MATH_LINALG_H
 
 #include "types.h"
+#include <rmagine/types/shared_functions.h>
 
 namespace rmagine
 {
@@ -53,8 +54,10 @@ namespace rmagine
  * @param scale   scale vector
  * @return Matrix4x4   composed 4x4 transformation matrix
  */
+RMAGINE_FUNCTION
 Matrix4x4 compose(const Transform& T, const Vector3& scale);
 
+RMAGINE_FUNCTION
 Matrix4x4 compose(const Transform& T, const Matrix3x3& S);
 
 /**
@@ -65,6 +68,7 @@ Matrix4x4 compose(const Transform& T, const Matrix3x3& S);
  * @param T transform object consisting of translational and rotational parts
  * @param S 3x3 scale matrix
  */
+RMAGINE_FUNCTION
 void decompose(const Matrix4x4& M, Transform& T, Matrix3x3& S);
 
 /**
@@ -75,6 +79,7 @@ void decompose(const Matrix4x4& M, Transform& T, Matrix3x3& S);
  * @param T transform object consisting of translational and rotational parts
  * @param scale scale vector
  */
+RMAGINE_FUNCTION
 void decompose(const Matrix4x4& M, Transform& T, Vector3& scale);
 
 /**
@@ -86,6 +91,7 @@ void decompose(const Matrix4x4& M, Transform& T, Vector3& scale);
  * - if fac=1.0 it is exactly B
  * - if fac=2.0 it it goes on a (B-A) length from B (extrapolation)
 */
+RMAGINE_FUNCTION
 Quaternion polate(const Quaternion& A, const Quaternion& B, float fac);
 
 /**
@@ -97,8 +103,69 @@ Quaternion polate(const Quaternion& A, const Quaternion& B, float fac);
  * - if fac=1.0 it is exactly B
  * - if fac=2.0 it it goes on a (B-A) length from B (extrapolation)
 */
+RMAGINE_FUNCTION
 Transform polate(const Transform& A, const Transform& B, float fac);
 
+
+// Numerical Recipes
+// M = MatrixT::rows()
+// N = MatrixT::cols()
+// 
+// Warning: Numerical Recipes has different SVD matrix shapes
+// than Wikipedia
+template<typename MatrixT>
+struct svd_dims {
+    using U = MatrixT; // same as input
+    using w = Matrix_<typename MatrixT::Type, MatrixT::cols(), 1>;
+    using W = Matrix_<typename MatrixT::Type, MatrixT::cols(), MatrixT::cols()>;
+    using V = Matrix_<typename MatrixT::Type, MatrixT::cols(), MatrixT::cols()>;
+};
+
+/**
+ * @brief own SVD implementation. 
+ * Why use it? 
+ * - ~2x faster than Eigen
+ * - SOON: Works insided of CUDA kernels
+ *
+ */
+template<typename DataT, unsigned int Rows, unsigned int Cols>
+void svd(
+    const Matrix_<DataT, Rows, Cols>& A, 
+    Matrix_<DataT, Rows, Cols>& U,
+    Matrix_<DataT, Cols, Cols>& W, // matrix
+    Matrix_<DataT, Cols, Cols>& V
+);
+
+template<typename DataT, unsigned int Rows, unsigned int Cols>
+void svd(
+    const Matrix_<DataT, Rows, Cols>& A,
+    Matrix_<DataT, Rows, Cols>& U,
+    Matrix_<DataT, Cols, 1>& w, // vector version (Cols should be something with max)
+    Matrix_<DataT, Cols, Cols>& V
+);
+
+/**
+ * @brief SVD that can be used for both CPU and GPU (Cuda kernels)
+ *
+ */
+RMAGINE_FUNCTION
+void svd(
+    const Matrix3x3& A,
+    Matrix3x3& U,
+    Matrix3x3& W,
+    Matrix3x3& V
+);
+
+RMAGINE_FUNCTION
+void svd(
+    const Matrix3x3& A, 
+    Matrix3x3& U,
+    Vector3& w,
+    Matrix3x3& V
+);
+
 } // namespace rmagine
 
+#include "linalg.tcc"
+
 #endif // RMAGINE_MATH_MATH_LINALG_H
\ No newline at end of file
diff --git a/src/rmagine_core/include/rmagine/math/linalg.tcc b/src/rmagine_core/include/rmagine/math/linalg.tcc
new file mode 100644
index 00000000..ef7a3762
--- /dev/null
+++ b/src/rmagine_core/include/rmagine/math/linalg.tcc
@@ -0,0 +1,582 @@
+#include <rmagine/math/types.h>
+#include <rmagine/math/math.h>
+
+namespace rmagine
+{
+
+template<typename DataT, unsigned int Rows, unsigned int Cols>
+void svd(
+    const Matrix_<DataT, Rows, Cols>& a, 
+    Matrix_<DataT, Rows, Cols>& u,
+    Matrix_<DataT, Cols, Cols>& w,
+    Matrix_<DataT, Cols, Cols>& v)
+{
+    constexpr unsigned int m = Rows;
+    constexpr unsigned int n = Cols;
+
+    constexpr unsigned int max_iterations = 30;
+
+    // extra memory required
+    bool flag;
+    int i, its, j, jj, k, l, nm;
+    float anorm, c, f, g, h, s, scale, x, y, z;
+    DataT rv1[n];
+
+    g = scale = anorm = 0.0;
+    const float eps = std::numeric_limits<float>::epsilon();
+    u = a;
+
+    for(i=0; i < n; i++) 
+    {
+        l = i + 2;
+        rv1[i] = scale * g;
+        g = s = scale = 0.0;
+        if(i < m) 
+        {
+            for(k=i; k<m; k++) 
+            {
+                scale += abs(u(k,i));
+            }
+            if(scale != 0.0) 
+            {
+                for(k=i; k<m; k++) 
+                {
+                    u(k,i) /= scale;
+                    s += u(k,i) * u(k,i);
+                }
+                f = u(i,i);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,i) = f-g;
+                for(j=l-1;j<n;j++) 
+                {
+                    for (s=0.0,k=i;k<m;k++) 
+                    {
+                        s += u(k,i) * u(k,j);
+                    }
+                    f = s/h;
+                    for (k=i;k<m;k++)
+                    {
+                        u(k,j) += f * u(k,i);
+                    }
+                }
+                for (k=i;k<m;k++)
+                {
+                    u(k,i) *= scale;
+                }
+            }
+        }
+        w(i, i) = scale * g;
+        g = s = scale = 0.0;
+        if(i+1 <= m && i+1 != n)
+        {
+            for(k=l-1; k<n; k++)
+            {
+                scale += abs(u(i,k));
+            }
+            if(scale != 0.0)
+            {
+                for(k=l-1;k<n;k++) 
+                {
+                    u(i, k) /= scale;
+                    s += u(i,k) * u(i,k);
+                }
+                f = u(i, l-1);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,l-1) = f-g;
+                for(k=l-1;k<n;k++)
+                {
+                    rv1[k] = u(i,k) / h;
+                }
+                for(j=l-1; j<m; j++)
+                {
+                    for (s=0.0,k=l-1; k<n; k++)
+                    {
+                        s += u(j,k) * u(i,k);
+                    }
+                    for (k=l-1; k<n;k++)
+                    {
+                        u(j,k) += s * rv1[k];
+                    }
+                }
+                for(k=l-1; k<n; k++)
+                {
+                    u(i,k) *= scale;
+                }
+            }
+        }
+        anorm = MAX(anorm, (abs(w(i, i))+abs(rv1[i])));
+    }
+    for(i=n-1; i>=0; i--)
+    {
+        if(i < n-1) 
+        {
+            if(g != 0.0)
+            {
+                for(j=l; j<n; j++)
+                {
+                    v(j,i) = (u(i,j)/u(i,l)) / g;
+                }
+                for(j=l; j<n; j++)
+                {
+                    for (s=0.0,k=l;k<n;k++) 
+                    {
+                        s += u(i,k) * v(k,j);
+                    }
+                    for (k=l; k<n; k++)
+                    {
+                        v(k,j) += s * v(k,i);
+                    }
+                }
+            }
+            for (j=l; j<n; j++) 
+            {
+                v(i,j) = 0.0;
+                v(j,i) = 0.0;
+            }
+        }
+        v(i,i) = 1.0;
+        g = rv1[i];
+        l = i;
+    }
+    for(i=MIN(m,n)-1; i>=0; i--)
+    {
+        l = i+1;
+        g = w(i, i);
+        for(j=l;j<n;j++) 
+        {
+            u(i,j) = 0.0;
+        }
+        if(g != 0.0)
+        {
+            g = 1.0/g;
+            for(j=l; j<n; j++)
+            {
+                for (s=0.0,k=l; k<m; k++) 
+                {
+                    s += u(k,i)*u(k,j);
+                }
+                f = (s/u(i,i)) * g;
+                for (k=i; k<m; k++) 
+                {
+                    u(k,j) += f * u(k,i);
+                }
+            }
+            for(j=i;j<m;j++) 
+            {
+                u(j,i) *= g;
+            }
+        } else {
+            for(j=i;j<m;j++) 
+            {
+                u(j,i) = 0.0;
+            } 
+        }
+        ++u(i,i);
+    }
+    for(k=n-1; k>=0; k--) 
+    {
+        for(its=0; its<max_iterations; its++) 
+        {
+            flag=true;
+            for(l=k; l>=0; l--) 
+            {
+                nm=l-1;
+                if (l == 0 || abs(rv1[l]) <= eps*anorm) {
+                    flag=false;
+                    break;
+                }
+                if (abs(w(nm, nm)) <= eps*anorm) 
+                {
+                    break;
+                }
+            }
+            if(flag) 
+            {
+                c=0.0;
+                s=1.0;
+                for(i=l; i<k+1; i++) 
+                {
+                    f = s*rv1[i];
+                    rv1[i] = c*rv1[i];
+                    if(abs(f) <= eps*anorm) 
+                    {
+                        break;
+                    }
+                    g = w(i, i);
+                    h = PYTHAG(f,g);
+                    w(i, i) = h;
+                    h = 1.0/h;
+                    c = g*h;
+                    s = -f*h;
+                    for(j=0; j<m; j++)
+                    {
+                        y = u(j,nm);
+                        z = u(j,i);
+                        u(j,nm) = y*c+z*s;
+                        u(j,i) = z*c-y*s;
+                    }
+                }
+            }
+            z = w(k, k);
+            if (l == k)
+            {
+                if (z < 0.0)
+                {
+                    w(k,k) = -z;
+                    for (j=0;j<n;j++) 
+                    {
+                        v(j,k) = -v(j,k);
+                    }
+                }
+                break;
+            }
+            if (its == max_iterations - 1) 
+            {
+                throw std::runtime_error("no convergence after max svdcmp iterations");
+            }
+            x = w(l, l);
+            nm = k-1;
+            y = w(nm, nm);
+            g = rv1[nm];
+            h = rv1[k];
+            f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.0*h*y);
+            g = PYTHAG(f, (DataT)1.0);
+            f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+            c = s = 1.0;
+            for (j=l; j<=nm; j++) 
+            {
+                i = j+1;
+                g = rv1[i];
+                y = w(i, i);
+                h = s*g;
+                g = c*g;
+                z = PYTHAG(f,h);
+                rv1[j] = z;
+                c = f/z;
+                s = h/z;
+                f = x*c+g*s;
+                g = g*c-x*s;
+                h = y*s;
+                y *= c;
+                for (jj=0;jj<n;jj++)
+                {
+                    x = v(jj,j);
+                    z = v(jj,i);
+                    v(jj,j) = x*c+z*s;
+                    v(jj,i) = z*c-x*s;
+                }
+                z = PYTHAG(f,h);
+                w(j, j) = z;
+                if (z) 
+                {
+                    z = 1.0/z;
+                    c = f*z;
+                    s = h*z;
+                }
+                f = c*g+s*y;
+                x = c*y-s*g;
+                for (jj=0;jj<m;jj++)
+                {
+                    y = u(jj,j);
+                    z = u(jj,i);
+                    u(jj,j) = y*c+z*s;
+                    u(jj,i) = z*c-y*s;
+                }
+            }
+            rv1[l] = 0.0;
+            rv1[k] = f;
+            w(k, k) = x;
+        }
+    }
+}
+
+template<typename DataT, unsigned int Rows, unsigned int Cols>
+void svd(
+    const Matrix_<DataT, Rows, Cols>& a, 
+    Matrix_<DataT, Rows, Cols>& u,
+    Matrix_<DataT, Cols, 1>& w, // vector version
+    Matrix_<DataT, Cols, Cols>& v)
+{
+    constexpr unsigned int m = Rows;
+    constexpr unsigned int n = Cols;
+    constexpr unsigned int max_iterations = 30;
+    
+    // additional memory required
+    bool flag;
+    int i, its, j, jj, k, l, nm;
+    float anorm, c, f, g, h, s, scale, x, y, z;
+    DataT rv1[n];
+    
+    g = scale = anorm = 0.0;
+    float eps = std::numeric_limits<float>::epsilon();
+    u = a;
+
+    for(i=0; i < n; i++) 
+    {
+        l = i+2;
+        rv1[i] = scale*g;
+        g = s = scale = 0.0;
+        if(i < m) 
+        {
+            for(k=i; k<m; k++) 
+            {
+                scale += abs(u(k,i));
+            }
+            if(scale != 0.0) 
+            {
+                for(k=i; k<m; k++) 
+                {
+                    u(k,i) /= scale;
+                    s += u(k,i) * u(k,i);
+                }
+                f = u(i,i);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,i) = f-g;
+                for(j=l-1;j<n;j++) 
+                {
+                    for (s=0.0,k=i;k<m;k++) 
+                    {
+                        s += u(k,i) * u(k,j);
+                    }
+                    f = s/h;
+                    for (k=i;k<m;k++)
+                    {
+                        u(k,j) += f * u(k,i);
+                    }
+                }
+                for (k=i;k<m;k++)
+                {
+                    u(k,i) *= scale;
+                }
+            }
+        }
+        w(i, 0) = scale * g;
+        g = s = scale = 0.0;
+        if(i+1 <= m && i+1 != n)
+        {
+            for(k=l-1; k<n; k++)
+            {
+                scale += abs(u(i,k));
+            }
+            if(scale != 0.0)
+            {
+                for(k=l-1;k<n;k++) 
+                {
+                    u(i, k) /= scale;
+                    s += u(i,k) * u(i,k);
+                }
+                f = u(i, l-1);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,l-1) = f-g;
+                for(k=l-1;k<n;k++)
+                {
+                    rv1[k] = u(i,k) / h;
+                }
+                for(j=l-1; j<m; j++)
+                {
+                    for (s=0.0,k=l-1; k<n; k++)
+                    {
+                        s += u(j,k) * u(i,k);
+                    }
+                    for (k=l-1; k<n;k++)
+                    {
+                        u(j,k) += s * rv1[k];
+                    }
+                }
+                for(k=l-1; k<n; k++)
+                {
+                    u(i,k) *= scale;
+                }
+            }
+        }
+        anorm = MAX(anorm, (abs(w(i, 0))+abs(rv1[i])));
+    }
+    for(i=n-1; i>=0; i--)
+    {
+        if(i < n-1) 
+        {
+            if(g != 0.0)
+            {
+                for(j=l; j<n; j++)
+                {
+                    v(j,i) = (u(i,j)/u(i,l)) / g;
+                }
+                for(j=l; j<n; j++)
+                {
+                    for (s=0.0,k=l;k<n;k++) 
+                    {
+                        s += u(i,k) * v(k,j);
+                    }
+                    for (k=l; k<n; k++)
+                    {
+                        v(k,j) += s * v(k,i);
+                    }
+                }
+            }
+            for (j=l; j<n; j++) 
+            {
+                v(i,j) = 0.0;
+                v(j,i) = 0.0;
+            }
+        }
+        v(i,i) = 1.0;
+        g = rv1[i];
+        l = i;
+    }
+    for(i=MIN(m,n)-1; i>=0; i--)
+    {
+        l = i+1;
+        g = w(i, 0);
+        for(j=l;j<n;j++) 
+        {
+            u(i,j) = 0.0;
+        }
+        if(g != 0.0)
+        {
+            g = 1.0/g;
+            for(j=l; j<n; j++)
+            {
+                for (s=0.0,k=l; k<m; k++) 
+                {
+                    s += u(k,i)*u(k,j);
+                }
+                f = (s/u(i,i)) * g;
+                for (k=i; k<m; k++) 
+                {
+                    u(k,j) += f * u(k,i);
+                }
+            }
+            for(j=i;j<m;j++) 
+            {
+                u(j,i) *= g;
+            }
+        } else {
+            for(j=i;j<m;j++) 
+            {
+                u(j,i) = 0.0;
+            } 
+        }
+        ++u(i,i);
+    }
+    for(k=n-1; k>=0; k--) 
+    {
+        for(its=0; its<max_iterations; its++) 
+        {
+            flag=true;
+            for(l=k; l>=0; l--) 
+            {
+                nm=l-1;
+                if (l == 0 || abs(rv1[l]) <= eps*anorm) {
+                    flag=false;
+                    break;
+                }
+                if (abs(w(nm, 0)) <= eps*anorm) 
+                {
+                    break;
+                }
+            }
+            if(flag) 
+            {
+                c=0.0;
+                s=1.0;
+                for(i=l; i<k+1; i++) 
+                {
+                    f = s*rv1[i];
+                    rv1[i] = c*rv1[i];
+                    if(abs(f) <= eps*anorm) 
+                    {
+                        break;
+                    }
+                    g = w(i, 0);
+                    h = PYTHAG(f,g);
+                    w(i, 0) = h;
+                    h = 1.0/h;
+                    c = g*h;
+                    s = -f*h;
+                    for(j=0; j<m; j++)
+                    {
+                        y = u(j,nm);
+                        z = u(j,i);
+                        u(j,nm) = y*c+z*s;
+                        u(j,i) = z*c-y*s;
+                    }
+                }
+            }
+            z = w(k, 0);
+            if (l == k)
+            {
+                if (z < 0.0)
+                {
+                    w(k,0) = -z;
+                    for (j=0;j<n;j++) 
+                    {
+                        v(j,k) = -v(j,k);
+                    }
+                }
+                break;
+            }
+            if (its == max_iterations - 1) 
+            {
+                throw std::runtime_error("no convergence in 30 svdcmp iterations");
+            }
+            x = w(l, 0);
+            nm = k-1;
+            y = w(nm, 0);
+            g = rv1[nm];
+            h = rv1[k];
+            f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.0*h*y);
+            g = PYTHAG(f, (DataT)1.0);
+            f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+            c = s = 1.0;
+            for (j=l;j<=nm;j++) 
+            {
+                i = j+1;
+                g = rv1[i];
+                y = w(i, 0);
+                h = s*g;
+                g = c*g;
+                z = PYTHAG(f,h);
+                rv1[j] = z;
+                c = f/z;
+                s = h/z;
+                f = x*c+g*s;
+                g = g*c-x*s;
+                h = y*s;
+                y *= c;
+                for (jj=0;jj<n;jj++)
+                {
+                    x = v(jj,j);
+                    z = v(jj,i);
+                    v(jj,j) = x*c+z*s;
+                    v(jj,i) = z*c-x*s;
+                }
+                z = PYTHAG(f,h);
+                w(j, 0) = z;
+                if (z) 
+                {
+                    z = 1.0/z;
+                    c = f*z;
+                    s = h*z;
+                }
+                f = c*g+s*y;
+                x = c*y-s*g;
+                for (jj=0;jj<m;jj++)
+                {
+                    y = u(jj,j);
+                    z = u(jj,i);
+                    u(jj,j) = y*c+z*s;
+                    u(jj,i) = z*c-y*s;
+                }
+            }
+            rv1[l] = 0.0;
+            rv1[k] = f;
+            w(k, 0) = x;
+        }
+    }
+}
+
+} // namespace rmagine
diff --git a/src/rmagine_core/include/rmagine/math/math.h b/src/rmagine_core/include/rmagine/math/math.h
index 39c341fe..eb420793 100644
--- a/src/rmagine_core/include/rmagine/math/math.h
+++ b/src/rmagine_core/include/rmagine/math/math.h
@@ -49,6 +49,67 @@
 namespace rmagine
 {
 
+///////
+// TODO: Integrate the following functions better / somewhere else
+//////
+template<class T>
+RMAGINE_INLINE_FUNCTION
+T SQR(const T a) {return a*a;}
+
+template<class T>
+RMAGINE_INLINE_FUNCTION
+const T &MAX(const T &a, const T &b)
+    {return b > a ? (b) : (a);}
+
+RMAGINE_INLINE_FUNCTION
+float MAX(const double &a, const float &b)
+    {return b > a ? (b) : float(a);}
+
+RMAGINE_INLINE_FUNCTION
+float MAX(const float &a, const double &b)
+    {return b > a ? float(b) : (a);}
+
+template<class T>
+RMAGINE_INLINE_FUNCTION
+const T &MIN(const T &a, const T &b)
+    {return b < a ? (b) : (a);}
+
+RMAGINE_INLINE_FUNCTION
+float MIN(const double &a, const float &b)
+    {return b < a ? (b) : float(a);}
+
+RMAGINE_INLINE_FUNCTION
+float MIN(const float &a, const double &b)
+    {return b < a ? float(b) : (a);}
+
+template<class T>
+RMAGINE_INLINE_FUNCTION
+T SIGN(const T &a, const T &b)
+    {return b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a);}
+
+RMAGINE_INLINE_FUNCTION
+float SIGN(const float &a, const double &b)
+    {return b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a);}
+
+RMAGINE_INLINE_FUNCTION
+float SIGN(const double &a, const float &b)
+    {return (float)(b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a));}
+
+template<class T>
+RMAGINE_INLINE_FUNCTION
+void SWAP(T &a, T &b)
+    {T dum=a; a=b; b=dum;}
+
+template<typename T>
+RMAGINE_INLINE_FUNCTION
+T PYTHAG(const T a, const T b)
+{
+    T absa = abs(a);
+    T absb = abs(b);
+    return (absa > absb ? absa * sqrt(1.0+SQR(absb/absa)) :
+        (absb == 0.0 ? 0.0 : absb * sqrt(1.0+SQR(absa/absb))));
+}
+
 
 template<typename DataT>
 Vector3_<DataT> min(const Vector3_<DataT>& a, const Vector3_<DataT>& b);
@@ -56,7 +117,6 @@ Vector3_<DataT> min(const Vector3_<DataT>& a, const Vector3_<DataT>& b);
 template<typename DataT>
 Vector3_<DataT> max(const Vector3_<DataT>& a, const Vector3_<DataT>& b);
 
-
 /////////////
 // #multNxN
 ////////
@@ -371,6 +431,29 @@ Memory<Matrix3x3, RAM> cov(
     const MemoryView<Vector, RAM>& v2
 );
 
+/**
+ * @brief decompose A = UWV* using singular value decomposition
+ */
+void svd(
+    const MemoryView<Matrix3x3, RAM>& As,
+    MemoryView<Matrix3x3, RAM>& Us,
+    MemoryView<Matrix3x3, RAM>& Ws,
+    MemoryView<Matrix3x3, RAM>& Vs
+);
+
+/**
+ * @brief decompose A = UWV* using singular value decomposition
+ * 
+ * w is a vector which is the diagonal of matrix W
+ */
+void svd(
+    const MemoryView<Matrix3x3, RAM>& As,
+    MemoryView<Matrix3x3, RAM>& Us,
+    MemoryView<Vector3, RAM>& ws,
+    MemoryView<Matrix3x3, RAM>& Vs
+);
+
+
 } // namespace rmagine
 
 #include "math.tcc"
diff --git a/src/rmagine_core/include/rmagine/math/types/Matrix.hpp b/src/rmagine_core/include/rmagine/math/types/Matrix.hpp
index 46cdc561..22e824b5 100644
--- a/src/rmagine_core/include/rmagine/math/types/Matrix.hpp
+++ b/src/rmagine_core/include/rmagine/math/types/Matrix.hpp
@@ -333,6 +333,17 @@ struct Matrix_ {
     RMAGINE_INLINE_FUNCTION
     operator EulerAngles_<DataT>() const;
 
+    /**
+     * @brief Transformation Matrix -> Transform
+     * WARNING: The matrix has to be isometric, i.e. composed only of
+     * rotational and translational components. If it has e.g. scalar
+     * components use the "decompose" function instead
+     * 
+     * @return Transform_<DataT>
+     */
+    RMAGINE_INLINE_FUNCTION
+    operator Transform_<DataT>() const;
+
     /**
      * @brief Data Type Cast to ConvT
      * 
diff --git a/src/rmagine_core/include/rmagine/math/types/Matrix.tcc b/src/rmagine_core/include/rmagine/math/types/Matrix.tcc
index 5973ce91..5fe5271e 100644
--- a/src/rmagine_core/include/rmagine/math/types/Matrix.tcc
+++ b/src/rmagine_core/include/rmagine/math/types/Matrix.tcc
@@ -1256,7 +1256,7 @@ Matrix_<DataT, Rows, Cols>::operator EulerAngles_<DataT>() const
     // pitch (y-axis)
     if (fabs(sB) >= 1.0)
     {
-        e.pitch = copysignf(M_PI / 2, sB); // use 90 degrees if out of range
+        e.pitch = copysignf(M_PI_2, sB); // use 90 degrees if out of range
     } else {
         e.pitch = asinf(sB);
     }
@@ -1267,6 +1267,18 @@ Matrix_<DataT, Rows, Cols>::operator EulerAngles_<DataT>() const
     return e;
 }
 
+template<typename DataT, unsigned int Rows, unsigned int Cols> 
+RMAGINE_INLINE_FUNCTION
+Matrix_<DataT, Rows, Cols>::operator Transform_<DataT>() const 
+{
+    static_assert(Rows == 4 && Cols == 4);
+    
+    Transform_<DataT> T;
+    T.set(*this);
+    return T;
+}
+
+
 
 template<typename DataT, unsigned int Rows, unsigned int Cols> 
 template<typename ConvT>
diff --git a/src/rmagine_core/include/rmagine/math/types/Quaternion.tcc b/src/rmagine_core/include/rmagine/math/types/Quaternion.tcc
index a3742652..b4dac7da 100644
--- a/src/rmagine_core/include/rmagine/math/types/Quaternion.tcc
+++ b/src/rmagine_core/include/rmagine/math/types/Quaternion.tcc
@@ -182,8 +182,6 @@ template<typename DataT>
 RMAGINE_INLINE_FUNCTION
 Quaternion_<DataT>::operator EulerAngles_<DataT>() const 
 {
-    constexpr DataT PI_HALF = M_PI / 2.0;
-
     // TODO: check
     // https://en.wikipedia.org/wiki/Conversion_between_quaternions_and_Euler_angles
     // checked once
@@ -215,7 +213,7 @@ Quaternion_<DataT>::operator EulerAngles_<DataT>() const
     // pitch (y-axis)
     if (fabs(sinp) >= 1.0f)
     {
-        e.pitch = copysign(PI_HALF, sinp); // use 90 degrees if out of range
+        e.pitch = copysign(M_PI_2, sinp); // use 90 degrees if out of range
     } else {
         e.pitch = asin(sinp);
     }
diff --git a/src/rmagine_core/include/rmagine/math/types/Transform.hpp b/src/rmagine_core/include/rmagine/math/types/Transform.hpp
index 2ad90aa9..baef4389 100644
--- a/src/rmagine_core/include/rmagine/math/types/Transform.hpp
+++ b/src/rmagine_core/include/rmagine/math/types/Transform.hpp
@@ -38,6 +38,12 @@ struct Transform_
     RMAGINE_INLINE_FUNCTION
     void setIdentity();
 
+    /**
+     * @brief Setting the transform from an 4x4 transformation matrix
+     * WARNING matrix must be isometric, i.e. must only contain
+     * rotational and translational parts (not scale). Otherwise,
+     * use "decompose" function
+     */
     RMAGINE_INLINE_FUNCTION
     void set(const Matrix_<DataT, 4, 4>& M);
 
@@ -76,13 +82,6 @@ struct Transform_
     RMAGINE_INLINE_FUNCTION
     Transform_<DataT> pow(const DataT& exp) const;
 
-    // OPERATORS
-    RMAGINE_INLINE_FUNCTION
-    void operator=(const Matrix_<DataT, 4, 4>& M)
-    {
-        set(M);
-    }
-
     RMAGINE_INLINE_FUNCTION
     Transform_<DataT> operator~() const
     {
@@ -108,6 +107,20 @@ struct Transform_
         return mult(v);
     }
 
+    /////////////////////
+    // CASTING
+
+    /**
+     * @brief Transform -> Matrix4x4
+     * 
+     * @return Matrix_<DataT> 
+     */
+    RMAGINE_INLINE_FUNCTION
+    operator Matrix_<DataT, 4, 4>() const;
+
+    /**
+     * @brief Internal data type cast
+     */
     template<typename ConvT>
     Transform_<ConvT> cast() const
     {
diff --git a/src/rmagine_core/include/rmagine/math/types/Transform.tcc b/src/rmagine_core/include/rmagine/math/types/Transform.tcc
index 02fd878a..8e8d6b5a 100644
--- a/src/rmagine_core/include/rmagine/math/types/Transform.tcc
+++ b/src/rmagine_core/include/rmagine/math/types/Transform.tcc
@@ -78,4 +78,13 @@ Transform_<DataT> Transform_<DataT>::pow(const DataT& exp) const
     return res;
 }
 
+template<typename DataT>
+RMAGINE_INLINE_FUNCTION
+Transform_<DataT>::operator Matrix_<DataT, 4, 4>() const 
+{
+    Matrix_<DataT, 4, 4> M;
+    M.set(*this);
+    return M;
+}
+
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_core/include/rmagine/math/types/Vector3.hpp b/src/rmagine_core/include/rmagine/math/types/Vector3.hpp
index e4a0b669..6a249511 100644
--- a/src/rmagine_core/include/rmagine/math/types/Vector3.hpp
+++ b/src/rmagine_core/include/rmagine/math/types/Vector3.hpp
@@ -245,6 +245,31 @@ struct Vector3_
     {   
         return multEwise(b);
     }
+
+    // Use with care!
+    RMAGINE_INLINE_FUNCTION
+    DataT& operator[](const size_t& idx)
+    {
+        return *(&x + idx);
+    }
+
+    RMAGINE_INLINE_FUNCTION
+    const DataT& operator[](const size_t& idx) const
+    {
+        return *(&x + idx);
+    }
+
+    RMAGINE_INLINE_FUNCTION
+    DataT& operator()(const size_t& idx)
+    {
+        return *(&x + idx);
+    }
+
+    RMAGINE_INLINE_FUNCTION
+    const DataT& operator()(const size_t& idx) const
+    {
+        return *(&x + idx);
+    }
 };
 
 } // namespace rmagine
diff --git a/src/rmagine_core/include/rmagine/math/types/definitions.h b/src/rmagine_core/include/rmagine/math/types/definitions.h
index 39de2857..39913160 100644
--- a/src/rmagine_core/include/rmagine/math/types/definitions.h
+++ b/src/rmagine_core/include/rmagine/math/types/definitions.h
@@ -17,7 +17,6 @@ namespace rmagine
 
 // Forward declarations
 
-
 template<typename DataT>
 struct Vector2_;
 
@@ -41,6 +40,8 @@ struct AABB_;
 
 
 using Vector2f = Vector2_<float>;
+using Vector2u = Vector2_<uint32_t>;
+using Vector2i = Vector2_<int32_t>;
 using Vector3f = Vector3_<float>;
 using Matrix2x2f = Matrix_<float, 2, 2>;
 using Matrix3x3f = Matrix_<float, 3, 3>;
@@ -79,17 +80,12 @@ using AABB = AABB_<DefaultFloatType>;
 using Vector = Vector3;
 using Point = Vector;
 
+// @amock TODO: how to define a pixel? unsigned or signed?
+// - projection operations can result in negative pixels
+// using Pixel = Vector2u;
+// using Pixel = Vector2i;
 
 
-// struct Vector2;
-// struct Vector3;
-// struct EulerAngles;
-// struct Quaternion;
-// struct Transform;
-// struct Matrix3x3;
-// struct Matrix4x4;
-// struct AABB;
-
 } // namespace rmagine
 
 #endif // RMAGINE_MATH_DEFINITIONS_H
\ No newline at end of file
diff --git a/src/rmagine_core/include/rmagine/simulation/SimulationResults.hpp b/src/rmagine_core/include/rmagine/simulation/SimulationResults.hpp
index aa8fe2cd..9942a1e1 100644
--- a/src/rmagine_core/include/rmagine/simulation/SimulationResults.hpp
+++ b/src/rmagine_core/include/rmagine/simulation/SimulationResults.hpp
@@ -97,7 +97,6 @@ struct FaceIds {
     Memory<unsigned int, MemT> face_ids;
 };
 
-
 /**
  * @brief GeomIds computed by the simulators
  * 
@@ -127,6 +126,13 @@ struct ObjectIds {
 };
 
 
+/**
+ * @brief Convenience object if we want to access all attributes at intersection
+ * 
+ * WARNING: use with care; It causes slower runtime in contrast to a more specific 
+ *          choice of attributes
+ * 
+ */
 template<typename MemT>
 using IntAttrAll = Bundle<
     Hits<MemT>,
@@ -156,7 +162,7 @@ template<typename MemT, typename BundleT>
 static void resize_memory_bundle(BundleT& res, 
     unsigned int W,
     unsigned int H,
-    unsigned int N )
+    unsigned int N = 1 )
 {
     if constexpr(BundleT::template has<Hits<MemT> >())
     {
@@ -194,24 +200,6 @@ static void resize_memory_bundle(BundleT& res,
     }
 }
 
-// template<typename BundleT>
-// static void resize_memory_bundle(BundleT& res, 
-//     unsigned int W,
-//     unsigned int H,
-//     unsigned int N )
-// {
-//     resize_memory_bundle_<RAM>(res, W, H, N);
-// }
-
-template<typename MemT, typename BundleT>
-[[deprecated("Use resize_memory_bundle() instead.")]]
-void resizeMemoryBundle(BundleT& res, 
-    unsigned int W,
-    unsigned int H,
-    unsigned int N )
-{
-    resize_memory_bundle<BundleT>(res, W, H, N);
-}
 
 } // namespace rmagine
 
diff --git a/src/rmagine_core/include/rmagine/types/Bundle.hpp b/src/rmagine_core/include/rmagine/types/Bundle.hpp
index cd015811..24952635 100644
--- a/src/rmagine_core/include/rmagine/types/Bundle.hpp
+++ b/src/rmagine_core/include/rmagine/types/Bundle.hpp
@@ -93,8 +93,6 @@ struct Bundle : public Tp...
         static constexpr bool value = has_type<F, elems>::type::value;
     };
 
-
-
     template <typename T, typename Tuple>
     struct Index;
 
diff --git a/src/rmagine_core/include/rmagine/types/Memory.hpp b/src/rmagine_core/include/rmagine/types/Memory.hpp
index 3b0279c5..ed7b1702 100644
--- a/src/rmagine_core/include/rmagine/types/Memory.hpp
+++ b/src/rmagine_core/include/rmagine/types/Memory.hpp
@@ -39,6 +39,7 @@
 #include <iostream>
 #include <cstring>
 #include <type_traits>
+#include <exception>
 
 #include <rmagine/types/shared_functions.h>
 
@@ -46,6 +47,15 @@ namespace rmagine {
 
 struct RAM;
 
+class MemoryResizeError : public std::runtime_error {
+public:
+  MemoryResizeError()
+  :std::runtime_error("rmagine: cannot resize memory view!")
+  {
+
+  }
+};
+
 template<typename DataT, typename MemT = RAM>
 class MemoryView {
 public:
@@ -126,6 +136,11 @@ class MemoryView {
         return m_size;
     }
 
+    // Shall we introduce this? 
+    // virtual void resize(size_t N) {
+    //     throw MemoryResizeError();
+    // }
+
     MemoryView<DataT, MemT> slice(size_t idx_start, size_t idx_end)
     {
         return MemoryView<DataT, MemT>(m_mem + idx_start, idx_end - idx_start);
@@ -173,6 +188,7 @@ class Memory : public MemoryView<DataT, MemT> {
 
     ~Memory();
 
+    // virtual void resize(size_t N);
     void resize(size_t N);
 
     // Copy for assignment of same MemT
diff --git a/src/rmagine_core/include/rmagine/types/mesh_types.h b/src/rmagine_core/include/rmagine/types/mesh_types.h
index 210c7fc1..4e094a4b 100644
--- a/src/rmagine_core/include/rmagine/types/mesh_types.h
+++ b/src/rmagine_core/include/rmagine/types/mesh_types.h
@@ -43,6 +43,8 @@
 #define RMAGINE_TYPES_MESH_TYPES_H
 
 #include <rmagine/math/types.h>
+#include <tuple>
+#include <type_traits>
 
 namespace rmagine
 {
@@ -53,8 +55,24 @@ struct Face {
     unsigned int v0;
     unsigned int v1;
     unsigned int v2;
+
+    // Other access functions
+    // use with care! No out of range checks
+    RMAGINE_INLINE_FUNCTION
+    const unsigned int& operator[](const size_t& idx) const;
+
+    RMAGINE_INLINE_FUNCTION
+    unsigned int& operator[](const size_t& idx);
+
+    RMAGINE_INLINE_FUNCTION
+    constexpr size_t size() const
+    {
+      return 3;
+    }
 };
 
 } // namespace rmagine
 
+#include "mesh_types.tcc"
+
 #endif // RMAGINE_TYPES_MESH_TYPES_H
\ No newline at end of file
diff --git a/src/rmagine_core/include/rmagine/types/mesh_types.tcc b/src/rmagine_core/include/rmagine/types/mesh_types.tcc
new file mode 100644
index 00000000..fa9d07c8
--- /dev/null
+++ b/src/rmagine_core/include/rmagine/types/mesh_types.tcc
@@ -0,0 +1,18 @@
+#include "mesh_types.h"
+
+namespace rmagine
+{
+
+RMAGINE_INLINE_FUNCTION
+const unsigned int& Face::operator[](const size_t& idx) const
+{
+    return *((&v0)+idx);
+}
+
+RMAGINE_INLINE_FUNCTION
+unsigned int& Face::operator[](const size_t& idx)
+{
+    return *((&v0)+idx);
+}
+
+} // namespace rmagine
diff --git a/src/rmagine_core/include/rmagine/types/sensor_models.h b/src/rmagine_core/include/rmagine/types/sensor_models.h
index 4c69ee3c..d54a15bd 100644
--- a/src/rmagine_core/include/rmagine/types/sensor_models.h
+++ b/src/rmagine_core/include/rmagine/types/sensor_models.h
@@ -148,6 +148,12 @@ struct SphericalModel
         return getWidth() * getHeight();
     }
 
+    RMAGINE_INLINE_FUNCTION
+    uint32_t getSize() const 
+    {
+        return getWidth() * getHeight();
+    }
+
     RMAGINE_INLINE_FUNCTION
     float getPhi(uint32_t phi_id) const
     {
@@ -179,6 +185,32 @@ struct SphericalModel
     {
         return phi_id * theta.size + theta_id;
     }
+
+    RMAGINE_INLINE_FUNCTION
+    Vector2u getPixelCoord(uint32_t buffer_id) const
+    {
+        return {buffer_id % theta.size, buffer_id / theta.size};
+    }
+
+    // slice horizontal line. vertical is not currently not possible because of memory layout
+    template<typename DataT, typename MemT>
+    MemoryView<DataT, MemT> getRow(const MemoryView<DataT, MemT>& mem, uint32_t vid) const 
+    {
+        return mem.slice(vid * getWidth(), (vid+1) * getWidth());
+    }
+
+    // for RAM we can access single elements of a buffer
+    template<typename DataT>
+    DataT& getPixelValue(MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
+
+    template<typename DataT>
+    DataT getPixelValue(const MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
 };
 
 using LiDARModel = SphericalModel;
@@ -215,6 +247,12 @@ struct PinholeModel {
         return getWidth() * getHeight();
     }
 
+    RMAGINE_INLINE_FUNCTION
+    uint32_t getSize() const 
+    {
+        return getWidth() * getHeight();
+    }
+
     RMAGINE_INLINE_FUNCTION
     Vector getDirectionOptical(uint32_t vid, uint32_t hid) const
     {
@@ -250,6 +288,31 @@ struct PinholeModel {
         return vid * width + hid;
     }
 
+    RMAGINE_INLINE_FUNCTION
+    Vector2u getPixelCoord(uint32_t buffer_id) const
+    {
+        return {buffer_id % width, buffer_id / width};
+    }
+
+    // slice horizontal line. vertical is not currently not possible because of memory layout
+    template<typename DataT, typename MemT>
+    MemoryView<DataT, MemT> getRow(const MemoryView<DataT, MemT>& mem, uint32_t vid) const 
+    {
+        return mem.slice(vid * getWidth(), (vid+1) * getWidth());
+    }
+
+    // for RAM we can access single elements of a buffer
+    template<typename DataT>
+    DataT& getPixelValue(MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
+
+    template<typename DataT>
+    DataT getPixelValue(const MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
 };
 // Distortion? Fisheye / radial-tangential ? 
 
@@ -257,20 +320,20 @@ using CameraModel = PinholeModel;
 using DepthCameraModel = PinholeModel;
 
 // TODO: distortion
-struct RadialTangentialDistortion {
-    // TODO
-};
+// struct RadialTangentialDistortion {
+//     // TODO
+// };
 
-struct FisheyeDistortion {
+// struct FisheyeDistortion {
 
-};
+// };
 
 
-struct CylindricModel {
-    static constexpr char name[] = "Cylinder";
-    // TODO
+// struct CylindricModel {
+//     static constexpr char name[] = "Cylinder";
+//     // TODO
     
-};
+// };
 
 template<typename MemT>
 struct O1DnModel_ {
@@ -298,15 +361,15 @@ struct O1DnModel_ {
     }
 
     RMAGINE_INLINE_FUNCTION
-    uint32_t size() const 
+    uint32_t getSize() const 
     {
         return getWidth() * getHeight();
     }
 
     RMAGINE_INLINE_FUNCTION
-    uint32_t getBufferId(uint32_t vid, uint32_t hid) const 
+    uint32_t size() const 
     {
-        return vid * getWidth() + hid;
+        return getWidth() * getHeight();
     }
 
     RMAGINE_INLINE_FUNCTION
@@ -320,6 +383,38 @@ struct O1DnModel_ {
     {
         return dirs[getBufferId(vid, hid)];
     }
+
+    RMAGINE_INLINE_FUNCTION
+    uint32_t getBufferId(uint32_t vid, uint32_t hid) const 
+    {
+        return vid * getWidth() + hid;
+    }
+
+    RMAGINE_INLINE_FUNCTION
+    Vector2u getPixelCoord(uint32_t buffer_id) const
+    {
+        return {buffer_id % width, buffer_id / width};
+    }
+
+    // slice horizontal line. vertical is not currently not possible because of memory layout
+    template<typename DataT, typename MemT_>
+    MemoryView<DataT, MemT_> getRow(const MemoryView<DataT, MemT_>& mem, uint32_t vid) const 
+    {
+        return mem.slice(vid * getWidth(), (vid+1) * getWidth());
+    }
+
+    // for RAM we can access single elements of a buffer
+    template<typename DataT>
+    DataT& getPixelValue(MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
+
+    template<typename DataT>
+    DataT getPixelValue(const MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
 };
 
 using O1DnModel = O1DnModel_<RAM>;
@@ -351,16 +446,15 @@ struct OnDnModel_ {
     }
 
     RMAGINE_INLINE_FUNCTION
-    uint32_t size() const 
+    uint32_t getSize() const 
     {
         return getWidth() * getHeight();
     }
 
-
     RMAGINE_INLINE_FUNCTION
-    uint32_t getBufferId(uint32_t vid, uint32_t hid) const 
+    uint32_t size() const 
     {
-        return vid * getWidth() + hid;
+        return getWidth() * getHeight();
     }
 
     RMAGINE_INLINE_FUNCTION
@@ -375,11 +469,51 @@ struct OnDnModel_ {
         return dirs[getBufferId(vid, hid)];
     }
 
+    RMAGINE_INLINE_FUNCTION
+    uint32_t getBufferId(uint32_t vid, uint32_t hid) const 
+    {
+        return vid * getWidth() + hid;
+    }
+
+    RMAGINE_INLINE_FUNCTION
+    Vector2u getPixelCoord(uint32_t buffer_id) const
+    {
+        return {buffer_id % width, buffer_id / width};
+    }
+
+    // slice horizontal line. vertical is not currently not possible because of memory layout
+    template<typename DataT, typename MemT_>
+    MemoryView<DataT, MemT_> getRow(MemoryView<DataT, MemT_>& mem, uint32_t vid) const 
+    {
+        return mem.slice(vid * getWidth(), (vid+1) * getWidth());
+    }
+
+    // for CPU we can access single elements of a buffer
+    template<typename DataT>
+    DataT& getPixelValue(MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
+
+    template<typename DataT>
+    DataT getPixelValue(const MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
 };
 
 using OnDnModel = OnDnModel_<RAM>;
 
 
+template<typename ModelT, typename DataT, typename MemT>
+MemoryView<DataT, MemT> slice( 
+    const MemoryView<DataT, MemT>& mem,
+    const ModelT& model,
+    const uint32_t pose_id)
+{
+    return mem.slice(model.getSize() * pose_id, model.getSize() * (pose_id + 1));
+}
+
 } // namespace rmagine
 
 #endif // RMAGINE_TYPES_SENSOR_MODELS_H
\ No newline at end of file
diff --git a/src/rmagine_core/include/rmagine/types/shared_functions.h b/src/rmagine_core/include/rmagine/types/shared_functions.h
index a8e11785..c4bdb96d 100644
--- a/src/rmagine_core/include/rmagine/types/shared_functions.h
+++ b/src/rmagine_core/include/rmagine/types/shared_functions.h
@@ -44,9 +44,17 @@
 #ifdef __CUDA_ARCH__
 #define RMAGINE_FUNCTION __host__ __device__
 #define RMAGINE_INLINE_FUNCTION __inline__ __host__ __device__ 
+#define RMAGINE_HOST_FUNCTION __host__
+#define RMAGINE_INLINE_HOST_FUNCTION __inline__ __host__
+#define RMAGINE_DEVICE_FUNCTION __device__
+#define RMAGINE_INLINE_DEVICE_FUNCTION __inline__ __device__
 #else
 #define RMAGINE_FUNCTION
 #define RMAGINE_INLINE_FUNCTION inline
+#define RMAGINE_HOST_FUNCTION 
+#define RMAGINE_INLINE_HOST_FUNCTION inline
+#define RMAGINE_DEVICE_FUNCTION 
+#define RMAGINE_INLINE_DEVICE_FUNCTION inline
 #endif
 
 #endif // RMAGINE_TYPES_SHARED_FUNCTIONS_H
\ No newline at end of file
diff --git a/src/rmagine_core/src/math/linalg.cpp b/src/rmagine_core/src/math/linalg.cpp
index 739b24fb..148d1996 100644
--- a/src/rmagine_core/src/math/linalg.cpp
+++ b/src/rmagine_core/src/math/linalg.cpp
@@ -1,10 +1,15 @@
 #include "rmagine/math/linalg.h"
+#include "rmagine/types/Memory.hpp"
+#include <assert.h>
 
 #include <Eigen/Dense>
 
+#include "rmagine/math/math.h"
+
 namespace rmagine
 {
 
+RMAGINE_HOST_FUNCTION
 Matrix4x4 compose(const Transform& T, const Vector3& scale)
 {
     Matrix4x4 M;
@@ -19,6 +24,7 @@ Matrix4x4 compose(const Transform& T, const Vector3& scale)
     return M * S;
 }
 
+RMAGINE_HOST_FUNCTION
 Matrix4x4 compose(const Transform& T, const Matrix3x3& S)
 {
     Matrix4x4 M;
@@ -38,6 +44,7 @@ Matrix4x4 compose(const Transform& T, const Matrix3x3& S)
     return M * S_;
 }
 
+RMAGINE_HOST_FUNCTION
 void decompose(const Matrix4x4& M, Transform& T, Matrix3x3& S)
 {
     Eigen::Matrix4f Meig;
@@ -73,6 +80,7 @@ void decompose(const Matrix4x4& M, Transform& T, Matrix3x3& S)
     T.R.set(R);
 }
 
+RMAGINE_HOST_FUNCTION
 void decompose(const Matrix4x4& M, Transform& T, Vector3& scale)
 {
     Matrix3x3 S;
@@ -85,15 +93,1088 @@ void decompose(const Matrix4x4& M, Transform& T, Vector3& scale)
     scale.z = S(2,2);
 }
 
+RMAGINE_HOST_FUNCTION
 Quaternion polate(const Quaternion& A, const Quaternion& B, float fac)
 {
     return A * A.to(B).pow(fac);
 }
 
+RMAGINE_HOST_FUNCTION
 Transform polate(const Transform& A, const Transform& B, float fac)
 {
     return A * A.to(B).pow(fac);
 }
 
 
+RMAGINE_HOST_FUNCTION
+void svd(
+    const Matrix3x3& a, 
+    Matrix3x3& u,
+    Matrix3x3& w,
+    Matrix3x3& v)
+{
+    // TODO: test
+    constexpr unsigned int max_iterations = 20;
+    
+    // additional memory required
+    bool flag;
+    int its, j, jj;
+    float anorm, c, f, g, h, s, scale, x, y, z;
+    
+    Vector3 rv1 = Vector3::Zeros();
+    
+    g = s = scale = anorm = 0.0;
+    float eps = __FLT_EPSILON__;
+    u = a;
+
+    // FIRST PART
+
+    // i = 0;
+    // l = 2;
+    scale = fabs(u(0,0)) + fabs(u(1,0)) + fabs(u(2,0));
+    if(scale > 0.0)
+    {
+        u(0, 0) /= scale;
+        u(1, 0) /= scale;
+        u(2, 0) /= scale;
+
+        s = u(0,0) * u(0,0) + u(1,0) * u(1,0) + u(2,0) * u(2,0);
+        f = u(0,0);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
+
+        u(0, 0) = f - g;
+
+        f = (u(0, 0) * u(0, 1) + u(1, 0) * u(1, 1) + u(2, 0) * u(2, 1)) / h;
+        u(0, 1) += f * u(0, 0);
+        u(1, 1) += f * u(1, 0);
+        u(2, 1) += f * u(2, 0);
+
+        f = (u(0, 0) * u(0, 2) + u(1, 0) * u(1, 2) + u(2, 0) * u(2, 2)) / h;
+        u(0, 2) += f * u(0, 0);
+        u(1, 2) += f * u(1, 0);
+        u(2, 2) += f * u(2, 0);
+
+        u(0, 0) *= scale;
+        u(1, 0) *= scale;
+        u(2, 0) *= scale;
+    }
+    
+    w(0, 0) = scale * g;
+    g = s = scale = 0.0;
+    
+    scale = abs(u(0,0)) + abs(u(0,1)) + abs(u(0,2));
+    
+    if(scale > 0.0)
+    {
+        u(0, 1) /= scale;
+        u(0, 2) /= scale;
+        s = u(0,1) * u(0,1) + u(0,2) * u(0,2);
+
+        f = u(0, 1);
+        g = -SIGN(sqrt(s),f);
+        h = f * g-s;
+        u(0, 1) = f - g;
+
+        rv1.y = u(0, 1) / h;
+        rv1.z = u(0, 2) / h;
+
+        s = u(1,1) * u(0,1) + u(1,2) * u(0,2);
+        u(1, 1) += s * rv1.y;
+        u(1, 2) += s * rv1.z;
+    
+        s = u(2,1) * u(0,1) + u(2,2) * u(0,2);
+        u(2, 1) += s * rv1.y;
+        u(2, 2) += s * rv1.z;            
+
+        u(0, 1) *= scale;
+        u(0, 2) *= scale;
+    }
+    
+    anorm = fabs(w(0, 0));
+    // anorm = MAX(anorm, (fabs(w(0, 0)) + fabs(rv1.x))); // rv1.x is always 0 here, anorm too. fabs(X) >= 0
+    
+    // i = 1;
+    // l = 3;
+    rv1.y = scale * g;
+    g = 0.0;
+    scale = fabs(u(1, 1)) + fabs(u(2, 1));
+    
+    if(scale > 0.0)
+    {
+        u(1,1) /= scale;
+        u(2,1) /= scale;
+
+        s = u(1,1) * u(1,1) + u(2,1) * u(2,1);
+        f = u(1,1);
+        g = -SIGN(sqrt(s),f);
+        h = f * g - s;
+        u(1,1) = f-g;
+        
+        f = (u(1,1) * u(1,2) + u(2,1) * u(2,2)) / h;
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
+        
+        u(1,1) *= scale;
+        u(2,1) *= scale;
+    }
+    
+    w(1, 1) = scale * g;
+    g = s = scale = 0.0;
+    
+    scale = abs(u(1,2));
+    if(scale > 0.0)
+    {
+        u(1,2) /= scale;
+        s = u(1,2) * u(1,2);
+        
+        f = u(1, 2);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
+        u(1,2) = f - g;
+
+        rv1.z = u(1,2) / h;
+        s = u(2,2) * u(1,2);
+
+        u(2,2) += s * rv1.z;
+        u(1,2) *= scale;
+    }
+
+    anorm = MAX(anorm, (abs(w(1, 1)) + abs(rv1.y)));
+    
+    rv1.z = scale * g;
+
+    scale = abs(u(2, 2));
+    if(scale > 0.0) 
+    {
+        u(2, 2) /= scale;
+        s = u(2, 2) * u(2, 2);
+        f = u(2, 2);
+        g = -SIGN(sqrt(s),f);
+        h = f * g - s;
+
+        u(2, 2) = f - g;
+        u(2, 2) *= scale;
+    }
+    
+    w(2, 2) = scale * g;
+    g = s = scale = 0.0;
+    
+    anorm = MAX(anorm, (abs(w(2, 2))+abs(rv1.z)));
+
+    // SECOND PART    
+    v(2, 2) = 1.0;
+    g = rv1.z;
+
+    // i = 1;
+    // l = 2;
+    if(fabs(g) > 0.0)
+    {
+        v(2,1) = (u(1,2) / u(1,2)) / g;
+        s = u(1,2) * v(2,2);
+        v(2,2) += s * v(2,1);
+    }
+    v(1,2) = 0.0;
+    v(2,1) = 0.0;
+    v(1,1) = 1.0;
+
+    g = rv1.y;
+
+    // l = 1;
+    // i = 0;
+    if(fabs(g) > 0.0)
+    {
+        v(1,0) = (u(0,1) / u(0,1)) / g;
+        v(2,0) = (u(0,2) / u(0,1)) / g;
+
+        s = u(0,1) * v(1,1) + u(0,2) * v(2,1);
+        v(1,1) += s * v(1,0);
+        v(2,1) += s * v(2,0);
+
+        s = u(0,1) * v(1,2) + u(0,2) * v(2,2);
+        v(1,2) += s * v(1,0);
+        v(2,2) += s * v(2,0);
+    }
+    v(0,1) = 0.0;
+    v(1,0) = 0.0;
+    v(0,2) = 0.0;
+    v(2,0) = 0.0;
+    v(0,0) = 1.0;
+    g = rv1.x;
+
+
+    // THIRD PART
+    
+    // i = 2;
+    // l = 3;
+    g = w(2, 2);
+    if(fabs(g) > 0.0)
+    {
+        u(2,2) /= g;
+    } else { 
+        // TODO(amock): shouldnt this be a large number?
+        u(2,2) = 0.0;   
+    }
+    u(2,2) += 1.0;
+    
+    // i = 1;
+    // l = 2;
+
+    g = w(1, 1);
+    u(1,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
+    {
+        g = 1.0/g;
+        s = u(2,1) * u(2,2);
+        f = (s/u(1,1)) * g;
+
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
+    
+        u(1,1) *= g;
+        u(2,1) *= g;
+    } else {
+        u(1,1) = 0.0;
+        u(2,1) = 0.0;
+    }
+    u(1,1) += 1.0;
+    
+    // i = 0;
+    // l = 1;
+    g = w(0, 0);
+    u(0,1) = 0.0;
+    u(0,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
+    {
+        f = (u(1,0) * u(1,1) + u(2,0) * u(2,1)) / (g * u(0,0));
+        u(0,1) += f * u(0,0);
+        u(1,1) += f * u(1,0);
+        u(2,1) += f * u(2,0);
+
+        f = (u(1,0) * u(1,2) + u(2,0) * u(2,2)) / (g * u(0,0));
+        u(0,2) += f * u(0,0);
+        u(1,2) += f * u(1,0);
+        u(2,2) += f * u(2,0);
+
+        u(0,0) /= g;
+        u(1,0) /= g;
+        u(2,0) /= g;
+    } else {
+        u(0,0) = 0.0;
+        u(1,0) = 0.0;
+        u(2,0) = 0.0;
+    }
+    u(0,0) += 1.0;
+
+    int i, l;
+
+    // PART 4: Opti
+
+    // k = 2;
+    for(its=0; its<max_iterations; its++) 
+    {
+        // flag=true;
+        // l = 2;
+        // if(MIN(fabs(rv1.z), fabs(w(1,1))) > eps*anorm)
+        // {
+        //     l = 1;
+        //     if(MIN(fabs(rv1.y),abs(w(0,0))) > eps*anorm)
+        //     {
+        //         l = 0;
+        //     }
+        // }
+
+        flag=true;
+        l=2;
+        if(abs(rv1.z) <= eps*anorm)
+        {
+            flag=false;
+        }
+        else if(abs(w(1,1)) > eps*anorm)
+        {
+            l=1;
+            if(abs(rv1.y) <= eps*anorm) 
+            {
+                flag=false;
+            }
+            else if(abs(w(0,0)) > eps*anorm) 
+            {
+                l=0;
+                flag = false;
+            }
+        }
+        
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<3; i++) 
+            {
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(abs(f) <= eps*anorm) 
+                {
+                    break;
+                }
+                g = w(i, i);
+                h = PYTHAG(f,g);
+                w(i, i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<3; j++)
+                {
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
+                }
+            }
+        }
+        z = w(2,2);
+        if(l == 2)
+        {
+            if(z < 0.0)
+            {
+                w(2, 2) = -z;
+                for (j=0; j<3; j++) 
+                {
+                    v(j,2) = -v(j,2);
+                }
+            }
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+            // std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            // throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
+        x = w(l,l);
+        y = w(1,1);
+        g = rv1.y;
+        h = rv1.z;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
+        for (j=l; j<2; j++) 
+        {
+            i = j+1;
+            g = rv1[i];
+            y = w(i, i);
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1[j] = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+            for(jj=0;jj<3;jj++)
+            {
+                x = v(jj,j);
+                z = v(jj,i);
+                v(jj,j) = x*c+z*s;
+                v(jj,i) = z*c-x*s;
+            }
+            z = PYTHAG(f,h);
+            w(j,j) = z;
+            if(z>0.0) 
+            {
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
+            }
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for (jj=0;jj<3;jj++)
+            {
+                y = u(jj,j);
+                z = u(jj,i);
+                u(jj,j) = y*c+z*s;
+                u(jj,i) = z*c-y*s;
+            }
+        }
+        rv1[l] = 0.f;
+        rv1.z = f;
+        w(2,2) = x;
+    }
+
+
+    for(its=0; its<max_iterations; its++) 
+    {
+        flag=true;
+        l=1;
+        if(abs(rv1.y) <= eps*anorm) 
+        {
+            flag=false;
+        }
+        else if(abs(w(0,0)) > eps*anorm) 
+        {
+            l=0;
+            flag = false;
+        }
+
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<2; i++) 
+            {
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(abs(f) <= eps*anorm) 
+                {
+                    break;
+                }
+                g = w(i,i);
+                h = PYTHAG(f,g);
+                w(i, i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<3; j++)
+                {
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
+                }
+            }
+        }
+        z = w(1,1);
+        if(l == 1)
+        {
+            if(z < 0.0)
+            {
+                w(1,1) = -z;
+                for (j=0; j<3; j++) 
+                {
+                    v(j,1) = -v(j,1);
+                }
+            }
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+            // std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            // throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
+
+        x = w(l, l);
+        y = w(0, 0);
+        g = rv1.x;
+        h = rv1.y;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
+
+        if(l == 0)
+        {
+            g = rv1.y;
+            y = w(1,1);
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1.x = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+
+
+            x = v(0,0);
+            z = v(0,1);
+            v(0,0) = x*c+z*s;
+            v(0,1) = z*c-x*s;
+
+            x = v(1,0);
+            z = v(1,1);
+            v(1,0) = x*c+z*s;
+            v(1,1) = z*c-x*s;
+
+            x = v(2,0);
+            z = v(2,1);
+            v(2,0) = x*c+z*s;
+            v(2,1) = z*c-x*s;
+
+
+            z = PYTHAG(f,h);
+            w(0,0) = z;
+            if(z>0.f)
+            {
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
+            }
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for(jj=0; jj<3; jj++)
+            {
+                y = u(jj,0);
+                z = u(jj,1);
+                u(jj,0) = y*c+z*s;
+                u(jj,1) = z*c-y*s;
+            }
+        }
+        
+        rv1[l] = 0.f;
+        rv1.y = f;
+        w(1,1) = x;
+    }
+
+
+    z = w(0,0);
+    if (z < 0.0)
+    {
+        w(0,0) = -z;
+        v(0,0) = -v(0,0);
+        v(1,0) = -v(1,0);
+        v(2,0) = -v(2,0);
+    }
+}
+
+RMAGINE_HOST_FUNCTION
+void svd(
+    const Matrix3x3& a, 
+    Matrix3x3& u,
+    Vector3& w,
+    Matrix3x3& v)
+{
+    // TODO: test
+    constexpr unsigned int m = 3;
+    constexpr unsigned int n = 3;
+    constexpr unsigned int max_iterations = 20;
+    
+    // additional memory required
+    bool flag;
+    int its, j, jj, k, nm;
+    float anorm, c, f, g, h, s, scale, x, y, z;
+    
+    Vector3 rv1 = Vector3::Zeros();
+    
+    g = s = scale = anorm = 0.0;
+    float eps = std::numeric_limits<float>::epsilon();
+    u = a;
+
+    // FIRST PART
+
+    // i = 0;
+    // l = 2;
+    scale = fabs(u(0,0)) + fabs(u(1,0)) + fabs(u(2,0));
+    if(scale > 0.0)
+    {
+        u(0, 0) /= scale;
+        u(1, 0) /= scale;
+        u(2, 0) /= scale;
+
+        s = u(0,0) * u(0,0) + u(1,0) * u(1,0) + u(2,0) * u(2,0);
+        f = u(0,0);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
+
+        u(0, 0) = f - g;
+
+        f = (u(0, 0) * u(0, 1) + u(1, 0) * u(1, 1) + u(2, 0) * u(2, 1)) / h;
+        u(0, 1) += f * u(0, 0);
+        u(1, 1) += f * u(1, 0);
+        u(2, 1) += f * u(2, 0);
+
+        f = (u(0, 0) * u(0, 2) + u(1, 0) * u(1, 2) + u(2, 0) * u(2, 2)) / h;
+        u(0, 2) += f * u(0, 0);
+        u(1, 2) += f * u(1, 0);
+        u(2, 2) += f * u(2, 0);
+
+        u(0, 0) *= scale;
+        u(1, 0) *= scale;
+        u(2, 0) *= scale;
+    }
+    
+    w.x = scale * g;
+    g = s = scale = 0.0;
+    
+    scale = abs(u(0,0)) + abs(u(0,1)) + abs(u(0,2));
+    
+    if(scale > 0.0)
+    {
+        u(0, 1) /= scale;
+        u(0, 2) /= scale;
+        s = u(0,1) * u(0,1) + u(0,2) * u(0,2);
+
+        f = u(0, 1);
+        g = -SIGN(sqrt(s),f);
+        h = f * g-s;
+        u(0, 1) = f - g;
+
+        rv1.y = u(0, 1) / h;
+        rv1.z = u(0, 2) / h;
+
+        s = u(1,1) * u(0,1) + u(1,2) * u(0,2);
+        u(1, 1) += s * rv1.y;
+        u(1, 2) += s * rv1.z;
+    
+        s = u(2,1) * u(0,1) + u(2,2) * u(0,2);
+        u(2, 1) += s * rv1.y;
+        u(2, 2) += s * rv1.z;            
+
+        u(0, 1) *= scale;
+        u(0, 2) *= scale;
+    }
+    
+    anorm = fabs(w.x);
+    // anorm = MAX(anorm, (fabs(w(0, 0)) + fabs(rv1.x))); // rv1.x is always 0 here, anorm too. fabs(X) >= 0
+    
+    // i = 1;
+    // l = 3;
+    rv1.y = scale * g;
+    g = 0.0;
+    scale = fabs(u(1, 1)) + fabs(u(2, 1));
+    
+    if(scale > 0.0)
+    {
+        u(1,1) /= scale;
+        u(2,1) /= scale;
+
+        s = u(1,1) * u(1,1) + u(2,1) * u(2,1);
+        f = u(1,1);
+        g = -SIGN(sqrt(s),f);
+        h = f * g - s;
+        u(1,1) = f-g;
+        
+        f = (u(1,1) * u(1,2) + u(2,1) * u(2,2)) / h;
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
+        
+        u(1,1) *= scale;
+        u(2,1) *= scale;
+    }
+    
+    w.y = scale * g;
+    g = s = scale = 0.0;
+    
+    scale = abs(u(1,2));
+    if(scale > 0.0)
+    {
+        u(1,2) /= scale;
+        s = u(1,2) * u(1,2);
+        
+        f = u(1, 2);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
+        u(1,2) = f - g;
+
+        rv1.z = u(1,2) / h;
+        s = u(2,2) * u(1,2);
+
+        u(2,2) += s * rv1.z;
+        u(1,2) *= scale;
+    }
+
+    anorm = MAX(anorm, (abs(w.y) + abs(rv1.y)));
+    
+    rv1.z = scale * g;
+
+    scale = abs(u(2, 2));
+    if(scale > 0.0) 
+    {
+        u(2, 2) /= scale;
+        s = u(2, 2) * u(2, 2);
+        f = u(2, 2);
+        g = -SIGN(sqrt(s),f);
+        h = f * g - s;
+
+        u(2, 2) = f - g;
+        u(2, 2) *= scale;
+    }
+    
+    w.z = scale * g;
+    g = s = scale = 0.0;
+    
+    anorm = MAX(anorm, (abs(w.z)+abs(rv1.z)));
+
+    // SECOND PART    
+    v(2, 2) = 1.0;
+    g = rv1.z;
+
+    // i = 1;
+    // l = 2;
+    if(fabs(g) > 0.0)
+    {
+        v(2,1) = (u(1,2) / u(1,2)) / g;
+        s = u(1,2) * v(2,2);
+        v(2,2) += s * v(2,1);
+    }
+    v(1,2) = 0.0;
+    v(2,1) = 0.0;
+    v(1,1) = 1.0;
+
+    g = rv1.y;
+
+    // l = 1;
+    // i = 0;
+    if(fabs(g) > 0.0)
+    {
+        v(1,0) = (u(0,1) / u(0,1)) / g;
+        v(2,0) = (u(0,2) / u(0,1)) / g;
+
+        s = u(0,1) * v(1,1) + u(0,2) * v(2,1);
+        v(1,1) += s * v(1,0);
+        v(2,1) += s * v(2,0);
+
+        s = u(0,1) * v(1,2) + u(0,2) * v(2,2);
+        v(1,2) += s * v(1,0);
+        v(2,2) += s * v(2,0);
+    }
+    v(0,1) = 0.0;
+    v(1,0) = 0.0;
+    v(0,2) = 0.0;
+    v(2,0) = 0.0;
+    v(0,0) = 1.0;
+    g = rv1.x;
+
+
+    // THIRD PART
+    
+    // i = 2;
+    // l = 3;
+    g = w.z;
+    if(fabs(g) > 0.0)
+    {
+        u(2,2) /= g;
+    } else { 
+        // TODO(amock): shouldnt this be a large number?
+        u(2,2) = 0.0;   
+    }
+    u(2,2) += 1.0;
+    
+    // i = 1;
+    // l = 2;
+
+    g = w.y;
+    u(1,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
+    {
+        g = 1.0/g;
+        s = u(2,1) * u(2,2);
+        f = (s/u(1,1)) * g;
+
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
+    
+        u(1,1) *= g;
+        u(2,1) *= g;
+    } else {
+        u(1,1) = 0.0;
+        u(2,1) = 0.0;
+    }
+    u(1,1) += 1.0;
+    
+    // i = 0;
+    // l = 1;
+    g = w.x;
+    u(0,1) = 0.0;
+    u(0,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
+    {
+        f = (u(1,0) * u(1,1) + u(2,0) * u(2,1)) / (g * u(0,0));
+        u(0,1) += f * u(0,0);
+        u(1,1) += f * u(1,0);
+        u(2,1) += f * u(2,0);
+
+        f = (u(1,0) * u(1,2) + u(2,0) * u(2,2)) / (g * u(0,0));
+        u(0,2) += f * u(0,0);
+        u(1,2) += f * u(1,0);
+        u(2,2) += f * u(2,0);
+
+        u(0,0) /= g;
+        u(1,0) /= g;
+        u(2,0) /= g;
+    } else {
+        u(0,0) = 0.0;
+        u(1,0) = 0.0;
+        u(2,0) = 0.0;
+    }
+    u(0,0) += 1.0;
+
+    int i, l;
+
+    // PART 4: Opti
+
+    // k = 2;
+    for(its=0; its<max_iterations; its++) 
+    {
+        // flag=true;
+        // l = 2;
+        // if(MIN(fabs(rv1.z), fabs(w(1,1))) > eps*anorm)
+        // {
+        //     l = 1;
+        //     if(MIN(fabs(rv1.y),abs(w(0,0))) > eps*anorm)
+        //     {
+        //         l = 0;
+        //     }
+        // }
+
+        flag=true;
+        l=2;
+        if(abs(rv1.z) <= eps*anorm)
+        {
+            flag=false;
+        }
+        else if(abs(w.y) > eps*anorm)
+        {
+            l=1;
+            if(abs(rv1.y) <= eps*anorm) 
+            {
+                flag=false;
+            }
+            else if(abs(w.x) > eps*anorm) 
+            {
+                l=0;
+                flag = false;
+            }
+        }
+        
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<3; i++) 
+            {
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(abs(f) <= eps*anorm) 
+                {
+                    break;
+                }
+                g = w(i);
+                h = PYTHAG(f,g);
+                w(i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<m; j++)
+                {
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
+                }
+            }
+        }
+        z = w.z;
+        if(l == 2)
+        {
+            if(z < 0.0)
+            {
+                w.z = -z;
+                v(0,2) = -v(0,2);
+                v(1,2) = -v(1,2);
+                v(2,2) = -v(2,2);
+            }
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+            // std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
+        x = w(l);
+        y = w.y;
+        g = rv1.y;
+        h = rv1.z;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
+        for (j=l; j<2; j++)
+        {
+            i = j+1;
+            g = rv1[i];
+            y = w(i);
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1[j] = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+            for(jj=0;jj<3;jj++)
+            {
+                x = v(jj,j);
+                z = v(jj,i);
+                v(jj,j) = x*c+z*s;
+                v(jj,i) = z*c-x*s;
+            }
+            z = PYTHAG(f,h);
+            w(j) = z;
+            if(z>0.0) 
+            {
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
+            }
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for (jj=0;jj<m;jj++)
+            {
+                y = u(jj,j);
+                z = u(jj,i);
+                u(jj,j) = y*c+z*s;
+                u(jj,i) = z*c-y*s;
+            }
+        }
+        rv1[l] = 0.f;
+        rv1.z = f;
+        w.z = x;
+    }
+
+
+    for(its=0; its<max_iterations; its++) 
+    {
+        flag=true;
+        l=1;
+        if(abs(rv1.y) <= eps*anorm) 
+        {
+            flag=false;
+        }
+        else if(abs(w.x) > eps*anorm) 
+        {
+            l=0;
+            flag = false;
+        }
+
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<2; i++) 
+            {
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(abs(f) <= eps*anorm) 
+                {
+                    break;
+                }
+                g = w(i);
+                h = PYTHAG(f,g);
+                w(i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<m; j++)
+                {
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
+                }
+            }
+        }
+        z = w.y;
+        if(l == 1)
+        {
+            if(z < 0.0)
+            {
+                w.y = -z;
+                for (j=0; j<3; j++) 
+                {
+                    v(j,1) = -v(j,1);
+                }
+            }
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+            std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
+
+        x = w(l);
+        y = w.x;
+        g = rv1.x;
+        h = rv1.y;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
+
+        if(l == 0)
+        {
+            g = rv1.y;
+            y = w.y;
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1.x = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+
+
+            x = v(0,0);
+            z = v(0,1);
+            v(0,0) = x*c+z*s;
+            v(0,1) = z*c-x*s;
+
+            x = v(1,0);
+            z = v(1,1);
+            v(1,0) = x*c+z*s;
+            v(1,1) = z*c-x*s;
+
+            x = v(2,0);
+            z = v(2,1);
+            v(2,0) = x*c+z*s;
+            v(2,1) = z*c-x*s;
+
+
+            z = PYTHAG(f,h);
+            w.x = z;
+            if(z>0.f)
+            {
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
+            }
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for(jj=0; jj<3; jj++)
+            {
+                y = u(jj,0);
+                z = u(jj,1);
+                u(jj,0) = y*c+z*s;
+                u(jj,1) = z*c-y*s;
+            }
+        }
+        
+        rv1[l] = 0.f;
+        rv1.y = f;
+        w.y = x;
+    }
+
+
+    z = w.x;
+    if (z < 0.0)
+    {
+        w.x = -z;
+        v(0,0) = -v(0,0);
+        v(1,0) = -v(1,0);
+        v(2,0) = -v(2,0);
+    }
+}
+
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_core/src/math/math.cpp b/src/rmagine_core/src/math/math.cpp
index 9323175d..42de8ae9 100644
--- a/src/rmagine_core/src/math/math.cpp
+++ b/src/rmagine_core/src/math/math.cpp
@@ -4,6 +4,8 @@
 
 #include <cassert>
 
+#include "rmagine/math/linalg.h"
+
 namespace rmagine {
 
 
@@ -697,5 +699,39 @@ Memory<Matrix3x3, RAM> cov(
     return C;
 }
 
+/**
+ * @brief decompose A = UWV* using singular value decomposition
+ */
+void svd(
+    const MemoryView<Matrix3x3, RAM>& As,
+    MemoryView<Matrix3x3, RAM>& Us,
+    MemoryView<Matrix3x3, RAM>& Ws,
+    MemoryView<Matrix3x3, RAM>& Vs)
+{
+    #pragma omp parallel for
+    for(size_t i=0; i<As.size(); i++)
+    {
+        svd(As[i], Us[i], Ws[i], Vs[i]);
+    }
+}
+
+/**
+ * @brief decompose A = UWV* using singular value decomposition
+ * 
+ * w is a vector which is the diagonal of matrix W
+ */
+void svd(
+    const MemoryView<Matrix3x3, RAM>& As,
+    MemoryView<Matrix3x3, RAM>& Us,
+    MemoryView<Vector3, RAM>& ws,
+    MemoryView<Matrix3x3, RAM>& Vs)
+{
+    #pragma omp parallel for
+    for(size_t i=0; i<As.size(); i++)
+    {
+        svd(As[i], Us[i], ws[i], Vs[i]);
+    }
+}
+
 
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_core/src/types/mesh_types.cpp b/src/rmagine_core/src/types/mesh_types.cpp
new file mode 100644
index 00000000..dbc2e299
--- /dev/null
+++ b/src/rmagine_core/src/types/mesh_types.cpp
@@ -0,0 +1,6 @@
+#include "rmagine/types/mesh_types.h"
+
+namespace rmagine
+{
+
+} // namespace rmagine
diff --git a/src/rmagine_cuda/CMakeLists.txt b/src/rmagine_cuda/CMakeLists.txt
index 6e8beb34..115abcbb 100644
--- a/src/rmagine_cuda/CMakeLists.txt
+++ b/src/rmagine_cuda/CMakeLists.txt
@@ -1,10 +1,13 @@
 message(STATUS "Building Cuda dependend code. Library: rmagine-cuda")
 
+# add_library(rmagine-cuda SHARED
+
 set(RMAGINE_CUDA_SRCS
     src/types/MemoryCuda.cu
     # math
     src/math/math.cu
     src/math/math_batched.cu
+    src/math/linalg.cu
     src/math/SVDCuda.cpp
     # util
     src/util/cuda/CudaContext.cpp
@@ -58,6 +61,7 @@ set_target_properties(rmagine-cuda
         EXPORT_NAME cuda
         SOVERSION ${rmagine_VERSION_MAJOR}
         VERSION ${rmagine_VERSION}
+        CUDA_SEPARABLE_COMPILATION ON
         # CUDA_STANDARD 17 -> only works for 
         # CXX_STANDARD 17
 )
diff --git a/src/rmagine_cuda/include/rmagine/math/math.cuh b/src/rmagine_cuda/include/rmagine/math/math.cuh
index 7439487f..645085cb 100644
--- a/src/rmagine_cuda/include/rmagine/math/math.cuh
+++ b/src/rmagine_cuda/include/rmagine/math/math.cuh
@@ -543,6 +543,28 @@ Memory<Matrix3x3, VRAM_CUDA> cov(
     const MemoryView<Vector, VRAM_CUDA>& v2
 );
 
+/**
+ * @brief decompose A = UWV* using singular value decomposition
+ */
+void svd(
+    const MemoryView<Matrix3x3, VRAM_CUDA>& As,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Us,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Ws,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Vs
+);
+
+/**
+ * @brief decompose A = UWV* using singular value decomposition
+ * 
+ * w is a vector which is the diagonal of matrix W
+ */
+void svd(
+    const MemoryView<Matrix3x3, VRAM_CUDA>& As,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Us,
+    MemoryView<Vector3, VRAM_CUDA>& ws,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Vs
+);
+
 
 } // namespace rmagine
 
diff --git a/src/rmagine_cuda/include/rmagine/util/cuda/CudaDebug.hpp b/src/rmagine_cuda/include/rmagine/util/cuda/CudaDebug.hpp
index 2bd29b81..faa48553 100644
--- a/src/rmagine_cuda/include/rmagine/util/cuda/CudaDebug.hpp
+++ b/src/rmagine_cuda/include/rmagine/util/cuda/CudaDebug.hpp
@@ -52,4 +52,16 @@ void cudaAssert(
    const char* func,
    int line);
 
+#ifdef NDEBUG
+    #define RM_CUDA_DEBUG() 
+#else  // NDEBUG
+    #define RM_CUDA_DEBUG() \
+    cudaError_t err = cudaGetLastError(); \
+    if (err != cudaSuccess)  \
+    { \
+        printf("Error: %s\n", cudaGetErrorString(err)); \
+        throw std::runtime_error(cudaGetErrorString(err)); \
+    }
+#endif // defined NDEBUG
+
 #endif // RMAGINE_UTIL_CUDA_DEBUG_HPP
\ No newline at end of file
diff --git a/src/rmagine_cuda/src/math/linalg.cu b/src/rmagine_cuda/src/math/linalg.cu
new file mode 100644
index 00000000..17c846da
--- /dev/null
+++ b/src/rmagine_cuda/src/math/linalg.cu
@@ -0,0 +1,1085 @@
+#include <rmagine/math/types.h>
+#include <rmagine/math/linalg.h>
+#include <cuda_runtime.h>
+
+namespace rmagine
+{
+
+RMAGINE_DEVICE_FUNCTION
+void svd(
+    const Matrix3x3& a, 
+    Matrix3x3& u,
+    Matrix3x3& w,
+    Matrix3x3& v)
+{
+    // printf("SVDD\n");
+
+    
+    // TODO: test
+    const unsigned int max_iterations = 20;
+    
+    // additional memory required
+    bool flag;
+    int its, j, jj;
+    float anorm, c, f, g, h, s, scale, x, y, z;
+    
+    Vector3 rv1;
+    rv1.x = 0.0;
+    rv1.y = 0.0;
+    rv1.z = 0.0;
+
+    
+    
+    g = s = scale = anorm = 0.0;
+    const float eps = __FLT_EPSILON__;
+    u = a;
+
+    // FIRST PART
+
+    // i = 0;
+    // l = 2;
+    scale = fabs(u(0,0)) + fabs(u(1,0)) + fabs(u(2,0));
+    if(scale > 0.0)
+    {
+        u(0, 0) /= scale;
+        u(1, 0) /= scale;
+        u(2, 0) /= scale;
+
+        s = u(0,0) * u(0,0) + u(1,0) * u(1,0) + u(2,0) * u(2,0);
+        f = u(0,0);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
+
+        u(0, 0) = f - g;
+
+        f = (u(0, 0) * u(0, 1) + u(1, 0) * u(1, 1) + u(2, 0) * u(2, 1)) / h;
+        u(0, 1) += f * u(0, 0);
+        u(1, 1) += f * u(1, 0);
+        u(2, 1) += f * u(2, 0);
+
+        f = (u(0, 0) * u(0, 2) + u(1, 0) * u(1, 2) + u(2, 0) * u(2, 2)) / h;
+        u(0, 2) += f * u(0, 0);
+        u(1, 2) += f * u(1, 0);
+        u(2, 2) += f * u(2, 0);
+
+        u(0, 0) *= scale;
+        u(1, 0) *= scale;
+        u(2, 0) *= scale;
+    }
+    
+    w(0, 0) = scale * g;
+    g = s = scale = 0.0;
+    
+    scale = abs(u(0,0)) + abs(u(0,1)) + abs(u(0,2));
+    
+    if(scale > 0.0)
+    {
+        u(0, 1) /= scale;
+        u(0, 2) /= scale;
+        s = u(0,1) * u(0,1) + u(0,2) * u(0,2);
+
+        f = u(0, 1);
+        g = -SIGN(sqrt(s),f);
+        h = f * g-s;
+        u(0, 1) = f - g;
+
+        rv1.y = u(0, 1) / h;
+        rv1.z = u(0, 2) / h;
+
+        s = u(1,1) * u(0,1) + u(1,2) * u(0,2);
+        u(1, 1) += s * rv1.y;
+        u(1, 2) += s * rv1.z;
+    
+        s = u(2,1) * u(0,1) + u(2,2) * u(0,2);
+        u(2, 1) += s * rv1.y;
+        u(2, 2) += s * rv1.z;            
+
+        u(0, 1) *= scale;
+        u(0, 2) *= scale;
+    }
+    
+    anorm = fabs(w(0, 0));
+    // anorm = MAX(anorm, (fabs(w(0, 0)) + fabs(rv1.x))); // rv1.x is always 0 here, anorm too. fabs(X) >= 0
+    
+    // i = 1;
+    // l = 3;
+    rv1.y = scale * g;
+    g = 0.0;
+    scale = fabs(u(1, 1)) + fabs(u(2, 1));
+    
+    if(scale > 0.0)
+    {
+        u(1,1) /= scale;
+        u(2,1) /= scale;
+
+        s = u(1,1) * u(1,1) + u(2,1) * u(2,1);
+        f = u(1,1);
+        g = -SIGN(sqrt(s),f);
+        h = f * g - s;
+        u(1,1) = f-g;
+        
+        f = (u(1,1) * u(1,2) + u(2,1) * u(2,2)) / h;
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
+        
+        u(1,1) *= scale;
+        u(2,1) *= scale;
+    }
+    
+    w(1, 1) = scale * g;
+    g = s = scale = 0.0;
+    
+    scale = abs(u(1,2));
+    if(scale > 0.0)
+    {
+        u(1,2) /= scale;
+        s = u(1,2) * u(1,2);
+        
+        f = u(1, 2);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
+        u(1,2) = f - g;
+
+        rv1.z = u(1,2) / h;
+        s = u(2,2) * u(1,2);
+
+        u(2,2) += s * rv1.z;
+        u(1,2) *= scale;
+    }
+
+    anorm = MAX(anorm, (abs(w(1, 1)) + abs(rv1.y)));
+    
+    
+
+    rv1.z = scale * g;
+
+    scale = abs(u(2, 2));
+    if(scale > 0.0) 
+    {
+        u(2, 2) /= scale;
+        s = u(2, 2) * u(2, 2);
+        f = u(2, 2);
+        g = -SIGN(sqrt(s),f);
+        h = f * g - s;
+
+        u(2, 2) = f - g;
+        u(2, 2) *= scale;
+    }
+    
+    w(2, 2) = scale * g;
+    g = s = scale = 0.0;
+    
+    anorm = MAX(anorm, (abs(w(2, 2))+abs(rv1.z)));
+
+    // SECOND PART    
+    v(2, 2) = 1.0;
+    g = rv1.z;
+
+    // i = 1;
+    // l = 2;
+    if(fabs(g) > 0.0)
+    {
+        v(2,1) = (u(1,2) / u(1,2)) / g;
+        s = u(1,2) * v(2,2);
+        v(2,2) += s * v(2,1);
+    }
+    v(1,2) = 0.0;
+    v(2,1) = 0.0;
+    v(1,1) = 1.0;
+
+    g = rv1.y;
+
+    // l = 1;
+    // i = 0;
+    if(fabs(g) > 0.0)
+    {
+        v(1,0) = (u(0,1) / u(0,1)) / g;
+        v(2,0) = (u(0,2) / u(0,1)) / g;
+
+        s = u(0,1) * v(1,1) + u(0,2) * v(2,1);
+        v(1,1) += s * v(1,0);
+        v(2,1) += s * v(2,0);
+
+        s = u(0,1) * v(1,2) + u(0,2) * v(2,2);
+        v(1,2) += s * v(1,0);
+        v(2,2) += s * v(2,0);
+    }
+    v(0,1) = 0.0;
+    v(1,0) = 0.0;
+    v(0,2) = 0.0;
+    v(2,0) = 0.0;
+    v(0,0) = 1.0;
+    g = rv1.x;
+
+
+    // THIRD PART
+    
+    // i = 2;
+    // l = 3;
+    g = w(2, 2);
+    if(fabs(g) > 0.0)
+    {
+        u(2,2) /= g;
+    } else { 
+        // TODO(amock): shouldnt this be a large number?
+        u(2,2) = 0.0;   
+    }
+    u(2,2) += 1.0;
+    
+    // i = 1;
+    // l = 2;
+
+    g = w(1, 1);
+    u(1,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
+    {
+        g = 1.0/g;
+        s = u(2,1) * u(2,2);
+        f = (s/u(1,1)) * g;
+
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
+    
+        u(1,1) *= g;
+        u(2,1) *= g;
+    } else {
+        u(1,1) = 0.0;
+        u(2,1) = 0.0;
+    }
+    u(1,1) += 1.0;
+    
+    // i = 0;
+    // l = 1;
+    g = w(0, 0);
+    u(0,1) = 0.0;
+    u(0,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
+    {
+        f = (u(1,0) * u(1,1) + u(2,0) * u(2,1)) / (g * u(0,0));
+        u(0,1) += f * u(0,0);
+        u(1,1) += f * u(1,0);
+        u(2,1) += f * u(2,0);
+
+        f = (u(1,0) * u(1,2) + u(2,0) * u(2,2)) / (g * u(0,0));
+        u(0,2) += f * u(0,0);
+        u(1,2) += f * u(1,0);
+        u(2,2) += f * u(2,0);
+
+        u(0,0) /= g;
+        u(1,0) /= g;
+        u(2,0) /= g;
+    } else {
+        u(0,0) = 0.0;
+        u(1,0) = 0.0;
+        u(2,0) = 0.0;
+    }
+    u(0,0) += 1.0;
+
+    int i, l;
+
+    
+
+    // PART 4: Opti
+
+    // k = 2;
+    for(int its=0; its<max_iterations; its++) 
+    {
+        // printf("First Iter %d/%d\n", its, max_iterations);
+        // return;
+
+        flag=true;
+        l=2;
+        if(fabs(rv1.z) <= eps*anorm)
+        {
+            flag=false;
+        }
+        else if(fabs(w(1,1)) > eps*anorm)
+        {
+            l=1;
+            if(fabs(rv1.y) <= eps*anorm) 
+            {
+                flag=false;
+            }
+            else if(fabs(w(0,0)) > eps*anorm) 
+            {
+                l=0;
+                flag = false;
+            }
+        }
+        
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<3; i++) 
+            {
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(fabs(f) <= eps*anorm) 
+                {
+                    break;
+                }
+                g = w(i, i);
+                h = PYTHAG(f,g);
+                w(i, i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<3; j++)
+                {
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
+                }
+            }
+        }
+        z = w(2,2);
+        if(l == 2)
+        {
+            if(z < 0.0)
+            {
+                w(2, 2) = -z;
+                for (j=0; j<3; j++) 
+                {
+                    v(j,2) = -v(j,2);
+                }
+            }
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+            // std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            // throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
+        x = w(l,l);
+        y = w(1,1);
+        g = rv1.y;
+        h = rv1.z;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
+        for (j=l; j<2; j++) 
+        {
+            i = j+1;
+            g = rv1[i];
+            y = w(i, i);
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1[j] = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+            for(jj=0;jj<3;jj++)
+            {
+                x = v(jj,j);
+                z = v(jj,i);
+                v(jj,j) = x*c+z*s;
+                v(jj,i) = z*c-x*s;
+            }
+            z = PYTHAG(f,h);
+            w(j,j) = z;
+            if(z>0.0) 
+            {
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
+            }
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for (jj=0;jj<3;jj++)
+            {
+                y = u(jj,j);
+                z = u(jj,i);
+                u(jj,j) = y*c+z*s;
+                u(jj,i) = z*c-y*s;
+            }
+        }
+        rv1[l] = 0.f;
+        rv1.z = f;
+        w(2,2) = x;
+    }
+
+    
+
+    for(its=0; its<max_iterations; its++) 
+    {
+        flag=true;
+        l=1;
+        if(abs(rv1.y) <= eps*anorm) 
+        {
+            flag=false;
+        }
+        else if(abs(w(0,0)) > eps*anorm) 
+        {
+            l=0;
+            flag = false;
+        }
+
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<2; i++) 
+            {
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(abs(f) <= eps*anorm) 
+                {
+                    break;
+                }
+                g = w(i,i);
+                h = PYTHAG(f,g);
+                w(i, i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<3; j++)
+                {
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
+                }
+            }
+        }
+        z = w(1,1);
+        if(l == 1)
+        {
+            if(z < 0.0)
+            {
+                w(1,1) = -z;
+                for (j=0; j<3; j++) 
+                {
+                    v(j,1) = -v(j,1);
+                }
+            }
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+            // std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            // throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
+
+        x = w(l, l);
+        y = w(0, 0);
+        g = rv1.x;
+        h = rv1.y;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
+
+        if(l == 0)
+        {
+            g = rv1.y;
+            y = w(1,1);
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1.x = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+
+
+            x = v(0,0);
+            z = v(0,1);
+            v(0,0) = x*c+z*s;
+            v(0,1) = z*c-x*s;
+
+            x = v(1,0);
+            z = v(1,1);
+            v(1,0) = x*c+z*s;
+            v(1,1) = z*c-x*s;
+
+            x = v(2,0);
+            z = v(2,1);
+            v(2,0) = x*c+z*s;
+            v(2,1) = z*c-x*s;
+
+
+            z = PYTHAG(f,h);
+            w(0,0) = z;
+            if(z>0.f)
+            {
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
+            }
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for(jj=0; jj<3; jj++)
+            {
+                y = u(jj,0);
+                z = u(jj,1);
+                u(jj,0) = y*c+z*s;
+                u(jj,1) = z*c-y*s;
+            }
+        }
+        
+        rv1[l] = 0.f;
+        rv1.y = f;
+        w(1,1) = x;
+    }
+
+
+    z = w(0,0);
+    if (z < 0.0)
+    {
+        w(0,0) = -z;
+        v(0,0) = -v(0,0);
+        v(1,0) = -v(1,0);
+        v(2,0) = -v(2,0);
+    }
+}
+
+
+RMAGINE_DEVICE_FUNCTION
+void svd(
+    const Matrix3x3& a, 
+    Matrix3x3& u,
+    Vector3& w,
+    Matrix3x3& v)
+{
+    // TODO: test
+    // constexpr unsigned int m = 3;
+    // constexpr unsigned int n = 3;
+    constexpr unsigned int max_iterations = 20;
+    
+    // additional memory required
+    bool flag;
+    int its, j, jj;
+    float anorm, c, f, g, h, s, scale, x, y, z;
+    
+    Vector3 rv1 = Vector3::Zeros();
+    
+    g = s = scale = anorm = 0.0;
+    float eps = __FLT_EPSILON__;
+    u = a;
+
+    // FIRST PART
+
+    // i = 0;
+    // l = 2;
+    scale = fabs(u(0,0)) + fabs(u(1,0)) + fabs(u(2,0));
+    if(scale > 0.0)
+    {
+        u(0, 0) /= scale;
+        u(1, 0) /= scale;
+        u(2, 0) /= scale;
+
+        s = u(0,0) * u(0,0) + u(1,0) * u(1,0) + u(2,0) * u(2,0);
+        f = u(0,0);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
+
+        u(0, 0) = f - g;
+
+        f = (u(0, 0) * u(0, 1) + u(1, 0) * u(1, 1) + u(2, 0) * u(2, 1)) / h;
+        u(0, 1) += f * u(0, 0);
+        u(1, 1) += f * u(1, 0);
+        u(2, 1) += f * u(2, 0);
+
+        f = (u(0, 0) * u(0, 2) + u(1, 0) * u(1, 2) + u(2, 0) * u(2, 2)) / h;
+        u(0, 2) += f * u(0, 0);
+        u(1, 2) += f * u(1, 0);
+        u(2, 2) += f * u(2, 0);
+
+        u(0, 0) *= scale;
+        u(1, 0) *= scale;
+        u(2, 0) *= scale;
+    }
+    
+    w.x = scale * g;
+    g = s = scale = 0.0;
+    
+    scale = abs(u(0,0)) + abs(u(0,1)) + abs(u(0,2));
+    
+    if(scale > 0.0)
+    {
+        u(0, 1) /= scale;
+        u(0, 2) /= scale;
+        s = u(0,1) * u(0,1) + u(0,2) * u(0,2);
+
+        f = u(0, 1);
+        g = -SIGN(sqrt(s),f);
+        h = f * g-s;
+        u(0, 1) = f - g;
+
+        rv1.y = u(0, 1) / h;
+        rv1.z = u(0, 2) / h;
+
+        s = u(1,1) * u(0,1) + u(1,2) * u(0,2);
+        u(1, 1) += s * rv1.y;
+        u(1, 2) += s * rv1.z;
+    
+        s = u(2,1) * u(0,1) + u(2,2) * u(0,2);
+        u(2, 1) += s * rv1.y;
+        u(2, 2) += s * rv1.z;            
+
+        u(0, 1) *= scale;
+        u(0, 2) *= scale;
+    }
+    
+    anorm = fabs(w.x);
+    // anorm = MAX(anorm, (fabs(w(0, 0)) + fabs(rv1.x))); // rv1.x is always 0 here, anorm too. fabs(X) >= 0
+    
+    // i = 1;
+    // l = 3;
+    rv1.y = scale * g;
+    g = 0.0;
+    scale = fabs(u(1, 1)) + fabs(u(2, 1));
+    
+    if(scale > 0.0)
+    {
+        u(1,1) /= scale;
+        u(2,1) /= scale;
+
+        s = u(1,1) * u(1,1) + u(2,1) * u(2,1);
+        f = u(1,1);
+        g = -SIGN(sqrt(s),f);
+        h = f * g - s;
+        u(1,1) = f-g;
+        
+        f = (u(1,1) * u(1,2) + u(2,1) * u(2,2)) / h;
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
+        
+        u(1,1) *= scale;
+        u(2,1) *= scale;
+    }
+    
+    w.y = scale * g;
+    g = s = scale = 0.0;
+    
+    scale = abs(u(1,2));
+    if(scale > 0.0)
+    {
+        u(1,2) /= scale;
+        s = u(1,2) * u(1,2);
+        
+        f = u(1, 2);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
+        u(1,2) = f - g;
+
+        rv1.z = u(1,2) / h;
+        s = u(2,2) * u(1,2);
+
+        u(2,2) += s * rv1.z;
+        u(1,2) *= scale;
+    }
+
+    anorm = MAX(anorm, (abs(w.y) + abs(rv1.y)));
+    
+    rv1.z = scale * g;
+
+    scale = abs(u(2, 2));
+    if(scale > 0.0) 
+    {
+        u(2, 2) /= scale;
+        s = u(2, 2) * u(2, 2);
+        f = u(2, 2);
+        g = -SIGN(sqrt(s),f);
+        h = f * g - s;
+
+        u(2, 2) = f - g;
+        u(2, 2) *= scale;
+    }
+    
+    w.z = scale * g;
+    g = s = scale = 0.0;
+    
+    anorm = MAX(anorm, (abs(w.z)+abs(rv1.z)));
+
+    // SECOND PART    
+    v(2, 2) = 1.0;
+    g = rv1.z;
+
+    // i = 1;
+    // l = 2;
+    if(fabs(g) > 0.0)
+    {
+        v(2,1) = (u(1,2) / u(1,2)) / g;
+        s = u(1,2) * v(2,2);
+        v(2,2) += s * v(2,1);
+    }
+    v(1,2) = 0.0;
+    v(2,1) = 0.0;
+    v(1,1) = 1.0;
+
+    g = rv1.y;
+
+    // l = 1;
+    // i = 0;
+    if(fabs(g) > 0.0)
+    {
+        v(1,0) = (u(0,1) / u(0,1)) / g;
+        v(2,0) = (u(0,2) / u(0,1)) / g;
+
+        s = u(0,1) * v(1,1) + u(0,2) * v(2,1);
+        v(1,1) += s * v(1,0);
+        v(2,1) += s * v(2,0);
+
+        s = u(0,1) * v(1,2) + u(0,2) * v(2,2);
+        v(1,2) += s * v(1,0);
+        v(2,2) += s * v(2,0);
+    }
+    v(0,1) = 0.0;
+    v(1,0) = 0.0;
+    v(0,2) = 0.0;
+    v(2,0) = 0.0;
+    v(0,0) = 1.0;
+    g = rv1.x;
+
+
+    // THIRD PART
+    
+    // i = 2;
+    // l = 3;
+    g = w.z;
+    if(fabs(g) > 0.0)
+    {
+        u(2,2) /= g;
+    } else { 
+        // TODO(amock): shouldnt this be a large number?
+        u(2,2) = 0.0;   
+    }
+    u(2,2) += 1.0;
+    
+    // i = 1;
+    // l = 2;
+
+    g = w.y;
+    u(1,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
+    {
+        g = 1.0/g;
+        s = u(2,1) * u(2,2);
+        f = (s/u(1,1)) * g;
+
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
+    
+        u(1,1) *= g;
+        u(2,1) *= g;
+    } else {
+        u(1,1) = 0.0;
+        u(2,1) = 0.0;
+    }
+    u(1,1) += 1.0;
+    
+    // i = 0;
+    // l = 1;
+    g = w.x;
+    u(0,1) = 0.0;
+    u(0,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
+    {
+        f = (u(1,0) * u(1,1) + u(2,0) * u(2,1)) / (g * u(0,0));
+        u(0,1) += f * u(0,0);
+        u(1,1) += f * u(1,0);
+        u(2,1) += f * u(2,0);
+
+        f = (u(1,0) * u(1,2) + u(2,0) * u(2,2)) / (g * u(0,0));
+        u(0,2) += f * u(0,0);
+        u(1,2) += f * u(1,0);
+        u(2,2) += f * u(2,0);
+
+        u(0,0) /= g;
+        u(1,0) /= g;
+        u(2,0) /= g;
+    } else {
+        u(0,0) = 0.0;
+        u(1,0) = 0.0;
+        u(2,0) = 0.0;
+    }
+    u(0,0) += 1.0;
+
+    int i, l;
+
+    // PART 4: Opti
+
+    // k = 2;
+    for(its=0; its<max_iterations; its++) 
+    {
+        // flag=true;
+        // l = 2;
+        // if(MIN(fabs(rv1.z), fabs(w(1,1))) > eps*anorm)
+        // {
+        //     l = 1;
+        //     if(MIN(fabs(rv1.y),abs(w(0,0))) > eps*anorm)
+        //     {
+        //         l = 0;
+        //     }
+        // }
+
+        flag=true;
+        l=2;
+        if(abs(rv1.z) <= eps*anorm)
+        {
+            flag=false;
+        }
+        else if(abs(w.y) > eps*anorm)
+        {
+            l=1;
+            if(abs(rv1.y) <= eps*anorm) 
+            {
+                flag=false;
+            }
+            else if(abs(w.x) > eps*anorm) 
+            {
+                l=0;
+                flag = false;
+            }
+        }
+        
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<3; i++) 
+            {
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(abs(f) <= eps*anorm) 
+                {
+                    break;
+                }
+                g = w(i);
+                h = PYTHAG(f,g);
+                w(i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<3; j++)
+                {
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
+                }
+            }
+        }
+        z = w.z;
+        if(l == 2)
+        {
+            if(z < 0.0)
+            {
+                w.z = -z;
+                v(0,2) = -v(0,2);
+                v(1,2) = -v(1,2);
+                v(2,2) = -v(2,2);
+            }
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+            // std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            // throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
+        x = w(l);
+        y = w.y;
+        g = rv1.y;
+        h = rv1.z;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
+        for (j=l; j<2; j++)
+        {
+            i = j+1;
+            g = rv1[i];
+            y = w(i);
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1[j] = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+            for(jj=0;jj<3;jj++)
+            {
+                x = v(jj,j);
+                z = v(jj,i);
+                v(jj,j) = x*c+z*s;
+                v(jj,i) = z*c-x*s;
+            }
+            z = PYTHAG(f,h);
+            w(j) = z;
+            if(z>0.0) 
+            {
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
+            }
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for (jj=0;jj<3;jj++)
+            {
+                y = u(jj,j);
+                z = u(jj,i);
+                u(jj,j) = y*c+z*s;
+                u(jj,i) = z*c-y*s;
+            }
+        }
+        rv1[l] = 0.f;
+        rv1.z = f;
+        w.z = x;
+    }
+
+
+    for(its=0; its<max_iterations; its++) 
+    {
+        flag=true;
+        l=1;
+        if(abs(rv1.y) <= eps*anorm) 
+        {
+            flag=false;
+        }
+        else if(abs(w.x) > eps*anorm) 
+        {
+            l=0;
+            flag = false;
+        }
+
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<2; i++) 
+            {
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(abs(f) <= eps*anorm) 
+                {
+                    break;
+                }
+                g = w(i);
+                h = PYTHAG(f,g);
+                w(i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<3; j++)
+                {
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
+                }
+            }
+        }
+        z = w.y;
+        if(l == 1)
+        {
+            if(z < 0.0)
+            {
+                w.y = -z;
+                for (j=0; j<3; j++) 
+                {
+                    v(j,1) = -v(j,1);
+                }
+            }
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+        //     std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+        //     throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
+
+        x = w(l);
+        y = w.x;
+        g = rv1.x;
+        h = rv1.y;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
+
+        if(l == 0)
+        {
+            g = rv1.y;
+            y = w.y;
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1.x = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+
+
+            x = v(0,0);
+            z = v(0,1);
+            v(0,0) = x*c+z*s;
+            v(0,1) = z*c-x*s;
+
+            x = v(1,0);
+            z = v(1,1);
+            v(1,0) = x*c+z*s;
+            v(1,1) = z*c-x*s;
+
+            x = v(2,0);
+            z = v(2,1);
+            v(2,0) = x*c+z*s;
+            v(2,1) = z*c-x*s;
+
+
+            z = PYTHAG(f,h);
+            w.x = z;
+            if(z>0.f)
+            {
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
+            }
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for(jj=0; jj<3; jj++)
+            {
+                y = u(jj,0);
+                z = u(jj,1);
+                u(jj,0) = y*c+z*s;
+                u(jj,1) = z*c-y*s;
+            }
+        }
+        
+        rv1[l] = 0.f;
+        rv1.y = f;
+        w.y = x;
+    }
+
+
+    z = w.x;
+    if (z < 0.0)
+    {
+        w.x = -z;
+        v(0,0) = -v(0,0);
+        v(1,0) = -v(1,0);
+        v(2,0) = -v(2,0);
+    }
+}
+
+} // namespace rmagine
diff --git a/src/rmagine_cuda/src/math/math.cu b/src/rmagine_cuda/src/math/math.cu
index e06d3baa..ec02282f 100644
--- a/src/rmagine_cuda/src/math/math.cu
+++ b/src/rmagine_cuda/src/math/math.cu
@@ -1,10 +1,13 @@
 #include "rmagine/math/math.cuh"
 #include "rmagine/math/math.h"
 #include "rmagine/math/types.h"
+#include "rmagine/math/linalg.h"
+#include "rmagine/util/cuda/CudaDebug.hpp"
 
 namespace rmagine 
 {
 
+
 ////////
 // Generic Kernel
 ///
@@ -508,6 +511,7 @@ void multNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     multNxN_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Quaternion, VRAM_CUDA> multNxN(
@@ -528,6 +532,7 @@ void multNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     multNxN_kernel<<<gridSize, blockSize>>>(A.raw(), b.raw(), c.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> multNxN(
@@ -548,6 +553,7 @@ void multNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (T1.size() + blockSize - 1) / blockSize;
     multNxN_kernel<<<gridSize, blockSize>>>(T1.raw(), T2.raw(), Tr.raw(), T1.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Transform, VRAM_CUDA> multNxN(
@@ -567,6 +573,7 @@ void multNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (T.size() + blockSize - 1) / blockSize;
     multNxN_kernel<<<gridSize, blockSize>>>(T.raw(), x.raw(), c.raw(), T.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> multNxN(
@@ -586,6 +593,7 @@ void multNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (M1.size() + blockSize - 1) / blockSize;
     multNxN_kernel<<<gridSize, blockSize>>>(M1.raw(), M2.raw(), Mr.raw(), M1.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> multNxN(
@@ -605,6 +613,7 @@ void multNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (M1.size() + blockSize - 1) / blockSize;
     multNxN_conv_kernel<<<gridSize, blockSize>>>(M1.raw(), M2.raw(), Qres.raw(), M1.size());
+    RM_CUDA_DEBUG();
 }
 
 void multNxN(
@@ -615,6 +624,7 @@ void multNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (M.size() + blockSize - 1) / blockSize;
     multNxN_kernel<<<gridSize, blockSize>>>(M.raw(), x.raw(), c.raw(), M.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> multNxN(
@@ -637,6 +647,7 @@ void multNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     multNxN_kernel<<<gridSize, blockSize>>>(A.raw(), b.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Quaternion, VRAM_CUDA> multNx1(
@@ -657,6 +668,7 @@ void multNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     multNx1_kernel<<<gridSize, blockSize>>>(A.raw(), b.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> multNx1(
@@ -676,6 +688,7 @@ void multNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (T1.size() + blockSize - 1) / blockSize;
     multNx1_kernel<<<gridSize, blockSize>>>(T1.raw(), t2.raw(), Tr.raw(), T1.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Transform, VRAM_CUDA> multNx1(
@@ -695,6 +708,7 @@ void multNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (T.size() + blockSize - 1) / blockSize;
     multNx1_kernel<<<gridSize, blockSize>>>(T.raw(), x.raw(), c.raw(), T.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> multNx1(
@@ -714,6 +728,7 @@ void multNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (M1.size() + blockSize - 1) / blockSize;
     multNx1_kernel<<<gridSize, blockSize>>>(M1.raw(), m2.raw(), Mr.raw(), M1.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> multNx1(
@@ -733,6 +748,7 @@ void multNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (M.size() + blockSize - 1) / blockSize;
     multNx1_kernel<<<gridSize, blockSize>>>(M.raw(), x.raw(), C.raw(), M.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> multNx1(
@@ -752,6 +768,7 @@ void multNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (M.size() + blockSize - 1) / blockSize;
     multNx1_kernel<<<gridSize, blockSize>>>(M.raw(), x.raw(), C.raw(), M.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> multNx1(
@@ -774,6 +791,7 @@ void mult1xN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (B.size() + blockSize - 1) / blockSize;
     mult1xN_kernel<<<gridSize, blockSize>>>(a.raw(), B.raw(), C.raw(), B.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Quaternion, VRAM_CUDA> mult1xN(
@@ -793,6 +811,7 @@ void mult1xN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (B.size() + blockSize - 1) / blockSize;
     mult1xN_kernel<<<gridSize, blockSize>>>(a.raw(), B.raw(), C.raw(), B.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> mult1xN(
@@ -812,6 +831,7 @@ void mult1xN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (T2.size() + blockSize - 1) / blockSize;
     mult1xN_kernel<<<gridSize, blockSize>>>(t1.raw(), T2.raw(), Tr.raw(), T2.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Transform, VRAM_CUDA> mult1xN(
@@ -831,6 +851,7 @@ void mult1xN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (X.size() + blockSize - 1) / blockSize;
     mult1xN_kernel<<<gridSize, blockSize>>>(t.raw(), X.raw(), C.raw(), X.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> mult1xN(
@@ -850,6 +871,7 @@ void mult1xN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (M2.size() + blockSize - 1) / blockSize;
     mult1xN_kernel<<<gridSize, blockSize>>>(m1.raw(), M2.raw(), Mr.raw(), M2.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> mult1xN(
@@ -869,6 +891,7 @@ void mult1xN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (X.size() + blockSize - 1) / blockSize;
     mult1xN_kernel<<<gridSize, blockSize>>>(m.raw(), X.raw(), C.raw(), X.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> mult1xN(
@@ -888,6 +911,7 @@ void mult1xN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (X.size() + blockSize - 1) / blockSize;
     mult1xN_kernel<<<gridSize, blockSize>>>(m.raw(), X.raw(), C.raw(), X.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> mult1xN(
@@ -909,6 +933,7 @@ void addNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     addNxN_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> addNxN(
@@ -928,6 +953,7 @@ void addNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     addNxN_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<float, VRAM_CUDA> addNxN(
@@ -950,6 +976,7 @@ void subNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     subNxN_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> subNxN(
@@ -969,6 +996,7 @@ void subNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     subNx1_kernel<<<gridSize, blockSize>>>(A.raw(), b.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> subNx1(
@@ -989,6 +1017,7 @@ void transpose(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     transpose_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> transpose(
@@ -1006,6 +1035,7 @@ void transpose(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     transpose_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix4x4, VRAM_CUDA> transpose(
@@ -1024,6 +1054,7 @@ void transposeInplace(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     transposeInplace_kernel<<<gridSize, blockSize>>>(A.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 //////
@@ -1035,6 +1066,7 @@ void invert(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     invert_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> invert(
@@ -1052,6 +1084,7 @@ void invert(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     invert_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix4x4, VRAM_CUDA> invert(
@@ -1069,6 +1102,7 @@ void invert(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     invert_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Transform, VRAM_CUDA> invert(
@@ -1089,6 +1123,7 @@ void divNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     divNxN_conv_kernel<float><<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> divNxN(
@@ -1108,6 +1143,7 @@ void divNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     divNxN_conv_kernel<float><<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> divNxN(
@@ -1129,6 +1165,7 @@ void divNxNIgnoreZeros(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     divNxNIgnoreZeros_conv_kernel<float><<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> divNxNIgnoreZeros(
@@ -1148,6 +1185,7 @@ void divNxNIgnoreZeros(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     divNxNIgnoreZeros_conv_kernel<float><<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> divNxNIgnoreZeros(
@@ -1167,6 +1205,7 @@ void divNxNIgnoreZeros(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     divNxNIgnoreZeros_conv_kernel<float><<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<float, VRAM_CUDA> divNxNIgnoreZeros(
@@ -1187,6 +1226,7 @@ void divNxNInplace(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     divNxNInplace_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 void divNxNInplace(
@@ -1207,6 +1247,7 @@ void divNx1Inplace(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     divNx1Inplace_kernel<<<gridSize, blockSize>>>(A.raw(), B, A.size());
+    RM_CUDA_DEBUG();
 }
 
 void divNx1Inplace(
@@ -1228,6 +1269,7 @@ void convert(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (from.size() + blockSize - 1) / blockSize;
     convert_kernel<<<gridSize, blockSize>>>(from.raw(), to.raw(), from.size());
+    RM_CUDA_DEBUG();
 }
 
 void convert(
@@ -1245,6 +1287,7 @@ void copy(const MemoryView<unsigned int, VRAM_CUDA>& from,
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (from.size() + blockSize - 1) / blockSize;
     convert_kernel<<<gridSize, blockSize>>>(from.raw(), to.raw(), from.size());
+    RM_CUDA_DEBUG();
 }
 
 ////////
@@ -1267,6 +1310,7 @@ void pack(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (R.size() + blockSize - 1) / blockSize;
     pack_kernel<<<gridSize, blockSize>>>(R.raw(), t.raw(), T.raw(), R.size());
+    RM_CUDA_DEBUG();
 }
 
 ////////
@@ -1279,6 +1323,7 @@ void multNxNTransposed(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (m1.size() + blockSize - 1) / blockSize;
     covParts_kernel<<<gridSize, blockSize>>>(m1.raw(), m2.raw(), Cs.raw(), m1.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> multNxNTransposed(
@@ -1299,6 +1344,7 @@ void multNxNTransposed(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (m1.size() + blockSize - 1) / blockSize;
     covParts_kernel<<<gridSize, blockSize>>>(m1.raw(), m2.raw(), mask.raw(), Cs.raw(), m1.size());
+    RM_CUDA_DEBUG();
 }
     
 Memory<Matrix3x3, VRAM_CUDA> multNxNTransposed(
@@ -1318,6 +1364,7 @@ void normalizeInplace(MemoryView<Quaternion, VRAM_CUDA>& q)
     constexpr unsigned int blockSize = 1024;
     const unsigned int gridSize = (q.size() + blockSize - 1) / blockSize;
     normalizeInplace_kernel<<<gridSize, blockSize>>>(q.raw(), q.size());
+    RM_CUDA_DEBUG();
 }
 
 ///////
@@ -1340,6 +1387,7 @@ void setIdentity(MemoryView<Quaternion, VRAM_CUDA>& qs)
     constexpr unsigned int blockSize = 1024;
     const unsigned int gridSize = (qs.size() + blockSize - 1) / blockSize;
     setIdentity_kernel<<<gridSize, blockSize>>>(qs.raw(), qs.size());
+    RM_CUDA_DEBUG();
 }
 
 void setIdentity(MemoryView<Transform, VRAM_CUDA>& Ts)
@@ -1347,6 +1395,7 @@ void setIdentity(MemoryView<Transform, VRAM_CUDA>& Ts)
     constexpr unsigned int blockSize = 1024;
     const unsigned int gridSize = (Ts.size() + blockSize - 1) / blockSize;
     setIdentity_kernel<<<gridSize, blockSize>>>(Ts.raw(), Ts.size());
+    RM_CUDA_DEBUG();
 }
 
 void setIdentity(MemoryView<Matrix3x3, VRAM_CUDA>& Ms)
@@ -1354,6 +1403,7 @@ void setIdentity(MemoryView<Matrix3x3, VRAM_CUDA>& Ms)
     constexpr unsigned int blockSize = 1024;
     const unsigned int gridSize = (Ms.size() + blockSize - 1) / blockSize;
     setIdentity_kernel<<<gridSize, blockSize>>>(Ms.raw(), Ms.size());
+    RM_CUDA_DEBUG();
 }
 
 void setIdentity(MemoryView<Matrix4x4, VRAM_CUDA>& Ms)
@@ -1361,6 +1411,7 @@ void setIdentity(MemoryView<Matrix4x4, VRAM_CUDA>& Ms)
     constexpr unsigned int blockSize = 1024;
     const unsigned int gridSize = (Ms.size() + blockSize - 1) / blockSize;
     setIdentity_kernel<<<gridSize, blockSize>>>(Ms.raw(), Ms.size());
+    RM_CUDA_DEBUG();
 }
 
 void setZeros(MemoryView<Matrix3x3, VRAM_CUDA>& Ms)
@@ -1381,6 +1432,7 @@ void sum(
     MemoryView<Vector, VRAM_CUDA>& s)
 {
     sum_kernel<1024> <<<1, 1024>>>(data.raw(), s.raw(), data.size() );
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> sum(
@@ -1417,6 +1469,7 @@ void cov(
     MemoryView<Matrix3x3, VRAM_CUDA>& C)
 {
     cov_kernel<1024> <<<1, 1024>>>(v1.raw(), v2.raw(), C.raw(), v1.size() );
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> cov(
@@ -1430,4 +1483,61 @@ Memory<Matrix3x3, VRAM_CUDA> cov(
 }
 
 
+__global__ void svd_kernel(
+    const Matrix3x3* As,
+    Matrix3x3* Us,
+    Matrix3x3* Ws,
+    Matrix3x3* Vs, 
+    unsigned int N)
+{
+    const unsigned int id = blockIdx.x * blockDim.x + threadIdx.x;
+    // printf("hello\n");
+    if(id < N)
+    {
+        svd(As[id], Us[id], Ws[id], Vs[id]);
+    }
+}
+
+void svd(
+    const MemoryView<Matrix3x3, VRAM_CUDA>& As,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Us,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Ws,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Vs
+)
+{
+    // std::cout << "SVD CUDA!" << std::endl;
+    constexpr unsigned int blockSize = 512;
+    const unsigned int gridSize = (As.size() + blockSize - 1) / blockSize;
+    svd_kernel<<<gridSize, blockSize>>>(As.raw(), Us.raw(), Ws.raw(), Vs.raw(), As.size());
+    RM_CUDA_DEBUG();
+}
+
+__global__ void svd_kernel(
+    const Matrix3x3* As,
+    Matrix3x3* Us,
+    Vector3* ws,
+    Matrix3x3* Vs,
+    unsigned int N)
+{
+    const unsigned int id = blockIdx.x * blockDim.x + threadIdx.x;
+    // printf("hello\n");
+    if(id < N)
+    {
+        svd(As[id], Us[id], ws[id], Vs[id]);
+    }
+}
+
+void svd(
+    const MemoryView<Matrix3x3, VRAM_CUDA>& As,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Us,
+    MemoryView<Vector3, VRAM_CUDA>& ws,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Vs
+)
+{
+    constexpr unsigned int blockSize = 512;
+    const unsigned int gridSize = (As.size() + blockSize - 1) / blockSize;
+    svd_kernel<<<gridSize, blockSize>>>(As.raw(), Us.raw(), ws.raw(), Vs.raw(), As.size());
+    RM_CUDA_DEBUG();
+}
+
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_embree/include/rmagine/map/embree/EmbreeMesh.hpp b/src/rmagine_embree/include/rmagine/map/embree/EmbreeMesh.hpp
index ea0a580f..fe7925c4 100644
--- a/src/rmagine_embree/include/rmagine/map/embree/EmbreeMesh.hpp
+++ b/src/rmagine_embree/include/rmagine/map/embree/EmbreeMesh.hpp
@@ -92,8 +92,6 @@ class EmbreeMesh
 
     void initVertexNormals();
 
-    bool closestPointFunc2(RTCPointQueryFunctionArguments* args);
-
     // PUBLIC ATTRIBUTES
     MemoryView<Face, RAM> faces() const;
     MemoryView<Vertex, RAM> vertices() const;
@@ -103,9 +101,6 @@ class EmbreeMesh
     MemoryView<const Vertex, RAM> verticesTransformed() const;
     MemoryView<const Vector, RAM> faceNormalsTransformed() const;
     
-    
-    
-
     void computeFaceNormals();
 
     /**
@@ -137,10 +132,6 @@ class EmbreeMesh
     Vertex* m_vertices_transformed;
     Memory<Vector, RAM> m_face_normals_transformed;
     Memory<Vector, RAM> m_vertex_normals_transformed;
-
-    // boost::function<bool (RTCPointQueryFunctionArguments*)> m_closest_point_func;
-    // RTCPointQueryFunction* m_closest_point_func_raw;
-
 };
 
 using EmbreeMeshPtr = std::shared_ptr<EmbreeMesh>;
diff --git a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.hpp b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.hpp
index 714942ec..9892808e 100644
--- a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.hpp
+++ b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.hpp
@@ -112,34 +112,6 @@ class O1DnSimulatorEmbree {
     void setModel(const MemoryView<O1DnModel_<RAM>, RAM>& model);
     void setModel(const O1DnModel_<RAM>& model);
 
-    // Generic Version
-    template<typename BundleT>
-    void simulate(const MemoryView<Transform, RAM>& Tbm,
-        BundleT& ret) const;
-
-    template<typename BundleT>
-    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
-
-
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<float, RAM>& ranges) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<float, RAM> simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateHits(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<uint8_t, RAM>& hits) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<uint8_t, RAM> simulateHits(
-        const MemoryView<Transform, RAM>& Tbm) const;
-
     inline Memory<O1DnModel_<RAM>, RAM> model() const
     {
         return m_model;
@@ -150,6 +122,41 @@ class O1DnSimulatorEmbree {
         return m_map;
     }
 
+
+    /**
+     * @brief Simulate from one pose
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
+    template<typename BundleT>
+    void simulate(const Transform& Tbm, BundleT& ret) const;
+
+    template<typename BundleT>
+    BundleT simulate(const Transform& Tbm) const;
+
+    /**
+     * @brief Simulate for multiple poses at once
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
+    template<typename BundleT>
+    void simulate(const MemoryView<Transform, RAM>& Tbm,
+        BundleT& ret) const;
+
+    template<typename BundleT>
+    void simulate(const MemoryView<const Transform, RAM>& Tbm,
+        BundleT& ret) const;
+
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
+
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<const Transform, RAM>& Tbm) const;
+
 protected:
     EmbreeMapPtr m_map;
     
diff --git a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
index 7e1598c3..63ccd223 100644
--- a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
@@ -12,15 +12,46 @@
 namespace rmagine
 {
 
+template<typename BundleT>
+void O1DnSimulatorEmbree::simulate(
+    const Transform& Tbm,
+    BundleT& ret) const
+{
+    // TODO: change parallelization scheme for single simulations?
+    const MemoryView<const Transform, RAM> Tbm_mem(&Tbm, 1);
+    simulate(Tbm_mem, ret);
+}
+
+template<typename BundleT>
+BundleT O1DnSimulatorEmbree::simulate(
+    const Transform& Tbm) const
+{
+    BundleT res;
+    resize_memory_bundle<RAM>(res, m_model->getWidth(), m_model->getHeight(), 1);
+    simulate(Tbm, res);
+    return res;
+}
 
 template<typename BundleT>
 void O1DnSimulatorEmbree::simulate(
     const MemoryView<Transform, RAM>& Tbm,
     BundleT& ret) const
+{
+    const MemoryView<const Transform, RAM> Tbm_const(Tbm.raw(), Tbm.size());
+    simulate(Tbm_const, ret);
+}
+
+template<typename BundleT>
+void O1DnSimulatorEmbree::simulate(
+    const MemoryView<const Transform, RAM>& Tbm,
+    BundleT& ret) const
 {
     SimulationFlags flags = SimulationFlags::Zero();
     set_simulation_flags_<RAM>(ret, flags);
 
+    const float range_min = m_model->range.min;
+    const float range_max = m_model->range.max;
+
     #pragma omp parallel for
     for(size_t pid = 0; pid < Tbm.size(); pid++)
     {
@@ -52,8 +83,8 @@ void O1DnSimulatorEmbree::simulate(
                 rayhit.ray.dir_x = ray_dir_m.x;
                 rayhit.ray.dir_y = ray_dir_m.y;
                 rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
+                rayhit.ray.tnear = 0; // if set to m_model->range.min we would scan through near occlusions
+                rayhit.ray.tfar = range_max;
                 rayhit.ray.mask = -1;
                 rayhit.ray.flags = 0;
                 rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
@@ -67,7 +98,12 @@ void O1DnSimulatorEmbree::simulate(
                     {
                         if(flags.hits)
                         {
-                            ret.Hits<RAM>::hits[glob_id] = 1;
+                            if(rayhit.ray.tfar >= range_min)
+                            {
+                                ret.Hits<RAM>::hits[glob_id] = 1;
+                            } else {
+                                ret.Hits<RAM>::hits[glob_id] = 0;
+                            }
                         }
                     }
 
@@ -83,7 +119,7 @@ void O1DnSimulatorEmbree::simulate(
                     {
                         if(flags.points)
                         {
-                            Vector pint = ray_dir_s * rayhit.ray.tfar;
+                            Vector pint = ray_dir_s * rayhit.ray.tfar + ray_orig_s;
                             ret.Points<RAM>::points[glob_id] = pint;
                         }
                     }
@@ -217,4 +253,14 @@ BundleT O1DnSimulatorEmbree::simulate(
     return res;
 }
 
+template<typename BundleT>
+BundleT O1DnSimulatorEmbree::simulate(
+    const MemoryView<const Transform, RAM>& Tbm) const
+{
+    BundleT res;
+    resize_memory_bundle<RAM>(res, m_model->getWidth(), m_model->getHeight(), Tbm.size());
+    simulate(Tbm, res);
+    return res;
+}
+
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.hpp b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.hpp
index ccfe7ffb..af2015c6 100644
--- a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.hpp
+++ b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.hpp
@@ -112,33 +112,6 @@ class OnDnSimulatorEmbree {
     void setModel(const OnDnModel_<RAM>& model);
     void setModel(const MemoryView<OnDnModel_<RAM>, RAM>& model);
 
-
-    // Generic Version
-    template<typename BundleT>
-    void simulate(const MemoryView<Transform, RAM>& Tbm,
-        BundleT& ret) const;
-
-    template<typename BundleT>
-    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<float, RAM>& ranges) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<float, RAM> simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateHits(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<uint8_t, RAM>& hits) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<uint8_t, RAM> simulateHits(
-        const MemoryView<Transform, RAM>& Tbm) const;
-
     inline Memory<OnDnModel_<RAM>, RAM> model() const
     {
       return m_model;
@@ -149,6 +122,41 @@ class OnDnSimulatorEmbree {
         return m_map;
     }
 
+    /**
+     * @brief Simulate from one pose
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
+    template<typename BundleT>
+    void simulate(const Transform& Tbm, BundleT& ret) const;
+
+    template<typename BundleT>
+    BundleT simulate(const Transform& Tbm) const;
+
+    /**
+     * @brief Simulate for multiple poses at once
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
+    template<typename BundleT>
+    void simulate(const MemoryView<Transform, RAM>& Tbm,
+        BundleT& ret) const;
+
+    template<typename BundleT>
+    void simulate(const MemoryView<const Transform, RAM>& Tbm,
+        BundleT& ret) const;
+
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
+
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<const Transform, RAM>& Tbm) const;
+
+    
 protected:
     EmbreeMapPtr m_map;
     
diff --git a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
index e003ff6b..9a875be0 100644
--- a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
@@ -13,13 +13,45 @@
 namespace rmagine
 {
 
+template<typename BundleT>
+void OnDnSimulatorEmbree::simulate(
+    const Transform& Tbm,
+    BundleT& ret) const
+{
+    // TODO: change parallelization scheme for single simulations?
+    const MemoryView<const Transform, RAM> Tbm_mem(&Tbm, 1);
+    simulate(Tbm_mem, ret);
+}
+
+template<typename BundleT>
+BundleT OnDnSimulatorEmbree::simulate(
+    const Transform& Tbm) const
+{
+    BundleT res;
+    resize_memory_bundle<RAM>(res, m_model->getWidth(), m_model->getHeight(), 1);
+    simulate(Tbm, res);
+    return res;
+}
+
 template<typename BundleT>
 void OnDnSimulatorEmbree::simulate(
     const MemoryView<Transform, RAM>& Tbm,
     BundleT& ret) const
+{
+    const MemoryView<const Transform, RAM> Tbm_const(Tbm.raw(), Tbm.size());
+    simulate(Tbm_const, ret);
+}
+
+template<typename BundleT>
+void OnDnSimulatorEmbree::simulate(
+    const MemoryView<const Transform, RAM>& Tbm,
+    BundleT& ret) const
 {
     SimulationFlags flags = SimulationFlags::Zero();
     set_simulation_flags_<RAM>(ret, flags);
+    
+    const float range_min = m_model->range.min;
+    const float range_max = m_model->range.max;
 
     #pragma omp parallel for
     for(size_t pid = 0; pid < Tbm.size(); pid++)
@@ -31,7 +63,7 @@ void OnDnSimulatorEmbree::simulate(
         const Transform Tms_ = Tsm_.inv();
 
         const unsigned int glob_shift = pid * m_model->size();
-
+        
         for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
         {
             for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
@@ -53,7 +85,7 @@ void OnDnSimulatorEmbree::simulate(
                 rayhit.ray.dir_y = ray_dir_m.y;
                 rayhit.ray.dir_z = ray_dir_m.z;
                 rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
+                rayhit.ray.tfar = range_max;
                 rayhit.ray.mask = -1;
                 rayhit.ray.flags = 0;
                 rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
@@ -67,7 +99,12 @@ void OnDnSimulatorEmbree::simulate(
                     {
                         if(flags.hits)
                         {
-                            ret.Hits<RAM>::hits[glob_id] = 1;
+                            if(rayhit.ray.tfar >= range_min)
+                            {
+                                ret.Hits<RAM>::hits[glob_id] = 1;
+                            } else {
+                                ret.Hits<RAM>::hits[glob_id] = 0;
+                            }
                         }
                     }
 
@@ -83,7 +120,7 @@ void OnDnSimulatorEmbree::simulate(
                     {
                         if(flags.points)
                         {
-                            Vector pint = ray_dir_s * rayhit.ray.tfar;
+                            Vector pint = ray_dir_s * rayhit.ray.tfar + ray_orig_s;
                             ret.Points<RAM>::points[glob_id] = pint;
                         }
                     }
@@ -217,4 +254,14 @@ BundleT OnDnSimulatorEmbree::simulate(
     return res;
 }
 
+template<typename BundleT>
+BundleT OnDnSimulatorEmbree::simulate(
+    const MemoryView<const Transform, RAM>& Tbm) const
+{
+    BundleT res;
+    resize_memory_bundle<RAM>(res, m_model->getWidth(), m_model->getHeight(), Tbm.size());
+    simulate(Tbm, res);
+    return res;
+}
+
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.hpp b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.hpp
index d940fb5e..04e7aab8 100644
--- a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.hpp
+++ b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.hpp
@@ -116,32 +116,6 @@ class PinholeSimulatorEmbree {
     void setModel(const MemoryView<PinholeModel, RAM>& model);
     void setModel(const PinholeModel& model);
 
-    // Generic Version
-    template<typename BundleT>
-    void simulate(const MemoryView<Transform, RAM>& Tbm,
-        BundleT& ret) const;
-
-    template<typename BundleT>
-    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<float, RAM>& ranges) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<float, RAM> simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateHits(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<uint8_t, RAM>& hits) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<uint8_t, RAM> simulateHits(
-        const MemoryView<Transform, RAM>& Tbm) const;
-
     inline Memory<PinholeModel, RAM> model() const
     {
       return m_model;
@@ -152,6 +126,40 @@ class PinholeSimulatorEmbree {
         return m_map;
     }
 
+    /**
+     * @brief Simulate from one pose
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
+    template<typename BundleT>
+    void simulate(const Transform& Tbm, BundleT& ret) const;
+
+    template<typename BundleT>
+    BundleT simulate(const Transform& Tbm) const;
+
+    /**
+     * @brief Simulate for multiple poses at once
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
+    template<typename BundleT>
+    void simulate(const MemoryView<Transform, RAM>& Tbm,
+        BundleT& ret) const;
+
+    template<typename BundleT>
+    void simulate(const MemoryView<const Transform, RAM>& Tbm,
+        BundleT& ret) const;
+
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
+
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<const Transform, RAM>& Tbm) const;
+
 protected:
     EmbreeMapPtr m_map;
 
diff --git a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
index 956953c9..0e563b2b 100644
--- a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
@@ -13,14 +13,46 @@
 namespace rmagine
 {
 
+template<typename BundleT>
+void PinholeSimulatorEmbree::simulate(
+    const Transform& Tbm,
+    BundleT& ret) const
+{
+    // TODO: change parallelization scheme for single simulations?
+    const MemoryView<const Transform, RAM> Tbm_mem(&Tbm, 1);
+    simulate(Tbm_mem, ret);
+}
+
+template<typename BundleT>
+BundleT PinholeSimulatorEmbree::simulate(
+    const Transform& Tbm) const
+{
+    BundleT res;
+    resize_memory_bundle<RAM>(res, m_model->getWidth(), m_model->getHeight(), 1);
+    simulate(Tbm, res);
+    return res;
+}
+
 template<typename BundleT>
 void PinholeSimulatorEmbree::simulate(
     const MemoryView<Transform, RAM>& Tbm,
     BundleT& ret) const
+{
+    const MemoryView<const Transform, RAM> Tbm_const(Tbm.raw(), Tbm.size());
+    simulate(Tbm_const, ret);
+}
+
+template<typename BundleT>
+void PinholeSimulatorEmbree::simulate(
+    const MemoryView<const Transform, RAM>& Tbm,
+    BundleT& ret) const
 {
     SimulationFlags flags = SimulationFlags::Zero();
     set_simulation_flags_<RAM>(ret, flags);
 
+    const float range_min = m_model->range.min;
+    const float range_max = m_model->range.max;
+
     #pragma omp parallel for
     for(size_t pid = 0; pid < Tbm.size(); pid++)
     {
@@ -49,8 +81,8 @@ void PinholeSimulatorEmbree::simulate(
                 rayhit.ray.dir_x = ray_dir_m.x;
                 rayhit.ray.dir_y = ray_dir_m.y;
                 rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
+                rayhit.ray.tnear = 0; // if set to m_model->range.min we would scan through near occlusions
+                rayhit.ray.tfar = range_max;
                 rayhit.ray.mask = -1;
                 rayhit.ray.flags = 0;
                 rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
@@ -64,7 +96,12 @@ void PinholeSimulatorEmbree::simulate(
                     {
                         if(flags.hits)
                         {
-                            ret.Hits<RAM>::hits[glob_id] = 1;
+                            if(rayhit.ray.tfar >= range_min)
+                            {
+                                ret.Hits<RAM>::hits[glob_id] = 1;
+                            } else {
+                                ret.Hits<RAM>::hits[glob_id] = 0;
+                            }
                         }
                     }
 
@@ -94,9 +131,10 @@ void PinholeSimulatorEmbree::simulate(
                                     rayhit.hit.Ng_y,
                                     rayhit.hit.Ng_z
                                 };
-                            
+                            // nint in map frame
                             nint.normalizeInplace();
                             nint = Tms_.R * nint;
+                            // nint in sensor frame
 
                             // flip?
                             if(ray_dir_s.dot(nint) > 0.0)
diff --git a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.hpp b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.hpp
index b60736e3..29025617 100644
--- a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.hpp
+++ b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.hpp
@@ -116,33 +116,6 @@ class SphereSimulatorEmbree {
     void setModel(const MemoryView<SphericalModel, RAM>& model);
     void setModel(const SphericalModel& model);
 
-    // Generic Version
-    template<typename BundleT>
-    void simulate(const MemoryView<Transform, RAM>& Tbm,
-        BundleT& ret) const;
-
-    template<typename BundleT>
-    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<float, RAM>& ranges) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<float, RAM> simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateHits(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<uint8_t, RAM>& hits) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<uint8_t, RAM> simulateHits(
-        const MemoryView<Transform, RAM>& Tbm) const;
-    
-
     inline Memory<SphericalModel, RAM> model() const
     {
       return m_model;
@@ -153,6 +126,40 @@ class SphereSimulatorEmbree {
         return m_map;
     }
 
+    /**
+     * @brief Simulate from one pose
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
+    template<typename BundleT>
+    void simulate(const Transform& Tbm, BundleT& ret) const;
+
+    template<typename BundleT>
+    BundleT simulate(const Transform& Tbm) const;
+
+    /**
+     * @brief Simulate for multiple poses at once
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
+    template<typename BundleT>
+    void simulate(const MemoryView<Transform, RAM>& Tbm,
+        BundleT& ret) const;
+
+    template<typename BundleT>
+    void simulate(const MemoryView<const Transform, RAM>& Tbm,
+        BundleT& ret) const;
+
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
+
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<const Transform, RAM>& Tbm) const;
+
 protected:
     EmbreeMapPtr m_map;
     
diff --git a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc
index ac800af2..3511838d 100644
--- a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc
@@ -13,14 +13,47 @@
 namespace rmagine
 {
 
+template<typename BundleT>
+void SphereSimulatorEmbree::simulate(
+    const Transform& Tbm,
+    BundleT& ret) const
+{
+    // TODO: change parallelization scheme for single simulations?
+    const MemoryView<const Transform, RAM> Tbm_mem(&Tbm, 1);
+    simulate(Tbm_mem, ret);
+}
+
+template<typename BundleT>
+BundleT SphereSimulatorEmbree::simulate(
+    const Transform& Tbm) const
+{
+    BundleT res;
+    resize_memory_bundle<RAM>(res, m_model->getWidth(), m_model->getHeight(), 1);
+    simulate(Tbm, res);
+    return res;
+}
+
 template<typename BundleT>
 void SphereSimulatorEmbree::simulate(
     const MemoryView<Transform, RAM>& Tbm,
     BundleT& ret) const
+{
+    const MemoryView<const Transform, RAM> Tbm_const(Tbm.raw(), Tbm.size());
+    simulate(Tbm_const, ret);
+}
+
+template<typename BundleT>
+void SphereSimulatorEmbree::simulate(
+    const MemoryView<const Transform, RAM>& Tbm,
+    BundleT& ret) const
 {
     SimulationFlags flags = SimulationFlags::Zero();
     set_simulation_flags_<RAM>(ret, flags);
 
+    const float range_min = m_model->range.min;
+    const float range_max = m_model->range.max;
+    
+
     #pragma omp parallel for
     for(size_t pid = 0; pid < Tbm.size(); pid++)
     {
@@ -49,8 +82,8 @@ void SphereSimulatorEmbree::simulate(
                 rayhit.ray.dir_x = ray_dir_m.x;
                 rayhit.ray.dir_y = ray_dir_m.y;
                 rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
+                rayhit.ray.tnear = 0.0; // if set to m_model->range.min we would scan through near occlusions
+                rayhit.ray.tfar = range_max;
                 rayhit.ray.mask = -1;
                 rayhit.ray.flags = 0;
                 rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
@@ -64,7 +97,12 @@ void SphereSimulatorEmbree::simulate(
                     {
                         if(flags.hits)
                         {
-                            ret.Hits<RAM>::hits[glob_id] = 1;
+                            if(rayhit.ray.tfar >= range_min)
+                            {
+                                ret.Hits<RAM>::hits[glob_id] = 1;
+                            } else {
+                                ret.Hits<RAM>::hits[glob_id] = 0;
+                            }
                         }
                     }
 
@@ -214,4 +252,14 @@ BundleT SphereSimulatorEmbree::simulate(
     return res;
 }
 
+template<typename BundleT>
+BundleT SphereSimulatorEmbree::simulate(
+    const MemoryView<const Transform, RAM>& Tbm) const
+{
+    BundleT res;
+    resize_memory_bundle<RAM>(res, m_model->getWidth(), m_model->getHeight(), Tbm.size());
+    simulate(Tbm, res);
+    return res;
+}
+
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_embree/src/map/embree/EmbreeMesh.cpp b/src/rmagine_embree/src/map/embree/EmbreeMesh.cpp
index cc290672..8a87011e 100644
--- a/src/rmagine_embree/src/map/embree/EmbreeMesh.cpp
+++ b/src/rmagine_embree/src/map/embree/EmbreeMesh.cpp
@@ -322,6 +322,4 @@ void EmbreeMesh::apply()
     }
 }
 
-// pt2ConstMember = &EmbreeMesh::closestPointFunc2; 
-
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_embree/src/simulation/O1DnSimulatorEmbree.cpp b/src/rmagine_embree/src/simulation/O1DnSimulatorEmbree.cpp
index c99741ca..92d5793f 100644
--- a/src/rmagine_embree/src/simulation/O1DnSimulatorEmbree.cpp
+++ b/src/rmagine_embree/src/simulation/O1DnSimulatorEmbree.cpp
@@ -19,7 +19,7 @@ O1DnSimulatorEmbree::O1DnSimulatorEmbree(EmbreeMapPtr map)
 
 O1DnSimulatorEmbree::~O1DnSimulatorEmbree()
 {
-    // std::cout << "O1DnSimulatorEmbree - Destructor " << std::endl;
+    
 }
 
 void O1DnSimulatorEmbree::setMap(EmbreeMapPtr map)
@@ -50,134 +50,6 @@ void O1DnSimulatorEmbree::setModel(
     m_model->range = model->range;
     m_model->orig = model->orig;
     m_model->dirs = model->dirs;
-
-    // this is shallow copy becauce pointer is copied
-    // m_model = model;
-    // copy(model[0].dirs, m_model[0].dirs);
-    // std::cout << "Set Model" << std::endl;
-    // std::cout << model[0].dirs.raw() << " -> " << m_model[0].dirs.raw() << std::endl; 
-}
-
-void O1DnSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm,
-    MemoryView<float, RAM>& ranges) const
-{
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                const Vector ray_orig_s = m_model->getOrigin(vid, hid);
-                const Vector ray_orig_m = Tsm_ * ray_orig_s;
-
-                // std::cout << ray_dir_s.x << " " << ray_dir_s.y << " " << ray_dir_s.z << std::endl;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = ray_orig_m.x;
-                rayhit.ray.org_y = ray_orig_m.y;
-                rayhit.ray.org_z = ray_orig_m.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    ranges[glob_id] = rayhit.ray.tfar;
-                } else {
-                    ranges[glob_id] = m_model->range.max + 1.0;
-                }
-            }
-        }
-    }
-}
-
-Memory<float, RAM> O1DnSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<float, RAM> res(m_model->size() * Tbm.size());
-    simulateRanges(Tbm, res);
-    return res;
-}
-
-void O1DnSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm, 
-    MemoryView<uint8_t, RAM>& hits) const
-{
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                const Vector ray_orig_s = m_model->getOrigin(vid, hid);
-                const Vector ray_orig_m = Tsm_ * ray_orig_s;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = ray_orig_m.x;
-                rayhit.ray.org_y = ray_orig_m.y;
-                rayhit.ray.org_z = ray_orig_m.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    hits[glob_id] = 1;
-                } else {
-                    hits[glob_id] = 0;
-                }
-            }
-        }
-    }
-}
-
-Memory<uint8_t, RAM> O1DnSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<uint8_t, RAM> res(m_model->size() * Tbm.size());
-    simulateHits(Tbm, res);
-    return res;
 }
 
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_embree/src/simulation/OnDnSimulatorEmbree.cpp b/src/rmagine_embree/src/simulation/OnDnSimulatorEmbree.cpp
index 5e4ee7be..702831d6 100644
--- a/src/rmagine_embree/src/simulation/OnDnSimulatorEmbree.cpp
+++ b/src/rmagine_embree/src/simulation/OnDnSimulatorEmbree.cpp
@@ -54,134 +54,6 @@ void OnDnSimulatorEmbree::setModel(
     m_model->range = model->range;
     m_model->origs = model->origs;
     m_model->dirs = model->dirs;
-
-    // this is shallow copy becauce pointer is copied
-    // m_model = model;
-    // copy(model[0].dirs, m_model[0].dirs);
-    // std::cout << "Set Model" << std::endl;
-    // std::cout << model[0].dirs.raw() << " -> " << m_model[0].dirs.raw() << std::endl; 
-}
-
-void OnDnSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm,
-    MemoryView<float, RAM>& ranges) const
-{
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                const Vector ray_orig_s = m_model->getOrigin(vid, hid);
-                const Vector ray_orig_m = Tsm_ * ray_orig_s;
-
-                // std::cout << ray_dir_s.x << " " << ray_dir_s.y << " " << ray_dir_s.z << std::endl;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = ray_orig_m.x;
-                rayhit.ray.org_y = ray_orig_m.y;
-                rayhit.ray.org_z = ray_orig_m.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    ranges[glob_id] = rayhit.ray.tfar;
-                } else {
-                    ranges[glob_id] = m_model->range.max + 1.0;
-                }
-            }
-        }
-    }
-}
-
-Memory<float, RAM> OnDnSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<float, RAM> res(m_model->size() * Tbm.size());
-    simulateRanges(Tbm, res);
-    return res;
-}
-
-void OnDnSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm, 
-    MemoryView<uint8_t, RAM>& hits) const
-{
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                const Vector ray_orig_s = m_model->getOrigin(vid, hid);
-                const Vector ray_orig_m = Tsm_ * ray_orig_s;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = ray_orig_m.x;
-                rayhit.ray.org_y = ray_orig_m.y;
-                rayhit.ray.org_z = ray_orig_m.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    hits[glob_id] = 1;
-                } else {
-                    hits[glob_id] = 0;
-                }
-            }
-        }
-    }
-}
-
-Memory<uint8_t, RAM> OnDnSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<uint8_t, RAM> res(m_model->size() * Tbm.size());
-    simulateHits(Tbm, res);
-    return res;
 }
 
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_embree/src/simulation/PinholeSimulatorEmbree.cpp b/src/rmagine_embree/src/simulation/PinholeSimulatorEmbree.cpp
index 48db4f47..c93b8677 100644
--- a/src/rmagine_embree/src/simulation/PinholeSimulatorEmbree.cpp
+++ b/src/rmagine_embree/src/simulation/PinholeSimulatorEmbree.cpp
@@ -49,118 +49,4 @@ void PinholeSimulatorEmbree::setModel(const PinholeModel& model)
     m_model[0] = model;
 }
 
-void PinholeSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm,
-    MemoryView<float, RAM>& ranges) const
-{
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = Tsm_.t.x;
-                rayhit.ray.org_y = Tsm_.t.y;
-                rayhit.ray.org_z = Tsm_.t.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    ranges[glob_id] = rayhit.ray.tfar;
-                } else {
-                    ranges[glob_id] = m_model->range.max + 1.0;
-                }
-            }
-        }
-    }
-}
-
-Memory<float, RAM> PinholeSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<float, RAM> res(m_model->size() * Tbm.size());
-    simulateRanges(Tbm, res);
-    return res;
-}
-
-void PinholeSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm, 
-    MemoryView<uint8_t, RAM>& hits) const
-{
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = Tsm_.t.x;
-                rayhit.ray.org_y = Tsm_.t.y;
-                rayhit.ray.org_z = Tsm_.t.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    hits[glob_id] = 1;
-                } else {
-                    hits[glob_id] = 0;
-                }
-            }
-        }
-    }
-}
-
-Memory<uint8_t, RAM> PinholeSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<uint8_t, RAM> res(m_model->size() * Tbm.size());
-    simulateHits(Tbm, res);
-    return res;
-}
-
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_embree/src/simulation/SphereSimulatorEmbree.cpp b/src/rmagine_embree/src/simulation/SphereSimulatorEmbree.cpp
index eabbf705..a14bcf2e 100644
--- a/src/rmagine_embree/src/simulation/SphereSimulatorEmbree.cpp
+++ b/src/rmagine_embree/src/simulation/SphereSimulatorEmbree.cpp
@@ -56,120 +56,4 @@ void SphereSimulatorEmbree::setModel(
     m_model[0] = model;
 }
 
-void SphereSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm,
-    MemoryView<float, RAM>& ranges) const
-{
-    auto handle = m_map->scene->handle();
-
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = Tsm_.t.x;
-                rayhit.ray.org_y = Tsm_.t.y;
-                rayhit.ray.org_z = Tsm_.t.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    ranges[glob_id] = rayhit.ray.tfar;
-                } else {
-                    ranges[glob_id] = m_model->range.max + 1.0;
-                }
-            }
-        }
-    }
-}
-
-Memory<float, RAM> SphereSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<float, RAM> res(m_model->phi.size * m_model->theta.size * Tbm.size());
-    simulateRanges(Tbm, res);
-    return res;
-}
-
-void SphereSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm, 
-    MemoryView<uint8_t, RAM>& hits) const
-{
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = Tsm_.t.x;
-                rayhit.ray.org_y = Tsm_.t.y;
-                rayhit.ray.org_z = Tsm_.t.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    hits[glob_id] = 1;
-                } else {
-                    hits[glob_id] = 0;
-                }
-            }
-        }
-    }
-}
-
-Memory<uint8_t, RAM> SphereSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<uint8_t, RAM> res(m_model->phi.size * m_model->theta.size * Tbm.size());
-    simulateHits(Tbm, res);
-    return res;
-}
-
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_ouster/CMakeLists.txt b/src/rmagine_ouster/CMakeLists.txt
new file mode 100644
index 00000000..d90c2cab
--- /dev/null
+++ b/src/rmagine_ouster/CMakeLists.txt
@@ -0,0 +1,94 @@
+
+message(STATUS "Building Core. Library: rmagine")
+
+set(RMAGINE_CORE_SRCS
+    # Types
+    src/types/ouster_sensors.cpp
+)
+
+add_library(rmagine-ouster SHARED
+    ${RMAGINE_CORE_SRCS}
+)
+
+target_include_directories(rmagine-ouster
+  PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/rmagine-${rmagine_VERSION}>
+    ${ASSIMP_INCLUDE_DIRS}
+    ${Boost_INCLUDE_DIRS}
+)
+
+target_link_libraries(rmagine-ouster
+    rmagine-core
+    ${ASSIMP_LIBRARIES}
+    Eigen3::Eigen
+    ${Boost_LIBRARIES}
+    ${OpenMP_CXX_LIBRARIES}
+    jsoncpp
+)
+
+set_target_properties(rmagine-ouster
+  PROPERTIES
+    EXPORT_NAME ouster
+    SOVERSION ${rmagine_VERSION_MAJOR}
+    VERSION ${rmagine_VERSION}
+    COMPONENT ouster
+    CXX_STANDARD 17
+)
+
+# TODO: do this:
+add_library(rmagine ALIAS rmagine-ouster)
+add_library(rmagine::ouster ALIAS rmagine-ouster)
+
+
+list(APPEND RMAGINE_LIBRARIES rmagine-ouster)
+set(RMAGINE_LIBRARIES ${RMAGINE_LIBRARIES} PARENT_SCOPE)
+
+
+# CMAKE FIND SCRIPT
+include(CMakePackageConfigHelpers)
+
+###########
+## CORE
+############
+install(TARGETS rmagine-ouster 
+  EXPORT rmagine-ouster-targets
+  COMPONENT ouster)
+
+install(EXPORT rmagine-ouster-targets
+    FILE rmagine-ouster-targets.cmake
+    COMPONENT ouster
+    NAMESPACE rmagine::
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/rmagine-${rmagine_VERSION}
+)
+
+write_basic_package_version_file(
+  ${CMAKE_CURRENT_BINARY_DIR}/rmagine-ouster-config-version.cmake
+  VERSION ${rmagine_VERSION}
+  COMPATIBILITY SameMajorVersion
+)
+
+configure_package_config_file(cmake/rmagine-ouster-config.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/rmagine-ouster-config.cmake
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/rmagine-${rmagine_VERSION}
+)
+
+install(
+  FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/rmagine-ouster-config.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/rmagine-ouster-config-version.cmake
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/rmagine-${rmagine_VERSION}
+  COMPONENT ouster
+)
+
+install(
+  DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/rmagine
+  COMPONENT ouster
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rmagine-${rmagine_VERSION}
+)
+
+set(CPACK_DEBIAN_CORE_PACKAGE_DEPENDS "libeigen3-dev, libassimp-dev" PARENT_SCOPE)
+
+list(APPEND CPACK_COMPONENTS_ALL ouster)
+set(CPACK_COMPONENTS_ALL ${CPACK_COMPONENTS_ALL} PARENT_SCOPE)
+
diff --git a/src/rmagine_ouster/cmake/rmagine-ouster-config.cmake.in b/src/rmagine_ouster/cmake/rmagine-ouster-config.cmake.in
new file mode 100644
index 00000000..a2d03406
--- /dev/null
+++ b/src/rmagine_ouster/cmake/rmagine-ouster-config.cmake.in
@@ -0,0 +1,12 @@
+@PACKAGE_INIT@
+
+include(${CMAKE_CURRENT_LIST_DIR}/rmagine-ouster-config-version.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/rmagine-ouster-targets.cmake)
+
+include(CMakeFindDependencyMacro)
+
+find_dependency(jsoncpp)
+
+check_required_components(ouster)
+
+set(rmagine_ouster_FOUND 1)
\ No newline at end of file
diff --git a/src/rmagine_ouster/include/rmagine/types/ouster_sensors.h b/src/rmagine_ouster/include/rmagine/types/ouster_sensors.h
new file mode 100644
index 00000000..9ec65e93
--- /dev/null
+++ b/src/rmagine_ouster/include/rmagine/types/ouster_sensors.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2024, University Osnabrück
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the University Osnabrück nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL University Osnabrück BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * 
+ * @brief Predefined Sensor Models
+ *
+ * @date 10.09.2024
+ * @author Alexander Mock
+ * 
+ * @copyright Copyright (c) 2024, University Osnabrück. All rights reserved.
+ * This project is released under the 3-Clause BSD License.
+ * 
+ */
+#ifndef RMAGINE_TYPES_OUSTER_SENSORS_H
+#define RMAGINE_TYPES_OUSTER_SENSORS_H
+
+#include <rmagine/types/sensor_models.h>
+
+namespace rmagine
+{
+
+O1DnModel o1dn_from_ouster_meta_file(std::string filename);
+
+} // namespace rmagine
+
+#endif // RMAGINE_TYPES_OUSTER_SENSORS_H
diff --git a/src/rmagine_ouster/src/types/ouster_sensors.cpp b/src/rmagine_ouster/src/types/ouster_sensors.cpp
new file mode 100644
index 00000000..b1af6561
--- /dev/null
+++ b/src/rmagine_ouster/src/types/ouster_sensors.cpp
@@ -0,0 +1,82 @@
+#include "rmagine/types/ouster_sensors.h"
+
+#include <jsoncpp/json/json.h>
+#include <fstream>
+
+namespace rmagine {
+
+O1DnModel o1dn_from_ouster_meta_file(std::string filename)
+{
+    O1DnModel model;
+
+    std::ifstream ouster_file(filename, std::ifstream::binary);
+
+    Json::Value ouster_config;
+    ouster_file >> ouster_config;
+
+    std::string lidar_mode = ouster_config["lidar_mode"].asString();
+
+    size_t x_id = lidar_mode.find("x");
+    unsigned int lidar_width = std::stoi(lidar_mode.substr(0, x_id));
+    unsigned int lidar_frequency = std::stoi(lidar_mode.substr(x_id+1));
+
+    std::vector<float> beam_altitude_angles;
+    for(auto val : ouster_config["beam_altitude_angles"])
+    {
+        beam_altitude_angles.push_back(val.asFloat());
+    }
+
+    unsigned int lidar_height = beam_altitude_angles.size();
+
+    std::vector<float> beam_azimuth_angles;
+    for(auto val : ouster_config["beam_azimuth_angles"])
+    {
+        beam_azimuth_angles.push_back(val.asFloat());
+    }
+
+    const float lidar_origin_to_beam_origin_mm = ouster_config["lidar_origin_to_beam_origin_mm"].asFloat();
+
+    // std::cout << "Loading Ouster Lidar: " << lidar_width << "x" << lidar_height << ". " << lidar_frequency << "hz" << std::endl;
+
+    model.width = lidar_width;
+    model.height = lidar_height;
+    model.dirs.resize(lidar_width * lidar_height);
+    
+    const float hor_start_angle = 0.0;
+    const float hor_angle_inc = (2.0 * M_PI) / static_cast<float>(model.getWidth());
+
+    const float ver_start_angle = 0.0;
+
+    
+
+    model.orig = {0.0, 0.0, 0.0};
+
+    model.range.min = 0.1;
+    model.range.max = 80.0;
+
+    for(unsigned int vid = 0; vid < model.getHeight(); vid++)
+    {
+        // vertical angle
+        const float phi = beam_altitude_angles[vid] * DEG_TO_RAD_F;
+
+        for(unsigned int hid = 0; hid < model.getWidth(); hid++)
+        {
+            // horizontal angle
+            const float theta_offset = beam_azimuth_angles[vid] * DEG_TO_RAD_F;
+            const float theta = hor_start_angle + hor_angle_inc * static_cast<float>(hid) + theta_offset;
+
+            // compute direction: polar to cartesian
+            Vector dir{
+                cosf(phi) * cosf(theta), 
+                cosf(phi) * sinf(theta), 
+                sinf(phi)};
+
+            const unsigned int buf_id = model.getBufferId(vid, hid);
+            model.dirs[buf_id] = dir;
+        }
+    }
+
+    return model;
+}
+
+} // namespace rmagine
\ No newline at end of file
diff --git a/tests/core/CMakeLists.txt b/tests/core/CMakeLists.txt
index 2809af9e..2977723f 100644
--- a/tests/core/CMakeLists.txt
+++ b/tests/core/CMakeLists.txt
@@ -35,3 +35,11 @@ target_link_libraries(rmagine_tests_core_quaternion
 )
 
 add_test(NAME core_quaternion COMMAND rmagine_tests_core_quaternion)
+
+# 4. SVD
+add_executable(rmagine_tests_core_math_svd math_svd.cpp)
+target_link_libraries(rmagine_tests_core_math_svd
+    rmagine::core
+)
+
+add_test(NAME core_math_svd COMMAND rmagine_tests_core_math_svd)
diff --git a/tests/core/math_svd.cpp b/tests/core/math_svd.cpp
new file mode 100644
index 00000000..974457cd
--- /dev/null
+++ b/tests/core/math_svd.cpp
@@ -0,0 +1,412 @@
+#include <iostream>
+
+
+#include <rmagine/math/types.h>
+
+#include <rmagine/math/math.h>
+#include <rmagine/util/StopWatch.hpp>
+
+#include <rmagine/util/exceptions.h>
+#include <rmagine/util/prints.h>
+
+#include <rmagine/math/linalg.h>
+
+#include <Eigen/Dense>
+
+#include <stdint.h>
+#include <string.h>
+
+namespace rm = rmagine;
+
+
+float compute_error(Eigen::Matrix3f gt, Eigen::Matrix3f m)
+{
+    float ret = 0.0;
+    for(size_t i=0; i<3; i++)
+    {
+        for(size_t j=0; j<3; j++)
+        {
+            ret += abs(gt(i, j) - m(i, j));
+        }
+    }
+    return ret;
+}
+
+float compute_error(rm::Matrix3x3 gt, rm::Matrix3x3 m)
+{
+    float ret = 0.0;
+    for(size_t i=0; i<3; i++)
+    {
+        for(size_t j=0; j<3; j++)
+        {
+            ret += abs(gt(i, j) - m(i, j));
+        }
+    }
+    return ret;
+}
+
+Eigen::Matrix3f& eigen_view(rm::Matrix3x3& M)
+{
+    return *reinterpret_cast<Eigen::Matrix3f*>( &M );
+}
+
+template<typename DataT, int Rows, int Cols>
+void testSVD(const Eigen::Matrix<DataT, Rows, Cols>& A)
+{
+    Eigen::JacobiSVD<Eigen::Matrix<DataT, Rows, Cols> > svd(A, Eigen::ComputeFullU | Eigen::ComputeFullV);
+}
+
+template<typename DataT, unsigned Rows, unsigned Cols>
+void testSVD(const rm::Matrix_<DataT, Rows, Cols>& A)
+{
+    using AMatT = rm::Matrix_<DataT, Rows, Cols>;
+    using UMatT = typename rm::svd_dims<AMatT>::U;
+    using WMatT = typename rm::svd_dims<AMatT>::W;
+    using VMatT = typename rm::svd_dims<AMatT>::V;
+
+    UMatT U = UMatT::Zeros();
+    WMatT W = WMatT::Zeros();
+    VMatT V = VMatT::Zeros();
+
+    rm::svd(A, U, W, V);
+}
+
+void svdTestWithPrints()
+{
+    rm::Matrix3x3 Arm;
+
+    Eigen::Matrix3f Aeig = Eigen::Matrix3f::Random(3, 3);
+    for(size_t i=0; i<3; i++)
+    {
+        for(size_t j=0; j<3; j++)
+        {
+            Arm(i, j) = Aeig(i, j);
+        }
+    }
+    
+
+    std::cout << "A: " << std::endl << Aeig << std::endl;
+
+    std::cout << std::endl;
+    std::cout << "Eigen: " << std::endl;
+    
+
+    Eigen::JacobiSVD<Eigen::Matrix3f> svdeig(Aeig, Eigen::ComputeFullU | Eigen::ComputeFullV);
+    // std::cout << "U: " << std::endl << svdeig.matrixU() << std::endl;
+    // std::cout << "V: " << std::endl << svdeig.matrixV() << std::endl;
+
+    std::cout << "U * V.T" << std::endl;
+    std::cout << svdeig.matrixU() * svdeig.matrixV().transpose() << std::endl;
+
+    // rm::SVD2 svdrm;
+    // svdrm.decompose(Arm);
+
+    std::cout << std::endl;
+    std::cout << "Rmagine: " << std::endl;
+    // std::cout  << "U: " << svdrm.u << std::endl;
+    // std::cout << "V: " << svdrm.v << std::endl;
+    // std::cout << "w: " << svdrm.w << std::endl;
+
+    std::cout << "U * V.T" << std::endl;
+    // std::cout << svdrm.u * svdrm.v.transpose() << std::endl;
+
+    // std::cout << svdrm.w << std::endl;
+
+    using AMat = rm::Matrix_<float, 3, 3>;
+    using UMat = typename rm::svd_dims<AMat>::U;
+    using WMat = typename rm::svd_dims<AMat>::W;
+    using VMat = typename rm::svd_dims<AMat>::V;
+
+    UMat Urm = UMat::Zeros();
+    WMat Wrm = WMat::Zeros();
+    VMat Vrm = VMat::Zeros();
+
+    rm::svd(Arm, Urm, Wrm, Vrm);
+
+    std::cout << Urm * Vrm.T() << std::endl;
+}
+
+template<int N, int M>
+void runtimeTest()
+{
+    rm::Matrix_<float, N, M> Arm;
+
+    Eigen::Matrix<float, N, M> Aeig = Eigen::Matrix<float, N, M>::Random(N, M);
+    for(size_t i=0; i<N; i++)
+    {
+        for(size_t j=0; j<M; j++)
+        {
+            Arm(i, j) = Aeig(i, j);
+        }
+    }
+    
+    rm::StopWatch sw;
+    double el_eig, el_rm;
+
+    sw();
+    testSVD(Aeig);
+    el_eig = sw();
+
+    sw();
+    testSVD(Arm);
+    el_rm = sw();
+
+    std::cout << "N = " << N << ", M = " << M << std::endl;
+    std::cout << "- eigen: " << el_eig*1000.0 << "ms" << std::endl;
+    std::cout << "- rmagine: " << el_rm*1000.0 << "ms" << std::endl;
+}
+
+
+template<int N, int M>
+void accuracyTest()
+{
+    std::cout << "Accuracy Test" << std::endl;
+    rm::Matrix_<float, N, M> Arm;
+
+    Eigen::Matrix<float, N, M> Aeig = Eigen::Matrix<float, N, M>::Random(N, M);
+    for(size_t i=0; i<N; i++)
+    {
+        for(size_t j=0; j<M; j++)
+        {
+            Arm(i, j) = Aeig(i, j);
+        }
+    }
+    
+    rm::StopWatch sw;
+    double el_eig, el_rm;
+
+    // std::cout << "A: " << std::endl;
+    // std::cout << Aeig << std::endl;
+
+    Eigen::JacobiSVD<Eigen::Matrix<float, N, M> > svdeig(Aeig, Eigen::ComputeFullU | Eigen::ComputeFullV);
+
+    auto s = svdeig.singularValues();
+
+    Eigen::Matrix<float, N, M> Seig = Eigen::Matrix<float, N, M>::Zero();
+    for(size_t i=0; i<s.rows(); i++)
+    {
+        Seig(i, i) = s(i);
+    }
+    // std::cout << "Seig: " << std::endl;
+    // std::cout << Seig << std::endl;
+
+
+    // std::cout << "Eigen: " << std::endl;
+    auto uvt_eig = svdeig.matrixU() * Seig * svdeig.matrixV().transpose();
+
+    // std::cout << uvt_eig << std::endl;
+
+    float error_eig = 0.0;
+    for(size_t i=0; i<N; i++)
+    {
+        for(size_t j=0; j<M; j++)
+        {
+            error_eig += abs(uvt_eig(i, j) - Aeig(i, j));
+        }
+    }
+
+    std::cout << "- Eigen JacobiSVD error: " << error_eig << std::endl;
+
+    using AMat = rm::Matrix_<float, N, M>;
+    using UMat = typename rm::svd_dims<AMat>::U;
+    using WMat = typename rm::svd_dims<AMat>::W;
+    using VMat = typename rm::svd_dims<AMat>::V;
+
+    UMat Urm = UMat::Zeros();
+    WMat Wrm = WMat::Zeros();
+    VMat Vrm = VMat::Zeros();
+
+    rm::svd(Arm, Urm, Wrm, Vrm);
+
+    auto uvt_rm = Urm * Wrm * Vrm.T();
+
+    // std::cout << "Rmagine: " << std::endl;
+    // std::cout << uvt_rm << std::endl;
+
+    float error_rm = 0.0;
+    for(size_t i=0; i<N; i++)
+    {
+        for(size_t j=0; j<M; j++)
+        {
+            error_rm += abs(uvt_rm(i, j) - Arm(i, j));
+        }
+    }
+
+    std::cout << "- Rmagine error: " << error_eig << std::endl;
+}
+
+
+void parallelTest()
+{
+    std::cout << "parallelTest" << std::endl;
+    // correct num_objects objects in parallel
+    size_t num_objects = 1000000;
+
+    std::vector<Eigen::Matrix3f> covs_eigen(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> covs_rm(num_objects);
+
+    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    {
+        rm::Matrix3x3 Arm;
+        Eigen::Matrix3f Aeig = Eigen::Matrix3f::Random(3, 3);
+        for(size_t i=0; i<3; i++)
+        {
+            for(size_t j=0; j<3; j++)
+            {
+                Arm(i, j) = Aeig(i, j);
+            }
+        }
+
+        covs_eigen[obj_id] = Aeig;
+        covs_rm[obj_id] = Arm;
+    }
+
+    std::cout << "First Mat: " << std::endl;
+    std::cout << covs_rm[0] << std::endl;
+
+
+    // C -> SVD -> UWT* -> U * W * T* -> C
+
+    std::cout << "Start computing SVD of " << num_objects << " 3x3 matrices" << std::endl;
+
+    std::vector<Eigen::Matrix3f> res_eigen(num_objects);
+
+    rm::StopWatch sw;
+    double el_eigen, el_rmagine, el_rmagine2;
+
+    sw();
+    #pragma omp parallel for
+    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    {
+        Eigen::JacobiSVD<Eigen::Matrix3f> svdeig(covs_eigen[obj_id], 
+            Eigen::ComputeFullU | Eigen::ComputeFullV);
+        auto s = svdeig.singularValues();
+        Eigen::Matrix3f Seig = Eigen::Matrix3f::Zero();
+        for(size_t i=0; i<s.rows(); i++)
+        {
+            Seig(i, i) = s(i);
+        }
+        auto uvt_eig = svdeig.matrixU() * Seig * svdeig.matrixV().transpose();
+        res_eigen[obj_id] = uvt_eig;
+    }
+    el_eigen = sw();
+
+    float err_eigen = 0.0;
+    for(size_t obj_id = 0; obj_id < num_objects; obj_id++)
+    {
+        err_eigen += compute_error(covs_eigen[obj_id], res_eigen[obj_id]);
+    }
+
+    std::cout << "Eigen:" << std::endl;
+    std::cout << "- run time: " << el_eigen << " s" << std::endl;
+    std::cout << "- summed error: " << err_eigen << std::endl;
+
+
+    rm::Memory<rm::Matrix3x3> res_rm(num_objects);
+
+    sw();
+    #pragma omp parallel for
+    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    {
+        rm::Matrix3x3 Urm = rm::Matrix3x3::Zeros();
+        rm::Matrix3x3 Wrm = rm::Matrix3x3::Zeros();
+        rm::Matrix3x3 Vrm = rm::Matrix3x3::Zeros();
+        rm::svd(covs_rm[obj_id], Urm, Wrm, Vrm);
+        auto uvt_rm = Urm * Wrm * Vrm.T();
+        res_rm[obj_id] = uvt_rm;
+    }
+    el_rmagine = sw();
+    
+    
+    float err_rmagine = 0.0;
+    for(size_t obj_id = 0; obj_id < num_objects; obj_id++)
+    {
+        err_rmagine += compute_error(covs_rm[obj_id], res_rm[obj_id]);
+    }
+    
+    std::cout << "Rmagine:" << std::endl;
+    std::cout << "- run time: " << el_rmagine << " s" << std::endl;
+    std::cout << "- summed error: " << err_rmagine << std::endl;
+}
+
+void parallelTest2()
+{
+    std::cout << "parallelTest2" << std::endl;
+
+    size_t num_objects = 1000000;
+    std::cout << "parallelTest. Computing SVD of " << num_objects << " 3x3 matrices" << std::endl;
+    // correct num_objects objects in parallel
+    
+
+    std::vector<Eigen::Matrix3f> covs_eigen(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> covs_rm(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> Us(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> Ws(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> Vs(num_objects);
+
+    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    {
+        rm::Matrix3x3 Arm;
+        Eigen::Matrix3f Aeig = Eigen::Matrix3f::Random(3, 3);
+        for(size_t i=0; i<3; i++)
+        {
+            for(size_t j=0; j<3; j++)
+            {
+                Arm(i, j) = Aeig(i, j);
+            }
+        }
+
+        covs_eigen[obj_id] = Aeig;
+        covs_rm[obj_id] = Arm;
+        Us[obj_id] = rm::Matrix3x3::Zeros();
+        Ws[obj_id] = rm::Matrix3x3::Zeros();
+        Vs[obj_id] = rm::Matrix3x3::Zeros();
+    }
+
+    std::cout << "First Mat: " << std::endl;
+    std::cout << covs_rm[0] << std::endl;
+
+
+    rm::StopWatch sw;
+    double el_rmagine;
+
+
+    sw();
+    svd(covs_rm, Us, Ws, Vs);
+    el_rmagine = sw();
+    
+
+    rm::Memory<rm::Matrix3x3> res_rm(num_objects);
+    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    {
+        auto uvt_rm = Us[obj_id] * Ws[obj_id] * Vs[obj_id].T();
+        res_rm[obj_id] = uvt_rm;
+    }
+    
+    float err_rmagine = 0.0;
+    for(size_t obj_id = 0; obj_id < num_objects; obj_id++)
+    {
+        err_rmagine += compute_error(covs_rm[obj_id], res_rm[obj_id]);
+    }
+    
+    std::cout << "Rmagine:" << std::endl;
+    std::cout << "- run time: " << el_rmagine << " s" << std::endl;
+    std::cout << "- summed error: " << err_rmagine << std::endl;
+}
+
+int main(int argc, char** argv)
+{
+    srand((unsigned int) time(0));
+
+    // svdTestWithPrints();
+    runtimeTest<20, 30>();
+    accuracyTest<20, 30>();
+
+
+    parallelTest();
+    parallelTest2();
+
+
+    return 0;
+}
+
diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt
index f1bb3c4f..2998f89f 100644
--- a/tests/cuda/CMakeLists.txt
+++ b/tests/cuda/CMakeLists.txt
@@ -26,3 +26,11 @@ target_link_libraries(rmagine_tests_cuda_memory_slicing
 
 add_test(NAME cuda_memory_slicing COMMAND rmagine_tests_cuda_memory_slicing)
 
+
+# 4. SVD
+add_executable(rmagine_tests_cuda_math_svd math_svd.cpp)
+target_link_libraries(rmagine_tests_cuda_math_svd
+    rmagine::cuda
+)
+
+add_test(NAME cuda_math_svd COMMAND rmagine_tests_cuda_math_svd)
diff --git a/tests/cuda/math_svd.cpp b/tests/cuda/math_svd.cpp
new file mode 100644
index 00000000..7cc7289b
--- /dev/null
+++ b/tests/cuda/math_svd.cpp
@@ -0,0 +1,118 @@
+#include <rmagine/math/linalg.h>
+#include <rmagine/util/prints.h>
+#include <rmagine/math/math_batched.cuh>
+#include <rmagine/util/StopWatch.hpp>
+
+#include <Eigen/Dense>
+
+#include <cuda_runtime.h>
+
+namespace rm = rmagine;
+
+float compute_error(Eigen::Matrix3f gt, Eigen::Matrix3f m)
+{
+    float ret = 0.0;
+    for(size_t i=0; i<3; i++)
+    {
+        for(size_t j=0; j<3; j++)
+        {
+            ret += abs(gt(i, j) - m(i, j));
+        }
+    }
+    return ret;
+}
+
+float compute_error(rm::Matrix3x3 gt, rm::Matrix3x3 m)
+{
+    float ret = 0.0;
+    for(size_t i=0; i<3; i++)
+    {
+        for(size_t j=0; j<3; j++)
+        {
+            ret += abs(gt(i, j) - m(i, j));
+        }
+    }
+    return ret;
+}
+
+void parallelTest()
+{
+    size_t num_objects = 1000000;
+    std::cout << "parallelTest. Computing SVD of " << num_objects << " 3x3 matrices" << std::endl;
+    // correct num_objects objects in parallel
+    
+
+    std::vector<Eigen::Matrix3f> covs_eigen(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> covs_rm(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> Us(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> Ws(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> Vs(num_objects);
+
+    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    {
+        rm::Matrix3x3 Arm;
+        Eigen::Matrix3f Aeig = Eigen::Matrix3f::Random(3, 3);
+        for(size_t i=0; i<3; i++)
+        {
+            for(size_t j=0; j<3; j++)
+            {
+                Arm(i, j) = Aeig(i, j);
+            }
+        }
+
+        covs_eigen[obj_id] = Aeig;
+        covs_rm[obj_id] = Arm;
+        Us[obj_id] = rm::Matrix3x3::Zeros();
+        Ws[obj_id] = rm::Matrix3x3::Zeros();
+        Vs[obj_id] = rm::Matrix3x3::Zeros();
+    }
+
+    std::cout << "First Mat: " << std::endl;
+    std::cout << covs_rm[0] << std::endl;
+
+    rm::StopWatch sw;
+    double el_rmagine;
+
+
+    // upload
+    rm::Memory<rm::Matrix3x3, rm::VRAM_CUDA> covs_rm_ = covs_rm;
+    rm::Memory<rm::Matrix3x3, rm::VRAM_CUDA> Us_ = Us;
+    rm::Memory<rm::Matrix3x3, rm::VRAM_CUDA> Ws_ = Ws;
+    rm::Memory<rm::Matrix3x3, rm::VRAM_CUDA> Vs_ = Vs;
+
+    sw();
+    svd(covs_rm_, Us_, Ws_, Vs_);
+    cudaDeviceSynchronize();
+    el_rmagine = sw();
+    
+    // download
+    Us = Us_;
+    Ws = Ws_;
+    Vs = Vs_;
+
+    rm::Memory<rm::Matrix3x3> res_rm(num_objects);
+    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    {
+        auto uvt_rm = Us[obj_id] * Ws[obj_id] * Vs[obj_id].T();
+        res_rm[obj_id] = uvt_rm;
+    }
+    
+    float err_rmagine = 0.0;
+    for(size_t obj_id = 0; obj_id < num_objects; obj_id++)
+    {
+        err_rmagine += compute_error(covs_rm[obj_id], res_rm[obj_id]);
+    }
+    
+    std::cout << "Rmagine:" << std::endl;
+    std::cout << "- run time: " << el_rmagine << " s" << std::endl;
+    std::cout << "- summed error: " << err_rmagine << std::endl;
+}
+
+int main(int argc, char** argv)
+{
+    srand((unsigned int) time(0));
+
+    parallelTest();
+
+    return 0;
+}
\ No newline at end of file