diff --git a/RandBLAS/trig_skops.hh b/RandBLAS/trig_skops.hh
new file mode 100644
index 00000000..804fe89e
--- /dev/null
+++ b/RandBLAS/trig_skops.hh
@@ -0,0 +1,440 @@
+#include "RandBLAS/base.hh"
+#include "RandBLAS/exceptions.hh"
+#include "RandBLAS/random_gen.hh"
+#include <RandBLAS/sparse_skops.hh>
+
+#include <Random123/philox.h>
+#include <blas.hh>
+
+#include <iostream>
+#include <stdio.h>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+
+#include <math.h>
+#include <typeinfo>
+
+#define MAX(a, b) (((a) < (b)) ? (b) : (a))
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+
+namespace RandBLAS {
+
+    // =============================================================================
+    /// WARNING: None of the following functions or overloads thereof are part of the
+    /// public API
+    ///
+
+    // Generates a vector of Rademacher entries using the Random123 library
+    template<SignedInteger sint_t, typename RNG = r123::Philox4x32>
+    void generate_rademacher_vector_r123(sint_t* buff, uint32_t key_seed, uint32_t ctr_seed, int64_t n) {
+        RNG rng;
+
+        typename RNG::ctr_type c;
+        typename RNG::key_type key = {{key_seed}};
+
+        // Sequential loop for generating Rademacher entries
+        for (int64_t i = 0; i < n; ++i) {
+            // Set the counter for each random number
+            c[0] = ctr_seed + i;  // Ensure each counter is unique
+
+            // Generate a 2x32-bit random number using the Philox generator
+            typename RNG::ctr_type r = rng(c, key);
+
+            // Convert the random number into a float in [0, 1) using u01fixedpt
+            float rand_value = r123::u01fixedpt<float>(r.v[0]);
+
+            // Convert the float into a Rademacher entry (-1 or 1)
+            buff[i] = rand_value < 0.5 ? -1 : 1;
+        }
+    }
+
+    template<SignedInteger sint_t, typename RNG = r123::Philox4x32>
+    RandBLAS::RNGState<RNG> generate_rademacher_vector_r123(sint_t* buff, int64_t n, RandBLAS::RNGState<RNG> seed_state) {
+        RNG rng;
+        auto [ctr, key] = seed_state;
+
+        for (int64_t i = 0; i < n; ++i) {
+            typename RNG::ctr_type r = rng(ctr, key);
+
+            float rand_value = r123::u01fixedpt<float>(r.v[0]);
+
+            buff[i] = rand_value < 0.5 ? -1 : 1;
+
+            ctr.incr();
+        }
+
+        // Return the updated RNGState (with the incremented counter)
+        return RandBLAS::RNGState<RNG> {ctr, key};
+    }
+
+    // Catch-all method for applying the diagonal Rademacher
+    // entries in-place to an input matrix, `A`
+    template<typename T, SignedInteger sint_t>
+    void apply_diagonal_rademacher(
+                                bool left,
+                                blas::Layout layout,
+                                int64_t rows,
+                                int64_t cols,
+                                T* A,
+                                sint_t* diag
+                                ) {
+        //TODO: Investigate better schemes for performing the scaling
+        //TODO: Move to `RandBLAS/util.hh`
+        if(left && layout == blas::Layout::ColMajor) {
+            for(int64_t col=0; col < cols; col++) {
+                if(diag[col] > 0)
+                    continue;
+                blas::scal(rows, diag[col], &A[col * rows], 1);
+            }
+        }
+        else if(left && layout == blas::Layout::RowMajor) {
+            for(int64_t col=0; col < cols; col++) {
+                if(diag[col] > 0)
+                    continue;
+                blas::scal(rows, diag[col], &A[col], cols);
+            }
+        }
+        else if(!left && layout == blas::Layout::ColMajor) {
+            for(int64_t row = 0; row < rows; row++) {
+                if(diag[row] > 0)
+                    continue;
+                blas::scal(cols, diag[row], &A[row], rows);
+            }
+        }
+        else {
+            for(int64_t row = 0; row < rows; row++) {
+                if(diag[row] > 0)
+                    continue;
+                blas::scal(cols, diag[row], &A[row * cols], 1);
+            }
+        }
+    }
+
+    template<typename T, SignedInteger sint_t>
+    void permuteRowsToTop(
+                          blas::Layout layout,
+                          int64_t rows,
+                          int64_t cols,
+                          sint_t* selected_rows,
+                          int64_t d, // size of `selected_rows`
+                          T* A
+                          ) {
+        int64_t top = 0;  // Keeps track of the topmost unselected row
+
+        //TODO: discuss precise semantics of `selected_rows` in this function
+        if(layout == blas::Layout::ColMajor) {
+            for (int64_t i=0; i < d; i++) {
+                randblas_error_if_msg(selected_rows[i] == top,
+                                      "The list of provided indices should be unique");
+                if (selected_rows[i] != top) {
+                    // Use BLAS swap to swap the entire rows
+                    // Swapping row 'selected' with row 'top'
+                    blas::swap(cols, &A[top], rows, &A[selected_rows[i]], rows);
+                }
+            }
+        }
+        else {
+            // For `RowMajor` ordering
+            for (int64_t i=0; i < d; i++) {
+                randblas_error_if_msg(selected_rows[i] == top,
+                                      "The list of provided indices should be unique");
+                if (selected_rows[i] != top) {
+                    blas::swap(cols, &A[cols * selected_rows[i]], 1, &A[cols * top], 1);
+                }
+            }
+        }
+    }
+
+    template<typename T, SignedInteger sint_t>
+    void permuteColsToLeft(
+                          blas::Layout layout,
+                          int64_t rows,
+                          int64_t cols,
+                          sint_t* selected_cols,
+                          int64_t d, // size of `selectedRows`
+                          T* A
+                          ) {
+        int64_t left = 0;  // Keeps track of the topmost unselected column
+
+        if(layout == blas::Layout::ColMajor) {
+            for (int64_t i=0; i < d; i++) {
+                if (selected_cols[i] != left) {
+                    // Use BLAS::swap to swap entire columns at once
+                    // Swapping col 'selected' with col 'top'
+                    blas::swap(rows, &A[rows * selected_cols[i]], 1, &A[rows * left], 1);
+                }
+            }
+        }
+        else {
+            // For `RowMajor` ordering
+            for (int64_t i=0; i < d; i++) {
+                if (selected_cols[i] != left) {
+                    blas::swap(rows, &A[selected_cols[i]], cols, &A[left], cols);
+                }
+            }
+        }
+    }
+
+    template <typename T>
+    void fht_left_col_major(T *buf, int64_t log_n, int64_t num_rows, int64_t num_cols) {
+        int64_t n = 1 << log_n;
+
+        // Apply FHT to each column independently
+        for (int64_t col = 0; col < num_cols; ++col) {
+            // Pointer to the beginning of the current column in the Column-Major order
+            T* col_buf = buf + col * num_rows;
+
+            // Apply the original FHT on this column
+            for (int64_t i = 0; i < log_n; ++i) {
+                int64_t s1 = 1 << i;
+                int64_t s2 = s1 << 1;
+                for (int64_t j = 0; j < n; j += s2) {
+                    for (int64_t k = 0; k < s1; ++k) {
+                        // For implicitly padding the input we just have to make sure
+                        // we replace all out-of-bounds accesses with zeros
+                        bool b1 = j + k < num_rows;
+                        bool b2 = j + k + s1 < num_rows;
+                        T u = b1 ? col_buf[j + k] : 0;
+                        T v = b2 ? col_buf[j + k + s1] : 0;
+                        if(b1 && b2) {
+                            col_buf[j + k] = u + v;
+                            col_buf[j + k + s1] = u - v;
+                        }
+                        else if(!b2 && b1) {
+                            col_buf[j + k] = u + v;
+                        }
+                        else if(!b2 && !b1)
+                            continue;
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T>
+    void fht_left_row_major(T *buf, int64_t log_n, int64_t num_rows, int64_t num_cols) {
+        int64_t n = 1 << log_n;
+
+        // Apply FHT to each column independently
+        for (int64_t col = 0; col < num_cols; ++col) {
+            // Apply the original FHT on this column
+            for (int64_t i = 0; i < log_n; ++i) {
+                int64_t s1 = 1 << i;
+                int64_t s2 = s1 << 1;
+                for (int64_t j = 0; j < n; j += s2) {
+                    for (int64_t k = 0; k < s1; ++k) {
+                        // For implicitly padding the input we just have to make sure
+                        // we replace all out-of-bounds accesses with zeros
+                        bool b1 = j + k < num_rows;
+                        bool b2 = j + k + s1 < num_rows;
+                        T u = b1 ? buf[(j + k) * num_cols + col] : 0;
+                        T v = b2 ? buf[(j + k + s1) * num_cols + col] : 0;
+                        if(b1 && b2) {
+                            buf[(j + k) * num_cols + col] = u + v;
+                            buf[(j + k + s1) * num_cols + col] = u - v;
+                        }
+                        else if(!b2 && b1) {
+                            buf[(j + k) * num_cols + col] = u + v;
+                        }
+                        else if(!b2 && !b1)
+                            continue;
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T>
+    void fht_right_row_major(T *buf, int64_t log_n, int64_t num_rows, int64_t num_cols) {
+        int64_t n = 1 << log_n;
+
+        // Apply FHT to each row independently
+        for (int64_t row = 0; row < num_rows; ++row) {
+            // Pointer to the beginning of the current row in RowMajor order
+            T * row_buf = buf + row * num_cols;
+
+            // Apply the original FHT on this row
+            for (int64_t i = 0; i < log_n; ++i) {
+                int64_t s1 = 1 << i;
+                int64_t s2 = s1 << 1;
+                for (int64_t j = 0; j < n; j += s2) {
+                    for (int64_t k = 0; k < s1; ++k) {
+                        // For implicitly padding the input we just have to make sure
+                        // we replace all out-of-bounds accesses with zeros
+                        bool b1 = j + k < num_cols;
+                        bool b2 = j + k + s1 < num_cols;
+                        T u = b1 ? row_buf[j + k] : 0;
+                        T v = b2 ? row_buf[j + k + s1] : 0;
+                        if(b1 && b2) {
+                            row_buf[j + k] = u + v;
+                            row_buf[j + k + s1] = u - v;
+                        }
+                        else if(!b2 && b1) {
+                            row_buf[j + k] = u + v;
+                        }
+                        else if(!b2 && !b1)
+                            continue;
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T>
+    void fht_right_col_major(T *buf, int64_t log_n, int64_t num_rows, int64_t num_cols) {
+        int64_t n = 1 << log_n;
+
+        // Apply FHT to each row independently
+        for (int64_t row= 0; row < num_rows; ++row) {
+            // Apply the original FHT on this column
+            for (int64_t i = 0; i < log_n; ++i) {
+                int64_t s1 = 1 << i;
+                int64_t s2 = s1 << 1;
+                for (int64_t j = 0; j < n; j += s2) {
+                    for (int64_t k = 0; k < s1; ++k) {
+                        // For implicitly padding the input we just have to make sure
+                        // we replace all out-of-bounds accesses with zeros
+                        bool b1 = j + k < num_cols;
+                        bool b2 = j + k + s1 < num_cols;
+                        T u = b1 ? buf[(j + k) * num_rows + row] : 0;
+                        T v = b2 ? buf[(j + k + s1) * num_rows + row] : 0;
+                        if(b1 && b2) {
+                            buf[(j + k) * num_rows + row] = u + v;
+                            buf[(j + k + s1) * num_rows + row] = u - v;
+                        }
+                        else if(!b2 && b1) {
+                            buf[(j + k) * num_rows + row] = u + v;
+                        }
+                        else if(!b2 && !b1)
+                            continue;
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T>
+    void fht_dispatch(
+        bool left,
+        blas::Layout layout,
+        T* buff,
+        int64_t log_n,
+        int64_t num_rows,
+        int64_t num_cols
+        )
+    {
+        if(left && layout == blas::Layout::ColMajor)
+            fht_left_col_major(buff, log_n, num_rows, num_cols);
+        else if(left && layout == blas::Layout::RowMajor)
+            fht_left_row_major(buff, log_n, num_rows, num_cols);
+        else if(!left && layout == blas::Layout::ColMajor)
+            fht_right_col_major(buff, log_n, num_rows, num_cols);
+        else
+            fht_right_row_major(buff, log_n, num_rows, num_cols);
+    }
+}
+
+
+namespace RandBLAS::trig {
+/*
+ * These functions apply an in-place, SRHT-like transform to the input matrix
+ * i.e. A <- (\Pi H D)A OR A <- A(D H \Pi) (which is equivalent to A <- A(\Pi H D)^{-1})
+ * layout: Layout of the input matrix (`ColMajor/RowMajor`)
+ * A: (m x n), input dimensions of `A`
+ * d: The number of rows/columns that will be permuted by the action of $\Pi$
+ */
+template <typename T, typename RNG = r123::Philox4x32, SignedInteger sint_t = int64_t>
+inline void lmiget(
+    blas::Layout layout,
+    RandBLAS::RNGState<RNG> random_state,
+    int64_t m, // `A` is `(m x n)`
+    int64_t n,
+    int64_t d, // `d` is the number of rows that have to be permuted by `\Pi`
+    T* A // data-matrix
+) {
+    // Size of the Rademacher entries = |A_cols|
+    //TODO: Change `diag` to float/doubles (same data type as the matrix)
+    sint_t* diag = new sint_t[n];
+    sint_t* selected_rows = new sint_t[d];
+
+    auto [ctr, key] = random_state;
+
+    //Step 1: Scale with `D`
+        //Populating `diag`
+    generate_rademacher_vector_r123(diag, key[0], ctr[0], n);
+    apply_diagonal_rademacher(true, layout, m, n, A, diag);
+
+    //Step 2: Apply the Hadamard transform
+    fht_dispatch(true, layout, A, std::log2(MAX(m, n)), m, n);
+
+    //Step 3: Permute the rows
+    std::vector<sint_t> idxs_minor(d); // Placeholder
+    std::vector<T> vals(d); // Placeholder
+
+    // Populating `selected_rows`
+        //TODO: Do I return this at some point?
+    RandBLAS::RNGState<RNG> next_state = RandBLAS::repeated_fisher_yates<T, RNG, sint_t>(
+        random_state,
+        d,         // Number of samples (vec_nnz)
+        m,         // Total number of elements (dim_major)
+        1,         // Single sample round (dim_minor)
+        selected_rows,  // Holds the required output
+        idxs_minor.data(),  // Placeholder
+        vals.data()         // Placeholder
+    );
+
+    permuteRowsToTop(layout, m, n, selected_rows, d, A);
+
+    free(diag);
+    free(selected_rows);
+}
+
+
+template <typename T, typename RNG = r123::Philox4x32, SignedInteger sint_t = int64_t>
+inline void rmiget(
+    blas::Layout layout,
+    RandBLAS::RNGState<RNG> random_state,
+    int64_t m, // `A` is `(m x n)`
+    int64_t n,
+    int64_t d, // `d` is the number of cols that have to be permuted by `\Pi`
+    T* A // data-matrix
+)
+{
+    // Size of the Rademacher entries = |A_cols|
+    //TODO: Change `diag` to float/doubles (same data type as the matrix)
+    sint_t* diag = new sint_t[m];
+    sint_t* selected_cols = new sint_t[d];
+
+    auto [ctr, key] = random_state;
+
+    //Step 1: Scale with `D`
+        //Populating `diag`
+    generate_rademacher_vector_r123(diag, key[0], ctr[0], n);
+    apply_diagonal_rademacher(false, layout, m, n, A, diag);
+
+    //Step 2: Apply the Hadamard transform
+    fht_dispatch(false, layout, A, std::log2(MAX(m, n)), m, n);
+
+    //Step 3: Permute the rows
+    std::vector<sint_t> idxs_minor(d); // Placeholder
+    std::vector<T> vals(d); // Placeholder
+
+    // Populating `selected_rows`
+        //TODO: Do I return this at some point?
+    RandBLAS::RNGState<RNG> next_state = RandBLAS::repeated_fisher_yates<T, RNG, sint_t>(
+        random_state,
+        d,         // Number of samples (vec_nnz)
+        m,         // Total number of elements (dim_major)
+        1,         // Single sample round (dim_minor)
+        selected_cols,  // Holds the required output
+        idxs_minor.data(),  // Placeholder
+        vals.data()         // Placeholder
+    );
+
+    permuteColsToLeft(layout, m, n, selected_cols, d, A);
+
+    free(diag);
+    free(selected_cols);
+}
+}
diff --git a/test/test_matmul_cores/test_trig.cc b/test/test_matmul_cores/test_trig.cc
new file mode 100644
index 00000000..3283764a
--- /dev/null
+++ b/test/test_matmul_cores/test_trig.cc
@@ -0,0 +1,556 @@
+// Copyright, 2024. See LICENSE for copyright holder information.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// (1) Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// (2) Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// (3) Neither the name of the copyright holder nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+
+#include "RandBLAS.hh"
+#include "RandBLAS/base.hh"
+#include "RandBLAS/trig_skops.hh"
+#include <blas.hh>
+
+#include <Eigen/Core>
+#include <Eigen/Dense>
+#include <cmath>
+#include <random>
+#include <gtest/gtest.h>
+
+using RandBLAS::trig::lmiget;
+using RandBLAS::trig::rmiget;
+using RandBLAS::generate_rademacher_vector_r123;
+using RandBLAS::apply_diagonal_rademacher;
+using RandBLAS::permuteRowsToTop;
+using RandBLAS::permuteColsToLeft;
+using RandBLAS::fht_dispatch;
+using Eigen::MatrixXd;
+
+
+class TestLMIGET : public::testing::Test
+{
+    protected:
+    virtual void SetUp(){};
+
+    virtual void TearDown(){};
+
+    inline static std::vector<uint32_t> keys {0, 42};
+
+    // Helper function for explicitly generating a Hadamard matrix
+    // (Note that ColMajor and RowMajor storage is identical for `H`)
+    static std::vector<double> generate_hadamard(int64_t log_n) {
+        int64_t size = 1 << log_n;  // size = 2^n
+        std::vector<std::vector<double>> H(size, std::vector<double>(size, 1));  // Initialize H_1
+
+        // Sylvester's construction: recursively build the matrix
+        for (int n = 1; n <= log_n; ++n) {
+            double curr_size = 1 << n;  // Current size of the matrix is 2^n
+            for (int i = 0; i < curr_size / 2; ++i) {
+                for (int j = 0; j < curr_size / 2; ++j) {
+                    // Fill the bottom-left and bottom-right quadrants
+                    H[i + curr_size / 2][j] = H[i][j];       // Copy the top-left quadrant to bottom-left
+                    H[i][j + curr_size / 2] = H[i][j];       // Copy the top-left quadrant to top-right
+                    H[i + curr_size / 2][j + curr_size / 2] = -H[i][j]; // Fill bottom-right with negative values
+                }
+            }
+        }
+
+        // Flatten into a vector in ColMajor order
+        std::vector<double> H_flat(size * size);
+
+        for (int col = 0; col < size; ++col) {
+            for (int row = 0; row < size; ++row) {
+                H_flat[col * size + row] = H[row][col];
+            }
+        }
+
+        return H_flat;
+    }
+
+    static std::vector<double> generate_random_vector(int size, double lower_bound, double upper_bound) {
+        // Create a random device and seed the random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+
+        // Define the distribution range for the random doubles
+        std::uniform_real_distribution<> dist(lower_bound, upper_bound);
+
+        // Create a vector of the specified size
+        std::vector<double> random_vector(size);
+
+        // Generate random doubles and fill the vector
+        for (int i = 0; i < size; ++i) {
+            random_vector[i] = dist(gen);
+        }
+
+        return random_vector;
+    }
+
+    enum class transforms {diag_scale, hadamard, permute};
+
+    // Tests to verify correctness of each of the transforms
+    template <typename T, RandBLAS::SignedInteger sint_t = int64_t>
+    static void correctness(
+        uint32_t seed,
+        transforms transform,
+        int64_t m, // Generated data matrix, `A` is of size `(m x n)`
+        int64_t n,
+        bool left,
+        blas::Layout layout,
+        double epsilon=1e-5
+    ) {
+        // Grabbing a random matrix
+        std::vector<double> A_vec = generate_random_vector(m * n, 0.0, 10.0);
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A_col = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>(A_vec.data(), m, n);
+        Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_row(A_vec.data(), m, n);
+
+        // Deep copy
+        MatrixXd B;
+        if(layout == blas::Layout::RowMajor)
+            B = A_row;
+        else
+            B = A_col;
+
+        switch (transform) {
+        case transforms::permute: {
+            // Simply compares against Eigen::PermutationMatrix
+            Eigen::PermutationMatrix<Eigen::Dynamic, Eigen::Dynamic> perm(5);
+
+            std::vector<int> V = left ? std::vector<int>(m) : std::vector<int>(n);
+
+            int cnt = 0;
+            // int cnt = 0;
+            for(int i = 0; i < V.size(); i++) {
+                if(i == 0)
+                    V[i] = V.size() - 1;
+                else if(i == V.size() - 1)
+                    V[i] = 0;
+                else
+                    V[i] = cnt;
+                cnt++;
+            }
+
+            Eigen::VectorXi indices = Eigen::Map<Eigen::VectorXi>(V.data(), V.size());
+
+            // Set the indices in the permutation matrix
+            perm.indices() = indices;
+
+            sint_t* v = new sint_t;
+            *v = V.size() - 1;
+
+            if(left) {
+                if(layout == blas::Layout::RowMajor)
+                    RandBLAS::permuteRowsToTop(layout, m, n, v, 1, A_row.data());
+                else
+                    RandBLAS::permuteRowsToTop(layout, m, n, v, 1, A_col.data());
+            }
+            else {
+                if(layout == blas::Layout::RowMajor)
+                    RandBLAS::permuteColsToLeft(layout, m, n, v, 1, A_row.data());
+                else
+                    RandBLAS::permuteColsToLeft(layout, m, n, v, 1, A_col.data());
+            }
+
+            // Or just do A.isApprox(B)
+            double norm_permute = 0.0;
+            if(left) {
+                if(layout == blas::Layout::RowMajor)
+                    norm_permute = (A_row - perm * B).norm();
+                else
+                    norm_permute = (A_col - perm * B).norm();
+            }
+            else {
+            if(layout == blas::Layout::RowMajor)
+                norm_permute = (A_row - B * perm).norm();
+            else
+                norm_permute = (A_col - B * perm).norm();
+            }
+
+            // Or do A.isApprox(H * B)
+            randblas_require(norm_permute < epsilon);
+
+            break;
+        }
+        case transforms::hadamard: {
+            // Here, simply check against explicit application of the Hadamard matrix
+
+            int ld = (left) ? m : n;
+            if(layout == blas::Layout::ColMajor)
+                RandBLAS::fht_dispatch(left, layout, A_col.data(), std::log2(ld), m, n);
+            else
+                RandBLAS::fht_dispatch(left, layout, A_row.data(), std::log2(ld), m, n);
+
+            std::vector<double> H_vec = generate_hadamard(std::log2(ld));
+            //TODO: Should have a check here to enforce that `m` and `n` are powers of 2 (since
+            // my `generate_hadamard` function does not take care to pad an input matrix)
+            MatrixXd H = Eigen::Map<MatrixXd>(H_vec.data(), int(std::pow(2, std::log2(ld))), int(std::pow(2, std::log2(ld))));
+
+            double norm_hadamard = 0.0;
+            if(left) {
+            if(layout == blas::Layout::RowMajor)
+                norm_hadamard = (A_row - H * B).norm();
+            else
+                norm_hadamard = (A_col - H * B).norm();
+            }
+            else {
+            if(layout == blas::Layout::RowMajor)
+                norm_hadamard = (A_row - B * H).norm();
+            else
+                norm_hadamard = (A_col - B * H).norm();
+            }
+
+            randblas_require(norm_hadamard < epsilon);
+
+            break;
+        }
+        case transforms::diag_scale: {
+            // Scales all rows/cols by -1 and checks if A == -A
+            std::vector<sint_t> buff = left ? std::vector<sint_t>(n, -1) : std::vector<sint_t>(m, -1);
+
+            double norm_diag = 0.0;
+            if(layout == blas::Layout::RowMajor) {
+            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
+            norm_diag = (A_row + B).norm();
+            }
+            else {
+            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
+            norm_diag = (A_col + B).norm();
+            }
+
+            randblas_require(norm_diag < epsilon);
+
+            break;
+        }
+        }
+    }
+
+    template <typename T, RandBLAS::SignedInteger sint_t = int64_t>
+    static void inverse_transform(
+        uint32_t seed,
+        int64_t m, // Generated data matrix, `A` is of size `(m x n)`
+        int64_t n,
+        int64_t d, // #rows/cols that will be permuted
+        bool left,
+        blas::Layout layout,
+        double epsilon=1e-5
+    ) {
+        // Grabbing a random matrix
+        std::vector<double> A_vec = generate_random_vector(m * n, 0.0, 10.0);
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> A_col = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>(A_vec.data(), m, n);
+        Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> A_row(A_vec.data(), m, n);
+
+        // Deep copy
+        MatrixXd B;
+        if(layout == blas::Layout::RowMajor)
+            B = A_row;
+        else
+            B = A_col;
+
+        //// Performing \Pi H D
+        // Step 1: setup the diagonal scaling
+        std::vector<sint_t> buff = left ? std::vector<sint_t>(n, -1) : std::vector<sint_t>(m, -1);
+
+        if(layout == blas::Layout::RowMajor) {
+            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
+        }
+        else {
+            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
+        }
+
+        // Step 2: apply the hadamard transform
+        int ld = (left) ? m : n;
+        if(layout == blas::Layout::ColMajor){
+            RandBLAS::fht_dispatch(left, layout, A_col.data(), int(std::log2(ld)), m, n);
+        }
+        else {
+            RandBLAS::fht_dispatch(left, layout, A_row.data(), int(std::log2(ld)), m, n);
+        }
+
+        // Step 3: Permuting
+        std::vector<int64_t> indices(d);
+
+        std::iota(indices.begin(), indices.end(), 1);
+        if(left) {
+            if(layout == blas::Layout::RowMajor)
+                RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_row.data());
+            else
+                RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_col.data());
+        }
+        else {
+            if(layout == blas::Layout::RowMajor)
+                RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_row.data());
+            else
+                RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_col.data());
+        }
+
+        //// Performing D H \Pi
+
+        //Step 1: Un-permute
+        std::reverse(indices.begin(), indices.end());
+
+        if(left) {
+            if(layout == blas::Layout::RowMajor)
+                RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_row.data());
+            else
+                RandBLAS::permuteRowsToTop(layout, m, n, indices.data(), d, A_col.data());
+        }
+        else {
+            if(layout == blas::Layout::RowMajor)
+                RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_row.data());
+            else
+                RandBLAS::permuteColsToLeft(layout, m, n, indices.data(), d, A_col.data());
+        }
+
+        // Step-2: Apply H^{-1}
+        if(layout == blas::Layout::ColMajor) {
+            RandBLAS::fht_dispatch(left, layout, A_col.data(), int(std::log2(ld)), m, n);
+            A_col = A_col * 1/std::pow(2, int(std::log2(ld)));
+        }
+        else {
+            RandBLAS::fht_dispatch(left, layout, A_row.data(), int(std::log2(ld)), m, n);
+            A_row = A_row * 1/std::pow(2, int(std::log2(ld)));
+        }
+
+        //Step-3: Inverting `D`
+        if(layout == blas::Layout::RowMajor) {
+            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_row.data(), buff.data());
+        }
+        else {
+            RandBLAS::apply_diagonal_rademacher(left, layout, m, n, A_col.data(), buff.data());
+        }
+
+        double norm_inverse = 0.0;
+
+        if(layout == blas::Layout::RowMajor) {
+            norm_inverse = (A_row - B).norm();
+        }
+        else {
+            norm_inverse = (A_col - B).norm();
+        }
+
+        randblas_require(norm_inverse < epsilon);
+
+
+    }
+};
+
+////////////////////////////////////////////////////////////////////////
+//
+//
+//      Checking correctness of each of the transforms
+//
+//
+////////////////////////////////////////////////////////////////////////
+
+TEST_F(TestLMIGET, test_diag_left_colmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::diag_scale,
+            100,
+            100,
+            true,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_diag_right_colmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::diag_scale,
+            100,
+            100,
+            false,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_diag_left_rowmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::diag_scale,
+            100,
+            100,
+            true,
+            blas::Layout::RowMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_diag_right_rowmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::diag_scale,
+            100,
+            100,
+            false,
+            blas::Layout::RowMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_permute_left_colmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::permute,
+            100,
+            100,
+            true,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_permute_right_colmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::permute,
+            100,
+            100,
+            false,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_permute_left_rowmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::permute,
+            100,
+            100,
+            true,
+            blas::Layout::RowMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_permute_right_rowmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::permute,
+            100,
+            100,
+            false,
+            blas::Layout::RowMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_hadamard_left_colmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::hadamard,
+            128,
+            100,
+            true,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_hadamard_right_colmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::hadamard,
+            100,
+            128,
+            false,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_hadamard_left_rowmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::hadamard,
+            128,
+            100,
+            true,
+            blas::Layout::RowMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_hadamard_right_rowmajor) {
+    for(uint32_t seed: keys)
+        correctness<double>(
+            seed,
+            transforms::hadamard,
+            100,
+            128,
+            false,
+            blas::Layout::RowMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_inverse_left_colmajor) {
+    for(uint32_t seed: keys)
+        inverse_transform<double>(
+            seed,
+            128,
+            128,
+            25,
+            true,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_inverse_right_colmajor) {
+    for(uint32_t seed: keys)
+        inverse_transform<double>(
+            seed,
+            100,
+            128,
+            25,
+            false,
+            blas::Layout::ColMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_inverse_left_rowmajor) {
+    for(uint32_t seed: keys)
+        inverse_transform<double>(
+            seed,
+            128,
+            128,
+            25,
+            true,
+            blas::Layout::RowMajor
+        );
+}
+
+TEST_F(TestLMIGET, test_inverse_right_rowmajor) {
+    for(uint32_t seed: keys)
+        inverse_transform<double>(
+            seed,
+            100,
+            128,
+            25,
+            false,
+            blas::Layout::RowMajor
+        );
+}