diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index c2b74eabee..406df8db56 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -39,3 +39,4 @@ The MIOpen API library is structured as follows:
   * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental)
   * :doc:`Kthvalue <../doxygen/html/group__kthvalue>` (experimental)
   * :doc:`GLU <../doxygen/html/group__glu>` (experimental)
+  * :doc:`Trace <../doxygen/html/group__trace>` (experimental)
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 60d6fe6ce6..ef5e162646 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -61,6 +61,7 @@ add_executable(MIOpenDriver
     dm_softmax.cpp
     dm_t5layernorm.cpp
     dm_tensorop.cpp
+    dm_trace.cpp
     dm_transformers_adam_w.cpp
     main.cpp
     registry_driver_maker.cpp
diff --git a/driver/dm_trace.cpp b/driver/dm_trace.cpp
new file mode 100644
index 0000000000..306117c8e7
--- /dev/null
+++ b/driver/dm_trace.cpp
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "trace_driver.hpp"
+#include "registry_driver_maker.hpp"
+
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "trace")
+        return new TraceDriver<float, float>();
+    if(base_arg == "tracefp16")
+        return new TraceDriver<float16, float>();
+    if(base_arg == "tracebfp16")
+        return new TraceDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index d77d5d02d2..fe8c668892 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -314,7 +314,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
            "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], "
            "prelu[bfp16|fp16], kthvalue[bfp16|fp16], glu[bfp16|fp16], softmarginloss[bfp16|fp16], "
-           "multimarginloss[bfp16|fp16]\n");
+           "multimarginloss[bfp16|fp16], trace[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -352,7 +352,7 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "kthvaluebfp16" && arg != "glu" && arg != "glufp16" && arg != "glubfp16" &&
        arg != "softmarginloss" && arg != "softmarginlossfp16" && arg != "softmarginlossbfp16" &&
        arg != "multimarginloss" && arg != "multimarginlossfp16" && arg != "multimarginlossbfp16" &&
-       arg != "--version")
+       arg != "trace" && arg != "tracefp16" && arg != "tracebfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/trace_driver.hpp b/driver/trace_driver.hpp
new file mode 100644
index 0000000000..e72ee2ccdd
--- /dev/null
+++ b/driver/trace_driver.hpp
@@ -0,0 +1,455 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "tensor_driver.hpp"
+#include "tensor_view.hpp"
+#include "timer.hpp"
+
+#include <../test/ford.hpp>
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include <miopen/errors.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor_view_utils.hpp>
+
+template <typename Tgpu, typename Tcheck>
+int mloTraceForwardRunHost(const miopenTensorDescriptor_t inputDesc,
+                           const Tgpu* input,
+                           Tcheck* outputHost)
+{
+    tensor_view_t<2> input_tv = miopen::get_inner_expanded_tv<2>(miopen::deref(inputDesc));
+    auto input_len            = miopen::deref(inputDesc).GetLengths();
+    size_t N                  = std::min(input_len[0], input_len[1]);
+    double res                = 0;
+    for(size_t i = 0; i < N; i++)
+    {
+        tensor_layout_t<2> input_layout = {i, i};
+        size_t input_idx                = input_tv.get_tensor_view_idx(input_layout);
+        res += static_cast<double>(input[input_idx]);
+    }
+    outputHost[0] = static_cast<Tcheck>(res);
+
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int mloTraceBackwardRunHost(const Tgpu* outputGrad,
+                            const miopenTensorDescriptor_t inputGradDesc,
+                            Tcheck* inputGradHost)
+{
+    tensor_view_t<2> input_grad_tv = miopen::get_inner_expanded_tv<2>(miopen::deref(inputGradDesc));
+    auto input_grad_len            = miopen::deref(inputGradDesc).GetLengths();
+    size_t N                       = input_grad_len[0];
+
+    par_ford(N)([&](size_t i) {
+        size_t idx = i % (input_grad_tv.size[1] + 1);
+
+        if(idx != input_grad_tv.size[1])
+        {
+            Tgpu val                         = outputGrad[0];
+            tensor_layout_t<2> ingrad_layout = {i, idx};
+            inputGradHost[input_grad_tv.get_tensor_view_idx(ingrad_layout)] =
+                static_cast<Tcheck>(val);
+        }
+    });
+
+    return 0;
+}
+
+template <typename Tgpu, typename Tref>
+class TraceDriver : public Driver
+{
+public:
+    TraceDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+        miopenCreateTensorDescriptor(&inputGradDesc);
+        miopenCreateTensorDescriptor(&outputGradDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    std::vector<int> ComputeStrides(std::vector<int> inputDim);
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    int GetandSetData() override;
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~TraceDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+        miopenDestroyTensorDescriptor(inputGradDesc);
+        miopenDestroyTensorDescriptor(outputGradDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+    bool isContiguous;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t outputDesc;
+    miopenTensorDescriptor_t inputGradDesc;
+    miopenTensorDescriptor_t outputGradDesc;
+
+    std::unique_ptr<GPUMem> in_dev;
+    std::unique_ptr<GPUMem> out_dev;
+    std::unique_ptr<GPUMem> in_grad_dev;
+    std::unique_ptr<GPUMem> out_grad_dev;
+    std::unique_ptr<GPUMem> workspace_dev;
+
+    std::vector<Tgpu> in;
+    std::vector<Tgpu> out;
+    std::vector<Tgpu> in_grad;
+    std::vector<Tgpu> out_grad;
+    std::vector<Tgpu> workspace;
+
+    std::vector<Tref> outHost;
+    std::vector<Tref> inGradHost;
+
+    size_t ws_sizeInBytes;
+};
+
+// Equivalent tensor.transpose(0, -1).contiguous().transpose(0, -1)
+template <typename Tgpu, typename Tref>
+std::vector<int> TraceDriver<Tgpu, Tref>::ComputeStrides(std::vector<int> inputDim)
+{
+    if(!isContiguous)
+        std::swap(inputDim.front(), inputDim.back());
+    std::vector<int> strides(inputDim.size());
+    strides.back() = 1;
+    for(int i = inputDim.size() - 2; i >= 0; --i)
+        strides[i] = strides[i + 1] * inputDim[i + 1];
+    if(!isContiguous)
+        std::swap(strides.front(), strides.back());
+    return strides;
+}
+
+template <typename Tgpu, typename Tref>
+int TraceDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+    isContiguous = inflags.GetValueInt("contiguous") > 0 ? true : false;
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+
+    forw = inflags.GetValueInt("forw");
+
+    if(forw != 0 && forw != 1)
+    {
+        MIOPEN_THROW("Invalid Forward|Backward Mode");
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int TraceDriver<Tgpu, Tref>::GetandSetData()
+{
+    auto in_len     = inflags.GetValueTensor("dim_lengths").lengths;
+    auto in_strides = ComputeStrides(in_len);
+    SetTensorNd(inputDesc, in_len, in_strides, data_type);
+    SetTensorNd(inputGradDesc, in_len, in_strides, data_type);
+
+    std::vector<int> out_lens = {1};
+    SetTensorNd(outputDesc, out_lens, data_type);
+    SetTensorNd(outputGradDesc, out_lens, data_type);
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int TraceDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag(
+        "forw",
+        'F',
+        "0",
+        "Run only Trace Forward (1) or Run both Forward and Backward (0) (Default = 0)",
+        "int");
+    inflags.AddTensorFlag(
+        "dim_lengths", 'D', "9x9", "The dimensional lengths of the input tensor (Default=9x9)");
+    inflags.AddInputFlag("contiguous",
+                         'C',
+                         "1",
+                         "Tensor is contiguous (1) or not contiguous (0) (Default=1)",
+                         "int");
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int");
+    inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int TraceDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t in_sz  = GetTensorSize(inputDesc);
+    size_t out_sz = GetTensorSize(outputDesc);
+
+    miopenGetTraceForwardWorkspaceSize(GetHandle(), inputDesc, outputDesc, &ws_sizeInBytes);
+
+    if(ws_sizeInBytes == static_cast<size_t>(-1))
+        return miopenStatusAllocFailed;
+
+    size_t ws_sz = ws_sizeInBytes / sizeof(Tgpu);
+
+    uint32_t ctx = 0;
+
+    in_dev        = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
+    out_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
+    workspace_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte)));
+
+    in_grad_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
+    out_grad_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
+
+    in        = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
+    out       = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
+    workspace = std::vector<Tgpu>(ws_sz, static_cast<Tgpu>(0));
+
+    in_grad  = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
+    out_grad = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
+
+    outHost    = std::vector<Tref>(out_sz, static_cast<Tref>(0));
+    inGradHost = std::vector<Tref>(in_sz, static_cast<Tref>(0));
+
+    for(int i = 0; i < in_sz; i++)
+    {
+        in[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(0.2));
+    }
+
+    for(int i = 0; i < out_sz; i++)
+    {
+        out_grad[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(0.3));
+    }
+
+    if(in_dev->ToGPU(GetStream(), in.data()) != 0)
+    {
+        std::cerr << "Error copying (in) to GPU, size: " << in_dev->GetSize() << std::endl;
+        return miopenStatusInternalError;
+    }
+
+    if(out_grad_dev->ToGPU(GetStream(), out_grad.data()) != 0)
+    {
+        std::cerr << "Error copying (out_grad) to GPU, size: " << out_grad_dev->GetSize()
+                  << std::endl;
+        return miopenStatusInternalError;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int TraceDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenStatus_t status = miopenTraceForward(GetHandle(),
+                                                   workspace_dev->GetMem(),
+                                                   ws_sizeInBytes,
+                                                   inputDesc,
+                                                   in_dev->GetMem(),
+                                                   outputDesc,
+                                                   out_dev->GetMem());
+        MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in miopenTraceForward");
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Forward Trace Elapsed: " << t.gettime_ms() / iter
+                      << " ms\n";
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Forward Trace Elapsed: " << kernel_average_time << " ms\n";
+    }
+
+    if(out_dev->FromGPU(GetStream(), out.data()) != 0)
+    {
+        std::cerr << "Error copying (out_dev) from GPU, size: " << out_dev->GetSize() << std::endl;
+        return miopenStatusInternalError;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int TraceDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    auto status = mloTraceForwardRunHost<Tgpu, Tref>(inputDesc, in.data(), outHost.data());
+    MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in mloTraceForwardRunHost");
+
+    return status;
+}
+
+template <typename Tgpu, typename Tref>
+int TraceDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenStatus_t status = miopenTraceBackward(GetHandle(),
+                                                    outputGradDesc,
+                                                    out_grad_dev->GetMem(),
+                                                    inputGradDesc,
+                                                    in_grad_dev->GetMem());
+        MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in miopenTraceBackward");
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Backward Trace Elapsed: " << t.gettime_ms() / iter
+                      << " ms\n";
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Backward Trace Elapsed: " << kernel_average_time << " ms\n";
+    }
+
+    if(in_grad_dev->FromGPU(GetStream(), in_grad.data()) != 0)
+    {
+        std::cerr << "Error copying (in_grad_dev) from GPU, size: " << in_grad_dev->GetSize()
+                  << std::endl;
+        return miopenStatusInternalError;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int TraceDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    auto status = mloTraceBackwardRunHost(out_grad.data(), inputGradDesc, inGradHost.data());
+    MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in mloTraceBackwardRunHost");
+
+    return status;
+}
+
+template <typename Tgpu, typename Tref>
+Tref TraceDriver<Tgpu, Tref>::GetTolerance()
+{
+    Tref tolerance = std::numeric_limits<Tgpu>::epsilon() * 10;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int TraceDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error           = miopen::rms_range(outHost, out);
+
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Forward Trace FAILED: " << error << " > " << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Forward Trace Verifies OK on CPU reference (" << error << " < " << tolerance
+                  << ')' << std::endl;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int TraceDriver<Tgpu, Tref>::VerifyBackward()
+{
+    RunBackwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error           = miopen::rms_range(inGradHost, in_grad);
+
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Backward Trace FAILED: " << error << " > " << tolerance << std::endl;
+        return EC_VerifyBwd;
+    }
+    else
+    {
+        std::cout << "Backward Trace Verifies OK on CPU reference (" << error << " < " << tolerance
+                  << ')' << std::endl;
+    }
+
+    return miopenStatusSuccess;
+}
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 6d1830d7d6..2936505dae 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -72,6 +72,7 @@
  * @defgroup ReduceCalculation
  * @defgroup RotaryPositionalEmbeddings
  * @defgroup ReLU
+ * @defgroup trace
  *
  */
 
@@ -8231,6 +8232,69 @@ MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle,
 // CLOSEOUT LossFunction DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+#ifdef MIOPEN_BETA_API
+/** @addtogroup trace
+ *
+ *  @{
+ */
+
+/*! @brief Helper function to query the minimum workspace size required by the Trace call
+ *
+ * @param handle                   MIOpen Handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param outputDesc               Tensor descriptor for output tensor (input)
+ * @param sizeInBytes              Pointer to data to return the minimum workspace size
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenGetTraceForwardWorkspaceSize(miopenHandle_t handle,
+                                   const miopenTensorDescriptor_t inputDesc,
+                                   const miopenTensorDescriptor_t outputDesc,
+                                   size_t* sizeInBytes);
+
+/** @addtogroup trace
+ *
+ *  @{
+ */
+
+/*! @brief Execute Trace forward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param workspace                Address of the allocated workspace data (input)
+ * @param workspaceSizeInBytes     Size in bytes of the allocated workspace data (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Input tensor (input)
+ * @param outputDesc               Tensor descriptor for output tensor (input)
+ * @param output                   Output tensor (output)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenTraceForward(miopenHandle_t handle,
+                                                void* workspace,
+                                                size_t workspaceSizeInBytes,
+                                                const miopenTensorDescriptor_t inputDesc,
+                                                const void* input,
+                                                const miopenTensorDescriptor_t outputDesc,
+                                                void* output);
+
+/*! @brief Execute Trace backward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param outputGradDesc           Tensor descriptor for output grad tensor (input)
+ * @param outputGrad               Output grad tensor (input)
+ * @param inputGradDesc            Tensor descriptor for input grad tensor (input)
+ * @param inputGrad                Input grad tensor (output)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenTraceBackward(miopenHandle_t handle,
+                                                 const miopenTensorDescriptor_t outputGradDesc,
+                                                 const void* outputGrad,
+                                                 const miopenTensorDescriptor_t inputGradDesc,
+                                                 void* inputGrad);
+
+/** @} */
+// CLOSEOUT TRACE DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9661ccb28f..5840c0ba1f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -343,6 +343,8 @@ set( MIOpen_Source
     solver/tensorOp/Op5dTensorGeneric.cpp
     solver/tensorOp/OpTensorFwdBias.cpp
     solver/tensorOp/OpTensorLeadingOnes.cpp
+    solver/trace/backward_trace.cpp
+    solver/trace/forward_trace.cpp
     subbuffers.cpp
     t5layernorm_api.cpp
     target_properties.cpp
@@ -350,6 +352,8 @@ set( MIOpen_Source
     tensor.cpp
     tensorOp/problem_description.cpp
     tensor_api.cpp
+    trace/problem_description.cpp
+    trace_api.cpp
     transformers_adam_w_api.cpp
     seq_tensor.cpp
 )
@@ -537,6 +541,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenConvDirUni.cl
         kernels/MIOpenConvDirBatchNormActiv.cl
         kernels/MIOpenConvDirGenFwd.cl
+        kernels/MIOpenFill.cpp
         kernels/MIOpenGLU.cpp
         kernels/MIOpenGroupNorm.cpp
         kernels/MIOpenGetitem.cpp
@@ -562,6 +567,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenSoftMarginLoss.cpp
         kernels/MIOpenSoftmax.cl
         kernels/MIOpenSoftmaxAttn.cpp
+        kernels/MIOpenTrace.cpp
         kernels/MIOpenUtilKernels3.cl
         kernels/MIOpenUtilKernels4.cl
         kernels/MIOpenUtilKernels5.cl
@@ -709,6 +715,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         reduceextreme.cpp
         rope.cpp
         softmarginloss.cpp
+        trace.cpp
         transformers_adam_w.cpp
         ${PROJECT_BINARY_DIR}/db_path.cpp
         )
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index 76a13b051c..44db139657 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -64,7 +64,8 @@ enum class Primitive
     ReLU,
     Kthvalue,
     SoftMarginLoss,
-    MultiMarginLoss
+    MultiMarginLoss,
+    Trace
 };
 
 struct MIOPEN_INTERNALS_EXPORT Id
diff --git a/src/include/miopen/trace.hpp b/src/include/miopen/trace.hpp
new file mode 100644
index 0000000000..2bee1fa2d1
--- /dev/null
+++ b/src/include/miopen/trace.hpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+namespace trace {
+
+MIOPEN_INTERNALS_EXPORT size_t GetTraceForwardWorkspaceSize(Handle& handle,
+                                                            const TensorDescriptor& inputDesc,
+                                                            const TensorDescriptor& outputDesc);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t TraceForward(Handle& handle,
+                                                    Data_t workspace,
+                                                    size_t workspaceSizeInBytes,
+                                                    const TensorDescriptor& inputDesc,
+                                                    ConstData_t input,
+                                                    const TensorDescriptor& outputDesc,
+                                                    Data_t output);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t TraceBackward(Handle& handle,
+                                                     const TensorDescriptor& outputGradDesc,
+                                                     ConstData_t outputGrad,
+                                                     const TensorDescriptor& inputGradDesc,
+                                                     Data_t inputGrad);
+
+} // namespace trace
+
+} // namespace miopen
diff --git a/src/include/miopen/trace/invoke_params.hpp b/src/include/miopen/trace/invoke_params.hpp
new file mode 100644
index 0000000000..c15aa00029
--- /dev/null
+++ b/src/include/miopen/trace/invoke_params.hpp
@@ -0,0 +1,72 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/invoke_params.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+namespace trace {
+
+struct FwdInvokeParams : public miopen::InvokeParams
+{
+    FwdInvokeParams() = default;
+
+    const TensorDescriptor* inputDesc  = nullptr;
+    const TensorDescriptor* outputDesc = nullptr;
+
+    ConstData_t input = nullptr;
+    Data_t output     = nullptr;
+
+    Data_t workspace           = nullptr;
+    std::size_t workspace_size = 0;
+
+    std::size_t GetWorkspaceSize() const { return workspace_size; }
+    Data_t GetWorkspace() const { return workspace; }
+};
+
+struct BwdInvokeParams : public miopen::InvokeParams
+{
+    BwdInvokeParams() = default;
+
+    const TensorDescriptor* inputGradDesc  = nullptr;
+    const TensorDescriptor* outputGradDesc = nullptr;
+
+    ConstData_t outputGrad = nullptr;
+    Data_t inputGrad       = nullptr;
+
+    Data_t workspace           = nullptr;
+    std::size_t workspace_size = 0;
+
+    std::size_t GetWorkspaceSize() const { return workspace_size; }
+    Data_t GetWorkspace() const { return workspace; }
+};
+
+} // namespace trace
+
+} // namespace miopen
diff --git a/src/include/miopen/trace/problem_description.hpp b/src/include/miopen/trace/problem_description.hpp
new file mode 100644
index 0000000000..43b1077789
--- /dev/null
+++ b/src/include/miopen/trace/problem_description.hpp
@@ -0,0 +1,123 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/errors.hpp>
+#include <miopen/miopen.h>
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace trace {
+
+struct FwdProblemDescription : ProblemDescriptionBase
+{
+    FwdProblemDescription(const TensorDescriptor& inputDesc_, const TensorDescriptor& outputDesc_)
+        : inputDesc(inputDesc_), outputDesc(outputDesc_)
+    {
+        if(inputDesc.GetNumDims() != 2)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Input tensor must be 2D.");
+        }
+
+        if(outputDesc.GetNumDims() != 1)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Output tensor must be 1D.");
+        }
+
+        if(!IsSameType())
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Input and output tensor must have same type.");
+        }
+    }
+
+    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
+    const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
+
+    bool IsSameType() const
+    {
+        if(inputDesc.GetType() != outputDesc.GetType())
+        {
+            return false;
+        }
+        return true;
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+protected:
+    TensorDescriptor inputDesc;
+    TensorDescriptor outputDesc;
+};
+
+struct BwdProblemDescription : ProblemDescriptionBase
+{
+    BwdProblemDescription(const TensorDescriptor& outputGradDesc_,
+                          const TensorDescriptor& inputGradDesc_)
+        : outputGradDesc(outputGradDesc_), inputGradDesc(inputGradDesc_)
+    {
+        if(inputGradDesc.GetNumDims() != 2)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Input grad tensor must be 2D.");
+        }
+
+        if(outputGradDesc.GetNumDims() != 1)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Output grad tensor must be 1D.");
+        }
+
+        if(!IsSameType())
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Input grad and output grad tensor must have same type.");
+        }
+    }
+
+    const TensorDescriptor& GetInputGradDesc() const { return inputGradDesc; }
+    const TensorDescriptor& GetOutputGradDesc() const { return outputGradDesc; }
+
+    bool IsSameType() const
+    {
+        if(inputGradDesc.GetType() != outputGradDesc.GetType())
+        {
+            return false;
+        }
+        return true;
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+protected:
+    TensorDescriptor outputGradDesc;
+    TensorDescriptor inputGradDesc;
+};
+
+} // namespace trace
+
+} // namespace miopen
diff --git a/src/include/miopen/trace/solvers.hpp b/src/include/miopen/trace/solvers.hpp
new file mode 100644
index 0000000000..7a8784d203
--- /dev/null
+++ b/src/include/miopen/trace/solvers.hpp
@@ -0,0 +1,77 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/solver.hpp>
+#include <miopen/trace/problem_description.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace trace {
+
+using TraceForwardSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::trace::FwdProblemDescription>;
+
+struct TraceForward final : TraceForwardSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<TraceForward>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::trace::FwdProblemDescription& problem) const override;
+
+    bool IsImprovementOverROCm(const ExecutionContext& context,
+                               const miopen::trace::FwdProblemDescription& problem) const;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::trace::FwdProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::trace::FwdProblemDescription& problem) const override;
+    bool MayNeedWorkspace() const override { return true; }
+};
+
+using TraceBackwardSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::trace::BwdProblemDescription>;
+
+struct TraceBackward final : TraceBackwardSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<TraceForward>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::trace::BwdProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::trace::BwdProblemDescription& problem) const override;
+};
+
+} // namespace trace
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/kernels/MIOpenFill.cpp b/src/kernels/MIOpenFill.cpp
new file mode 100644
index 0000000000..4333d76173
--- /dev/null
+++ b/src/kernels/MIOpenFill.cpp
@@ -0,0 +1,44 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+template <typename TIO>
+__device__ void FillZeroKernel(TIO* output, uint64_t size)
+{
+    size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= size)
+        return;
+
+    output[gid] = static_cast<TIO>(0);
+}
+
+extern "C" __global__ void FillZero(IO_TYPE* output, uint64_t size)
+{
+    FillZeroKernel<IO_TYPE>(output, size);
+}
diff --git a/src/kernels/MIOpenTrace.cpp b/src/kernels/MIOpenTrace.cpp
new file mode 100644
index 0000000000..bc21eac706
--- /dev/null
+++ b/src/kernels/MIOpenTrace.cpp
@@ -0,0 +1,106 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "block_reduce.hpp"
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "tensor_view.hpp"
+
+#define LOCAL_SIZE 256
+
+template <typename TI, typename TO>
+__device__ void TraceForward_kernel(
+    const TI* input, TO* output, size_t N, tensor_view_t<2> input_tv, bool isReduced)
+{
+    const size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    const size_t lid = threadIdx.x;
+
+    __shared__ FLOAT_ACCUM local_mem[LOCAL_SIZE];
+
+    if(gid < N)
+    {
+        tensor_layout_t<2> diag_layout = {gid, gid};
+        auto input_idx                 = input_tv.get_tensor_view_idx(diag_layout);
+        local_mem[lid]                 = CVT_FLOAT2ACCUM(input[input_idx]);
+    }
+    else
+    {
+        local_mem[lid] = 0;
+    }
+
+    __syncthreads();
+
+    for(size_t i = blockDim.x / 2; i > 0; i >>= 1)
+    {
+        if(lid < i)
+        {
+            local_mem[lid] += local_mem[lid + i];
+        }
+        __syncthreads();
+    }
+
+    if(lid == 0)
+    {
+        output[blockIdx.x] = isReduced ? local_mem[0] : CVT_ACCUM2FLOAT(local_mem[0]);
+    }
+}
+
+extern "C" __global__ void TraceForward(
+    const IO_TYPE* input, O_TYPE* output, size_t N, tensor_view_t<2> input_tv, bool isReduced)
+{
+    TraceForward_kernel<IO_TYPE, O_TYPE>(input, output, N, input_tv, isReduced);
+}
+
+template <typename TIO>
+__device__ void TraceBackward_kernel(const TIO* output_grad,
+                                     TIO* input_grad,
+                                     size_t N,
+                                     tensor_view_t<2> input_grad_tv)
+{
+    size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= N)
+        return;
+
+    size_t idx = gid % (input_grad_tv.size[1] + 1);
+
+    if(idx != input_grad_tv.size[1])
+    {
+        TIO val                                                      = output_grad[0];
+        tensor_layout_t<2> ingrad_layout                             = {gid, idx};
+        input_grad[input_grad_tv.get_tensor_view_idx(ingrad_layout)] = val;
+    }
+}
+
+extern "C" __global__ void TraceBackward(const IO_TYPE* output_grad,
+                                         IO_TYPE* input_grad,
+                                         size_t N,
+                                         tensor_view_t<2> input_grad_tv)
+{
+    TraceBackward_kernel<IO_TYPE>(output_grad, input_grad, N, input_grad_tv);
+}
diff --git a/src/solver.cpp b/src/solver.cpp
index c3b077d435..eb439d9d83 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -42,6 +42,7 @@
 #include <miopen/mha/solvers.hpp>
 #include <miopen/softmarginloss/solvers.hpp>
 #include <miopen/softmax/solvers.hpp>
+#include <miopen/trace/solvers.hpp>
 #include <miopen/multimarginloss/solvers.hpp>
 
 #include <miopen/conv_algo_name.hpp>
@@ -707,6 +708,9 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
              multimarginloss::MultiMarginLossForward{}.SolverDbId());
 
     Register(registry, ++id, Primitive::Mha, mha::MhaCKFlashAttentionV2Forward{}.SolverDbId());
+
+    Register(registry, ++id, Primitive::Trace, trace::TraceBackward{}.SolverDbId());
+    Register(registry, ++id, Primitive::Trace, trace::TraceForward{}.SolverDbId());
     // IMPORTANT: New solvers should be added to the end of the function, and don't leave a white
     // space between this comment and the newly registered solver(s)!
 }
diff --git a/src/solver/trace/backward_trace.cpp b/src/solver/trace/backward_trace.cpp
new file mode 100644
index 0000000000..3431f9d78d
--- /dev/null
+++ b/src/solver/trace/backward_trace.cpp
@@ -0,0 +1,181 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/datatype.hpp>
+#include <miopen/errors.hpp>
+#include <miopen/hipoc_kernel.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/kernel_info.hpp>
+#include <miopen/trace/invoke_params.hpp>
+#include <miopen/trace/problem_description.hpp>
+#include <miopen/trace/solvers.hpp>
+#include <miopen/miopen.h>
+#include <miopen/mlo_internal.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#include <cstddef>
+#include <vector>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace trace {
+
+bool TraceBackward::IsApplicable(const ExecutionContext& /*context*/,
+                                 const miopen::trace::BwdProblemDescription& problem) const
+{
+    if(!(problem.GetInputGradDesc().GetType() == miopenFloat ||
+         problem.GetInputGradDesc().GetType() == miopenHalf ||
+         problem.GetInputGradDesc().GetType() == miopenBFloat16))
+    {
+        return false;
+    }
+
+    return true;
+}
+
+ConvSolution TraceBackward::GetSolution(const ExecutionContext& /*context*/,
+                                        const miopen::trace::BwdProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto dtype            = problem.GetOutputGradDesc().GetType();
+    auto io_dtype         = miopen::GetDataType(dtype);
+    auto input_grad_len   = problem.GetInputGradDesc().GetLengths();
+    size_t N              = input_grad_len[0];
+    auto input_grad_numel = problem.GetInputGradDesc().GetElementSize();
+
+    const auto build_params =
+        KernelBuildParameters{{"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+                              {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+                              {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+                              {"IO_TYPE", io_dtype == "bfloat16" ? "ushort" : io_dtype},
+                              {"O_TYPE", io_dtype == "bfloat16" ? "ushort" : io_dtype}};
+
+    /* Phase 1: Fill input grad with zeros */
+    {
+        size_t xlocalsize = LOCAL_SIZE;
+        size_t xgridsize  = AlignUp(input_grad_numel, xlocalsize);
+
+        auto kernel        = KernelInfo{};
+        kernel.kernel_file = "MIOpenFill.cpp";
+        kernel.kernel_name = "FillZero";
+
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(1);
+        kernel.l_wk.push_back(1);
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(1);
+        kernel.g_wk.push_back(1);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    /* Phase 2: Trace backward */
+    {
+        size_t xlocalsize = LOCAL_SIZE;
+        size_t xgridsize  = AlignUp(N, xlocalsize);
+
+        auto kernel        = KernelInfo{};
+        kernel.kernel_file = "MIOpenTrace.cpp";
+        kernel.kernel_name = "TraceBackward";
+
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(1);
+        kernel.l_wk.push_back(1);
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(1);
+        kernel.g_wk.push_back(1);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [input_grad_numel, N](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::trace::BwdInvokeParams>();
+
+            float elapsed = 0.f;
+            HipEventPtr start, stop;
+
+            bool reset_profiling_state = false;
+            if(handle_.IsProfilingEnabled())
+            {
+                reset_profiling_state = true;
+                handle_.EnableProfiling(false);
+                start = miopen::make_hip_event();
+                stop  = miopen::make_hip_event();
+                hipEventRecord(start.get(), handle_.GetStream());
+            }
+
+            /* Phase 1: Fill input grad with zeros. */
+            {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                kernel(params.inputGrad, input_grad_numel);
+            }
+
+            /* Phase 2: Trace backward. */
+            {
+                auto input_grad_tv = get_inner_expanded_tv<2>(deref(params.inputGradDesc));
+
+                decltype(auto) kernel = handle_.Run(kernels.back());
+                kernel(params.outputGrad, params.inputGrad, N, input_grad_tv);
+            }
+
+            if(reset_profiling_state)
+            {
+                handle_.EnableProfiling(true);
+            }
+
+            if(handle_.IsProfilingEnabled())
+            {
+                hipEventRecord(stop.get(), handle_.GetStream());
+                hipEventSynchronize(stop.get());
+                hipEventElapsedTime(&elapsed, start.get(), stop.get());
+
+                // Clean up
+                hipEventDestroy(start.get());
+                hipEventDestroy(stop.get());
+                handle_.ResetKernelTime();
+                handle_.AccumKernelTime(elapsed);
+            }
+        };
+    };
+
+    return result;
+}
+
+} // namespace trace
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/trace/forward_trace.cpp b/src/solver/trace/forward_trace.cpp
new file mode 100644
index 0000000000..b814220879
--- /dev/null
+++ b/src/solver/trace/forward_trace.cpp
@@ -0,0 +1,267 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/common.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/hipoc_kernel.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/kernel_info.hpp>
+#include <miopen/trace/invoke_params.hpp>
+#include <miopen/trace/problem_description.hpp>
+#include <miopen/trace/solvers.hpp>
+#include <miopen/miopen.h>
+#include <miopen/mlo_internal.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#include <cstddef>
+#include <vector>
+
+#define LOCAL_SIZE 256
+#define LOCAL_SIZE_REDUCE 1024
+
+namespace miopen {
+
+namespace solver {
+
+namespace trace {
+
+bool TraceForward::IsApplicable(const ExecutionContext& /*context*/,
+                                const miopen::trace::FwdProblemDescription& problem) const
+{
+    if(!(problem.GetInputDesc().GetType() == miopenFloat ||
+         problem.GetInputDesc().GetType() == miopenHalf ||
+         problem.GetInputDesc().GetType() == miopenBFloat16))
+    {
+        return false;
+    }
+
+    return true;
+}
+
+ConvSolution TraceForward::GetSolution(const ExecutionContext& /*context*/,
+                                       const miopen::trace::FwdProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto dtype     = problem.GetOutputDesc().GetType();
+    auto i_dtype   = miopen::GetDataType(dtype);
+    auto input_len = problem.GetInputDesc().GetLengths();
+    size_t N       = std::min(input_len[0], input_len[1]);
+
+    // 1 kernel
+    if(N <= LOCAL_SIZE)
+    {
+        size_t xlocalsize = LOCAL_SIZE;
+        size_t xgridsize  = AlignUp(N, xlocalsize);
+
+        auto kernel        = KernelInfo{};
+        kernel.kernel_file = "MIOpenTrace.cpp";
+        kernel.kernel_name = "TraceForward";
+
+        const auto build_params =
+            KernelBuildParameters{{"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+                                  {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+                                  {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+                                  {"IO_TYPE", i_dtype == "bfloat16" ? "ushort" : i_dtype},
+                                  {"O_TYPE", i_dtype == "bfloat16" ? "ushort" : i_dtype}};
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(1);
+        kernel.l_wk.push_back(1);
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(1);
+        kernel.g_wk.push_back(1);
+
+        result.construction_params.push_back(kernel);
+    }
+    else
+    {
+        const auto build_params =
+            KernelBuildParameters{{"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+                                  {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+                                  {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+                                  {"IO_TYPE", i_dtype == "bfloat16" ? "ushort" : i_dtype},
+                                  {"O_TYPE", "FLOAT_ACCUM"},
+                                  {"REDUCE_SIZE", LOCAL_SIZE_REDUCE}};
+
+        /* Phase 1: Get diagonal to workspace */
+        {
+            size_t xlocalsize = LOCAL_SIZE;
+            size_t xgridsize  = AlignUp(N, xlocalsize);
+
+            auto kernel        = KernelInfo{};
+            kernel.kernel_file = "MIOpenTrace.cpp";
+            kernel.kernel_name = "TraceForward";
+
+            kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+            kernel.l_wk.push_back(xlocalsize);
+            kernel.l_wk.push_back(1);
+            kernel.l_wk.push_back(1);
+            kernel.g_wk.push_back(xgridsize);
+            kernel.g_wk.push_back(1);
+            kernel.g_wk.push_back(1);
+
+            result.construction_params.push_back(kernel);
+        }
+
+        /* Phase 2: Reduce sum (FLOAT_ACCUM to FLOAT_ACCUM) */
+        {
+            auto _size = (N + LOCAL_SIZE - 1) / LOCAL_SIZE;
+
+            while(_size > LOCAL_SIZE_REDUCE)
+            {
+                size_t xlocalsize = LOCAL_SIZE_REDUCE;
+                size_t xgridsize  = AlignUp(_size, xlocalsize);
+
+                auto kernel        = KernelInfo{};
+                kernel.kernel_file = "MIOpenReduceSum.cpp";
+                kernel.kernel_name = "ReduceSumFLOATACCUM";
+
+                kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+                kernel.l_wk.push_back(xlocalsize);
+                kernel.l_wk.push_back(1);
+                kernel.l_wk.push_back(1);
+                kernel.g_wk.push_back(xgridsize);
+                kernel.g_wk.push_back(1);
+                kernel.g_wk.push_back(1);
+
+                result.construction_params.push_back(kernel);
+                _size = (_size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE;
+            }
+
+            /* Reduce sum (FLOAT_ACCUM to TIO) */
+            size_t xlocalsize = LOCAL_SIZE_REDUCE;
+            size_t xgridsize  = AlignUp(_size, xlocalsize);
+
+            auto kernel        = KernelInfo{};
+            kernel.kernel_file = "MIOpenReduceSum.cpp";
+            kernel.kernel_name = "ReduceSum";
+
+            kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+            kernel.l_wk.push_back(xlocalsize);
+            kernel.l_wk.push_back(1);
+            kernel.l_wk.push_back(1);
+            kernel.g_wk.push_back(xgridsize);
+            kernel.g_wk.push_back(1);
+            kernel.g_wk.push_back(1);
+
+            result.construction_params.push_back(kernel);
+        }
+    }
+
+    result.invoker_factory = [dtype, N](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::trace::FwdInvokeParams>();
+            auto input_tv         = get_inner_expanded_tv<2>(deref(params.inputDesc));
+
+            if(N <= LOCAL_SIZE)
+            {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                kernel(params.input, params.output, N, input_tv, false);
+            }
+            else
+            {
+                auto elapsed = 0.f;
+                HipEventPtr start, stop;
+
+                const bool profiling = handle_.IsProfilingEnabled();
+                if(profiling)
+                {
+                    handle_.EnableProfiling(false);
+                    start = miopen::make_hip_event();
+                    stop  = miopen::make_hip_event();
+                    hipEventRecord(start.get(), handle_.GetStream());
+                }
+
+                /* Phase 1: Get diagonal. */
+                {
+                    decltype(auto) kernel = handle_.Run(kernels.front());
+                    kernel(params.input, params.workspace, N, input_tv, true);
+                }
+
+                /* Phase 2: Reduce. */
+                if(kernels.size() > 1)
+                {
+                    auto _size      = (N + LOCAL_SIZE - 1) / LOCAL_SIZE;
+                    auto reduce_in  = params.workspace;
+                    auto reduce_out = static_cast<Data_t>(static_cast<char*>(params.workspace) +
+                                                          N * get_data_size(dtype));
+
+                    for(size_t i = 1; i < kernels.size() - 1; ++i)
+                    {
+                        decltype(auto) kernel = handle_.Run(kernels[i]);
+
+                        kernel(reduce_in, reduce_out, _size);
+                        std::swap(reduce_in, reduce_out);
+
+                        _size = (_size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE;
+                    }
+
+                    decltype(auto) kernel = handle_.Run(kernels.back());
+                    kernel(reduce_in, params.output, _size);
+
+                    if(profiling)
+                    {
+                        hipEventRecord(stop.get(), handle_.GetStream());
+                        hipEventSynchronize(stop.get());
+                        hipEventElapsedTime(&elapsed, start.get(), stop.get());
+
+                        // Clean up
+                        hipEventDestroy(start.get());
+                        hipEventDestroy(stop.get());
+                        handle_.ResetKernelTime();
+                        handle_.AccumKernelTime(elapsed);
+
+                        handle_.EnableProfiling(true);
+                    }
+                }
+            }
+        };
+    };
+
+    return result;
+}
+
+std::size_t
+TraceForward::GetWorkspaceSize(const ExecutionContext& /*context*/,
+                               const miopen::trace::FwdProblemDescription& problem) const
+{
+    auto input_len            = problem.GetInputDesc().GetLengths();
+    size_t N                  = std::min(input_len[0], input_len[1]);
+    size_t trace_forward_size = (N + LOCAL_SIZE - 1) / LOCAL_SIZE;
+    return (trace_forward_size + (trace_forward_size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE) *
+           get_data_size(miopenFloat);
+}
+
+} // namespace trace
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/trace.cpp b/src/trace.cpp
new file mode 100644
index 0000000000..d64917f83a
--- /dev/null
+++ b/src/trace.cpp
@@ -0,0 +1,112 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/common.hpp>
+#include <miopen/find_solution.hpp>
+#include <miopen/trace.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/trace/invoke_params.hpp>
+#include <miopen/trace/problem_description.hpp>
+#include <miopen/trace/solvers.hpp>
+
+namespace miopen {
+
+namespace trace {
+
+size_t GetTraceForwardWorkspaceSize(Handle& handle,
+                                    const TensorDescriptor& inputDesc,
+                                    const TensorDescriptor& outputDesc)
+{
+    auto ctx           = ExecutionContext{&handle};
+    const auto problem = trace::FwdProblemDescription{inputDesc, outputDesc};
+
+    const auto algo    = AlgorithmName{"TraceForward"};
+    const auto solvers = solver::SolverContainer<solver::trace::TraceForward>{};
+
+    auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem);
+
+    return pair_size_vector.empty() ? static_cast<size_t>(-1) : pair_size_vector.front().second;
+}
+
+miopenStatus_t TraceForward(Handle& handle,
+                            Data_t workspace,
+                            size_t workspaceSizeInBytes,
+                            const TensorDescriptor& inputDesc,
+                            ConstData_t input,
+                            const TensorDescriptor& outputDesc,
+                            Data_t output)
+{
+    const auto problem = trace::FwdProblemDescription{inputDesc, outputDesc};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = trace::FwdInvokeParams{};
+        tmp.type           = InvokeType::Run;
+        tmp.inputDesc      = &inputDesc;
+        tmp.outputDesc     = &outputDesc;
+        tmp.input          = input;
+        tmp.output         = output;
+        tmp.workspace      = workspace;
+        tmp.workspace_size = workspaceSizeInBytes;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"TraceForward"};
+    const auto solvers = solver::SolverContainer<solver::trace::TraceForward>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t TraceBackward(Handle& handle,
+                             const TensorDescriptor& outputGradDesc,
+                             ConstData_t outputGrad,
+                             const TensorDescriptor& inputGradDesc,
+                             Data_t inputGrad)
+{
+    const auto problem = miopen::trace::BwdProblemDescription{outputGradDesc, inputGradDesc};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = trace::BwdInvokeParams{};
+        tmp.type           = InvokeType::Run;
+        tmp.inputGradDesc  = &inputGradDesc;
+        tmp.outputGradDesc = &outputGradDesc;
+        tmp.inputGrad      = inputGrad;
+        tmp.outputGrad     = outputGrad;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"TraceBackward"};
+    const auto solvers = solver::SolverContainer<solver::trace::TraceBackward>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace trace
+
+} // namespace miopen
diff --git a/src/trace/problem_description.cpp b/src/trace/problem_description.cpp
new file mode 100644
index 0000000000..c6e4c85e5c
--- /dev/null
+++ b/src/trace/problem_description.cpp
@@ -0,0 +1,69 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/names.hpp>
+#include <miopen/trace/problem_description.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+namespace trace {
+
+NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dtype = inputDesc.GetType();
+    auto input_len   = inputDesc.GetLengths();
+    auto N           = std::min(input_len[0], input_len[1]);
+
+    std::ostringstream ss;
+
+    ss << "trace_fwd";
+    ss << "i_dtype" << input_dtype;
+    ss << "N" << N;
+
+    return NetworkConfig{ss.str()};
+}
+
+NetworkConfig BwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_grad_dtype = inputGradDesc.GetType();
+    auto input_grad_numel = inputGradDesc.GetElementSize();
+    auto N                = inputGradDesc.GetLengths()[0];
+
+    std::ostringstream ss;
+
+    ss << "trace_bwd";
+    ss << "i_dtype" << input_grad_dtype;
+    ss << "size" << input_grad_numel;
+    ss << "N" << N;
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace trace
+
+} // namespace miopen
diff --git a/src/trace_api.cpp b/src/trace_api.cpp
new file mode 100644
index 0000000000..f586c88346
--- /dev/null
+++ b/src/trace_api.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/errors.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor_ops.hpp>
+#include <miopen/trace.hpp>
+
+extern "C" miopenStatus_t
+miopenGetTraceForwardWorkspaceSize(miopenHandle_t handle,
+                                   const miopenTensorDescriptor_t inputDesc,
+                                   const miopenTensorDescriptor_t outputDesc,
+                                   size_t* sizeInBytes)
+{
+    MIOPEN_LOG_FUNCTION(handle, inputDesc, sizeInBytes);
+
+    return miopen::try_([&] {
+        miopen::deref(sizeInBytes) = miopen::trace::GetTraceForwardWorkspaceSize(
+            miopen::deref(handle), miopen::deref(inputDesc), miopen::deref(outputDesc));
+    });
+}
+
+extern "C" miopenStatus_t miopenTraceForward(miopenHandle_t handle,
+                                             void* workspace,
+                                             size_t workspaceSizeInBytes,
+                                             const miopenTensorDescriptor_t inputDesc,
+                                             const void* input,
+                                             const miopenTensorDescriptor_t outputDesc,
+                                             void* output)
+{
+    MIOPEN_LOG_FUNCTION(
+        handle, workspace, workspaceSizeInBytes, inputDesc, input, outputDesc, output);
+
+    return miopen::try_([&] {
+        miopen::trace::TraceForward(miopen::deref(handle),
+                                    DataCast(workspace),
+                                    workspaceSizeInBytes,
+                                    miopen::deref(inputDesc),
+                                    DataCast(input),
+                                    miopen::deref(outputDesc),
+                                    DataCast(output));
+    });
+}
+
+extern "C" miopenStatus_t miopenTraceBackward(miopenHandle_t handle,
+                                              const miopenTensorDescriptor_t outputGradDesc,
+                                              const void* outputGrad,
+                                              const miopenTensorDescriptor_t inputGradDesc,
+                                              void* inputGrad)
+{
+    MIOPEN_LOG_FUNCTION(handle, outputGradDesc, outputGrad, inputGradDesc, inputGrad);
+
+    return miopen::try_([&] {
+        miopen::trace::TraceBackward(miopen::deref(handle),
+                                     miopen::deref(outputGradDesc),
+                                     DataCast(outputGrad),
+                                     miopen::deref(inputGradDesc),
+                                     DataCast(inputGrad));
+    });
+}
diff --git a/test/cpu_trace.hpp b/test/cpu_trace.hpp
new file mode 100644
index 0000000000..2ae533268b
--- /dev/null
+++ b/test/cpu_trace.hpp
@@ -0,0 +1,72 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/miopen.h>
+#include <miopen/tensor_view_utils.hpp>
+
+#include "ford.hpp"
+#include "tensor_holder.hpp"
+#include "tensor_view.hpp"
+
+#include <cstddef>
+
+template <class T>
+void cpu_trace_forward(const tensor<T>& input, tensor<T>& ref_output)
+{
+    tensor_view_t<2> input_tv = miopen::get_inner_expanded_tv<2>(input.desc);
+    auto input_len            = input.desc.GetLengths();
+    size_t N                  = std::min(input_len[0], input_len[1]);
+    double res                = 0;
+
+    for(size_t i = 0; i < N; i++)
+    {
+        tensor_layout_t<2> input_layout = {i, i};
+        size_t input_idx                = input_tv.get_tensor_view_idx(input_layout);
+        T val                           = input[input_idx];
+        res += static_cast<double>(input[input_idx]);
+    }
+    ref_output[0] = static_cast<T>(res);
+}
+
+template <class T>
+void cpu_trace_backward(const tensor<T>& output_grad, tensor<T>& ref_input_grad)
+{
+    tensor_view_t<2> input_grad_tv = miopen::get_inner_expanded_tv<2>(ref_input_grad.desc);
+    auto input_grad_len            = ref_input_grad.desc.GetLengths();
+    size_t N                       = input_grad_len[0];
+
+    par_ford(N)([&](size_t i) {
+        size_t idx = i % (input_grad_tv.size[1] + 1);
+
+        if(idx != input_grad_tv.size[1])
+        {
+            T val                                                            = output_grad[0];
+            tensor_layout_t<2> ingrad_layout                                 = {i, idx};
+            ref_input_grad[input_grad_tv.get_tensor_view_idx(ingrad_layout)] = static_cast<T>(val);
+        }
+    });
+}
diff --git a/test/gtest/trace.cpp b/test/gtest/trace.cpp
new file mode 100644
index 0000000000..b07d31d5e1
--- /dev/null
+++ b/test/gtest/trace.cpp
@@ -0,0 +1,84 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "trace.hpp"
+using float16 = half_float::half;
+
+namespace trace {
+
+using GPU_Trace_fwd_FP32  = TraceFwdTest<float>;
+using GPU_Trace_fwd_FP16  = TraceFwdTest<float16>;
+using GPU_Trace_fwd_BFP16 = TraceFwdTest<bfloat16>;
+
+using GPU_Trace_bwd_FP32  = TraceBwdTest<float>;
+using GPU_Trace_bwd_FP16  = TraceBwdTest<float16>;
+using GPU_Trace_bwd_BFP16 = TraceBwdTest<bfloat16>;
+
+} // namespace trace
+using namespace trace;
+
+TEST_P(GPU_Trace_fwd_FP32, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_Trace_fwd_FP16, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_Trace_fwd_BFP16, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_Trace_bwd_FP32, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_Trace_bwd_FP16, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_Trace_bwd_BFP16, Test)
+{
+    RunTest();
+    Verify();
+};
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Trace_fwd_FP32, testing::ValuesIn(GenFullTestCases()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Trace_fwd_FP16, testing::ValuesIn(GenFullTestCases()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Trace_fwd_BFP16, testing::ValuesIn(GenFullTestCases()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Trace_bwd_FP32, testing::ValuesIn(GenFullTestCases()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Trace_bwd_FP16, testing::ValuesIn(GenFullTestCases()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Trace_bwd_BFP16, testing::ValuesIn(GenFullTestCases()));
diff --git a/test/gtest/trace.hpp b/test/gtest/trace.hpp
new file mode 100644
index 0000000000..453ab5840a
--- /dev/null
+++ b/test/gtest/trace.hpp
@@ -0,0 +1,254 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "cpu_trace.hpp"
+#include "get_handle.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include "random.hpp"
+
+#include <cstddef>
+#include <gtest/gtest.h>
+#include <limits>
+
+#include <miopen/miopen.h>
+#include <miopen/trace.hpp>
+
+struct TraceTestCase
+{
+    std::vector<size_t> input_dim;
+    bool isContiguous;
+
+    friend std::ostream& operator<<(std::ostream& os, const TraceTestCase& tc)
+    {
+        os << "Input dims: ";
+        for(auto dim_sz : tc.input_dim)
+        {
+            os << dim_sz << " ";
+        }
+        return os << " contiguous: " << tc.isContiguous;
+    }
+
+    TraceTestCase() {}
+
+    TraceTestCase(std::vector<size_t> input_dims_, bool cont_)
+        : input_dim(input_dims_), isContiguous(cont_)
+    {
+    }
+
+    std::vector<size_t> ComputeStrides() const
+    {
+        std::vector<size_t> inputDim = input_dim;
+        if(!isContiguous)
+            std::swap(inputDim.front(), inputDim.back());
+        std::vector<size_t> strides(inputDim.size());
+        strides.back() = 1;
+        for(int i = inputDim.size() - 2; i >= 0; --i)
+            strides[i] = strides[i + 1] * inputDim[i + 1];
+        if(!isContiguous)
+            std::swap(strides.front(), strides.back());
+        return strides;
+    }
+};
+
+inline std::vector<TraceTestCase> GenFullTestCases()
+{ // n c d h w dim
+    // clang-format off
+    return {
+        {{1, 8}, false},                      // non-cont small case
+        {{8, 4}, false},                      // non-cont small case
+        {{512, 512}, false},                  // non-cont large case 
+        {{384, 640}, false},                  // non-cont large case
+        {{512, 768}, true},                   // cont large case
+        {{1024, 1024}, true},                 // cont large case
+        {{1, 10}, true},                      // cont small case
+        {{34, 20}, true},                     // cont small case
+    };
+    // clang-format on
+}
+
+template <typename T>
+struct TraceFwdTest : public ::testing::TestWithParam<TraceTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle   = get_handle();
+        trace_config    = GetParam();
+        auto gen_value1 = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+        auto gen_value2 = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 99); };
+
+        auto in_dims    = trace_config.input_dim;
+        auto in_strides = trace_config.ComputeStrides();
+        input           = tensor<T>{in_dims, in_strides}.generate(gen_value1);
+
+        auto out_lengths = std::vector<size_t>{1};
+
+        output = tensor<T>{out_lengths};
+        std::fill(output.begin(), output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_output = tensor<T>{out_lengths};
+        std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ws_sizeInBytes =
+            miopen::trace::GetTraceForwardWorkspaceSize(handle, input.desc, output.desc);
+        if(ws_sizeInBytes == static_cast<size_t>(-1))
+            GTEST_SKIP();
+
+        if(ws_sizeInBytes != 0)
+        {
+            std::vector<size_t> workspace_dims;
+            workspace_dims.push_back(ws_sizeInBytes / sizeof(float));
+
+            workspace = tensor<float>{workspace_dims};
+            std::fill(workspace.begin(), workspace.end(), static_cast<float>(0));
+
+            workspace_dev = handle.Write(workspace.data);
+        }
+
+        input_dev  = handle.Write(input.data);
+        output_dev = handle.Write(output.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+
+        miopenStatus_t status;
+
+        cpu_trace_forward<T>(input, ref_output);
+        status = miopen::trace::TraceForward(handle,
+                                             workspace_dev.get(),
+                                             ws_sizeInBytes,
+                                             input.desc,
+                                             input_dev.get(),
+                                             output.desc,
+                                             output_dev.get());
+
+        ASSERT_EQ(status, miopenStatusSuccess);
+
+        output.data = handle.Read<T>(output_dev, output.data.size());
+    }
+
+    double GetTolerance()
+    {
+        double tolerance = std::numeric_limits<T>::epsilon() * 10;
+        return tolerance;
+    }
+
+    void Verify()
+    {
+        double threshold = GetTolerance();
+        EXPECT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output));
+        auto error = miopen::rms_range(ref_output, output);
+
+        EXPECT_TRUE(error < threshold * 10) << "Error output beyond tolerance Error: " << error
+                                            << ",  Tolerance: " << threshold * 10;
+    }
+
+    TraceTestCase trace_config;
+
+    tensor<T> input;
+    tensor<T> output;
+    tensor<float> workspace;
+
+    tensor<T> ref_output;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+    miopen::Allocator::ManageDataPtr workspace_dev;
+
+    size_t ws_sizeInBytes;
+};
+
+template <typename T>
+struct TraceBwdTest : public ::testing::TestWithParam<TraceTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle   = get_handle();
+        trace_config    = GetParam();
+        auto gen_value1 = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+        auto gen_value2 = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 99); };
+
+        auto in_grad_dims    = trace_config.input_dim;
+        auto in_grad_strides = trace_config.ComputeStrides();
+        input_grad           = tensor<T>{in_grad_dims, in_grad_strides};
+
+        auto out_grad_dims = std::vector<size_t>{1};
+        output_grad        = tensor<T>{out_grad_dims}.generate(gen_value1);
+
+        std::fill(input_grad.begin(), input_grad.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_input_grad = tensor<T>{in_grad_dims, in_grad_strides};
+        std::fill(ref_input_grad.begin(), ref_input_grad.end(), static_cast<T>(0));
+
+        input_grad_dev  = handle.Write(input_grad.data);
+        output_grad_dev = handle.Write(output_grad.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+
+        miopenStatus_t status;
+
+        cpu_trace_backward<T>(output_grad, ref_input_grad);
+        status = miopen::trace::TraceBackward(
+            handle, output_grad.desc, output_grad_dev.get(), input_grad.desc, input_grad_dev.get());
+
+        ASSERT_EQ(status, miopenStatusSuccess);
+
+        input_grad.data = handle.Read<T>(input_grad_dev, input_grad.data.size());
+    }
+
+    double GetTolerance()
+    {
+        double tolerance = std::numeric_limits<T>::epsilon() * 10;
+        return tolerance;
+    }
+
+    void Verify()
+    {
+        double threshold = GetTolerance();
+        ASSERT_EQ(miopen::range_distance(ref_input_grad), miopen::range_distance(input_grad));
+        auto error = miopen::rms_range(ref_input_grad, input_grad);
+
+        EXPECT_LT(error, threshold * 10) << "Error output beyond tolerance Error: " << error
+                                         << ",  Tolerance: " << threshold * 10;
+    }
+
+    TraceTestCase trace_config;
+
+    tensor<T> input_grad;
+    tensor<T> output_grad;
+
+    tensor<T> ref_input_grad;
+
+    miopen::Allocator::ManageDataPtr input_grad_dev;
+    miopen::Allocator::ManageDataPtr output_grad_dev;
+};