From d197108888a5a37ed7d866f2cf1a25eb9611be8e Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 5 Aug 2024 18:11:48 +0700
Subject: [PATCH 01/38] init

---
 docs/reference/index.rst |   1 +
 include/miopen/miopen.h  |  77 ++++++++++++++++++++++++++
 src/avgpool.cpp          |   0
 src/avgpool_api.cpp      | 113 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 191 insertions(+)
 create mode 100644 src/avgpool.cpp
 create mode 100644 src/avgpool_api.cpp

diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 90e29ffaa9..9594e00ef0 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -35,3 +35,4 @@ The MIOpen API library is structured as follows:
   * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental)
   * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental)
   * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental)
+  * :doc:`AvgPool <../doxygen/html/group__avgpool>` (experimental)
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 3b9bbeccc1..fda8817e3a 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -70,6 +70,7 @@
  * @defgroup SGD
  * @defgroup getitem
  * @defgroup ReduceCalculation
+ * @defgroup avgpool
  *
  */
 
@@ -7621,6 +7622,82 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
 // CLOSEOUT GETITEM DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+#ifdef MIOPEN_BETA_API
+// avgpool APIs
+/** @addtogroup avgpool
+ *
+ *  @{
+ */
+
+/*! @brief Execute an avgpool forward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Data tensor input (input)
+ * @param outputDesc               Tensor descriptor for output tensor (input)
+ * @param output                   Data tensor output (output)
+ * @param strideDesc               Tensor descriptor for stride tensor (input)
+ * @param stride                   Data tensor stride (output)
+ * @param paddingDesc              Tensor descriptor for padding tensor (input)
+ * @param padding                  Data tensor padding (output)
+ * @param kinforDesc               Tensor descriptor for kinfor tensor (input)
+ * @param kinfor                   Data tensor kinfor (output)
+ * @param count_include_pad        When True, will include the zero-padding in the averaging
+ * calculation (input)
+ * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
+ * use the number of elements in the pooling window (input)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
+                                                  const miopenTensorDescriptor_t inputDesc,
+                                                  const void* input,
+                                                  const miopenTensorDescriptor_t outputDesc,
+                                                  void* output,
+                                                  const miopenTensorDescriptor_t strideDesc,
+                                                  const void* stride,
+                                                  const miopenTensorDescriptor_t paddingDesc,
+                                                  const void* padding,
+                                                  const miopenTensorDescriptor_t kinforDesc,
+                                                  const void* kinfor,
+                                                  const bool count_include_pad,
+                                                  const int32_t divisor_override);
+
+/*! @brief Execute an avgpool backward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param outputGradDesc           Tensor descriptor for output grad tensor (input)
+ * @param output_grad              Data tensor output grad (input)
+ * @param inputGradDesc            Tensor descriptor for input grad tensor (input)
+ * @param input_grad               Data tensor input grad (output)
+ * @param strideDesc               Tensor descriptor for stride tensor (input)
+ * @param stride                   Data tensor stride (output)
+ * @param paddingDesc              Tensor descriptor for padding tensor (input)
+ * @param padding                  Data tensor padding (output)
+ * @param kinforDesc               Tensor descriptor for kinfor tensor (input)
+ * @param kinfor                   Data tensor kinfor (output)
+ * @param count_include_pad        When True, will include the zero-padding in the averaging
+ * calculation (input)
+ * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
+ * use the number of elements in the pooling window (input)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
+                                                   const miopenTensorDescriptor_t outputGradDesc,
+                                                   const void* output_grad,
+                                                   const miopenTensorDescriptor_t inputGradDesc,
+                                                   void* input_grad,
+                                                   const miopenTensorDescriptor_t strideDesc,
+                                                   const void* stride,
+                                                   const miopenTensorDescriptor_t paddingDesc,
+                                                   const void* padding,
+                                                   const miopenTensorDescriptor_t kinforDesc,
+                                                   const void* kinfor,
+                                                   const bool count_include_pad,
+                                                   const int32_t divisor_override);
+/** @} */
+// CLOSEOUT avgpool DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/avgpool.cpp b/src/avgpool.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp
new file mode 100644
index 0000000000..643d494cee
--- /dev/null
+++ b/src/avgpool_api.cpp
@@ -0,0 +1,113 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/avgpool.hpp>
+#include <miopen/kernel_cache.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/avgpool/solvers.hpp>
+#include <miopen/find_solution.hpp>
+
+namespace miopen {
+
+miopenStatus_t AvgPoolForward(Handle& handle,
+                              const TensorDescriptor& inputDesc,
+                              ConstData_t input,
+                              const TensorDescriptor& outputDesc,
+                              Data_t output,
+                              const TensorDescriptor& strideDesc,
+                              ConstData_t stride,
+                              bool log_target)
+{
+    const auto problem = avgpool::UnreducedProblemDescription{
+        inputDesc, targetDesc, outputGradDesc, log_target, false};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = avgpool::BwdInvokeParams{};
+        tmp.inputDesc      = &inputDesc;
+        tmp.targetDesc     = &targetDesc;
+        tmp.outputGradDesc = &outputGradDesc;
+        tmp.inputGradDesc  = &inputGradDesc;
+        tmp.targetGradDesc = &targetGradDesc;
+
+        tmp.input       = input;
+        tmp.target      = target;
+        tmp.output_grad = output_grad;
+        tmp.input_grad  = input_grad;
+        tmp.target_grad = target_grad;
+
+        tmp.log_target = log_target;
+
+        return tmp;
+    }();
+    const auto algo    = AlgorithmName{"AvgPoolForward"};
+    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolForward5d>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t AvgPoolBackward(Handle& handle,
+                               const TensorDescriptor& outputGradDesc,
+                               ConstData_t output_grad,
+                               const TensorDescriptor& inputGradDesc,
+                               Data_t input_grad,
+                               const TensorDescriptor& windowInforDesc,
+                               ConstData_t window_infor,
+                               bool log_target)
+{
+    const auto problem = avgpool::ReducedProblemDescription{
+        inputDesc, targetDesc, outputGradDesc, divisor, log_target, false};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = avgpool::BwdInvokeParams{};
+        tmp.inputDesc      = &inputDesc;
+        tmp.targetDesc     = &targetDesc;
+        tmp.outputGradDesc = &outputGradDesc;
+        tmp.inputGradDesc  = &inputGradDesc;
+        tmp.targetGradDesc = &targetGradDesc;
+
+        tmp.input       = input;
+        tmp.target      = target;
+        tmp.output_grad = output_grad;
+        tmp.input_grad  = input_grad;
+        tmp.target_grad = target_grad;
+
+        tmp.divisor    = divisor;
+        tmp.log_target = log_target;
+
+        return tmp;
+    }();
+    const auto algo    = AlgorithmName{"AvgPoolBackward"};
+    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolBackward5d>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen

From 3c90908ea196e4051d85cd9fe916788d3cce71ac Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 5 Aug 2024 23:30:39 +0700
Subject: [PATCH 02/38] skeleton code

---
 src/CMakeLists.txt                            |   8 +
 src/avgpool.cpp                               | 134 +++++++++
 src/avgpool/problem_description.cpp           |  85 ++++++
 src/avgpool_api.cpp                           | 217 +++++++++-----
 src/include/miopen/avgpool.hpp                |  65 ++++
 src/include/miopen/avgpool/invoke_params.hpp  |  85 ++++++
 .../miopen/avgpool/problem_description.hpp    | 215 ++++++++++++++
 src/include/miopen/avgpool/solvers.hpp        | 281 ++++++++++++++++++
 src/include/miopen/solver_id.hpp              |   3 +-
 src/kernels/MIOpenAvgPool.cpp                 |   0
 src/solver/avgpool/backward_avgpool_2d.cpp    |   0
 src/solver/avgpool/backward_avgpool_3d.cpp    |   0
 src/solver/avgpool/forward_avgpool_2d.cpp     |   0
 src/solver/avgpool/forward_avgpool_3d.cpp     |   0
 14 files changed, 1013 insertions(+), 80 deletions(-)
 create mode 100644 src/avgpool/problem_description.cpp
 create mode 100644 src/include/miopen/avgpool.hpp
 create mode 100644 src/include/miopen/avgpool/invoke_params.hpp
 create mode 100644 src/include/miopen/avgpool/problem_description.hpp
 create mode 100644 src/include/miopen/avgpool/solvers.hpp
 create mode 100644 src/kernels/MIOpenAvgPool.cpp
 create mode 100644 src/solver/avgpool/backward_avgpool_2d.cpp
 create mode 100644 src/solver/avgpool/backward_avgpool_3d.cpp
 create mode 100644 src/solver/avgpool/forward_avgpool_2d.cpp
 create mode 100644 src/solver/avgpool/forward_avgpool_3d.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 77acf3f7d3..ee36c92967 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -89,6 +89,8 @@ set( MIOpen_Source
     adam_api.cpp
     addlayernorm_api.cpp
     api/find2_0_commons.cpp
+    avgpool_api.cpp
+    avgpool/problem_description.cpp
     batch_norm.cpp
     batch_norm_api.cpp
     batchnorm/problem_description.cpp
@@ -191,6 +193,10 @@ set( MIOpen_Source
     solver/activ/fwd_1.cpp
     solver/adam/adam.cpp
     solver/adam/transformers_adam_w.cpp
+    solver/avgpool/backward_avgpool_2d.cpp
+    solver/avgpool/backward_avgpool_3d.cpp
+    solver/avgpool/forward_avgpool_2d.cpp
+    solver/avgpool/forward_avgpool_3d.cpp
     solver/batchnorm/backward_ck.cpp
     solver/batchnorm/backward_per_activation.cpp
     solver/batchnorm/backward_per_activation_fused.cpp
@@ -482,6 +488,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         ${GPU_BATCHED_TRANSPOSE_KERNEL_HIP}
         ${GPU_GENERAL_TENSOR_REORDER_KERNEL_HIP_SOURCE}
         kernels/MIOpenAdam.cpp
+        kernels/MIOpenAvgPool.cpp
         kernels/MIOpenCat.cpp
         kernels/MIOpenCheckNumerics.cpp
         kernels/MIOpenBatchNormActivBwdPerAct.cl
@@ -626,6 +633,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         activ.cpp
         adam.cpp
         addlayernorm.cpp
+        avgpool.cpp
         cat.cpp
         groupnorm.cpp
         getitem.cpp
diff --git a/src/avgpool.cpp b/src/avgpool.cpp
index e69de29bb2..15bea1f9d8 100644
--- a/src/avgpool.cpp
+++ b/src/avgpool.cpp
@@ -0,0 +1,134 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/avgpool.hpp>
+#include <miopen/kernel_cache.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/avgpool/solvers.hpp>
+#include <miopen/find_solution.hpp>
+
+namespace miopen {
+
+miopenStatus_t AvgPoolForward(Handle& handle,
+                              const TensorDescriptor& inputDesc,
+                              ConstData_t input,
+                              const TensorDescriptor& outputDesc,
+                              Data_t output,
+                              const TensorDescriptor& strideDesc,
+                              ConstData_t stride,
+                              const TensorDescriptor& paddingDesc,
+                              ConstData_t padding,
+                              const TensorDescriptor& kinforDesc,
+                              ConstData_t kinfor,
+                              const bool count_include_pad,
+                              const int32_t divisor_override)
+{
+    const auto problem = avgpool::FwdProblemDescription{inputDesc,
+                                                        outputDesc,
+                                                        strideDesc,
+                                                        paddingDesc,
+                                                        kinforDesc,
+                                                        count_include_pad,
+                                                        divisor_override};
+
+    const auto invoke_params = [&]() {
+        auto tmp        = avgpool::FwdInvokeParams{};
+        tmp.inputDesc   = &inputDesc;
+        tmp.outputDesc  = &outputDesc;
+        tmp.strideDesc  = &strideDesc;
+        tmp.paddingDesc = &paddingDesc;
+        tmp.kinforDesc  = &kinforDesc;
+
+        tmp.input             = input;
+        tmp.output            = output;
+        tmp.stride            = stride;
+        tmp.padding           = padding;
+        tmp.kinfor            = kinfor;
+        tmp.count_include_pad = count_include_pad;
+        tmp.divisor_override  = divisor_override;
+
+        return tmp;
+    }();
+    const auto algo    = AlgorithmName{"AvgPoolForward"};
+    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolForward2d,
+                                                 solver::avgpool::AvgPoolForward3d>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t AvgPoolBackward(Handle& handle,
+                               const TensorDescriptor& outputGradDesc,
+                               ConstData_t output_grad,
+                               const TensorDescriptor& inputGradDesc,
+                               Data_t input_grad,
+                               const TensorDescriptor& strideDesc,
+                               ConstData_t stride,
+                               const TensorDescriptor& paddingDesc,
+                               ConstData_t padding,
+                               const TensorDescriptor& kinforDesc,
+                               ConstData_t kinfor,
+                               const bool count_include_pad,
+                               const int32_t divisor_override)
+{
+    const auto problem = avgpool::BwdProblemDescription{outputGradDesc,
+                                                        inputGradDesc,
+                                                        strideDesc,
+                                                        paddingDesc,
+                                                        kinforDesc,
+                                                        count_include_pad,
+                                                        divisor_override};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = avgpool::BwdInvokeParams{};
+        tmp.outputGradDesc = &outputGradDesc;
+        tmp.inputGradDesc  = &inputGradDesc;
+        tmp.strideDesc     = &strideDesc;
+        tmp.paddingDesc    = &paddingDesc;
+        tmp.kinforDesc     = &kinforDesc;
+
+        tmp.output_grad       = output_grad;
+        tmp.input_grad        = input_grad;
+        tmp.stride            = stride;
+        tmp.padding           = padding;
+        tmp.kinfor            = kinfor;
+        tmp.count_include_pad = count_include_pad;
+        tmp.divisor_override  = divisor_override;
+
+        return tmp;
+    }();
+    const auto algo    = AlgorithmName{"AvgPoolBackward"};
+    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolBackward2d,
+                                                 solver::avgpool::AvgPoolBackward3d>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
diff --git a/src/avgpool/problem_description.cpp b/src/avgpool/problem_description.cpp
new file mode 100644
index 0000000000..dd2144f429
--- /dev/null
+++ b/src/avgpool/problem_description.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <cstddef>
+#include <miopen/avgpool/problem_description.hpp>
+#include <miopen/names.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+namespace avgpool {
+
+NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
+{
+    size_t numel       = GetNtotal();
+    size_t num_batches = inputDesc.GetLengths()[0];
+    size_t num_classes = GetC();
+    size_t num_dims    = inputDesc.GetNumDims();
+
+    auto input_dtype = inputDesc.GetType();
+
+    std::ostringstream ss;
+
+    ss << "avgpool_unreduce";
+    ss << "is_fwd" << is_fwd;
+    ss << "contiguous" << contiguous;
+    ss << "input_dtype" << input_dtype;
+    ss << "numel" << numel;
+    ss << "num_dims" << num_dims;
+    ss << "num_batches" << num_batches;
+    ss << "num_classes" << num_classes;
+
+    return NetworkConfig{ss.str()};
+}
+
+NetworkConfig BwdProblemDescription::MakeNetworkConfig() const
+{
+    size_t numel       = GetNtotal();
+    size_t num_batches = inputDesc.GetLengths()[0];
+    size_t num_classes = GetC();
+    size_t num_dims    = inputDesc.GetNumDims();
+
+    auto input_dtype = inputDesc.GetType();
+
+    std::ostringstream ss;
+
+    ss << "avgpool_reduce";
+    ss << "is_fwd" << is_fwd;
+    ss << "input_dtype" << input_dtype;
+    ss << "divisor" << divisor;
+    ss << "numel" << numel;
+    ss << "num_dims" << num_dims;
+    ss << "num_batches" << num_batches;
+    ss << "num_classes" << num_classes;
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace avgpool
+
+} // namespace miopen
diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp
index 643d494cee..4e62bd5e7b 100644
--- a/src/avgpool_api.cpp
+++ b/src/avgpool_api.cpp
@@ -23,91 +23,150 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include <miopen/avgpool.hpp>
-#include <miopen/kernel_cache.hpp>
-#include <miopen/float_equal.hpp>
-#include <miopen/tensor.hpp>
-#include <miopen/avgpool/invoke_params.hpp>
-#include <miopen/avgpool/solvers.hpp>
-#include <miopen/find_solution.hpp>
 
-namespace miopen {
+#include <miopen/avgpool.hpp>
+#include <miopen/errors.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/tensor_ops.hpp>
 
-miopenStatus_t AvgPoolForward(Handle& handle,
-                              const TensorDescriptor& inputDesc,
-                              ConstData_t input,
-                              const TensorDescriptor& outputDesc,
-                              Data_t output,
-                              const TensorDescriptor& strideDesc,
-                              ConstData_t stride,
-                              bool log_target)
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
 {
-    const auto problem = avgpool::UnreducedProblemDescription{
-        inputDesc, targetDesc, outputGradDesc, log_target, false};
-
-    const auto invoke_params = [&]() {
-        auto tmp           = avgpool::BwdInvokeParams{};
-        tmp.inputDesc      = &inputDesc;
-        tmp.targetDesc     = &targetDesc;
-        tmp.outputGradDesc = &outputGradDesc;
-        tmp.inputGradDesc  = &inputGradDesc;
-        tmp.targetGradDesc = &targetGradDesc;
-
-        tmp.input       = input;
-        tmp.target      = target;
-        tmp.output_grad = output_grad;
-        tmp.input_grad  = input_grad;
-        tmp.target_grad = target_grad;
-
-        tmp.log_target = log_target;
-
-        return tmp;
-    }();
-    const auto algo    = AlgorithmName{"AvgPoolForward"};
-    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolForward5d>{};
-
-    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
-
-    return miopenStatusSuccess;
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
 }
 
-miopenStatus_t AvgPoolBackward(Handle& handle,
-                               const TensorDescriptor& outputGradDesc,
-                               ConstData_t output_grad,
-                               const TensorDescriptor& inputGradDesc,
-                               Data_t input_grad,
-                               const TensorDescriptor& windowInforDesc,
-                               ConstData_t window_infor,
-                               bool log_target)
+static void LogCmdAvgPool(const miopenTensorDescriptor_t xDesc,
+                          const miopenTensorDescriptor_t oDesc,
+                          const bool count_include_pad,
+                          const int32_t divisor_override,
+                          const bool is_fwd)
 {
-    const auto problem = avgpool::ReducedProblemDescription{
-        inputDesc, targetDesc, outputGradDesc, divisor, log_target, false};
-
-    const auto invoke_params = [&]() {
-        auto tmp           = avgpool::BwdInvokeParams{};
-        tmp.inputDesc      = &inputDesc;
-        tmp.targetDesc     = &targetDesc;
-        tmp.outputGradDesc = &outputGradDesc;
-        tmp.inputGradDesc  = &inputGradDesc;
-        tmp.targetGradDesc = &targetGradDesc;
-
-        tmp.input       = input;
-        tmp.target      = target;
-        tmp.output_grad = output_grad;
-        tmp.input_grad  = input_grad;
-        tmp.target_grad = target_grad;
-
-        tmp.divisor    = divisor;
-        tmp.log_target = log_target;
-
-        return tmp;
-    }();
-    const auto algo    = AlgorithmName{"AvgPoolBackward"};
-    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolBackward5d>{};
-
-    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+    if(miopen::IsLoggingCmd())
+    {
+        std::stringstream ss;
+        auto dtype = miopen::deref(xDesc).GetType();
+        if(dtype == miopenHalf)
+        {
+            ss << "avgpoolfp16";
+        }
+        else if(dtype == miopenFloat)
+        {
+            ss << "avgpoolfp32";
+        }
+        else if(dtype == miopenBFloat16)
+        {
+            ss << "avgpoolbfp16";
+        }
+
+        MIOPEN_LOG_FUNCTION(xDesc, oDesc, count_include_pad, divisor_override);
+        ss << " -Is " << miopen::deref(xDesc).GetLengths();
+        ss << " -Os " << miopen::deref(oDesc).GetLengths();
+        ss << " -Si " << miopen::deref(xDesc).GetStrides();
+        ss << " -So " << miopen::deref(oDesc).GetStrides();
+        ss << " -Cp " << count_include_pad;
+        ss << " -Do " << divisor_override;
+        ss << " -F " << ((is_fwd) ? "1" : "2");
+
+        MIOPEN_LOG_DRIVER_CMD(ss.str());
+    }
+}
 
-    return miopenStatusSuccess;
+extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
+                                               const miopenTensorDescriptor_t inputDesc,
+                                               const void* input,
+                                               const miopenTensorDescriptor_t outputDesc,
+                                               void* output,
+                                               const miopenTensorDescriptor_t strideDesc,
+                                               const void* stride,
+                                               const miopenTensorDescriptor_t paddingDesc,
+                                               const void* padding,
+                                               const miopenTensorDescriptor_t kinforDesc,
+                                               const void* kinfor,
+                                               const bool count_include_pad,
+                                               const int32_t divisor_override)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        inputDesc,
+                        input,
+                        outputDesc,
+                        output,
+                        strideDesc,
+                        stride,
+                        paddingDesc,
+                        padding,
+                        kinforDesc,
+                        kinfor,
+                        count_include_pad,
+                        divisor_override);
+
+    LogCmdAvgPool(inputDesc, outputDesc, count_include_pad, divisor_override, true);
+    return miopen::try_([&] {
+        miopen::AvgPoolForward(miopen::deref(handle),
+                               miopen::deref(inputDesc),
+                               DataCast(input),
+                               miopen::deref(outputDesc),
+                               DataCast(output),
+                               miopen::deref(strideDesc),
+                               DataCast(stride),
+                               miopen::deref(paddingDesc),
+                               DataCast(padding),
+                               miopen::deref(kinforDesc),
+                               DataCast(kinfor),
+                               count_include_pad,
+                               divisor_override);
+    });
 }
 
-} // namespace miopen
+extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
+                                                const miopenTensorDescriptor_t outputGradDesc,
+                                                const void* output_grad,
+                                                const miopenTensorDescriptor_t inputGradDesc,
+                                                void* input_grad,
+                                                const miopenTensorDescriptor_t strideDesc,
+                                                const void* stride,
+                                                const miopenTensorDescriptor_t paddingDesc,
+                                                const void* padding,
+                                                const miopenTensorDescriptor_t kinforDesc,
+                                                const void* kinfor,
+                                                const bool count_include_pad,
+                                                const int32_t divisor_override)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        outputGradDesc,
+                        output_grad,
+                        inputGradDesc,
+                        input_grad,
+                        strideDesc,
+                        stride,
+                        paddingDesc,
+                        padding,
+                        kinforDesc,
+                        kinfor,
+                        count_include_pad,
+                        divisor_override);
+
+    LogCmdAvgPool(inputGradDesc, outputGradDesc, count_include_pad, divisor_override, false);
+    return miopen::try_([&] {
+        miopen::AvgPoolBackward(miopen::deref(handle),
+                                miopen::deref(outputGradDesc),
+                                DataCast(output_grad),
+                                miopen::deref(inputGradDesc),
+                                DataCast(input_grad),
+                                miopen::deref(strideDesc),
+                                DataCast(stride),
+                                miopen::deref(paddingDesc),
+                                DataCast(padding),
+                                miopen::deref(kinforDesc),
+                                DataCast(kinfor),
+                                count_include_pad,
+                                divisor_override);
+    });
+}
diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp
new file mode 100644
index 0000000000..1a46b974b2
--- /dev/null
+++ b/src/include/miopen/avgpool.hpp
@@ -0,0 +1,65 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/miopen.h>
+#ifndef MIOPEN_AVGPOOL_HPP_
+#define MIOPEN_AVGPOOL_HPP_
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle,
+                                                      const TensorDescriptor& inputDesc,
+                                                      ConstData_t input,
+                                                      const TensorDescriptor& outputDesc,
+                                                      Data_t output,
+                                                      const TensorDescriptor& strideDesc,
+                                                      ConstData_t stride,
+                                                      const TensorDescriptor& paddingDesc,
+                                                      ConstData_t padding,
+                                                      const TensorDescriptor& kinforDesc,
+                                                      ConstData_t kinfor,
+                                                      bool count_include_pad,
+                                                      int32_t divisor_override);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle,
+                                                       const TensorDescriptor& outputGradDesc,
+                                                       Data_t output_grad,
+                                                       const TensorDescriptor& inputGradDesc,
+                                                       Data_t input_grad,
+                                                       const TensorDescriptor& strideDesc,
+                                                       ConstData_t stride,
+                                                       const TensorDescriptor& paddingDesc,
+                                                       ConstData_t padding,
+                                                       const TensorDescriptor& kinforDesc,
+                                                       ConstData_t kinfor,
+                                                       bool count_include_pad,
+                                                       int32_t divisor_override);
+} // namespace miopen
+#endif // _MIOPEN_AVGPOOL_HPP_
diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp
new file mode 100644
index 0000000000..de2e87ea1b
--- /dev/null
+++ b/src/include/miopen/avgpool/invoke_params.hpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "miopen/common.hpp"
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+namespace avgpool {
+
+struct FwdInvokeParams : public miopen::InvokeParams
+{
+
+    FwdInvokeParams() = default;
+
+    const TensorDescriptor* inputDesc   = nullptr;
+    const TensorDescriptor* outputDesc  = nullptr;
+    const TensorDescriptor* strideDesc  = nullptr;
+    const TensorDescriptor* paddingDesc = nullptr;
+    const TensorDescriptor* kinfor      = nullptr;
+
+    ConstData_t input   = nullptr;
+    Data_t output       = nullptr;
+    ConstData_t stride  = nullptr;
+    ConstData_t padding = nullptr;
+    ConstData_t kinfo   = nullptr;
+
+    const bool count_include_pad   = false;
+    const int32_t divisor_override = 0;
+
+    std::size_t GetWorkspaceSize() const { return 0; }
+    Data_t GetWorkspace() const { return nullptr; }
+};
+
+struct BwdInvokeParams : public miopen::InvokeParams
+{
+
+    BwdInvokeParams() = default;
+
+    const TensorDescriptor* outputGradDesc = nullptr;
+    const TensorDescriptor* inputGradDesc  = nullptr;
+    const TensorDescriptor* strideDesc     = nullptr;
+    const TensorDescriptor* paddingDesc    = nullptr;
+    const TensorDescriptor* kinfor         = nullptr;
+
+    ConstData_t output_grad = nullptr;
+    Data_t input_grad       = nullptr;
+    ConstData_t stride      = nullptr;
+    ConstData_t padding     = nullptr;
+    ConstData_t kinfo       = nullptr;
+
+    const bool count_include_pad   = false;
+    const int32_t divisor_override = 0;
+
+    std::size_t GetWorkspaceSize() const { return 0; }
+    Data_t GetWorkspace() const { return nullptr; }
+};
+
+} // namespace avgpool
+} // namespace miopen
diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp
new file mode 100644
index 0000000000..2b3ec555db
--- /dev/null
+++ b/src/include/miopen/avgpool/problem_description.hpp
@@ -0,0 +1,215 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/problem_description_base.hpp>
+#include <miopen/activ.hpp>
+#include <miopen/tensor.hpp>
+#include <cassert>
+#include <string>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace avgpool {
+
+struct ProblemDescription : ProblemDescriptionBase
+{
+    ProblemDescription(const TensorDescriptor& inputDesc_,
+                       const TensorDescriptor& targetDesc_,
+                       const TensorDescriptor& weightDesc_,
+                       const TensorDescriptor& outputDesc_,
+                       int32_t ignore_index_,
+                       bool is_fwd_)
+        : inputDesc(inputDesc_),
+          targetDesc(targetDesc_),
+          weightDesc(weightDesc_),
+          outputDesc(outputDesc_),
+          ignore_index(ignore_index_),
+          is_fwd(is_fwd_)
+    {
+    }
+
+    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
+    const TensorDescriptor& GetTargetDesc() const { return targetDesc; }
+    const TensorDescriptor& GetWeightDesc() const { return weightDesc; }
+    const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
+    int32_t GetIgnoreIndex() const { return ignore_index; }
+
+    bool IsValidLength() const
+    {
+        if(targetDesc.GetLengths()[0] != inputDesc.GetLengths()[0])
+            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match.");
+
+        for(int32_t i = 1; i < targetDesc.GetNumDims(); ++i)
+        {
+            if(targetDesc.GetLengths()[i] != inputDesc.GetLengths()[i + 1])
+            {
+                MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match.");
+            }
+        }
+        if(weightDesc.GetLengths()[0] != inputDesc.GetLengths()[1])
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match.");
+        }
+        if(inputDesc.GetLengths().size() > 5)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Do not support Input Tensor dims > 5.");
+        }
+        return true;
+    }
+
+    bool IsValidStride() const
+    {
+        auto isRightStride = [](TensorDescriptor td) {
+            auto strides = td.GetStrides();
+            auto lengths = td.GetLengths();
+            std::vector<std::pair<size_t, size_t>> p;
+            p.reserve(td.GetNumDims());
+            std::transform(strides.begin(),
+                           strides.end(),
+                           lengths.begin(),
+                           std::back_inserter(p),
+                           [](size_t a, size_t b) { return std::make_pair(a, b); });
+            std::sort(p.begin(), p.end());
+            for(int i = 1; i < p.size(); ++i)
+            {
+                if(p[i].first != p[i - 1].first * p[i - 1].second)
+                    MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor strides do not valid.");
+            }
+            return true;
+        };
+        return isRightStride(inputDesc) && isRightStride(targetDesc) && isRightStride(outputDesc) &&
+               isRightStride(weightDesc);
+    }
+
+    bool IsSameType() const
+    {
+        if(inputDesc.GetType() != weightDesc.GetType())
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "NLLLoss: Input and Weight tensors types do not match.");
+        }
+        return true;
+    }
+
+    bool IsAllContiguous() const
+    {
+        auto isContiguous = [](TensorDescriptor td) {
+            size_t s = 1;
+            for(int i = td.GetNumDims() - 1; i >= 0; --i)
+            {
+                if(s != td.GetStrides()[i])
+                {
+                    return false;
+                }
+                s *= td.GetLengths()[i];
+            }
+            return true;
+        };
+        return isContiguous(inputDesc) && isContiguous(targetDesc) && isContiguous(weightDesc) &&
+               isContiguous(outputDesc);
+    }
+
+protected:
+    TensorDescriptor inputDesc;
+    TensorDescriptor targetDesc;
+    TensorDescriptor weightDesc;
+    TensorDescriptor outputDesc;
+
+    int32_t ignore_index;
+    bool is_fwd;
+
+    NetworkConfig MakeForwardNetworkConfig() const;
+};
+
+struct UnreduceProblemDescription : ProblemDescription
+{
+    UnreduceProblemDescription(const TensorDescriptor& inputDesc_,
+                               const TensorDescriptor& targetDesc_,
+                               const TensorDescriptor& weightDesc_,
+                               const TensorDescriptor& outputDesc_,
+                               int32_t ignore_index_,
+                               bool is_fwd_)
+        : ProblemDescription(
+              inputDesc_, targetDesc_, weightDesc_, outputDesc_, ignore_index_, is_fwd_)
+    {
+        IsSameType();
+        IsValidLength();
+        IsValidStride();
+    }
+
+    size_t GetNtotal() const { return outputDesc.GetElementSize(); }
+    size_t GetC() const { return weightDesc.GetElementSize(); }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    NetworkConfig MakeForwardNetworkConfig() const;
+};
+
+struct ReduceProblemDescription : ProblemDescription
+{
+    ReduceProblemDescription(const TensorDescriptor& inputDesc_,
+                             const TensorDescriptor& targetDesc_,
+                             const TensorDescriptor& weightDesc_,
+                             const TensorDescriptor& outputDesc_,
+                             int32_t ignore_index_,
+                             float divisor_,
+                             bool is_fwd_)
+        : ProblemDescription(
+              inputDesc_, targetDesc_, weightDesc_, outputDesc_, ignore_index_, is_fwd_)
+    {
+        divisor = divisor_;
+        IsSameType();
+        IsValidLength();
+        IsValidStride();
+    }
+
+    size_t GetNtotal() const { return targetDesc.GetElementSize(); }
+    size_t GetC() const { return weightDesc.GetElementSize(); }
+
+    bool IsValidLength() const
+    {
+        if(outputDesc.GetNumDims() != 1 || outputDesc.GetLengths()[0] != 1)
+            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Output Tensor size must be (1).");
+        if(!ProblemDescription::IsValidLength())
+            return false;
+        return true;
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    float divisor;
+    NetworkConfig MakeForwardNetworkConfig() const;
+};
+
+} // namespace avgpool
+
+} // namespace miopen
diff --git a/src/include/miopen/avgpool/solvers.hpp b/src/include/miopen/avgpool/solvers.hpp
new file mode 100644
index 0000000000..34adc12b4c
--- /dev/null
+++ b/src/include/miopen/avgpool/solvers.hpp
@@ -0,0 +1,281 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include <miopen/solver.hpp>
+#include <miopen/avgpool/problem_description.hpp>
+#include "miopen/kernel_build_params.hpp"
+#include "miopen/kernel_info.hpp"
+
+#include <utility>
+
+namespace miopen {
+
+namespace solver {
+
+const auto make_hip_kernel = [](std::vector<size_t> localsize,
+                                std::vector<size_t> gridsize,
+                                std::string kernel_file,
+                                std::string kernel_name,
+                                KernelBuildParameters build_params) {
+    while(localsize.size() < 3)
+        localsize.push_back(1);
+    while(gridsize.size() < 3)
+        gridsize.push_back(1);
+    for(int i = 0; i < localsize.size(); ++i)
+        gridsize[i] = AlignUp(gridsize[i], localsize[i]);
+    return KernelInfo{
+        build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name};
+};
+
+namespace avgpool {
+
+using NLLLossUnreduce =
+    NonTunableSolverBase<ExecutionContext, miopen::avgpool::UnreduceProblemDescription>;
+
+using NLLLossReduce =
+    NonTunableSolverBase<ExecutionContext, miopen::avgpool::ReduceProblemDescription>;
+
+struct NLLLossUnreduceSolver : NLLLossUnreduce
+{
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossReduceSolver : NLLLossReduce
+{
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
+};
+
+// FORWARD UNREDUCE
+struct NLLLossUnreduceForwardContiguous4d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceForwardContiguous4d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceForwardContiguous2d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceForwardContiguous2d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceForward4d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceForward4d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceForward2d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceForward2d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceForward5d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceForward5d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+// FORWARD REDUCE
+struct NLLLossReduceForward5d final : NLLLossReduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossReduceForward5d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::ReduceProblemDescription& problem) const override;
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::avgpool::ReduceProblemDescription& problem) const override;
+    bool MayNeedWorkspace() const override { return true; }
+};
+
+// BACKWARD UNREDUCE
+struct NLLLossUnreduceBackwardContiguous2d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceBackwardContiguous2d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceBackwardContiguous4d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceBackwardContiguous4d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceBackward4d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceBackward4d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceBackward2d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceBackward2d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceBackward5d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceBackward5d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+// BACKWARD REDUCE
+struct NLLLossReduceBackward2d final : NLLLossReduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossReduceBackward2d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::ReduceProblemDescription& problem) const override;
+};
+
+struct NLLLossReduceBackward5d final : NLLLossReduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossReduceBackward5d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::ReduceProblemDescription& problem) const override;
+};
+
+} // namespace avgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index 81c15f6bea..194afd79ac 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -59,7 +59,8 @@ enum class Primitive
     Mha,
     Softmax,
     Adam,
-    Item
+    Item,
+    AvgPool
 };
 
 struct MIOPEN_INTERNALS_EXPORT Id
diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
new file mode 100644
index 0000000000..e69de29bb2

From 86a50733653b8cce2fcfccb4d79869385e149181 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Tue, 6 Aug 2024 19:12:43 +0700
Subject: [PATCH 03/38] add solver, kernel

---
 src/avgpool/problem_description.cpp           |  66 +++---
 src/include/miopen/avgpool.hpp                |   2 +-
 src/include/miopen/avgpool/invoke_params.hpp  |  16 +-
 .../miopen/avgpool/problem_description.hpp    | 218 +++++++----------
 src/include/miopen/avgpool/solvers.hpp        | 220 +++---------------
 src/include/miopen/tensor_view_utils.hpp      |   1 +
 src/kernels/tensor_view.hpp                   |  40 ++++
 src/solver/avgpool/backward_avgpool_2d.cpp    | 116 +++++++++
 src/solver/avgpool/backward_avgpool_3d.cpp    | 120 ++++++++++
 src/solver/avgpool/forward_avgpool_2d.cpp     | 116 +++++++++
 src/solver/avgpool/forward_avgpool_3d.cpp     | 120 ++++++++++
 11 files changed, 668 insertions(+), 367 deletions(-)

diff --git a/src/avgpool/problem_description.cpp b/src/avgpool/problem_description.cpp
index dd2144f429..96ecb4bb72 100644
--- a/src/avgpool/problem_description.cpp
+++ b/src/avgpool/problem_description.cpp
@@ -24,58 +24,68 @@
  *
  *******************************************************************************/
 
-#include <cstddef>
 #include <miopen/avgpool/problem_description.hpp>
 #include <miopen/names.hpp>
 
-#include <sstream>
-
 namespace miopen {
 
 namespace avgpool {
 
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
 NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
 {
-    size_t numel       = GetNtotal();
-    size_t num_batches = inputDesc.GetLengths()[0];
-    size_t num_classes = GetC();
-    size_t num_dims    = inputDesc.GetNumDims();
+    auto input_size    = inputDesc.GetLengths();
+    auto output_size   = outputDesc.GetLengths();
+    auto input_stride  = inputDesc.GetStrides();
+    auto output_stride = outputDesc.GetStrides();
 
     auto input_dtype = inputDesc.GetType();
 
     std::ostringstream ss;
 
-    ss << "avgpool_unreduce";
-    ss << "is_fwd" << is_fwd;
-    ss << "contiguous" << contiguous;
-    ss << "input_dtype" << input_dtype;
-    ss << "numel" << numel;
-    ss << "num_dims" << num_dims;
-    ss << "num_batches" << num_batches;
-    ss << "num_classes" << num_classes;
+    ss << "avgpool_fwd";
+    ss << "-input_dtype" << input_dtype;
+    ss << "-Is" << input_size;
+    ss << "-Os" << output_size;
+    ss << "-Si" << input_stride;
+    ss << "-So" << output_stride;
+    ss << "-Cp " << count_include_pad;
+    ss << "-Do " << divisor_override;
 
     return NetworkConfig{ss.str()};
 }
 
 NetworkConfig BwdProblemDescription::MakeNetworkConfig() const
 {
-    size_t numel       = GetNtotal();
-    size_t num_batches = inputDesc.GetLengths()[0];
-    size_t num_classes = GetC();
-    size_t num_dims    = inputDesc.GetNumDims();
+    auto input_grad_size    = inputGradDesc.GetLengths();
+    auto output_grad_size   = outputGradDesc.GetLengths();
+    auto input_grad_stride  = inputGradDesc.GetStrides();
+    auto output_grad_stride = outputGradDesc.GetStrides();
 
-    auto input_dtype = inputDesc.GetType();
+    auto input_dtype = inputGradDesc.GetType();
 
     std::ostringstream ss;
 
-    ss << "avgpool_reduce";
-    ss << "is_fwd" << is_fwd;
-    ss << "input_dtype" << input_dtype;
-    ss << "divisor" << divisor;
-    ss << "numel" << numel;
-    ss << "num_dims" << num_dims;
-    ss << "num_batches" << num_batches;
-    ss << "num_classes" << num_classes;
+    ss << "avgpool_bwd";
+    ss << "-input_dtype" << input_dtype;
+    ss << "-dIs" << input_grad_size;
+    ss << "-dOs" << output_grad_size;
+    ss << "-dSi" << input_grad_stride;
+    ss << "-dSo" << output_grad_stride;
+    ss << "-Cp " << count_include_pad;
+    ss << "-Do " << divisor_override;
 
     return NetworkConfig{ss.str()};
 }
diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp
index 1a46b974b2..617ed56782 100644
--- a/src/include/miopen/avgpool.hpp
+++ b/src/include/miopen/avgpool.hpp
@@ -50,7 +50,7 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle,
 
 MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle,
                                                        const TensorDescriptor& outputGradDesc,
-                                                       Data_t output_grad,
+                                                       ConstData_t output_grad,
                                                        const TensorDescriptor& inputGradDesc,
                                                        Data_t input_grad,
                                                        const TensorDescriptor& strideDesc,
diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp
index de2e87ea1b..b57f8e0edc 100644
--- a/src/include/miopen/avgpool/invoke_params.hpp
+++ b/src/include/miopen/avgpool/invoke_params.hpp
@@ -42,16 +42,16 @@ struct FwdInvokeParams : public miopen::InvokeParams
     const TensorDescriptor* outputDesc  = nullptr;
     const TensorDescriptor* strideDesc  = nullptr;
     const TensorDescriptor* paddingDesc = nullptr;
-    const TensorDescriptor* kinfor      = nullptr;
+    const TensorDescriptor* kinforDesc  = nullptr;
 
     ConstData_t input   = nullptr;
     Data_t output       = nullptr;
     ConstData_t stride  = nullptr;
     ConstData_t padding = nullptr;
-    ConstData_t kinfo   = nullptr;
+    ConstData_t kinfor  = nullptr;
 
-    const bool count_include_pad   = false;
-    const int32_t divisor_override = 0;
+    bool count_include_pad   = false;
+    int32_t divisor_override = 0;
 
     std::size_t GetWorkspaceSize() const { return 0; }
     Data_t GetWorkspace() const { return nullptr; }
@@ -66,16 +66,16 @@ struct BwdInvokeParams : public miopen::InvokeParams
     const TensorDescriptor* inputGradDesc  = nullptr;
     const TensorDescriptor* strideDesc     = nullptr;
     const TensorDescriptor* paddingDesc    = nullptr;
-    const TensorDescriptor* kinfor         = nullptr;
+    const TensorDescriptor* kinforDesc     = nullptr;
 
     ConstData_t output_grad = nullptr;
     Data_t input_grad       = nullptr;
     ConstData_t stride      = nullptr;
     ConstData_t padding     = nullptr;
-    ConstData_t kinfo       = nullptr;
+    ConstData_t kinfor      = nullptr;
 
-    const bool count_include_pad   = false;
-    const int32_t divisor_override = 0;
+    bool count_include_pad   = false;
+    int32_t divisor_override = 0;
 
     std::size_t GetWorkspaceSize() const { return 0; }
     Data_t GetWorkspace() const { return nullptr; }
diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp
index 2b3ec555db..9400bd67a0 100644
--- a/src/include/miopen/avgpool/problem_description.hpp
+++ b/src/include/miopen/avgpool/problem_description.hpp
@@ -29,8 +29,6 @@
 #include <miopen/problem_description_base.hpp>
 #include <miopen/activ.hpp>
 #include <miopen/tensor.hpp>
-#include <cassert>
-#include <string>
 
 namespace miopen {
 
@@ -40,174 +38,122 @@ namespace avgpool {
 
 struct ProblemDescription : ProblemDescriptionBase
 {
-    ProblemDescription(const TensorDescriptor& inputDesc_,
-                       const TensorDescriptor& targetDesc_,
-                       const TensorDescriptor& weightDesc_,
-                       const TensorDescriptor& outputDesc_,
-                       int32_t ignore_index_,
-                       bool is_fwd_)
-        : inputDesc(inputDesc_),
-          targetDesc(targetDesc_),
-          weightDesc(weightDesc_),
-          outputDesc(outputDesc_),
-          ignore_index(ignore_index_),
-          is_fwd(is_fwd_)
+    ProblemDescription(const TensorDescriptor& strideDesc_,
+                       const TensorDescriptor& paddingDesc_,
+                       const TensorDescriptor& kinforDesc_,
+                       const bool count_include_pad_,
+                       const int32_t divisor_override_)
+        : strideDesc(strideDesc_),
+          paddingDesc(paddingDesc_),
+          kinforDesc(kinforDesc_),
+          count_include_pad(count_include_pad_),
+          divisor_override(divisor_override_)
     {
-    }
-
-    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
-    const TensorDescriptor& GetTargetDesc() const { return targetDesc; }
-    const TensorDescriptor& GetWeightDesc() const { return weightDesc; }
-    const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
-    int32_t GetIgnoreIndex() const { return ignore_index; }
-
-    bool IsValidLength() const
-    {
-        if(targetDesc.GetLengths()[0] != inputDesc.GetLengths()[0])
-            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match.");
-
-        for(int32_t i = 1; i < targetDesc.GetNumDims(); ++i)
+        if(divisor_override < 0)
         {
-            if(targetDesc.GetLengths()[i] != inputDesc.GetLengths()[i + 1])
-            {
-                MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match.");
-            }
+            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: divisor_override must be non-negative.");
         }
-        if(weightDesc.GetLengths()[0] != inputDesc.GetLengths()[1])
-        {
-            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match.");
-        }
-        if(inputDesc.GetLengths().size() > 5)
-        {
-            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Do not support Input Tensor dims > 5.");
-        }
-        return true;
-    }
-
-    bool IsValidStride() const
-    {
-        auto isRightStride = [](TensorDescriptor td) {
-            auto strides = td.GetStrides();
-            auto lengths = td.GetLengths();
-            std::vector<std::pair<size_t, size_t>> p;
-            p.reserve(td.GetNumDims());
-            std::transform(strides.begin(),
-                           strides.end(),
-                           lengths.begin(),
-                           std::back_inserter(p),
-                           [](size_t a, size_t b) { return std::make_pair(a, b); });
-            std::sort(p.begin(), p.end());
-            for(int i = 1; i < p.size(); ++i)
-            {
-                if(p[i].first != p[i - 1].first * p[i - 1].second)
-                    MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor strides do not valid.");
-            }
-            return true;
-        };
-        return isRightStride(inputDesc) && isRightStride(targetDesc) && isRightStride(outputDesc) &&
-               isRightStride(weightDesc);
-    }
-
-    bool IsSameType() const
-    {
-        if(inputDesc.GetType() != weightDesc.GetType())
-        {
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "NLLLoss: Input and Weight tensors types do not match.");
-        }
-        return true;
-    }
-
-    bool IsAllContiguous() const
-    {
-        auto isContiguous = [](TensorDescriptor td) {
-            size_t s = 1;
-            for(int i = td.GetNumDims() - 1; i >= 0; --i)
-            {
-                if(s != td.GetStrides()[i])
-                {
-                    return false;
-                }
-                s *= td.GetLengths()[i];
-            }
-            return true;
-        };
-        return isContiguous(inputDesc) && isContiguous(targetDesc) && isContiguous(weightDesc) &&
-               isContiguous(outputDesc);
     }
 
 protected:
-    TensorDescriptor inputDesc;
-    TensorDescriptor targetDesc;
-    TensorDescriptor weightDesc;
-    TensorDescriptor outputDesc;
-
-    int32_t ignore_index;
-    bool is_fwd;
+    TensorDescriptor strideDesc;
+    TensorDescriptor paddingDesc;
+    TensorDescriptor kinforDesc;
 
-    NetworkConfig MakeForwardNetworkConfig() const;
+    bool count_include_pad;
+    int32_t divisor_override;
 };
 
-struct UnreduceProblemDescription : ProblemDescription
+struct FwdProblemDescription : ProblemDescription
 {
-    UnreduceProblemDescription(const TensorDescriptor& inputDesc_,
-                               const TensorDescriptor& targetDesc_,
-                               const TensorDescriptor& weightDesc_,
-                               const TensorDescriptor& outputDesc_,
-                               int32_t ignore_index_,
-                               bool is_fwd_)
+    FwdProblemDescription(const TensorDescriptor& inputDesc_,
+                          const TensorDescriptor& outputDesc_,
+                          const TensorDescriptor& strideDesc_,
+                          const TensorDescriptor& paddingDesc_,
+                          const TensorDescriptor& kinforDesc_,
+                          const bool count_include_pad_,
+                          const int32_t divisor_override_)
         : ProblemDescription(
-              inputDesc_, targetDesc_, weightDesc_, outputDesc_, ignore_index_, is_fwd_)
+              strideDesc_, paddingDesc_, kinforDesc_, count_include_pad_, divisor_override_),
+          inputDesc(inputDesc_),
+          outputDesc(outputDesc_)
     {
-        IsSameType();
         IsValidLength();
-        IsValidStride();
     }
 
-    size_t GetNtotal() const { return outputDesc.GetElementSize(); }
-    size_t GetC() const { return weightDesc.GetElementSize(); }
+    auto GetInputDesc() const { return inputDesc; }
+    auto GetOutputDesc() const { return outputDesc; }
+    auto GetNtotal() const { return outputDesc.GetElementSize(); }
+
+    bool IsValidLength() const
+    {
+        auto input_dims = inputDesc.GetLengths().size();
+        if(outputDesc.GetLengths()[0] != inputDesc.GetLengths()[0] ||
+           outputDesc.GetLengths()[1] != inputDesc.GetLengths()[1] ||
+           outputDesc.GetLengths().size() != input_dims)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+        }
+        if(input_dims != strideDesc.GetElementSize() ||
+           input_dims != paddingDesc.GetElementSize() || input_dims != kinforDesc.GetElementSize())
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+        }
+
+        return true;
+    }
 
     NetworkConfig MakeNetworkConfig() const override;
 
-private:
-    NetworkConfig MakeForwardNetworkConfig() const;
+protected:
+    TensorDescriptor inputDesc;
+    TensorDescriptor outputDesc;
 };
 
-struct ReduceProblemDescription : ProblemDescription
+struct BwdProblemDescription : ProblemDescription
 {
-    ReduceProblemDescription(const TensorDescriptor& inputDesc_,
-                             const TensorDescriptor& targetDesc_,
-                             const TensorDescriptor& weightDesc_,
-                             const TensorDescriptor& outputDesc_,
-                             int32_t ignore_index_,
-                             float divisor_,
-                             bool is_fwd_)
+    BwdProblemDescription(const TensorDescriptor& outputGradDesc_,
+                          const TensorDescriptor& inputGradDesc_,
+                          const TensorDescriptor& strideDesc_,
+                          const TensorDescriptor& paddingDesc_,
+                          const TensorDescriptor& kinforDesc_,
+                          const bool count_include_pad_,
+                          const int32_t divisor_override_)
         : ProblemDescription(
-              inputDesc_, targetDesc_, weightDesc_, outputDesc_, ignore_index_, is_fwd_)
+              strideDesc_, paddingDesc_, kinforDesc_, count_include_pad_, divisor_override_),
+          outputGradDesc(outputGradDesc_),
+          inputGradDesc(inputGradDesc_)
     {
-        divisor = divisor_;
-        IsSameType();
         IsValidLength();
-        IsValidStride();
     }
 
-    size_t GetNtotal() const { return targetDesc.GetElementSize(); }
-    size_t GetC() const { return weightDesc.GetElementSize(); }
+    auto GetOutputGradDesc() const { return outputGradDesc; }
+    auto GetInputGradDesc() const { return inputGradDesc; }
+    auto GetNtotal() const { return inputGradDesc.GetElementSize(); }
 
     bool IsValidLength() const
     {
-        if(outputDesc.GetNumDims() != 1 || outputDesc.GetLengths()[0] != 1)
-            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Output Tensor size must be (1).");
-        if(!ProblemDescription::IsValidLength())
-            return false;
+        auto input_dims = inputGradDesc.GetLengths().size();
+        if(outputGradDesc.GetLengths()[0] != inputGradDesc.GetLengths()[0] ||
+           outputGradDesc.GetLengths()[1] != inputGradDesc.GetLengths()[1] ||
+           outputGradDesc.GetLengths().size() != input_dims)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+        }
+        if(input_dims != strideDesc.GetElementSize() ||
+           input_dims != paddingDesc.GetElementSize() || input_dims != kinforDesc.GetElementSize())
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+        }
+
         return true;
     }
 
     NetworkConfig MakeNetworkConfig() const override;
 
-private:
-    float divisor;
-    NetworkConfig MakeForwardNetworkConfig() const;
+protected:
+    TensorDescriptor outputGradDesc;
+    TensorDescriptor inputGradDesc;
 };
 
 } // namespace avgpool
diff --git a/src/include/miopen/avgpool/solvers.hpp b/src/include/miopen/avgpool/solvers.hpp
index 34adc12b4c..5577b9fad6 100644
--- a/src/include/miopen/avgpool/solvers.hpp
+++ b/src/include/miopen/avgpool/solvers.hpp
@@ -33,8 +33,6 @@
 #include "miopen/kernel_build_params.hpp"
 #include "miopen/kernel_info.hpp"
 
-#include <utility>
-
 namespace miopen {
 
 namespace solver {
@@ -56,222 +54,56 @@ const auto make_hip_kernel = [](std::vector<size_t> localsize,
 
 namespace avgpool {
 
-using NLLLossUnreduce =
-    NonTunableSolverBase<ExecutionContext, miopen::avgpool::UnreduceProblemDescription>;
-
-using NLLLossReduce =
-    NonTunableSolverBase<ExecutionContext, miopen::avgpool::ReduceProblemDescription>;
-
-struct NLLLossUnreduceSolver : NLLLossUnreduce
-{
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
-
-struct NLLLossReduceSolver : NLLLossReduce
-{
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
-};
-
-// FORWARD UNREDUCE
-struct NLLLossUnreduceForwardContiguous4d final : NLLLossUnreduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceForwardContiguous4d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
-
-struct NLLLossUnreduceForwardContiguous2d final : NLLLossUnreduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceForwardContiguous2d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
-
-struct NLLLossUnreduceForward4d final : NLLLossUnreduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceForward4d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
-
-struct NLLLossUnreduceForward2d final : NLLLossUnreduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceForward2d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
-
-struct NLLLossUnreduceForward5d final : NLLLossUnreduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceForward5d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
-
-// FORWARD REDUCE
-struct NLLLossReduceForward5d final : NLLLossReduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossReduceForward5d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::ReduceProblemDescription& problem) const override;
-    std::size_t
-    GetWorkspaceSize(const ExecutionContext& context,
-                     const miopen::avgpool::ReduceProblemDescription& problem) const override;
-    bool MayNeedWorkspace() const override { return true; }
-};
-
-// BACKWARD UNREDUCE
-struct NLLLossUnreduceBackwardContiguous2d final : NLLLossUnreduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceBackwardContiguous2d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+using AvgPoolForward =
+    NonTunableSolverBase<ExecutionContext, miopen::avgpool::FwdProblemDescription>;
 
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
+using AvgPoolBackward =
+    NonTunableSolverBase<ExecutionContext, miopen::avgpool::BwdProblemDescription>;
 
-struct NLLLossUnreduceBackwardContiguous4d final : NLLLossUnreduceSolver
+// FORWARD
+struct AvgPoolForward2d final : AvgPoolForward
 {
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceBackwardContiguous4d>();
-    }
+    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolForward2d>(); }
 
     bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+                      const miopen::avgpool::FwdProblemDescription& problem) const override;
 
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::avgpool::FwdProblemDescription& problem) const override;
 };
 
-struct NLLLossUnreduceBackward4d final : NLLLossUnreduceSolver
+struct AvgPoolForward3d final : AvgPoolForward
 {
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceBackward4d>();
-    }
+    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolForward3d>(); }
 
     bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+                      const miopen::avgpool::FwdProblemDescription& problem) const override;
 
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::avgpool::FwdProblemDescription& problem) const override;
 };
 
-struct NLLLossUnreduceBackward2d final : NLLLossUnreduceSolver
+// BACKWARD
+struct AvgPoolBackward2d final : AvgPoolBackward
 {
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceBackward2d>();
-    }
+    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolBackward2d>(); }
 
     bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+                      const miopen::avgpool::BwdProblemDescription& problem) const override;
 
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::avgpool::BwdProblemDescription& problem) const override;
 };
 
-struct NLLLossUnreduceBackward5d final : NLLLossUnreduceSolver
+struct AvgPoolBackward3d final : AvgPoolBackward
 {
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceBackward5d>();
-    }
+    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolBackward3d>(); }
 
     bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
+                      const miopen::avgpool::BwdProblemDescription& problem) const override;
 
-// BACKWARD REDUCE
-struct NLLLossReduceBackward2d final : NLLLossReduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossReduceBackward2d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::ReduceProblemDescription& problem) const override;
-};
-
-struct NLLLossReduceBackward5d final : NLLLossReduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossReduceBackward5d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::ReduceProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::avgpool::BwdProblemDescription& problem) const override;
 };
 
 } // namespace avgpool
diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp
index 9f7430ba8a..050d431844 100644
--- a/src/include/miopen/tensor_view_utils.hpp
+++ b/src/include/miopen/tensor_view_utils.hpp
@@ -29,6 +29,7 @@
 
 #include <miopen/common.hpp>
 #include "../../kernels/tensor_view.hpp"
+#include "miopen/tensor.hpp"
 
 namespace miopen {
 
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index d35bfd93fc..d64dbf21f9 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -72,6 +72,46 @@ struct tensor_layout_t
         }
     }
 
+    constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w)
+    {
+        static_assert(N == 5);
+        layout[0] = n;
+        layout[1] = c;
+        layout[2] = d;
+        layout[3] = h;
+        layout[4] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t h, uint64_t w)
+    {
+        static_assert(N == 4);
+        layout[0] = n;
+        layout[1] = c;
+        layout[2] = h;
+        layout[3] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n, uint64_t h, uint64_t w)
+    {
+        static_assert(N == 3);
+        layout[0] = n;
+        layout[1] = h;
+        layout[2] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n, uint64_t w)
+    {
+        static_assert(N == 2);
+        layout[0] = n;
+        layout[1] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n)
+    {
+        static_assert(N == 1);
+        layout[0] = n;
+    }
+
     uint64_t layout[N];
 };
 
diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp
index e69de29bb2..10c9479b0c 100644
--- a/src/solver/avgpool/backward_avgpool_2d.cpp
+++ b/src/solver/avgpool/backward_avgpool_2d.cpp
@@ -0,0 +1,116 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <miopen/avgpool/solvers.hpp>
+
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/avgpool.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_BWD_2D 1024
+
+namespace miopen {
+
+namespace solver {
+
+namespace avgpool {
+
+bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context,
+                                     const miopen::avgpool::BwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution
+AvgPoolBackward2d::GetSolution(const ExecutionContext& context,
+                               const miopen::avgpool::BwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
+    auto dtype        = problem.GetInputGradDesc().GetType();
+    size_t N_total    = problem.GetNtotal();
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+        {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
+
+    result.construction_params.push_back(make_hip_kernel(
+        {LOCAL_SIZE_BWD_2D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolBackward2d", build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::avgpool::BwdInvokeParams>();
+
+            decltype(auto) kernel = handle_.Run(kernels.front());
+
+            auto input_grad_tv  = get_inner_expanded_tv<4>(deref(params.inputGradDesc));
+            auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc));
+
+            auto N  = deref(params.inputGradDesc).GetLengths()[0];
+            auto C  = deref(params.inputGradDesc).GetLengths()[1];
+            auto H  = deref(params.inputGradDesc).GetLengths()[2];
+            auto W  = deref(params.inputGradDesc).GetLengths()[3];
+            auto OH = deref(params.outputGradDesc).GetLengths()[2];
+            auto OW = deref(params.outputGradDesc).GetLengths()[3];
+
+            kernel(params.output_grad,
+                   params.input_grad,
+                   N,
+                   C,
+                   H,
+                   W,
+                   OH,
+                   OW,
+                   params.kinfor,
+                   params.stride,
+                   params.padding,
+                   params.count_include_pad,
+                   params.divisor_override,
+                   output_grad_tv,
+                   input_grad_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace avgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
index e69de29bb2..b960554348 100644
--- a/src/solver/avgpool/backward_avgpool_3d.cpp
+++ b/src/solver/avgpool/backward_avgpool_3d.cpp
@@ -0,0 +1,120 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <miopen/avgpool/solvers.hpp>
+
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/avgpool.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_BWD_3D 1024
+
+namespace miopen {
+
+namespace solver {
+
+namespace avgpool {
+
+bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context,
+                                     const miopen::avgpool::BwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution
+AvgPoolBackward3d::GetSolution(const ExecutionContext& context,
+                               const miopen::avgpool::BwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
+    auto dtype        = problem.GetInputGradDesc().GetType();
+    size_t N_total    = problem.GetNtotal();
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+        {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
+
+    result.construction_params.push_back(make_hip_kernel(
+        {LOCAL_SIZE_BWD_3D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolBackward3d", build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::avgpool::BwdInvokeParams>();
+
+            decltype(auto) kernel = handle_.Run(kernels.front());
+
+            auto input_grad_tv  = get_inner_expanded_tv<5>(deref(params.inputGradDesc));
+            auto output_grad_tv = get_inner_expanded_tv<5>(deref(params.outputGradDesc));
+
+            auto N  = deref(params.inputGradDesc).GetLengths()[0];
+            auto C  = deref(params.inputGradDesc).GetLengths()[1];
+            auto D  = deref(params.inputGradDesc).GetLengths()[2];
+            auto H  = deref(params.inputGradDesc).GetLengths()[3];
+            auto W  = deref(params.inputGradDesc).GetLengths()[4];
+            auto OD = deref(params.outputGradDesc).GetLengths()[2];
+            auto OH = deref(params.outputGradDesc).GetLengths()[3];
+            auto OW = deref(params.outputGradDesc).GetLengths()[4];
+
+            kernel(params.output_grad,
+                   params.input_grad,
+                   N,
+                   C,
+                   D,
+                   H,
+                   W,
+                   OD,
+                   OH,
+                   OW,
+                   params.kinfor,
+                   params.stride,
+                   params.padding,
+                   params.count_include_pad,
+                   params.divisor_override,
+                   output_grad_tv,
+                   input_grad_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace avgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index e69de29bb2..d0e37b5464 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -0,0 +1,116 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <miopen/avgpool/solvers.hpp>
+
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/avgpool.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_FWD_2D 1024
+
+namespace miopen {
+
+namespace solver {
+
+namespace avgpool {
+
+bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context,
+                                    const miopen::avgpool::FwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution
+AvgPoolForward2d::GetSolution(const ExecutionContext& context,
+                              const miopen::avgpool::FwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
+    auto dtype        = problem.GetOutputDesc().GetType();
+    size_t N_total    = problem.GetNtotal();
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+        {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
+
+    result.construction_params.push_back(make_hip_kernel(
+        {LOCAL_SIZE_FWD_2D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolForward2d", build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::avgpool::FwdInvokeParams>();
+
+            decltype(auto) kernel = handle_.Run(kernels.front());
+
+            auto input_tv  = get_inner_expanded_tv<4>(deref(params.inputDesc));
+            auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc));
+
+            auto N  = deref(params.inputDesc).GetLengths()[0];
+            auto C  = deref(params.inputDesc).GetLengths()[1];
+            auto H  = deref(params.inputDesc).GetLengths()[2];
+            auto W  = deref(params.inputDesc).GetLengths()[3];
+            auto OH = deref(params.outputDesc).GetLengths()[2];
+            auto OW = deref(params.outputDesc).GetLengths()[3];
+
+            kernel(params.input,
+                   params.output,
+                   N,
+                   C,
+                   H,
+                   W,
+                   OH,
+                   OW,
+                   params.kinfor,
+                   params.stride,
+                   params.padding,
+                   params.count_include_pad,
+                   params.divisor_override,
+                   input_tv,
+                   output_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace avgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
index e69de29bb2..9dd8c03cba 100644
--- a/src/solver/avgpool/forward_avgpool_3d.cpp
+++ b/src/solver/avgpool/forward_avgpool_3d.cpp
@@ -0,0 +1,120 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <miopen/avgpool/solvers.hpp>
+
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/avgpool.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_FWD_3D 1024
+
+namespace miopen {
+
+namespace solver {
+
+namespace avgpool {
+
+bool AvgPoolForward3d::IsApplicable(const ExecutionContext& context,
+                                    const miopen::avgpool::FwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution
+AvgPoolForward3d::GetSolution(const ExecutionContext& context,
+                              const miopen::avgpool::FwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
+    auto dtype        = problem.GetOutputDesc().GetType();
+    size_t N_total    = problem.GetNtotal();
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+        {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
+
+    result.construction_params.push_back(make_hip_kernel(
+        {LOCAL_SIZE_FWD_3D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolForward3d", build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::avgpool::FwdInvokeParams>();
+
+            decltype(auto) kernel = handle_.Run(kernels.front());
+
+            auto input_tv  = get_inner_expanded_tv<5>(deref(params.inputDesc));
+            auto output_tv = get_inner_expanded_tv<5>(deref(params.outputDesc));
+
+            auto N  = deref(params.inputDesc).GetLengths()[0];
+            auto C  = deref(params.inputDesc).GetLengths()[1];
+            auto D  = deref(params.inputDesc).GetLengths()[2];
+            auto H  = deref(params.inputDesc).GetLengths()[3];
+            auto W  = deref(params.inputDesc).GetLengths()[4];
+            auto OD = deref(params.outputDesc).GetLengths()[2];
+            auto OH = deref(params.outputDesc).GetLengths()[3];
+            auto OW = deref(params.outputDesc).GetLengths()[4];
+
+            kernel(params.input,
+                   params.output,
+                   N,
+                   C,
+                   D,
+                   H,
+                   W,
+                   OD,
+                   OH,
+                   OW,
+                   params.kinfor,
+                   params.stride,
+                   params.padding,
+                   params.count_include_pad,
+                   params.divisor_override,
+                   input_tv,
+                   output_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace avgpool
+
+} // namespace solver
+
+} // namespace miopen

From ca4ad974e8392c209814afa2478af48a3bb2bf1c Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Wed, 7 Aug 2024 18:17:48 +0700
Subject: [PATCH 04/38] add gtest

---
 .../miopen/avgpool/problem_description.hpp    |  24 +-
 src/kernels/MIOpenAvgPool.cpp                 | 550 ++++++++++++++++++
 src/solver/avgpool/forward_avgpool_2d.cpp     |  13 +-
 test/cpu_avgpool.hpp                          | 426 ++++++++++++++
 test/gtest/avgpool.cpp                        | 163 ++++++
 test/gtest/avgpool.hpp                        | 426 ++++++++++++++
 6 files changed, 1588 insertions(+), 14 deletions(-)
 create mode 100644 test/cpu_avgpool.hpp
 create mode 100644 test/gtest/avgpool.cpp
 create mode 100644 test/gtest/avgpool.hpp

diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp
index 9400bd67a0..9166762235 100644
--- a/src/include/miopen/avgpool/problem_description.hpp
+++ b/src/include/miopen/avgpool/problem_description.hpp
@@ -92,12 +92,16 @@ struct FwdProblemDescription : ProblemDescription
            outputDesc.GetLengths()[1] != inputDesc.GetLengths()[1] ||
            outputDesc.GetLengths().size() != input_dims)
         {
-            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AvgPool: Input and output tensor sizes do not match.");
         }
-        if(input_dims != strideDesc.GetElementSize() ||
-           input_dims != paddingDesc.GetElementSize() || input_dims != kinforDesc.GetElementSize())
+        if(input_dims - 2 != strideDesc.GetElementSize() ||
+           input_dims - 2 != paddingDesc.GetElementSize() ||
+           input_dims - 2 != kinforDesc.GetElementSize())
         {
-            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AvgPool: Input tensor sizes and Kernel size or stride "
+                         "or padding do not match.");
         }
 
         return true;
@@ -138,12 +142,16 @@ struct BwdProblemDescription : ProblemDescription
            outputGradDesc.GetLengths()[1] != inputGradDesc.GetLengths()[1] ||
            outputGradDesc.GetLengths().size() != input_dims)
         {
-            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AvgPool: Input grad and output grad tensor sizes do not match.");
         }
-        if(input_dims != strideDesc.GetElementSize() ||
-           input_dims != paddingDesc.GetElementSize() || input_dims != kinforDesc.GetElementSize())
+        if(input_dims - 2 != strideDesc.GetElementSize() ||
+           input_dims - 2 != paddingDesc.GetElementSize() ||
+           input_dims - 2 != kinforDesc.GetElementSize())
         {
-            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AvgPool: Input grad tensor sizes and Kernel size or stride or padding do "
+                         "not match.");
         }
 
         return true;
diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
index e69de29bb2..bcbf4f6c60 100644
--- a/src/kernels/MIOpenAvgPool.cpp
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -0,0 +1,550 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <cstddef>
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "tensor_view.hpp"
+
+#ifndef INPUT_TYPE
+#define INPUT_TYPE float
+#endif
+
+#ifndef OUTPUT_TYPE
+#define OUTPUT_TYPE float
+#endif
+
+template <typename TI, typename TO>
+__device__ void avgPoolForward2d(const TI* __restrict__ input,
+                                 TO* __restrict__ output,
+                                 int32_t N,
+                                 int32_t C,
+                                 int32_t H,
+                                 int32_t W,
+                                 int32_t OH,
+                                 int32_t OW,
+                                 int32_t* kinfor,
+                                 int32_t* stride,
+                                 int32_t* padding,
+                                 bool count_include_pad,
+                                 int32_t divisor_override,
+                                 tensor_view_t<4> input_tv,
+                                 tensor_view_t<4> output_tv)
+{
+    int32_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ncoh = gid / OW, ow = gid % OW;
+    int32_t nc = ncoh / OH, oh = ncoh % OH;
+    int32_t n = nc / C, c = nc % C;
+    int32_t R  = kinfor[0];
+    int32_t S  = kinfor[1];
+    int32_t sh = stride[0];
+    int32_t sw = stride[1];
+    int32_t ph = padding[0];
+    int32_t pw = padding[1];
+
+    if(n >= N)
+        return;
+
+    FLOAT_ACCUM m = 0;
+    for(int32_t r = 0; r < R; ++r)
+    {
+        for(int32_t s = 0; s < S; ++s)
+        {
+            // input idx : (n, c, h, w)
+            int32_t h = oh * sh - ph + r;
+            if(h < 0 || h >= H)
+                continue;
+            int32_t w = ow * sw - pw + s;
+            if(w < 0 || w >= W)
+                continue;
+            // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+            m += CVT_FLOAT2ACCUM(
+                input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
+        }
+    }
+
+    int32_t hstart = oh * sh - ph;
+    int32_t wstart = ow * sw - pw;
+    int32_t hend   = min(hstart + R, H + ph);
+    int32_t wend   = min(wstart + S, W + pw);
+
+    const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    hend   = min(hend, H);
+    wend   = min(wend, W);
+
+    int32_t divide_factor;
+    if(divisor_override != 0)
+    {
+        divide_factor = divisor_override;
+    }
+    else
+    {
+        if(count_include_pad)
+        {
+            divide_factor = pool_size;
+        }
+        else
+        {
+            divide_factor = (hend - hstart) * (wend - wstart);
+        }
+    }
+    FLOAT_ACCUM val = m / divide_factor;
+
+    output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = CVT_ACCUM2FLOAT(val);
+}
+
+extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input,
+                                            OUTPUT_TYPE* __restrict__ output,
+                                            int32_t N,
+                                            int32_t C,
+                                            int32_t H,
+                                            int32_t W,
+                                            int32_t OH,
+                                            int32_t OW,
+                                            int32_t* kinfor,
+                                            int32_t* stride,
+                                            int32_t* padding,
+                                            bool count_include_pad,
+                                            int32_t divisor_override,
+                                            tensor_view_t<4> input_tv,
+                                            tensor_view_t<4> output_tv)
+{
+    avgPoolForward2d<INPUT_TYPE, OUTPUT_TYPE>(input,
+                                              output,
+                                              N,
+                                              C,
+                                              H,
+                                              W,
+                                              OH,
+                                              OW,
+                                              kinfor,
+                                              stride,
+                                              padding,
+                                              count_include_pad,
+                                              divisor_override,
+                                              input_tv,
+                                              output_tv);
+}
+
+template <typename TI, typename TO>
+__device__ void avgPoolForward3d(const TI* __restrict__ input,
+                                 TO* __restrict__ output,
+                                 int32_t N,
+                                 int32_t C,
+                                 int32_t D,
+                                 int32_t H,
+                                 int32_t W,
+                                 int32_t OD,
+                                 int32_t OH,
+                                 int32_t OW,
+                                 int32_t* kinfor,
+                                 int32_t* stride,
+                                 int32_t* padding,
+                                 bool count_include_pad,
+                                 int32_t divisor_override,
+                                 tensor_view_t<5> input_tv,
+                                 tensor_view_t<5> output_tv)
+{
+    int32_t gid    = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ncodoh = gid / OW, ow = gid % OW;
+    int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
+    int32_t nc = ncod / OD, od = ncod % OD;
+    int32_t n = nc / C, c = nc % C;
+    int32_t KD = kinfor[0];
+    int32_t R  = kinfor[1];
+    int32_t S  = kinfor[2];
+    int32_t sd = stride[0];
+    int32_t sh = stride[1];
+    int32_t sw = stride[2];
+    int32_t pd = padding[0];
+    int32_t ph = padding[1];
+    int32_t pw = padding[2];
+
+    if(n >= N)
+        return;
+    FLOAT_ACCUM sum = 0;
+    for(int32_t kd = 0; kd < KD; ++kd)
+    {
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                // input idx : (n, c, d, h, w)
+                int32_t d = od * sd - pd + kd;
+                if(d < 0 || d >= D)
+                    continue;
+                int32_t h = oh * sh - ph + r;
+                if(h < 0 || h >= H)
+                    continue;
+                int32_t w = ow * sw - pw + s;
+                if(w < 0 || w >= W)
+                    continue;
+                // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                sum += CVT_FLOAT2ACCUM(
+                    input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
+            }
+        }
+    }
+    int32_t dstart = od * sd - pd;
+    int32_t hstart = oh * sh - ph;
+    int32_t wstart = ow * sw - pw;
+    int32_t dend   = min(dstart + KD, D + pd);
+    int32_t hend   = min(hstart + R, H + ph);
+    int32_t wend   = min(wstart + S, W + pw);
+
+    const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    dstart                  = max(dstart, 0);
+    hstart                  = max(hstart, 0);
+    wstart                  = max(wstart, 0);
+    dend                    = min(dend, D);
+    hend                    = min(hend, H);
+    wend                    = min(wend, W);
+
+    int32_t divide_factor;
+    if(divisor_override != 0)
+    {
+        divide_factor = divisor_override;
+    }
+    else
+    {
+        if(count_include_pad)
+        {
+            divide_factor = pool_size;
+        }
+        else
+        {
+            divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+        }
+    }
+    FLOAT_ACCUM val = sum / divide_factor;
+    output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
+        CVT_ACCUM2FLOAT(val);
+}
+
+extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input,
+                                            OUTPUT_TYPE* __restrict__ output,
+                                            int32_t N,
+                                            int32_t C,
+                                            int32_t D,
+                                            int32_t H,
+                                            int32_t W,
+                                            int32_t OD,
+                                            int32_t OH,
+                                            int32_t OW,
+                                            int32_t* kinfor,
+                                            int32_t* stride,
+                                            int32_t* padding,
+                                            bool count_include_pad,
+                                            int32_t divisor_override,
+                                            tensor_view_t<5> input_tv,
+                                            tensor_view_t<5> output_tv)
+{
+    avgPoolForward3d<INPUT_TYPE, OUTPUT_TYPE>(input,
+                                              output,
+                                              N,
+                                              C,
+                                              D,
+                                              H,
+                                              W,
+                                              OD,
+                                              OH,
+                                              OW,
+                                              kinfor,
+                                              stride,
+                                              padding,
+                                              count_include_pad,
+                                              divisor_override,
+                                              input_tv,
+                                              output_tv);
+}
+
+template <typename TI, typename TO>
+__device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
+                                  TO* __restrict__ input_grad,
+                                  int32_t N,
+                                  int32_t C,
+                                  int32_t H,
+                                  int32_t W,
+                                  int32_t OH,
+                                  int32_t OW,
+                                  int32_t* kinfor,
+                                  int32_t* stride,
+                                  int32_t* padding,
+                                  bool count_include_pad,
+                                  int32_t divisor_override,
+                                  tensor_view_t<4> output_grad_tv,
+                                  tensor_view_t<4> input_grad_tv)
+{
+    int32_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t nch = gid / W, w = gid % W;
+    int32_t nc = nch / H, h = nch % H;
+    int32_t n = nc / C, c = nc % C;
+    int32_t R  = kinfor[0];
+    int32_t S  = kinfor[1];
+    int32_t sh = stride[0];
+    int32_t sw = stride[1];
+    int32_t ph = padding[0];
+    int32_t pw = padding[1];
+
+    if(n >= N)
+        return;
+
+    FLOAT_ACCUM grad = 0;
+    for(int32_t r = 0; r < R; ++r)
+    {
+        for(int32_t s = 0; s < S; ++s)
+        {
+            int32_t ohsh = h + ph - r;
+            if(ohsh % sh != 0)
+                continue;
+            int32_t oh = ohsh / sh;
+            if(oh < 0 || oh >= OH)
+                continue;
+            int32_t owsw = w + pw - s;
+            if(owsw % sw != 0)
+                continue;
+            int32_t ow = owsw / sw;
+            if(ow < 0 || ow >= OW)
+                continue;
+
+            int32_t hstart = oh * sh - ph;
+            int32_t wstart = ow * sw - pw;
+            int32_t hend   = min(hstart + R, H + ph);
+            int32_t wend   = min(wstart + S, W + pw);
+
+            const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            hend   = min(hend, H);
+            wend   = min(wend, W);
+
+            int32_t divide_factor;
+            if(divisor_override != 0)
+            {
+                divide_factor = divisor_override;
+            }
+            else
+            {
+                if(count_include_pad)
+                {
+                    divide_factor = pool_size;
+                }
+                else
+                {
+                    divide_factor = (hend - hstart) * (wend - wstart);
+                }
+            }
+
+            grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(
+                        tensor_layout_t<4>(n, c, oh, ow))]) /
+                    divide_factor;
+        }
+    }
+    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
+        CVT_ACCUM2FLOAT(grad);
+}
+
+extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad,
+                                             OUTPUT_TYPE* __restrict__ input_grad,
+                                             int32_t N,
+                                             int32_t C,
+                                             int32_t H,
+                                             int32_t W,
+                                             int32_t OH,
+                                             int32_t OW,
+                                             int32_t* kinfor,
+                                             int32_t* stride,
+                                             int32_t* padding,
+                                             bool count_include_pad,
+                                             int32_t divisor_override,
+                                             tensor_view_t<4> output_grad_tv,
+                                             tensor_view_t<4> input_grad_tv)
+{
+    avgPoolBackward2d<INPUT_TYPE, OUTPUT_TYPE>(output_grad,
+                                               input_grad,
+                                               N,
+                                               C,
+                                               H,
+                                               W,
+                                               OH,
+                                               OW,
+                                               kinfor,
+                                               stride,
+                                               padding,
+                                               count_include_pad,
+                                               divisor_override,
+                                               output_grad_tv,
+                                               input_grad_tv);
+}
+
+template <typename TI, typename TO>
+__device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
+                                  TO* __restrict__ input_grad,
+                                  int32_t N,
+                                  int32_t C,
+                                  int32_t D,
+                                  int32_t H,
+                                  int32_t W,
+                                  int32_t OD,
+                                  int32_t OH,
+                                  int32_t OW,
+                                  int32_t* kinfor,
+                                  int32_t* stride,
+                                  int32_t* padding,
+                                  bool count_include_pad,
+                                  int32_t divisor_override,
+                                  tensor_view_t<5> output_grad_tv,
+                                  tensor_view_t<5> input_grad_tv)
+{
+    int32_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ncdh = gid / W, w = gid % W;
+    int32_t ncd = ncdh / H, h = ncdh % H;
+    int32_t nc = ncd / D, d = ncd % D;
+    int32_t n = nc / C, c = nc % C;
+    int32_t KD = kinfor[0];
+    int32_t R  = kinfor[1];
+    int32_t S  = kinfor[2];
+    int32_t sd = stride[0];
+    int32_t sh = stride[1];
+    int32_t sw = stride[2];
+    int32_t pd = padding[0];
+    int32_t ph = padding[1];
+    int32_t pw = padding[2];
+
+    if(n >= N)
+        return;
+
+    FLOAT_ACCUM grad = 0;
+    for(int32_t kd = 0; kd < KD; ++kd)
+    {
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                int32_t odsd = d + pd - kd;
+                if(odsd % sd != 0)
+                    continue;
+                int32_t od = odsd / sd;
+                if(od < 0 || od >= OD)
+                    continue;
+
+                int32_t ohsh = h + ph - r;
+                if(ohsh % sh != 0)
+                    continue;
+                int32_t oh = ohsh / sh;
+                if(oh < 0 || oh >= OH)
+                    continue;
+
+                int32_t owsw = w + pw - s;
+                if(owsw % sw != 0)
+                    continue;
+                int32_t ow = owsw / sw;
+                if(ow < 0 || ow >= OW)
+                    continue;
+
+                int32_t dstart = od * sd - pd;
+                int32_t hstart = oh * sh - ph;
+                int32_t wstart = ow * sw - pw;
+                int32_t dend   = min(dstart + KD, D + pd);
+                int32_t hend   = min(hstart + R, H + ph);
+                int32_t wend   = min(wstart + S, W + pw);
+
+                const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                dstart                  = max(dstart, 0);
+                hstart                  = max(hstart, 0);
+                wstart                  = max(wstart, 0);
+                dend                    = min(dend, D);
+                hend                    = min(hend, H);
+                wend                    = min(wend, W);
+                int32_t divide_factor;
+                if(divisor_override != 0)
+                {
+                    divide_factor = divisor_override;
+                }
+                else
+                {
+                    if(count_include_pad)
+                    {
+                        divide_factor = pool_size;
+                    }
+                    else
+                    {
+                        divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                    }
+                }
+                grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(
+                            tensor_layout_t<5>(n, c, od, oh, ow))]) /
+                        divide_factor;
+            }
+        }
+    }
+    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
+        CVT_ACCUM2FLOAT(grad);
+}
+
+extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad,
+                                             OUTPUT_TYPE* __restrict__ input_grad,
+                                             int32_t N,
+                                             int32_t C,
+                                             int32_t D,
+                                             int32_t H,
+                                             int32_t W,
+                                             int32_t OD,
+                                             int32_t OH,
+                                             int32_t OW,
+                                             int32_t* kinfor,
+                                             int32_t* stride,
+                                             int32_t* padding,
+                                             bool count_include_pad,
+                                             int32_t divisor_override,
+                                             tensor_view_t<5> output_grad_tv,
+                                             tensor_view_t<5> input_grad_tv)
+{
+    avgPoolBackward3d<INPUT_TYPE, OUTPUT_TYPE>(output_grad,
+                                               input_grad,
+                                               N,
+                                               C,
+                                               D,
+                                               H,
+                                               W,
+                                               OD,
+                                               OH,
+                                               OW,
+                                               kinfor,
+                                               stride,
+                                               padding,
+                                               count_include_pad,
+                                               divisor_override,
+                                               output_grad_tv,
+                                               input_grad_tv);
+}
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index d0e37b5464..8b444370a0 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -28,6 +28,7 @@
 #include "miopen/execution_context.hpp"
 #include "miopen/invoke_params.hpp"
 #include "miopen/tensor_view_utils.hpp"
+#include <cstdint>
 #include <miopen/avgpool/solvers.hpp>
 
 #include <miopen/avgpool/invoke_params.hpp>
@@ -81,12 +82,12 @@ AvgPoolForward2d::GetSolution(const ExecutionContext& context,
             auto input_tv  = get_inner_expanded_tv<4>(deref(params.inputDesc));
             auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc));
 
-            auto N  = deref(params.inputDesc).GetLengths()[0];
-            auto C  = deref(params.inputDesc).GetLengths()[1];
-            auto H  = deref(params.inputDesc).GetLengths()[2];
-            auto W  = deref(params.inputDesc).GetLengths()[3];
-            auto OH = deref(params.outputDesc).GetLengths()[2];
-            auto OW = deref(params.outputDesc).GetLengths()[3];
+            size_t N  = deref(params.inputDesc).GetLengths()[0];
+            size_t C  = deref(params.inputDesc).GetLengths()[1];
+            size_t H  = deref(params.inputDesc).GetLengths()[2];
+            size_t W  = deref(params.inputDesc).GetLengths()[3];
+            size_t OH = deref(params.outputDesc).GetLengths()[2];
+            size_t OW = deref(params.outputDesc).GetLengths()[3];
 
             kernel(params.input,
                    params.output,
diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp
new file mode 100644
index 0000000000..40a67a8d7d
--- /dev/null
+++ b/test/cpu_avgpool.hpp
@@ -0,0 +1,426 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CPU_AVGPOOL_HPP
+#define GUARD_CPU_AVGPOOL_HPP
+
+#include "tensor_holder.hpp"
+#include <miopen/tensor_view_utils.hpp>
+
+template <class T>
+void cpu_avgpool_forward_2d(tensor<T> input,
+                            tensor<T>& output,
+                            int32_t N,
+                            int32_t C,
+                            int32_t H,
+                            int32_t W,
+                            int32_t OH,
+                            int32_t OW,
+                            tensor<int32_t> kinfor,
+                            tensor<int32_t> stride,
+                            tensor<int32_t> padding,
+                            bool count_include_pad,
+                            int32_t divisor_override)
+{
+    auto dims  = input.desc.GetLengths();
+    auto numel = output.desc.GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<4>(input.desc);
+    auto output_tv = miopen::get_inner_expanded_tv<4>(output.desc);
+
+    for(int32_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncoh = gid / OW, ow = gid % OW;
+        int32_t nc = ncoh / OH, oh = ncoh % OH;
+        int32_t n = nc / C, c = nc % C;
+        int32_t R  = kinfor[0];
+        int32_t S  = kinfor[1];
+        int32_t sh = stride[0];
+        int32_t sw = stride[1];
+        int32_t ph = padding[0];
+        int32_t pw = padding[1];
+
+        if(n >= N)
+            return;
+
+        float m = 0;
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                // input idx : (n, c, h, w)
+                int32_t h = oh * sh - ph + r;
+                if(h < 0 || h >= H)
+                    continue;
+                int32_t w = ow * sw - pw + s;
+                if(w < 0 || w >= W)
+                    continue;
+                // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                m += static_cast<float>(
+                    input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
+            }
+        }
+
+        int32_t hstart = oh * sh - ph;
+        int32_t wstart = ow * sw - pw;
+        int32_t hend   = std::min(hstart + R, H + ph);
+        int32_t wend   = std::min(wstart + S, W + pw);
+
+        const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+        hstart = std::max(hstart, 0);
+        wstart = std::max(wstart, 0);
+        hend   = std::min(hend, H);
+        wend   = std::min(wend, W);
+
+        int32_t divide_factor;
+        if(divisor_override != 0)
+        {
+            divide_factor = divisor_override;
+        }
+        else
+        {
+            if(count_include_pad)
+            {
+                divide_factor = pool_size;
+            }
+            else
+            {
+                divide_factor = (hend - hstart) * (wend - wstart);
+            }
+        }
+        float val = m / divide_factor;
+
+        output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] =
+            static_cast<T>(val);
+    }
+}
+
+template <class T>
+void cpu_avgpool_forward_3d(tensor<T> input,
+                            tensor<T>& output,
+                            int32_t N,
+                            int32_t C,
+                            int32_t D,
+                            int32_t H,
+                            int32_t W,
+                            int32_t OD,
+                            int32_t OH,
+                            int32_t OW,
+                            tensor<int32_t> kinfor,
+                            tensor<int32_t> stride,
+                            tensor<int32_t> padding,
+                            bool count_include_pad,
+                            int32_t divisor_override)
+{
+    auto dims  = input.desc.GetLengths();
+    auto numel = output.desc.GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<5>(input.desc);
+    auto output_tv = miopen::get_inner_expanded_tv<5>(output.desc);
+
+    for(int32_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncodoh = gid / OW, ow = gid % OW;
+        int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
+        int32_t nc = ncod / OD, od = ncod % OD;
+        int32_t n = nc / C, c = nc % C;
+        int32_t KD = kinfor[0];
+        int32_t R  = kinfor[1];
+        int32_t S  = kinfor[2];
+        int32_t sd = stride[0];
+        int32_t sh = stride[1];
+        int32_t sw = stride[2];
+        int32_t pd = padding[0];
+        int32_t ph = padding[1];
+        int32_t pw = padding[2];
+
+        if(n >= N)
+            return;
+        float sum = 0;
+        for(int32_t kd = 0; kd < KD; ++kd)
+        {
+            for(int32_t r = 0; r < R; ++r)
+            {
+                for(int32_t s = 0; s < S; ++s)
+                {
+                    // input idx : (n, c, d, h, w)
+                    int32_t d = od * sd - pd + kd;
+                    if(d < 0 || d >= D)
+                        continue;
+                    int32_t h = oh * sh - ph + r;
+                    if(h < 0 || h >= H)
+                        continue;
+                    int32_t w = ow * sw - pw + s;
+                    if(w < 0 || w >= W)
+                        continue;
+                    // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                    sum += static_cast<float>(
+                        input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
+                }
+            }
+        }
+        int32_t dstart = od * sd - pd;
+        int32_t hstart = oh * sh - ph;
+        int32_t wstart = ow * sw - pw;
+        int32_t dend   = std::min(dstart + KD, D + pd);
+        int32_t hend   = std::min(hstart + R, H + ph);
+        int32_t wend   = std::min(wstart + S, W + pw);
+
+        const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+        dstart                  = std::max(dstart, 0);
+        hstart                  = std::max(hstart, 0);
+        wstart                  = std::max(wstart, 0);
+        dend                    = std::min(dend, D);
+        hend                    = std::min(hend, H);
+        wend                    = std::min(wend, W);
+
+        int32_t divide_factor;
+        if(divisor_override != 0)
+        {
+            divide_factor = divisor_override;
+        }
+        else
+        {
+            if(count_include_pad)
+            {
+                divide_factor = pool_size;
+            }
+            else
+            {
+                divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            }
+        }
+        float val = sum / divide_factor;
+        output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
+            static_cast<T>(val);
+    }
+}
+
+template <class T>
+void cpu_avgpool_backward_2d(tensor<T> output_grad,
+                             tensor<T>& input_grad,
+                             int32_t N,
+                             int32_t C,
+                             int32_t H,
+                             int32_t W,
+                             int32_t OH,
+                             int32_t OW,
+                             tensor<int32_t> kinfor,
+                             tensor<int32_t> stride,
+                             tensor<int32_t> padding,
+                             bool count_include_pad,
+                             int32_t divisor_override)
+{
+    auto dims  = input_grad.desc.GetLengths();
+    auto numel = input_grad.desc.GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc);
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(input_grad.desc);
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        int32_t nch = gid / W, w = gid % W;
+        int32_t nc = nch / H, h = nch % H;
+        int32_t n = nc / C, c = nc % C;
+        int32_t R  = kinfor[0];
+        int32_t S  = kinfor[1];
+        int32_t sh = stride[0];
+        int32_t sw = stride[1];
+        int32_t ph = padding[0];
+        int32_t pw = padding[1];
+
+        if(n >= N)
+            return;
+
+        float grad = 0;
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                int32_t ohsh = h + ph - r;
+                if(ohsh % sh != 0)
+                    continue;
+                int32_t oh = ohsh / sh;
+                if(oh < 0 || oh >= OH)
+                    continue;
+                int32_t owsw = w + pw - s;
+                if(owsw % sw != 0)
+                    continue;
+                int32_t ow = owsw / sw;
+                if(ow < 0 || ow >= OW)
+                    continue;
+
+                int32_t hstart = oh * sh - ph;
+                int32_t wstart = ow * sw - pw;
+                int32_t hend   = std::min(hstart + R, H + ph);
+                int32_t wend   = std::min(wstart + S, W + pw);
+
+                const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+                hstart = std::max(hstart, 0);
+                wstart = std::max(wstart, 0);
+                hend   = std::min(hend, H);
+                wend   = std::min(wend, W);
+
+                int32_t divide_factor;
+                if(divisor_override != 0)
+                {
+                    divide_factor = divisor_override;
+                }
+                else
+                {
+                    if(count_include_pad)
+                    {
+                        divide_factor = pool_size;
+                    }
+                    else
+                    {
+                        divide_factor = (hend - hstart) * (wend - wstart);
+                    }
+                }
+
+                grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
+                            tensor_layout_t<4>(n, c, oh, ow))]) /
+                        divide_factor;
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
+            static_cast<T>(grad);
+    }
+}
+
+template <class T>
+void cpu_avgpool_backward_3d(tensor<T> output_grad,
+                             tensor<T>& input_grad,
+                             int32_t N,
+                             int32_t C,
+                             int32_t D,
+                             int32_t H,
+                             int32_t W,
+                             int32_t OD,
+                             int32_t OH,
+                             int32_t OW,
+                             tensor<int32_t> kinfor,
+                             tensor<int32_t> stride,
+                             tensor<int32_t> padding,
+                             bool count_include_pad,
+                             int32_t divisor_override)
+{
+    auto dims  = input_grad.desc.GetLengths();
+    auto numel = input_grad.desc.GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc);
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(input_grad.desc);
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncdh = gid / W, w = gid % W;
+        int32_t ncd = ncdh / H, h = ncdh % H;
+        int32_t nc = ncd / D, d = ncd % D;
+        int32_t n = nc / C, c = nc % C;
+        int32_t KD = kinfor[0];
+        int32_t R  = kinfor[1];
+        int32_t S  = kinfor[2];
+        int32_t sd = stride[0];
+        int32_t sh = stride[1];
+        int32_t sw = stride[2];
+        int32_t pd = padding[0];
+        int32_t ph = padding[1];
+        int32_t pw = padding[2];
+
+        if(n >= N)
+            return;
+
+        float grad = 0;
+        for(int32_t kd = 0; kd < KD; ++kd)
+        {
+            for(int32_t r = 0; r < R; ++r)
+            {
+                for(int32_t s = 0; s < S; ++s)
+                {
+                    int32_t odsd = d + pd - kd;
+                    if(odsd % sd != 0)
+                        continue;
+                    int32_t od = odsd / sd;
+                    if(od < 0 || od >= OD)
+                        continue;
+
+                    int32_t ohsh = h + ph - r;
+                    if(ohsh % sh != 0)
+                        continue;
+                    int32_t oh = ohsh / sh;
+                    if(oh < 0 || oh >= OH)
+                        continue;
+
+                    int32_t owsw = w + pw - s;
+                    if(owsw % sw != 0)
+                        continue;
+                    int32_t ow = owsw / sw;
+                    if(ow < 0 || ow >= OW)
+                        continue;
+
+                    int32_t dstart = od * sd - pd;
+                    int32_t hstart = oh * sh - ph;
+                    int32_t wstart = ow * sw - pw;
+                    int32_t dend   = std::min(dstart + KD, D + pd);
+                    int32_t hend   = std::min(hstart + R, H + ph);
+                    int32_t wend   = std::min(wstart + S, W + pw);
+
+                    const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                    dstart                  = std::max(dstart, 0);
+                    hstart                  = std::max(hstart, 0);
+                    wstart                  = std::max(wstart, 0);
+                    dend                    = std::min(dend, D);
+                    hend                    = std::min(hend, H);
+                    wend                    = std::min(wend, W);
+                    int32_t divide_factor;
+                    if(divisor_override != 0)
+                    {
+                        divide_factor = divisor_override;
+                    }
+                    else
+                    {
+                        if(count_include_pad)
+                        {
+                            divide_factor = pool_size;
+                        }
+                        else
+                        {
+                            divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                        }
+                    }
+                    grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
+                                tensor_layout_t<5>(n, c, od, oh, ow))]) /
+                            divide_factor;
+                }
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
+            static_cast<T>(grad);
+    }
+}
+
+#endif
diff --git a/test/gtest/avgpool.cpp b/test/gtest/avgpool.cpp
new file mode 100644
index 0000000000..1dd5502339
--- /dev/null
+++ b/test/gtest/avgpool.cpp
@@ -0,0 +1,163 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "avgpool.hpp"
+#include <miopen/env.hpp>
+
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+
+namespace avgpool {
+
+std::string GetFloatArg()
+{
+    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
+    if(tmp.empty())
+    {
+        return "";
+    }
+    return tmp;
+}
+
+struct GPU_Avgpool_fwd_FP32 : AvgPoolTestFwd<float>
+{
+};
+
+struct GPU_Avgpool_fwd_FP16 : AvgPoolTestFwd<half>
+{
+};
+
+struct GPU_Avgpool_fwd_BFP16 : AvgPoolTestFwd<bfloat16>
+{
+};
+
+struct GPU_Avgpool_bwd_FP32 : AvgPoolTestBwd<float>
+{
+};
+
+struct GPU_Avgpool_bwd_FP16 : AvgPoolTestBwd<half>
+{
+};
+
+struct GPU_Avgpool_bwd_BFP16 : AvgPoolTestBwd<bfloat16>
+{
+};
+
+} // namespace avgpool
+using namespace avgpool;
+
+// FORWARD TEST
+TEST_P(GPU_Avgpool_fwd_FP32, AvgPoolTestFwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Avgpool_fwd_FP16, AvgPoolTestFwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Avgpool_fwd_BFP16, AvgPoolTestFwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP32, testing::ValuesIn(AvgPoolTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP16, testing::ValuesIn(AvgPoolTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs()));
+
+// // BACKWARD TEST
+// TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd)
+// {
+//     if(!MIOPEN_TEST_ALL ||
+//        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+//     {
+//         RunTest();
+//         Verify();
+//     }
+//     else
+//     {
+//         GTEST_SKIP();
+//     }
+// };
+
+// TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd)
+// {
+//     if(!MIOPEN_TEST_ALL ||
+//        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+//     {
+//         RunTest();
+//         Verify();
+//     }
+//     else
+//     {
+//         GTEST_SKIP();
+//     }
+// };
+
+// TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd)
+// {
+//     if(!MIOPEN_TEST_ALL ||
+//        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+//     {
+//         RunTest();
+//         Verify();
+//     }
+//     else
+//     {
+//         GTEST_SKIP();
+//     }
+// };
+
+// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP32, testing::ValuesIn(AvgPoolTestConfigs()));
+// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP16, testing::ValuesIn(AvgPoolTestConfigs()));
+// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs()));
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
new file mode 100644
index 0000000000..23ec4c1726
--- /dev/null
+++ b/test/gtest/avgpool.hpp
@@ -0,0 +1,426 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "../driver/tensor_driver.hpp"
+#include "cpu_avgpool.hpp"
+#include "get_handle.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include <cstdint>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <miopen/avgpool.hpp>
+#include <miopen/miopen.h>
+
+template <class T>
+inline std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+struct AvgPoolTestCase
+{
+    std::vector<int32_t> input_dims;
+    std::vector<int32_t> kernel_size;
+    std::vector<int32_t> stride;
+    std::vector<int32_t> padding;
+    bool ceil_mode;
+    bool count_include_pad;
+    int32_t divisor_override;
+
+    friend std::ostream& operator<<(std::ostream& os, const AvgPoolTestCase& tc)
+    {
+        return os << " input_dims:" << tc.input_dims << " kernel_size:" << tc.kernel_size
+                  << " stride:" << tc.stride << " padding:" << tc.padding
+                  << " ceil_mode:" << tc.ceil_mode << " count_include_pad:" << tc.count_include_pad
+                  << " divisor_override:" << tc.divisor_override;
+    }
+
+    std::vector<int32_t> GetInput() const { return input_dims; }
+};
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigs()
+{
+    return {
+        {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, false, 0},
+        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, false, 0},
+        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, true, 0},
+        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, true, 0},
+        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, false, 1},
+        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, false, 1},
+        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, true, 1},
+        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, true, 1},
+        {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, false, 0},
+        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, false, 0},
+        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, true, 0},
+        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, false, 1},
+        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, false, 1},
+        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 1},
+        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, true, 1},
+    };
+}
+
+// FORWARD TEST
+template <typename T = float>
+struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle     = get_handle();
+        avgpool_config    = GetParam();
+        auto in_dim       = avgpool_config.GetInput();
+        N                 = in_dim[0];
+        C                 = in_dim[1];
+        D                 = in_dim.size() == 5 ? in_dim[2] : 1;
+        H                 = in_dim.size() == 5 ? in_dim[3] : in_dim[2];
+        W                 = in_dim.size() == 5 ? in_dim[4] : in_dim[3];
+        ksize             = tensor<int32_t>{in_dim.size() - 2};
+        ksize.data        = avgpool_config.kernel_size;
+        stride            = tensor<int32_t>{in_dim.size() - 2};
+        stride.data       = avgpool_config.stride;
+        padding           = tensor<int32_t>{in_dim.size() - 2};
+        padding.data      = avgpool_config.padding;
+        ceil_mode         = avgpool_config.ceil_mode;
+        count_include_pad = avgpool_config.count_include_pad;
+        divisor_override  = avgpool_config.divisor_override;
+
+        auto gen_input_value = [](auto...) {
+            return prng::gen_A_to_B<T>(static_cast<T>(-10.0f), static_cast<T>(10.0f));
+        };
+        input = tensor<T>{in_dim}.generate(gen_input_value);
+
+        std::vector<int32_t> out_dim;
+        if(in_dim.size() == 5)
+        {
+            if(ceil_mode)
+            {
+                OD = std::ceil(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OH = std::ceil(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+                OW = std::ceil(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+            }
+            else
+            {
+                OD = std::floor(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OH = std::floor(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+                OW = std::floor(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+            }
+            out_dim = {N, C, OD, OH, OW};
+        }
+        else
+        {
+            if(ceil_mode)
+            {
+                OH = std::ceil(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OW = std::ceil(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            }
+            else
+            {
+                OH = std::floor(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OW = std::floor(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            }
+            out_dim = {N, C, OH, OW};
+        }
+
+        output = tensor<T>{out_dim};
+        std::fill(output.begin(), output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_output = tensor<T>{out_dim};
+        std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        input_dev   = handle.Write(input.data);
+        output_dev  = handle.Write(output.data);
+        ksize_dev   = handle.Write(ksize.data);
+        stride_dev  = handle.Write(stride.data);
+        padding_dev = handle.Write(padding.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        auto dims = input.desc.GetNumDims();
+        if(dims == 4)
+        {
+            cpu_avgpool_forward_2d(input,
+                                   ref_output,
+                                   N,
+                                   C,
+                                   H,
+                                   W,
+                                   OH,
+                                   OW,
+                                   ksize,
+                                   stride,
+                                   padding,
+                                   count_include_pad,
+                                   divisor_override);
+        }
+        else if(dims == 5)
+        {
+            cpu_avgpool_forward_3d<T>(input,
+                                      ref_output,
+                                      N,
+                                      C,
+                                      D,
+                                      H,
+                                      W,
+                                      OD,
+                                      OH,
+                                      OW,
+                                      ksize,
+                                      stride,
+                                      padding,
+                                      count_include_pad,
+                                      divisor_override);
+        }
+        status = miopen::AvgPoolForward(handle,
+                                        input.desc,
+                                        input_dev.get(),
+                                        output.desc,
+                                        output_dev.get(),
+                                        stride.desc,
+                                        stride_dev.get(),
+                                        padding.desc,
+                                        padding_dev.get(),
+                                        ksize.desc,
+                                        ksize_dev.get(),
+                                        count_include_pad,
+                                        divisor_override);
+        fflush(stdout);
+
+        ASSERT_EQ(status, miopenStatusSuccess);
+
+        output.data = handle.Read<T>(output_dev, output.data.size());
+    }
+
+    void Verify()
+    {
+        double threshold = std::numeric_limits<T>::epsilon();
+
+        auto error = miopen::rms_range(ref_output, output);
+
+        ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output));
+        for(int i = 0; i < 10; ++i)
+        {
+            std::cout << "output cpu: " << ref_output[i] << " output gpu: " << output[i]
+                      << std::endl;
+        }
+        EXPECT_LT(error, threshold * 10);
+    }
+    AvgPoolTestCase avgpool_config;
+
+    tensor<T> input;
+    tensor<T> output;
+    tensor<T> ref_output;
+    tensor<int32_t> ksize;
+    tensor<int32_t> stride;
+    tensor<int32_t> padding;
+
+    bool ceil_mode;
+    bool count_include_pad;
+    int32_t divisor_override;
+    int32_t N, C, D, H, W, OD, OH, OW;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+    miopen::Allocator::ManageDataPtr ksize_dev;
+    miopen::Allocator::ManageDataPtr stride_dev;
+    miopen::Allocator::ManageDataPtr padding_dev;
+};
+
+// BACKWARD TEST
+template <typename T = float>
+struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle     = get_handle();
+        avgpool_config    = GetParam();
+        auto in_grad_dim  = avgpool_config.GetInput();
+        N                 = in_grad_dim[0];
+        C                 = in_grad_dim[1];
+        D                 = in_grad_dim.size() == 5 ? in_grad_dim[2] : 1;
+        H                 = in_grad_dim.size() == 5 ? in_grad_dim[3] : in_grad_dim[2];
+        W                 = in_grad_dim.size() == 5 ? in_grad_dim[4] : in_grad_dim[3];
+        ksize             = tensor<int32_t>{in_grad_dim.size() - 2};
+        ksize.data        = avgpool_config.kernel_size;
+        stride            = tensor<int32_t>{in_grad_dim.size() - 2};
+        stride.data       = avgpool_config.stride;
+        padding           = tensor<int32_t>{in_grad_dim.size() - 2};
+        padding.data      = avgpool_config.padding;
+        ceil_mode         = avgpool_config.ceil_mode;
+        count_include_pad = avgpool_config.count_include_pad;
+        divisor_override  = avgpool_config.divisor_override;
+
+        std::vector<int32_t> out_grad_dim;
+        if(in_grad_dim.size() == 5)
+        {
+            if(ceil_mode)
+            {
+                OD = std::ceil(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OH = std::ceil(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+                OW = std::ceil(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+            }
+            else
+            {
+                OD = std::floor(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OH = std::floor(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+                OW = std::floor(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+            }
+            out_grad_dim = {N, C, OD, OH, OW};
+        }
+        else
+        {
+            if(ceil_mode)
+            {
+                OH = std::ceil(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OW = std::ceil(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            }
+            else
+            {
+                OH = std::floor(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OW = std::floor(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            }
+            out_grad_dim = {N, C, OH, OW};
+        }
+        auto gen_output_grad_value = [](auto...) {
+            return prng::gen_A_to_B<T>(static_cast<T>(-10.0f), static_cast<T>(10.0f));
+        };
+        output_grad = tensor<T>{out_grad_dim}.generate(gen_output_grad_value);
+
+        input_grad = tensor<T>{in_grad_dim};
+        std::fill(input_grad.begin(), input_grad.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_input_grad = tensor<T>{in_grad_dim};
+        std::fill(
+            ref_input_grad.begin(), ref_input_grad.end(), std::numeric_limits<T>::quiet_NaN());
+
+        output_grad_dev = handle.Write(output_grad.data);
+        input_grad_dev  = handle.Write(input_grad.data);
+        ksize_dev       = handle.Write(ksize.data);
+        stride_dev      = handle.Write(stride.data);
+        padding_dev     = handle.Write(padding.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+
+        miopenStatus_t status;
+
+        auto dims = input_grad.desc.GetNumDims();
+        if(dims == 4)
+        {
+            cpu_avgpool_backward_2d(output_grad,
+                                    ref_input_grad,
+                                    N,
+                                    C,
+                                    H,
+                                    W,
+                                    OH,
+                                    OW,
+                                    ksize,
+                                    stride,
+                                    padding,
+                                    count_include_pad,
+                                    divisor_override);
+        }
+        else if(dims == 5)
+        {
+            cpu_avgpool_backward_3d<T>(output_grad,
+                                       ref_input_grad,
+                                       N,
+                                       C,
+                                       D,
+                                       H,
+                                       W,
+                                       OD,
+                                       OH,
+                                       OW,
+                                       ksize,
+                                       stride,
+                                       padding,
+                                       count_include_pad,
+                                       divisor_override);
+        }
+        status = miopen::AvgPoolBackward(handle,
+                                         output_grad.desc,
+                                         output_grad_dev.get(),
+                                         input_grad.desc,
+                                         input_grad_dev.get(),
+                                         stride.desc,
+                                         stride_dev.get(),
+                                         padding.desc,
+                                         padding_dev.get(),
+                                         ksize.desc,
+                                         ksize_dev.get(),
+                                         count_include_pad,
+                                         divisor_override);
+
+        ASSERT_EQ(status, miopenStatusSuccess);
+
+        input_grad.data = handle.Read<T>(input_grad_dev, input_grad.data.size());
+    }
+
+    void Verify()
+    {
+        double threshold = std::numeric_limits<T>::epsilon();
+        auto error       = miopen::rms_range(ref_input_grad, input_grad);
+        ASSERT_EQ(miopen::range_distance(ref_input_grad), miopen::range_distance(input_grad));
+        EXPECT_LT(error, threshold * 10);
+    }
+    AvgPoolTestCase avgpool_config;
+
+    tensor<T> output_grad;
+    tensor<T> input_grad;
+    tensor<T> ref_input_grad;
+    tensor<int32_t> ksize;
+    tensor<int32_t> stride;
+    tensor<int32_t> padding;
+
+    bool ceil_mode;
+    bool count_include_pad;
+    int32_t divisor_override;
+    int32_t N, C, D, H, W, OD, OH, OW;
+
+    miopen::Allocator::ManageDataPtr output_grad_dev;
+    miopen::Allocator::ManageDataPtr input_grad_dev;
+    miopen::Allocator::ManageDataPtr ksize_dev;
+    miopen::Allocator::ManageDataPtr stride_dev;
+    miopen::Allocator::ManageDataPtr padding_dev;
+};

From 0492fc71c714c320b7d0d53f67030ba8e3fe2a90 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 12 Aug 2024 10:21:50 +0700
Subject: [PATCH 05/38] add driver test

---
 driver/CMakeLists.txt                      |   1 +
 driver/avgpool_driver.hpp                  | 596 +++++++++++++++++++++
 driver/dm_avgpool.cpp                      |  40 ++
 driver/driver.hpp                          |   5 +-
 driver/mloAvgPoolHost.hpp                  | 438 +++++++++++++++
 src/kernels/MIOpenAvgPool.cpp              | 118 ++--
 src/solver/avgpool/backward_avgpool_2d.cpp |   5 +
 src/solver/avgpool/backward_avgpool_3d.cpp |   5 +
 src/solver/avgpool/forward_avgpool_2d.cpp  |   4 +
 src/solver/avgpool/forward_avgpool_3d.cpp  |   4 +
 test/cpu_avgpool.hpp                       | 116 ++--
 test/gtest/avgpool.cpp                     |  92 ++--
 test/gtest/avgpool.hpp                     |   6 -
 13 files changed, 1259 insertions(+), 171 deletions(-)
 create mode 100644 driver/avgpool_driver.hpp
 create mode 100644 driver/dm_avgpool.cpp
 create mode 100644 driver/mloAvgPoolHost.hpp

diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index cd663eb8b4..385580e2e1 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -32,6 +32,7 @@ add_executable(MIOpenDriver
     dm_activ.cpp
     dm_adam.cpp
     dm_addlayernorm.cpp
+    dm_avgpool.cpp
     dm_bnorm.cpp
     dm_cat.cpp
     dm_conv.cpp
diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp
new file mode 100644
index 0000000000..38beba92f1
--- /dev/null
+++ b/driver/avgpool_driver.hpp
@@ -0,0 +1,596 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_AVGPOOL_DRIVER_HPP
+#define GUARD_MIOPEN_AVGPOOL_DRIVER_HPP
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "mloAvgPoolHost.hpp"
+#include "random.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+
+#include <miopen/env.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <vector>
+
+template <typename Tgpu, typename Tref>
+class AvgPoolDriver : public Driver
+{
+public:
+    AvgPoolDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+        miopenCreateTensorDescriptor(&inputGradDesc);
+        miopenCreateTensorDescriptor(&outputGradDesc);
+        miopenCreateTensorDescriptor(&ksizeDesc);
+        miopenCreateTensorDescriptor(&strideDesc);
+        miopenCreateTensorDescriptor(&paddingDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    std::vector<int> GetInputTensorDimsFromCmd(const char* param);
+    int GetandSetData() override;
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~AvgPoolDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+        miopenDestroyTensorDescriptor(inputGradDesc);
+        miopenDestroyTensorDescriptor(outputGradDesc);
+        miopenDestroyTensorDescriptor(ksizeDesc);
+        miopenDestroyTensorDescriptor(strideDesc);
+        miopenDestroyTensorDescriptor(paddingDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t outputDesc;
+    miopenTensorDescriptor_t inputGradDesc;
+    miopenTensorDescriptor_t outputGradDesc;
+    miopenTensorDescriptor_t ksizeDesc;
+    miopenTensorDescriptor_t strideDesc;
+    miopenTensorDescriptor_t paddingDesc;
+
+    std::unique_ptr<GPUMem> input_dev;
+    std::unique_ptr<GPUMem> output_dev;
+    std::unique_ptr<GPUMem> input_grad_dev;
+    std::unique_ptr<GPUMem> output_grad_dev;
+    std::unique_ptr<GPUMem> ksize_dev;
+    std::unique_ptr<GPUMem> stride_dev;
+    std::unique_ptr<GPUMem> padding_dev;
+
+    std::vector<Tgpu> input;
+    std::vector<Tgpu> output;
+    std::vector<Tref> output_host;
+    std::vector<Tgpu> input_grad;
+    std::vector<Tref> input_grad_host;
+    std::vector<Tgpu> output_grad;
+    std::vector<int32_t> ksize;
+    std::vector<int32_t> stride;
+    std::vector<int32_t> padding;
+
+    bool ceil_mode;
+    bool count_include_pad;
+    int32_t divisor_override;
+    int32_t N, C, D, H, W, OD, OH, OW;
+
+    std::vector<int> in_dim;
+};
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+std::vector<int> AvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char* param)
+{
+    std::string lengthsStr = inflags.GetValueStr(param);
+
+    std::vector<int> lengths;
+    std::size_t pos = 0;
+    std::size_t new_pos;
+
+    new_pos = lengthsStr.find(',', pos);
+    while(new_pos != std::string::npos)
+    {
+        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
+
+        int len = std::stoi(sliceStr);
+
+        lengths.push_back(len);
+
+        pos     = new_pos + 1;
+        new_pos = lengthsStr.find(',', pos);
+    };
+
+    std::string sliceStr = lengthsStr.substr(pos);
+    int len              = std::stoi(sliceStr);
+
+    lengths.push_back(len);
+
+    return (lengths);
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
+{
+    in_dim                   = GetInputTensorDimsFromCmd("input_dims");
+    std::vector<int> ksp_dim = {in_dim.size() - 2};
+    ksize                    = GetInputTensorDimsFromCmd("kernel_size");
+    stride                   = GetInputTensorDimsFromCmd("stride");
+    padding                  = GetInputTensorDimsFromCmd("padding");
+
+    if(ksize.size() != ksp_dim[0])
+    {
+        int ref = ksp_dim[0] - ksize.size();
+        while(ref--)
+            ksize.push_back(1);
+    }
+    if(stride.size() != ksp_dim[0])
+    {
+        int ref = ksp_dim[0] - ksize.size();
+        while(ref--)
+            stride.push_back(1);
+    }
+    if(padding.size() != ksp_dim[0])
+    {
+        int ref = ksp_dim[0] - ksize.size();
+        while(ref--)
+            padding.push_back(0);
+    }
+
+    ceil_mode         = static_cast<bool>(inflags.GetValueInt("ceil_mode"));
+    count_include_pad = static_cast<bool>(inflags.GetValueInt("count_include_pad"));
+    divisor_override  = inflags.GetValueInt("divisor_override");
+
+    N = in_dim[0];
+    C = in_dim[1];
+    D = in_dim.size() == 5 ? in_dim[2] : 1;
+    H = in_dim.size() == 5 ? in_dim[3] : in_dim[2];
+    W = in_dim.size() == 5 ? in_dim[4] : in_dim[3];
+
+    std::vector<int32_t> out_dim;
+    if(in_dim.size() == 5)
+    {
+        if(ceil_mode)
+        {
+            OD = std::ceil(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+            OH = std::ceil(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            OW = std::ceil(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+        }
+        else
+        {
+            OD = std::floor(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+            OH = std::floor(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            OW = std::floor(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+        }
+        out_dim = std::vector<int32_t>{N, C, OD, OH, OW};
+    }
+    else
+    {
+        if(ceil_mode)
+        {
+            OH = std::ceil(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+            OW = std::ceil(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+        }
+        else
+        {
+            OH = std::floor(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+            OW = std::floor(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+        }
+        out_dim = std::vector<int32_t>{N, C, OH, OW};
+    }
+    SetTensorNd(inputDesc, in_dim, data_type);
+    SetTensorNd(outputDesc, out_dim, data_type);
+    SetTensorNd(outputGradDesc, out_dim, data_type);
+    SetTensorNd(inputGradDesc, in_dim, data_type);
+    SetTensorNd(ksizeDesc, ksp_dim, miopen_type<int32_t>{});
+    SetTensorNd(strideDesc, ksp_dim, miopen_type<int32_t>{});
+    SetTensorNd(paddingDesc, ksp_dim, miopen_type<int32_t>{});
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag("forw", 'F', "1", "Run only Forward AvgPool (Default=1)", "int");
+    inflags.AddInputFlag(
+        "input_dims",
+        'D',
+        "2,3,7,9",
+        "The dimensional lengths of the input tensor: N,C,D1,D2,... Example: 2,3,7,9.",
+        "string");
+    inflags.AddInputFlag(
+        "kernel_size", 'k', "1,1", "The size of the window D1,D2,... Example: 1,1.", "string");
+    inflags.AddInputFlag(
+        "stride",
+        's',
+        "1,1",
+        "The stride of the window. Default value is kernel_size D1,D2,... Example: 1,1.",
+        "string");
+    inflags.AddInputFlag("padding",
+                         'p',
+                         "0,0",
+                         "Implicit zero padding to be added on both sides D1,D2,... Example: 0,0.",
+                         "string");
+    inflags.AddInputFlag("ceil_mode",
+                         'c',
+                         "1",
+                         "When 1, will use ceil instead of floor to compute the output shape.",
+                         "int");
+    inflags.AddInputFlag("count_include_pad",
+                         'P',
+                         "0",
+                         "When 1, will include the zero-padding in the averaging calculation.",
+                         "int");
+    inflags.AddInputFlag("divisor_override",
+                         'd',
+                         "0",
+                         "If specified, it will be used as divisor, otherwise size of the pooling "
+                         "region will be used.",
+                         "int");
+
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int");
+    inflags.AddInputFlag("time", 't', "1", "Time (Default=1)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t input_sz   = GetTensorSize(inputDesc);
+    size_t output_sz  = GetTensorSize(outputDesc);
+    size_t ksize_sz   = GetTensorSize(ksizeDesc);
+    size_t stride_sz  = GetTensorSize(strideDesc);
+    size_t padding_sz = GetTensorSize(paddingDesc);
+
+    uint32_t ctx = 0;
+
+    input_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    output_dev      = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
+    input_grad_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    output_grad_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
+    ksize_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, ksize_sz, sizeof(int32_t)));
+    stride_dev      = std::unique_ptr<GPUMem>(new GPUMem(ctx, stride_sz, sizeof(int32_t)));
+    padding_dev     = std::unique_ptr<GPUMem>(new GPUMem(ctx, padding_sz, sizeof(int32_t)));
+
+    input       = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0));
+    output      = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0));
+    output_host = std::vector<Tref>(output_sz, static_cast<Tref>(0));
+
+    input_grad      = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0));
+    input_grad_host = std::vector<Tref>(input_sz, static_cast<Tref>(0));
+    output_grad     = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0));
+
+    int status;
+
+    for(int i = 0; i < input_sz; i++)
+    {
+        input[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-10.0f), static_cast<Tgpu>(10.0f));
+    }
+    status = input_dev->ToGPU(q, input.data());
+
+    status |= output_dev->ToGPU(q, output.data());
+
+    status |= input_grad_dev->ToGPU(q, input_grad.data());
+
+    for(int i = 0; i < output_sz; i++)
+    {
+        output_grad[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-1.0), static_cast<Tgpu>(1.0));
+    }
+    status |= output_grad_dev->ToGPU(q, output_grad.data());
+
+    status |= ksize_dev->ToGPU(q, ksize.data());
+
+    status |= stride_dev->ToGPU(q, stride.data());
+
+    status |= padding_dev->ToGPU(q, padding.data());
+
+    if(status != 0)
+        std::cout << "Error copying data to GPU\n" << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0.0;
+    float kernel_first_time = 0.0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenAvgPoolForward(GetHandle(),
+                             inputDesc,
+                             input_dev->GetMem(),
+                             outputDesc,
+                             output_dev->GetMem(),
+                             strideDesc,
+                             stride_dev->GetMem(),
+                             paddingDesc,
+                             padding_dev->GetMem(),
+                             ksizeDesc,
+                             ksize_dev->GetMem(),
+                             count_include_pad,
+                             divisor_override);
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            printf("Wall-clock Time Forward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter);
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        printf("GPU Kernel Time Forward AvgPool Elapsed: %f ms\n", kernel_average_time);
+    }
+
+    output_dev->FromGPU(GetStream(), output.data());
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    if(in_dim.size() == 4)
+    {
+        mloAvgPoolForward2dRunHost<Tgpu, Tref>(inputDesc,
+                                               outputDesc,
+                                               input.data(),
+                                               output_host.data(),
+                                               N,
+                                               C,
+                                               H,
+                                               W,
+                                               OH,
+                                               OW,
+                                               ksize.data(),
+                                               stride.data(),
+                                               padding.data(),
+                                               count_include_pad,
+                                               divisor_override);
+    }
+    else if(in_dim.size() == 5)
+    {
+        mloAvgPoolForward3dRunHost<Tgpu, Tref>(inputDesc,
+                                               outputDesc,
+                                               input.data(),
+                                               output_host.data(),
+                                               N,
+                                               C,
+                                               D,
+                                               H,
+                                               W,
+                                               OD,
+                                               OH,
+                                               OW,
+                                               ksize.data(),
+                                               stride.data(),
+                                               padding.data(),
+                                               count_include_pad,
+                                               divisor_override);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    float kernel_total_time = 0.0;
+    float kernel_first_time = 0.0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenAvgPoolBackward(GetHandle(),
+                              outputGradDesc,
+                              output_grad_dev->GetMem(),
+                              inputGradDesc,
+                              input_grad_dev->GetMem(),
+                              strideDesc,
+                              stride_dev->GetMem(),
+                              paddingDesc,
+                              padding_dev->GetMem(),
+                              ksizeDesc,
+                              ksize_dev->GetMem(),
+                              count_include_pad,
+                              divisor_override);
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            printf("Wall-clock Time Backward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter);
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        printf("GPU Kernel Time Backward AvgPool Elapsed: %f ms\n", kernel_average_time);
+    }
+
+    input_grad_dev->FromGPU(GetStream(), input_grad.data());
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    if(in_dim.size() == 4)
+    {
+        mloAvgPoolBackward2dRunHost<Tgpu, Tref>(outputGradDesc,
+                                                inputGradDesc,
+                                                output_grad.data(),
+                                                input_grad_host.data(),
+                                                N,
+                                                C,
+                                                H,
+                                                W,
+                                                OH,
+                                                OW,
+                                                ksize.data(),
+                                                stride.data(),
+                                                padding.data(),
+                                                count_include_pad,
+                                                divisor_override);
+    }
+    else if(in_dim.size() == 5)
+    {
+        mloAvgPoolBackward3dRunHost<Tgpu, Tref>(outputGradDesc,
+                                                inputGradDesc,
+                                                output_grad.data(),
+                                                input_grad_host.data(),
+                                                N,
+                                                C,
+                                                D,
+                                                H,
+                                                W,
+                                                OD,
+                                                OH,
+                                                OW,
+                                                ksize.data(),
+                                                stride.data(),
+                                                padding.data(),
+                                                count_include_pad,
+                                                divisor_override);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+Tref AvgPoolDriver<Tgpu, Tref>::GetTolerance()
+{
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8.0;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error           = miopen::rms_range(output_host, output);
+
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Forward AvgPool FAILED: " << error << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        printf("Forward AvgPool Verifies on CPU and GPU (err=%f)\n", error);
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::VerifyBackward()
+{
+    RunBackwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error           = miopen::rms_range(input_grad_host, input_grad);
+
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Backward AvgPool FAILED: " << error << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        printf("Backward AvgPool Verifies on CPU and GPU (err=%f)\n", error);
+    }
+    return miopenStatusSuccess;
+}
+
+#endif // GUARD_MIOPEN_AVGPOOL_DRIVER_HPP
diff --git a/driver/dm_avgpool.cpp b/driver/dm_avgpool.cpp
new file mode 100644
index 0000000000..ec0e457056
--- /dev/null
+++ b/driver/dm_avgpool.cpp
@@ -0,0 +1,40 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "registry_driver_maker.hpp"
+#include "avgpool_driver.hpp"
+
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "avgpool")
+        return new AvgPoolDriver<float, float>();
+    if(base_arg == "avgpoolfp16")
+        return new AvgPoolDriver<float16, float>();
+    if(base_arg == "avgpoolbfp16")
+        return new AvgPoolDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index b23df690d1..bd42f6ee13 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -175,7 +175,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
            "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
-           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16]\n");
+           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], avgpool[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -206,7 +206,8 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" &&
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
        arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" &&
-       arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "--version")
+       arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "avgpool" &&
+       arg != "avgpoolfp16" && arg != "avgpoolbfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/mloAvgPoolHost.hpp b/driver/mloAvgPoolHost.hpp
new file mode 100644
index 0000000000..ad55c53c66
--- /dev/null
+++ b/driver/mloAvgPoolHost.hpp
@@ -0,0 +1,438 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MLO_AVGPOOLHOST_H_
+#define MLO_AVGPOOLHOST_H_
+
+#include <miopen/tensor.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
+                                   const miopenTensorDescriptor_t outputDesc,
+                                   Tgpu* input,
+                                   Tcheck* output,
+                                   size_t N,
+                                   size_t C,
+                                   size_t H,
+                                   size_t W,
+                                   size_t OH,
+                                   size_t OW,
+                                   const int32_t* kinfor,
+                                   const int32_t* stride,
+                                   const int32_t* padding,
+                                   bool count_include_pad,
+                                   int32_t divisor_override)
+{
+    auto dims  = miopen::deref(inputDesc).GetLengths();
+    auto numel = miopen::deref(outputDesc).GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc));
+    auto output_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputDesc));
+
+    for(int32_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncoh = gid / OW, ow = gid % OW;
+        int32_t nc = ncoh / OH, oh = ncoh % OH;
+        int32_t n = nc / C, c = nc % C;
+        int32_t R  = kinfor[0];
+        int32_t S  = kinfor[1];
+        int32_t sh = stride[0];
+        int32_t sw = stride[1];
+        int32_t ph = padding[0];
+        int32_t pw = padding[1];
+
+        if(n >= N)
+            return 0;
+
+        float m = 0;
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                // input idx : (n, c, h, w)
+                int32_t h = oh * sh - ph + r;
+                if(h < 0 || h >= H)
+                    continue;
+                int32_t w = ow * sw - pw + s;
+                if(w < 0 || w >= W)
+                    continue;
+                // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                m += static_cast<float>(
+                    input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
+            }
+        }
+
+        int32_t hstart = oh * sh - ph;
+        int32_t wstart = ow * sw - pw;
+        int32_t hend   = min(hstart + R, H + ph);
+        int32_t wend   = min(wstart + S, W + pw);
+
+        const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        hend   = min(hend, H);
+        wend   = min(wend, W);
+
+        int32_t divide_factor;
+        if(divisor_override != 0)
+        {
+            divide_factor = divisor_override;
+        }
+        else
+        {
+            if(count_include_pad)
+            {
+                divide_factor = pool_size;
+            }
+            else
+            {
+                divide_factor = (hend - hstart) * (wend - wstart);
+            }
+        }
+        float val = m / divide_factor;
+
+        output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] =
+            static_cast<Tcheck>(val);
+    }
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
+                                   const miopenTensorDescriptor_t outputDesc,
+                                   Tgpu* input,
+                                   Tcheck* output,
+                                   size_t N,
+                                   size_t C,
+                                   size_t D,
+                                   size_t H,
+                                   size_t W,
+                                   size_t OD,
+                                   size_t OH,
+                                   size_t OW,
+                                   const int32_t* kinfor,
+                                   const int32_t* stride,
+                                   const int32_t* padding,
+                                   bool count_include_pad,
+                                   int32_t divisor_override)
+{
+    auto dims  = miopen::deref(inputDesc).GetLengths();
+    auto numel = miopen::deref(outputDesc).GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
+
+    for(int32_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncodoh = gid / OW, ow = gid % OW;
+        int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
+        int32_t nc = ncod / OD, od = ncod % OD;
+        int32_t n = nc / C, c = nc % C;
+        int32_t KD = kinfor[0];
+        int32_t R  = kinfor[1];
+        int32_t S  = kinfor[2];
+        int32_t sd = stride[0];
+        int32_t sh = stride[1];
+        int32_t sw = stride[2];
+        int32_t pd = padding[0];
+        int32_t ph = padding[1];
+        int32_t pw = padding[2];
+
+        if(n >= N)
+            return 0;
+        float sum = 0;
+        for(int32_t kd = 0; kd < KD; ++kd)
+        {
+            for(int32_t r = 0; r < R; ++r)
+            {
+                for(int32_t s = 0; s < S; ++s)
+                {
+                    // input idx : (n, c, d, h, w)
+                    int32_t d = od * sd - pd + kd;
+                    if(d < 0 || d >= D)
+                        continue;
+                    int32_t h = oh * sh - ph + r;
+                    if(h < 0 || h >= H)
+                        continue;
+                    int32_t w = ow * sw - pw + s;
+                    if(w < 0 || w >= W)
+                        continue;
+                    // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                    sum += static_cast<float>(
+                        input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
+                }
+            }
+        }
+        int32_t dstart = od * sd - pd;
+        int32_t hstart = oh * sh - ph;
+        int32_t wstart = ow * sw - pw;
+        int32_t dend   = min(dstart + KD, D + pd);
+        int32_t hend   = min(hstart + R, H + ph);
+        int32_t wend   = min(wstart + S, W + pw);
+
+        const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+        dstart                  = max(dstart, 0);
+        hstart                  = max(hstart, 0);
+        wstart                  = max(wstart, 0);
+        dend                    = min(dend, D);
+        hend                    = min(hend, H);
+        wend                    = min(wend, W);
+
+        int32_t divide_factor;
+        if(divisor_override != 0)
+        {
+            divide_factor = divisor_override;
+        }
+        else
+        {
+            if(count_include_pad)
+            {
+                divide_factor = pool_size;
+            }
+            else
+            {
+                divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            }
+        }
+        float val = sum / divide_factor;
+        output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
+            static_cast<Tcheck>(val);
+    }
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDesc,
+                                    const miopenTensorDescriptor_t inputGradDesc,
+                                    Tgpu* output_grad,
+                                    Tcheck* input_grad,
+                                    size_t N,
+                                    size_t C,
+                                    size_t H,
+                                    size_t W,
+                                    size_t OH,
+                                    size_t OW,
+                                    const int32_t* kinfor,
+                                    const int32_t* stride,
+                                    const int32_t* padding,
+                                    bool count_include_pad,
+                                    int32_t divisor_override)
+{
+    auto dims  = miopen::deref(inputGradDesc).GetLengths();
+    auto numel = miopen::deref(inputGradDesc).GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc));
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc));
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        int32_t nch = gid / W, w = gid % W;
+        int32_t nc = nch / H, h = nch % H;
+        int32_t n = nc / C, c = nc % C;
+        int32_t R  = kinfor[0];
+        int32_t S  = kinfor[1];
+        int32_t sh = stride[0];
+        int32_t sw = stride[1];
+        int32_t ph = padding[0];
+        int32_t pw = padding[1];
+
+        if(n >= N)
+            return 0;
+
+        float grad = 0;
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                int32_t ohsh = h + ph - r;
+                if(ohsh % sh != 0)
+                    continue;
+                int32_t oh = ohsh / sh;
+                if(oh < 0 || oh >= OH)
+                    continue;
+                int32_t owsw = w + pw - s;
+                if(owsw % sw != 0)
+                    continue;
+                int32_t ow = owsw / sw;
+                if(ow < 0 || ow >= OW)
+                    continue;
+
+                int32_t hstart = oh * sh - ph;
+                int32_t wstart = ow * sw - pw;
+                int32_t hend   = min(hstart + R, H + ph);
+                int32_t wend   = min(wstart + S, W + pw);
+
+                const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+                hstart = max(hstart, 0);
+                wstart = max(wstart, 0);
+                hend   = min(hend, H);
+                wend   = min(wend, W);
+
+                int32_t divide_factor;
+                if(divisor_override != 0)
+                {
+                    divide_factor = divisor_override;
+                }
+                else
+                {
+                    if(count_include_pad)
+                    {
+                        divide_factor = pool_size;
+                    }
+                    else
+                    {
+                        divide_factor = (hend - hstart) * (wend - wstart);
+                    }
+                }
+
+                grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
+                            tensor_layout_t<4>(n, c, oh, ow))]) /
+                        divide_factor;
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
+            static_cast<Tcheck>(grad);
+    }
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDesc,
+                                    const miopenTensorDescriptor_t inputGradDesc,
+                                    Tgpu* output_grad,
+                                    Tcheck* input_grad,
+                                    size_t N,
+                                    size_t C,
+                                    size_t D,
+                                    size_t H,
+                                    size_t W,
+                                    size_t OD,
+                                    size_t OH,
+                                    size_t OW,
+                                    const int32_t* kinfor,
+                                    const int32_t* stride,
+                                    const int32_t* padding,
+                                    bool count_include_pad,
+                                    int32_t divisor_override)
+{
+    auto dims  = miopen::deref(inputGradDesc).GetLengths();
+    auto numel = miopen::deref(inputGradDesc).GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc));
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc));
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncdh = gid / W, w = gid % W;
+        int32_t ncd = ncdh / H, h = ncdh % H;
+        int32_t nc = ncd / D, d = ncd % D;
+        int32_t n = nc / C, c = nc % C;
+        int32_t KD = kinfor[0];
+        int32_t R  = kinfor[1];
+        int32_t S  = kinfor[2];
+        int32_t sd = stride[0];
+        int32_t sh = stride[1];
+        int32_t sw = stride[2];
+        int32_t pd = padding[0];
+        int32_t ph = padding[1];
+        int32_t pw = padding[2];
+
+        if(n >= N)
+            return 0;
+
+        float grad = 0;
+        for(int32_t kd = 0; kd < KD; ++kd)
+        {
+            for(int32_t r = 0; r < R; ++r)
+            {
+                for(int32_t s = 0; s < S; ++s)
+                {
+                    int32_t odsd = d + pd - kd;
+                    if(odsd % sd != 0)
+                        continue;
+                    int32_t od = odsd / sd;
+                    if(od < 0 || od >= OD)
+                        continue;
+
+                    int32_t ohsh = h + ph - r;
+                    if(ohsh % sh != 0)
+                        continue;
+                    int32_t oh = ohsh / sh;
+                    if(oh < 0 || oh >= OH)
+                        continue;
+
+                    int32_t owsw = w + pw - s;
+                    if(owsw % sw != 0)
+                        continue;
+                    int32_t ow = owsw / sw;
+                    if(ow < 0 || ow >= OW)
+                        continue;
+
+                    int32_t dstart = od * sd - pd;
+                    int32_t hstart = oh * sh - ph;
+                    int32_t wstart = ow * sw - pw;
+                    int32_t dend   = min(dstart + KD, D + pd);
+                    int32_t hend   = min(hstart + R, H + ph);
+                    int32_t wend   = min(wstart + S, W + pw);
+
+                    const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                    dstart                  = max(dstart, 0);
+                    hstart                  = max(hstart, 0);
+                    wstart                  = max(wstart, 0);
+                    dend                    = min(dend, D);
+                    hend                    = min(hend, H);
+                    wend                    = min(wend, W);
+                    int32_t divide_factor;
+                    if(divisor_override != 0)
+                    {
+                        divide_factor = divisor_override;
+                    }
+                    else
+                    {
+                        if(count_include_pad)
+                        {
+                            divide_factor = pool_size;
+                        }
+                        else
+                        {
+                            divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                        }
+                    }
+                    grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
+                                tensor_layout_t<5>(n, c, od, oh, ow))]) /
+                            divide_factor;
+                }
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
+            static_cast<Tcheck>(grad);
+    }
+    return 0;
+}
+
+#endif // MLO_AVGPOOLHOST_H_
diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
index bcbf4f6c60..f4a9e95ce1 100644
--- a/src/kernels/MIOpenAvgPool.cpp
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -43,15 +43,15 @@
 template <typename TI, typename TO>
 __device__ void avgPoolForward2d(const TI* __restrict__ input,
                                  TO* __restrict__ output,
-                                 int32_t N,
-                                 int32_t C,
-                                 int32_t H,
-                                 int32_t W,
-                                 int32_t OH,
-                                 int32_t OW,
-                                 int32_t* kinfor,
-                                 int32_t* stride,
-                                 int32_t* padding,
+                                 size_t N,
+                                 size_t C,
+                                 size_t H,
+                                 size_t W,
+                                 size_t OH,
+                                 size_t OW,
+                                 const int32_t* __restrict__ kinfor,
+                                 const int32_t* __restrict__ stride,
+                                 const int32_t* __restrict__ padding,
                                  bool count_include_pad,
                                  int32_t divisor_override,
                                  tensor_view_t<4> input_tv,
@@ -124,12 +124,12 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
 
 extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input,
                                             OUTPUT_TYPE* __restrict__ output,
-                                            int32_t N,
-                                            int32_t C,
-                                            int32_t H,
-                                            int32_t W,
-                                            int32_t OH,
-                                            int32_t OW,
+                                            size_t N,
+                                            size_t C,
+                                            size_t H,
+                                            size_t W,
+                                            size_t OH,
+                                            size_t OW,
                                             int32_t* kinfor,
                                             int32_t* stride,
                                             int32_t* padding,
@@ -158,14 +158,14 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input
 template <typename TI, typename TO>
 __device__ void avgPoolForward3d(const TI* __restrict__ input,
                                  TO* __restrict__ output,
-                                 int32_t N,
-                                 int32_t C,
-                                 int32_t D,
-                                 int32_t H,
-                                 int32_t W,
-                                 int32_t OD,
-                                 int32_t OH,
-                                 int32_t OW,
+                                 size_t N,
+                                 size_t C,
+                                 size_t D,
+                                 size_t H,
+                                 size_t W,
+                                 size_t OD,
+                                 size_t OH,
+                                 size_t OW,
                                  int32_t* kinfor,
                                  int32_t* stride,
                                  int32_t* padding,
@@ -252,14 +252,14 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
 
 extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input,
                                             OUTPUT_TYPE* __restrict__ output,
-                                            int32_t N,
-                                            int32_t C,
-                                            int32_t D,
-                                            int32_t H,
-                                            int32_t W,
-                                            int32_t OD,
-                                            int32_t OH,
-                                            int32_t OW,
+                                            size_t N,
+                                            size_t C,
+                                            size_t D,
+                                            size_t H,
+                                            size_t W,
+                                            size_t OD,
+                                            size_t OH,
+                                            size_t OW,
                                             int32_t* kinfor,
                                             int32_t* stride,
                                             int32_t* padding,
@@ -290,12 +290,12 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input
 template <typename TI, typename TO>
 __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
                                   TO* __restrict__ input_grad,
-                                  int32_t N,
-                                  int32_t C,
-                                  int32_t H,
-                                  int32_t W,
-                                  int32_t OH,
-                                  int32_t OW,
+                                  size_t N,
+                                  size_t C,
+                                  size_t H,
+                                  size_t W,
+                                  size_t OH,
+                                  size_t OW,
                                   int32_t* kinfor,
                                   int32_t* stride,
                                   int32_t* padding,
@@ -376,12 +376,12 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
 
 extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad,
                                              OUTPUT_TYPE* __restrict__ input_grad,
-                                             int32_t N,
-                                             int32_t C,
-                                             int32_t H,
-                                             int32_t W,
-                                             int32_t OH,
-                                             int32_t OW,
+                                             size_t N,
+                                             size_t C,
+                                             size_t H,
+                                             size_t W,
+                                             size_t OH,
+                                             size_t OW,
                                              int32_t* kinfor,
                                              int32_t* stride,
                                              int32_t* padding,
@@ -410,14 +410,14 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp
 template <typename TI, typename TO>
 __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
                                   TO* __restrict__ input_grad,
-                                  int32_t N,
-                                  int32_t C,
-                                  int32_t D,
-                                  int32_t H,
-                                  int32_t W,
-                                  int32_t OD,
-                                  int32_t OH,
-                                  int32_t OW,
+                                  size_t N,
+                                  size_t C,
+                                  size_t D,
+                                  size_t H,
+                                  size_t W,
+                                  size_t OD,
+                                  size_t OH,
+                                  size_t OW,
                                   int32_t* kinfor,
                                   int32_t* stride,
                                   int32_t* padding,
@@ -514,14 +514,14 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
 
 extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad,
                                              OUTPUT_TYPE* __restrict__ input_grad,
-                                             int32_t N,
-                                             int32_t C,
-                                             int32_t D,
-                                             int32_t H,
-                                             int32_t W,
-                                             int32_t OD,
-                                             int32_t OH,
-                                             int32_t OW,
+                                             size_t N,
+                                             size_t C,
+                                             size_t D,
+                                             size_t H,
+                                             size_t W,
+                                             size_t OD,
+                                             size_t OH,
+                                             size_t OW,
                                              int32_t* kinfor,
                                              int32_t* stride,
                                              int32_t* padding,
diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp
index 10c9479b0c..b677192b36 100644
--- a/src/solver/avgpool/backward_avgpool_2d.cpp
+++ b/src/solver/avgpool/backward_avgpool_2d.cpp
@@ -46,6 +46,11 @@ namespace avgpool {
 bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context,
                                      const miopen::avgpool::BwdProblemDescription& problem) const
 {
+    if(problem.GetInputGradDesc().GetNumDims() != 4 ||
+       problem.GetOutputGradDesc().GetNumDims() != 4)
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
index b960554348..829511d8cb 100644
--- a/src/solver/avgpool/backward_avgpool_3d.cpp
+++ b/src/solver/avgpool/backward_avgpool_3d.cpp
@@ -46,6 +46,11 @@ namespace avgpool {
 bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context,
                                      const miopen::avgpool::BwdProblemDescription& problem) const
 {
+    if(problem.GetInputGradDesc().GetNumDims() != 5 ||
+       problem.GetOutputGradDesc().GetNumDims() != 5)
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index 8b444370a0..6ddef062da 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -47,6 +47,10 @@ namespace avgpool {
 bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context,
                                     const miopen::avgpool::FwdProblemDescription& problem) const
 {
+    if(problem.GetInputDesc().GetNumDims() != 4 || problem.GetOutputDesc().GetNumDims() != 4)
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
index 9dd8c03cba..c1ee497b27 100644
--- a/src/solver/avgpool/forward_avgpool_3d.cpp
+++ b/src/solver/avgpool/forward_avgpool_3d.cpp
@@ -46,6 +46,10 @@ namespace avgpool {
 bool AvgPoolForward3d::IsApplicable(const ExecutionContext& context,
                                     const miopen::avgpool::FwdProblemDescription& problem) const
 {
+    if(problem.GetInputDesc().GetNumDims() != 5 || problem.GetOutputDesc().GetNumDims() != 5)
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp
index 40a67a8d7d..ef26e17d74 100644
--- a/test/cpu_avgpool.hpp
+++ b/test/cpu_avgpool.hpp
@@ -32,12 +32,12 @@
 template <class T>
 void cpu_avgpool_forward_2d(tensor<T> input,
                             tensor<T>& output,
-                            int32_t N,
-                            int32_t C,
-                            int32_t H,
-                            int32_t W,
-                            int32_t OH,
-                            int32_t OW,
+                            size_t N,
+                            size_t C,
+                            size_t H,
+                            size_t W,
+                            size_t OH,
+                            size_t OW,
                             tensor<int32_t> kinfor,
                             tensor<int32_t> stride,
                             tensor<int32_t> padding,
@@ -85,15 +85,15 @@ void cpu_avgpool_forward_2d(tensor<T> input,
 
         int32_t hstart = oh * sh - ph;
         int32_t wstart = ow * sw - pw;
-        int32_t hend   = std::min(hstart + R, H + ph);
-        int32_t wend   = std::min(wstart + S, W + pw);
+        int32_t hend   = min(hstart + R, H + ph);
+        int32_t wend   = min(wstart + S, W + pw);
 
         const int32_t pool_size = (hend - hstart) * (wend - wstart);
 
-        hstart = std::max(hstart, 0);
-        wstart = std::max(wstart, 0);
-        hend   = std::min(hend, H);
-        wend   = std::min(wend, W);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        hend   = min(hend, H);
+        wend   = min(wend, W);
 
         int32_t divide_factor;
         if(divisor_override != 0)
@@ -121,14 +121,14 @@ void cpu_avgpool_forward_2d(tensor<T> input,
 template <class T>
 void cpu_avgpool_forward_3d(tensor<T> input,
                             tensor<T>& output,
-                            int32_t N,
-                            int32_t C,
-                            int32_t D,
-                            int32_t H,
-                            int32_t W,
-                            int32_t OD,
-                            int32_t OH,
-                            int32_t OW,
+                            size_t N,
+                            size_t C,
+                            size_t D,
+                            size_t H,
+                            size_t W,
+                            size_t OD,
+                            size_t OH,
+                            size_t OW,
                             tensor<int32_t> kinfor,
                             tensor<int32_t> stride,
                             tensor<int32_t> padding,
@@ -185,17 +185,17 @@ void cpu_avgpool_forward_3d(tensor<T> input,
         int32_t dstart = od * sd - pd;
         int32_t hstart = oh * sh - ph;
         int32_t wstart = ow * sw - pw;
-        int32_t dend   = std::min(dstart + KD, D + pd);
-        int32_t hend   = std::min(hstart + R, H + ph);
-        int32_t wend   = std::min(wstart + S, W + pw);
+        int32_t dend   = min(dstart + KD, D + pd);
+        int32_t hend   = min(hstart + R, H + ph);
+        int32_t wend   = min(wstart + S, W + pw);
 
         const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-        dstart                  = std::max(dstart, 0);
-        hstart                  = std::max(hstart, 0);
-        wstart                  = std::max(wstart, 0);
-        dend                    = std::min(dend, D);
-        hend                    = std::min(hend, H);
-        wend                    = std::min(wend, W);
+        dstart                  = max(dstart, 0);
+        hstart                  = max(hstart, 0);
+        wstart                  = max(wstart, 0);
+        dend                    = min(dend, D);
+        hend                    = min(hend, H);
+        wend                    = min(wend, W);
 
         int32_t divide_factor;
         if(divisor_override != 0)
@@ -222,12 +222,12 @@ void cpu_avgpool_forward_3d(tensor<T> input,
 template <class T>
 void cpu_avgpool_backward_2d(tensor<T> output_grad,
                              tensor<T>& input_grad,
-                             int32_t N,
-                             int32_t C,
-                             int32_t H,
-                             int32_t W,
-                             int32_t OH,
-                             int32_t OW,
+                             size_t N,
+                             size_t C,
+                             size_t H,
+                             size_t W,
+                             size_t OH,
+                             size_t OW,
                              tensor<int32_t> kinfor,
                              tensor<int32_t> stride,
                              tensor<int32_t> padding,
@@ -275,15 +275,15 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
 
                 int32_t hstart = oh * sh - ph;
                 int32_t wstart = ow * sw - pw;
-                int32_t hend   = std::min(hstart + R, H + ph);
-                int32_t wend   = std::min(wstart + S, W + pw);
+                int32_t hend   = min(hstart + R, H + ph);
+                int32_t wend   = min(wstart + S, W + pw);
 
                 const int32_t pool_size = (hend - hstart) * (wend - wstart);
 
-                hstart = std::max(hstart, 0);
-                wstart = std::max(wstart, 0);
-                hend   = std::min(hend, H);
-                wend   = std::min(wend, W);
+                hstart = max(hstart, 0);
+                wstart = max(wstart, 0);
+                hend   = min(hend, H);
+                wend   = min(wend, W);
 
                 int32_t divide_factor;
                 if(divisor_override != 0)
@@ -315,14 +315,14 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
 template <class T>
 void cpu_avgpool_backward_3d(tensor<T> output_grad,
                              tensor<T>& input_grad,
-                             int32_t N,
-                             int32_t C,
-                             int32_t D,
-                             int32_t H,
-                             int32_t W,
-                             int32_t OD,
-                             int32_t OH,
-                             int32_t OW,
+                             size_t N,
+                             size_t C,
+                             size_t D,
+                             size_t H,
+                             size_t W,
+                             size_t OD,
+                             size_t OH,
+                             size_t OW,
                              tensor<int32_t> kinfor,
                              tensor<int32_t> stride,
                              tensor<int32_t> padding,
@@ -385,17 +385,17 @@ void cpu_avgpool_backward_3d(tensor<T> output_grad,
                     int32_t dstart = od * sd - pd;
                     int32_t hstart = oh * sh - ph;
                     int32_t wstart = ow * sw - pw;
-                    int32_t dend   = std::min(dstart + KD, D + pd);
-                    int32_t hend   = std::min(hstart + R, H + ph);
-                    int32_t wend   = std::min(wstart + S, W + pw);
+                    int32_t dend   = min(dstart + KD, D + pd);
+                    int32_t hend   = min(hstart + R, H + ph);
+                    int32_t wend   = min(wstart + S, W + pw);
 
                     const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-                    dstart                  = std::max(dstart, 0);
-                    hstart                  = std::max(hstart, 0);
-                    wstart                  = std::max(wstart, 0);
-                    dend                    = std::min(dend, D);
-                    hend                    = std::min(hend, H);
-                    wend                    = std::min(wend, W);
+                    dstart                  = max(dstart, 0);
+                    hstart                  = max(hstart, 0);
+                    wstart                  = max(wstart, 0);
+                    dend                    = min(dend, D);
+                    hend                    = min(hend, H);
+                    wend                    = min(wend, W);
                     int32_t divide_factor;
                     if(divisor_override != 0)
                     {
diff --git a/test/gtest/avgpool.cpp b/test/gtest/avgpool.cpp
index 1dd5502339..fa002e5610 100644
--- a/test/gtest/avgpool.cpp
+++ b/test/gtest/avgpool.cpp
@@ -115,49 +115,49 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP32, testing::ValuesIn(AvgPoolT
 INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP16, testing::ValuesIn(AvgPoolTestConfigs()));
 INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs()));
 
-// // BACKWARD TEST
-// TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd)
-// {
-//     if(!MIOPEN_TEST_ALL ||
-//        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
-//     {
-//         RunTest();
-//         Verify();
-//     }
-//     else
-//     {
-//         GTEST_SKIP();
-//     }
-// };
-
-// TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd)
-// {
-//     if(!MIOPEN_TEST_ALL ||
-//        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
-//     {
-//         RunTest();
-//         Verify();
-//     }
-//     else
-//     {
-//         GTEST_SKIP();
-//     }
-// };
-
-// TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd)
-// {
-//     if(!MIOPEN_TEST_ALL ||
-//        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
-//     {
-//         RunTest();
-//         Verify();
-//     }
-//     else
-//     {
-//         GTEST_SKIP();
-//     }
-// };
-
-// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP32, testing::ValuesIn(AvgPoolTestConfigs()));
-// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP16, testing::ValuesIn(AvgPoolTestConfigs()));
-// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs()));
+// BACKWARD TEST
+TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP32, testing::ValuesIn(AvgPoolTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP16, testing::ValuesIn(AvgPoolTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs()));
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
index 23ec4c1726..26548e0a12 100644
--- a/test/gtest/avgpool.hpp
+++ b/test/gtest/avgpool.hpp
@@ -221,7 +221,6 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
                                         count_include_pad,
                                         divisor_override);
         fflush(stdout);
-
         ASSERT_EQ(status, miopenStatusSuccess);
 
         output.data = handle.Read<T>(output_dev, output.data.size());
@@ -234,11 +233,6 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
         auto error = miopen::rms_range(ref_output, output);
 
         ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output));
-        for(int i = 0; i < 10; ++i)
-        {
-            std::cout << "output cpu: " << ref_output[i] << " output gpu: " << output[i]
-                      << std::endl;
-        }
         EXPECT_LT(error, threshold * 10);
     }
     AvgPoolTestCase avgpool_config;

From 881e79671935b7cbc6a05ba2cf61ad8749927305 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Fri, 16 Aug 2024 11:49:15 +0700
Subject: [PATCH 06/38] change kinfor to ksize

---
 driver/mloAvgPoolHost.hpp                    | 28 ++++-----
 include/miopen/miopen.h                      | 16 ++---
 src/avgpool.cpp                              | 20 +++---
 src/avgpool_api.cpp                          | 24 ++++----
 src/include/miopen/avgpool.hpp               |  8 +--
 src/include/miopen/avgpool/invoke_params.hpp |  8 +--
 src/kernels/MIOpenAvgPool.cpp                | 65 +++++++++++++-------
 src/solver/avgpool/backward_avgpool_2d.cpp   | 38 +++++++++++-
 src/solver/avgpool/backward_avgpool_3d.cpp   |  4 +-
 src/solver/avgpool/forward_avgpool_2d.cpp    | 40 +++++++++++-
 src/solver/avgpool/forward_avgpool_3d.cpp    |  4 +-
 test/cpu_avgpool.hpp                         | 28 ++++-----
 12 files changed, 187 insertions(+), 96 deletions(-)

diff --git a/driver/mloAvgPoolHost.hpp b/driver/mloAvgPoolHost.hpp
index ad55c53c66..6980ce968e 100644
--- a/driver/mloAvgPoolHost.hpp
+++ b/driver/mloAvgPoolHost.hpp
@@ -40,7 +40,7 @@ int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
                                    size_t W,
                                    size_t OH,
                                    size_t OW,
-                                   const int32_t* kinfor,
+                                   const int32_t* ksize,
                                    const int32_t* stride,
                                    const int32_t* padding,
                                    bool count_include_pad,
@@ -57,8 +57,8 @@ int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
         int32_t ncoh = gid / OW, ow = gid % OW;
         int32_t nc = ncoh / OH, oh = ncoh % OH;
         int32_t n = nc / C, c = nc % C;
-        int32_t R  = kinfor[0];
-        int32_t S  = kinfor[1];
+        int32_t R  = ksize[0];
+        int32_t S  = ksize[1];
         int32_t sh = stride[0];
         int32_t sw = stride[1];
         int32_t ph = padding[0];
@@ -134,7 +134,7 @@ int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
                                    size_t OD,
                                    size_t OH,
                                    size_t OW,
-                                   const int32_t* kinfor,
+                                   const int32_t* ksize,
                                    const int32_t* stride,
                                    const int32_t* padding,
                                    bool count_include_pad,
@@ -152,9 +152,9 @@ int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
         int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
         int32_t nc = ncod / OD, od = ncod % OD;
         int32_t n = nc / C, c = nc % C;
-        int32_t KD = kinfor[0];
-        int32_t R  = kinfor[1];
-        int32_t S  = kinfor[2];
+        int32_t KD = ksize[0];
+        int32_t R  = ksize[1];
+        int32_t S  = ksize[2];
         int32_t sd = stride[0];
         int32_t sh = stride[1];
         int32_t sw = stride[2];
@@ -236,7 +236,7 @@ int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDes
                                     size_t W,
                                     size_t OH,
                                     size_t OW,
-                                    const int32_t* kinfor,
+                                    const int32_t* ksize,
                                     const int32_t* stride,
                                     const int32_t* padding,
                                     bool count_include_pad,
@@ -253,8 +253,8 @@ int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDes
         int32_t nch = gid / W, w = gid % W;
         int32_t nc = nch / H, h = nch % H;
         int32_t n = nc / C, c = nc % C;
-        int32_t R  = kinfor[0];
-        int32_t S  = kinfor[1];
+        int32_t R  = ksize[0];
+        int32_t S  = ksize[1];
         int32_t sh = stride[0];
         int32_t sw = stride[1];
         int32_t ph = padding[0];
@@ -334,7 +334,7 @@ int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDes
                                     size_t OD,
                                     size_t OH,
                                     size_t OW,
-                                    const int32_t* kinfor,
+                                    const int32_t* ksize,
                                     const int32_t* stride,
                                     const int32_t* padding,
                                     bool count_include_pad,
@@ -352,9 +352,9 @@ int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDes
         int32_t ncd = ncdh / H, h = ncdh % H;
         int32_t nc = ncd / D, d = ncd % D;
         int32_t n = nc / C, c = nc % C;
-        int32_t KD = kinfor[0];
-        int32_t R  = kinfor[1];
-        int32_t S  = kinfor[2];
+        int32_t KD = ksize[0];
+        int32_t R  = ksize[1];
+        int32_t S  = ksize[2];
         int32_t sd = stride[0];
         int32_t sh = stride[1];
         int32_t sw = stride[2];
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index fda8817e3a..18b0bcafdf 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -7640,8 +7640,8 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
  * @param stride                   Data tensor stride (output)
  * @param paddingDesc              Tensor descriptor for padding tensor (input)
  * @param padding                  Data tensor padding (output)
- * @param kinforDesc               Tensor descriptor for kinfor tensor (input)
- * @param kinfor                   Data tensor kinfor (output)
+ * @param ksizeDesc               Tensor descriptor for ksize tensor (input)
+ * @param ksize                   Data tensor ksize (output)
  * @param count_include_pad        When True, will include the zero-padding in the averaging
  * calculation (input)
  * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
@@ -7657,8 +7657,8 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                                                   const void* stride,
                                                   const miopenTensorDescriptor_t paddingDesc,
                                                   const void* padding,
-                                                  const miopenTensorDescriptor_t kinforDesc,
-                                                  const void* kinfor,
+                                                  const miopenTensorDescriptor_t ksizeDesc,
+                                                  const void* ksize,
                                                   const bool count_include_pad,
                                                   const int32_t divisor_override);
 
@@ -7673,8 +7673,8 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
  * @param stride                   Data tensor stride (output)
  * @param paddingDesc              Tensor descriptor for padding tensor (input)
  * @param padding                  Data tensor padding (output)
- * @param kinforDesc               Tensor descriptor for kinfor tensor (input)
- * @param kinfor                   Data tensor kinfor (output)
+ * @param ksizeDesc               Tensor descriptor for ksize tensor (input)
+ * @param ksize                   Data tensor ksize (output)
  * @param count_include_pad        When True, will include the zero-padding in the averaging
  * calculation (input)
  * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
@@ -7690,8 +7690,8 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                                                    const void* stride,
                                                    const miopenTensorDescriptor_t paddingDesc,
                                                    const void* padding,
-                                                   const miopenTensorDescriptor_t kinforDesc,
-                                                   const void* kinfor,
+                                                   const miopenTensorDescriptor_t ksizeDesc,
+                                                   const void* ksize,
                                                    const bool count_include_pad,
                                                    const int32_t divisor_override);
 /** @} */
diff --git a/src/avgpool.cpp b/src/avgpool.cpp
index 15bea1f9d8..87ff481c6a 100644
--- a/src/avgpool.cpp
+++ b/src/avgpool.cpp
@@ -42,8 +42,8 @@ miopenStatus_t AvgPoolForward(Handle& handle,
                               ConstData_t stride,
                               const TensorDescriptor& paddingDesc,
                               ConstData_t padding,
-                              const TensorDescriptor& kinforDesc,
-                              ConstData_t kinfor,
+                              const TensorDescriptor& ksizeDesc,
+                              ConstData_t ksize,
                               const bool count_include_pad,
                               const int32_t divisor_override)
 {
@@ -51,7 +51,7 @@ miopenStatus_t AvgPoolForward(Handle& handle,
                                                         outputDesc,
                                                         strideDesc,
                                                         paddingDesc,
-                                                        kinforDesc,
+                                                        ksizeDesc,
                                                         count_include_pad,
                                                         divisor_override};
 
@@ -61,13 +61,13 @@ miopenStatus_t AvgPoolForward(Handle& handle,
         tmp.outputDesc  = &outputDesc;
         tmp.strideDesc  = &strideDesc;
         tmp.paddingDesc = &paddingDesc;
-        tmp.kinforDesc  = &kinforDesc;
+        tmp.ksizeDesc   = &ksizeDesc;
 
         tmp.input             = input;
         tmp.output            = output;
         tmp.stride            = stride;
         tmp.padding           = padding;
-        tmp.kinfor            = kinfor;
+        tmp.ksize             = ksize;
         tmp.count_include_pad = count_include_pad;
         tmp.divisor_override  = divisor_override;
 
@@ -91,8 +91,8 @@ miopenStatus_t AvgPoolBackward(Handle& handle,
                                ConstData_t stride,
                                const TensorDescriptor& paddingDesc,
                                ConstData_t padding,
-                               const TensorDescriptor& kinforDesc,
-                               ConstData_t kinfor,
+                               const TensorDescriptor& ksizeDesc,
+                               ConstData_t ksize,
                                const bool count_include_pad,
                                const int32_t divisor_override)
 {
@@ -100,7 +100,7 @@ miopenStatus_t AvgPoolBackward(Handle& handle,
                                                         inputGradDesc,
                                                         strideDesc,
                                                         paddingDesc,
-                                                        kinforDesc,
+                                                        ksizeDesc,
                                                         count_include_pad,
                                                         divisor_override};
 
@@ -110,13 +110,13 @@ miopenStatus_t AvgPoolBackward(Handle& handle,
         tmp.inputGradDesc  = &inputGradDesc;
         tmp.strideDesc     = &strideDesc;
         tmp.paddingDesc    = &paddingDesc;
-        tmp.kinforDesc     = &kinforDesc;
+        tmp.ksizeDesc      = &ksizeDesc;
 
         tmp.output_grad       = output_grad;
         tmp.input_grad        = input_grad;
         tmp.stride            = stride;
         tmp.padding           = padding;
-        tmp.kinfor            = kinfor;
+        tmp.ksize             = ksize;
         tmp.count_include_pad = count_include_pad;
         tmp.divisor_override  = divisor_override;
 
diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp
index 4e62bd5e7b..fa2e8a957c 100644
--- a/src/avgpool_api.cpp
+++ b/src/avgpool_api.cpp
@@ -88,8 +88,8 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                                                const void* stride,
                                                const miopenTensorDescriptor_t paddingDesc,
                                                const void* padding,
-                                               const miopenTensorDescriptor_t kinforDesc,
-                                               const void* kinfor,
+                                               const miopenTensorDescriptor_t ksizeDesc,
+                                               const void* ksize,
                                                const bool count_include_pad,
                                                const int32_t divisor_override)
 {
@@ -102,8 +102,8 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                         stride,
                         paddingDesc,
                         padding,
-                        kinforDesc,
-                        kinfor,
+                        ksizeDesc,
+                        ksize,
                         count_include_pad,
                         divisor_override);
 
@@ -118,8 +118,8 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                                DataCast(stride),
                                miopen::deref(paddingDesc),
                                DataCast(padding),
-                               miopen::deref(kinforDesc),
-                               DataCast(kinfor),
+                               miopen::deref(ksizeDesc),
+                               DataCast(ksize),
                                count_include_pad,
                                divisor_override);
     });
@@ -134,8 +134,8 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                                                 const void* stride,
                                                 const miopenTensorDescriptor_t paddingDesc,
                                                 const void* padding,
-                                                const miopenTensorDescriptor_t kinforDesc,
-                                                const void* kinfor,
+                                                const miopenTensorDescriptor_t ksizeDesc,
+                                                const void* ksize,
                                                 const bool count_include_pad,
                                                 const int32_t divisor_override)
 {
@@ -148,8 +148,8 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                         stride,
                         paddingDesc,
                         padding,
-                        kinforDesc,
-                        kinfor,
+                        ksizeDesc,
+                        ksize,
                         count_include_pad,
                         divisor_override);
 
@@ -164,8 +164,8 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                                 DataCast(stride),
                                 miopen::deref(paddingDesc),
                                 DataCast(padding),
-                                miopen::deref(kinforDesc),
-                                DataCast(kinfor),
+                                miopen::deref(ksizeDesc),
+                                DataCast(ksize),
                                 count_include_pad,
                                 divisor_override);
     });
diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp
index 617ed56782..9210e45e3a 100644
--- a/src/include/miopen/avgpool.hpp
+++ b/src/include/miopen/avgpool.hpp
@@ -43,8 +43,8 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle,
                                                       ConstData_t stride,
                                                       const TensorDescriptor& paddingDesc,
                                                       ConstData_t padding,
-                                                      const TensorDescriptor& kinforDesc,
-                                                      ConstData_t kinfor,
+                                                      const TensorDescriptor& ksizeDesc,
+                                                      ConstData_t ksize,
                                                       bool count_include_pad,
                                                       int32_t divisor_override);
 
@@ -57,8 +57,8 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle,
                                                        ConstData_t stride,
                                                        const TensorDescriptor& paddingDesc,
                                                        ConstData_t padding,
-                                                       const TensorDescriptor& kinforDesc,
-                                                       ConstData_t kinfor,
+                                                       const TensorDescriptor& ksizeDesc,
+                                                       ConstData_t ksize,
                                                        bool count_include_pad,
                                                        int32_t divisor_override);
 } // namespace miopen
diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp
index b57f8e0edc..91a70725ee 100644
--- a/src/include/miopen/avgpool/invoke_params.hpp
+++ b/src/include/miopen/avgpool/invoke_params.hpp
@@ -42,13 +42,13 @@ struct FwdInvokeParams : public miopen::InvokeParams
     const TensorDescriptor* outputDesc  = nullptr;
     const TensorDescriptor* strideDesc  = nullptr;
     const TensorDescriptor* paddingDesc = nullptr;
-    const TensorDescriptor* kinforDesc  = nullptr;
+    const TensorDescriptor* ksizeDesc   = nullptr;
 
     ConstData_t input   = nullptr;
     Data_t output       = nullptr;
     ConstData_t stride  = nullptr;
     ConstData_t padding = nullptr;
-    ConstData_t kinfor  = nullptr;
+    ConstData_t ksize   = nullptr;
 
     bool count_include_pad   = false;
     int32_t divisor_override = 0;
@@ -66,13 +66,13 @@ struct BwdInvokeParams : public miopen::InvokeParams
     const TensorDescriptor* inputGradDesc  = nullptr;
     const TensorDescriptor* strideDesc     = nullptr;
     const TensorDescriptor* paddingDesc    = nullptr;
-    const TensorDescriptor* kinforDesc     = nullptr;
+    const TensorDescriptor* ksizeDesc      = nullptr;
 
     ConstData_t output_grad = nullptr;
     Data_t input_grad       = nullptr;
     ConstData_t stride      = nullptr;
     ConstData_t padding     = nullptr;
-    ConstData_t kinfor      = nullptr;
+    ConstData_t ksize       = nullptr;
 
     bool count_include_pad   = false;
     int32_t divisor_override = 0;
diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
index f4a9e95ce1..6d94bffac1 100644
--- a/src/kernels/MIOpenAvgPool.cpp
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -40,6 +40,27 @@
 #define OUTPUT_TYPE float
 #endif
 
+// template <typename T, uint32_t Nd>
+// struct blockNd
+// {
+//     T val[Nd];
+// };
+
+// template <typename TI, typename TO, uint32_t Nd>
+// __device__ void avgPoolForwardNdNew(const TI* __restrict__ input,
+//                                     TO* __restrict__ output,
+//                                     size_t N,
+//                                     size_t C,
+//                                     const blockNd<size_t, Nd> sizeIn,
+//                                     const blockNd<size_t, Nd> sizeOut,
+//                                     const blockNd<int32_t, Nd> ksize,
+//                                     const blockNd<int32_t, Nd> stride,
+//                                     const blockNd<int32_t, Nd> padding,
+//                                     bool count_include_pad,
+//                                     int32_t divisor_override,
+//                                     tensor_view_t<Nd + 2> input_tv,
+//                                     tensor_view_t<Nd + 2> output_tv);
+
 template <typename TI, typename TO>
 __device__ void avgPoolForward2d(const TI* __restrict__ input,
                                  TO* __restrict__ output,
@@ -49,7 +70,7 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
                                  size_t W,
                                  size_t OH,
                                  size_t OW,
-                                 const int32_t* __restrict__ kinfor,
+                                 const int32_t* __restrict__ ksize,
                                  const int32_t* __restrict__ stride,
                                  const int32_t* __restrict__ padding,
                                  bool count_include_pad,
@@ -61,8 +82,8 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
     int32_t ncoh = gid / OW, ow = gid % OW;
     int32_t nc = ncoh / OH, oh = ncoh % OH;
     int32_t n = nc / C, c = nc % C;
-    int32_t R  = kinfor[0];
-    int32_t S  = kinfor[1];
+    int32_t R  = ksize[0];
+    int32_t S  = ksize[1];
     int32_t sh = stride[0];
     int32_t sw = stride[1];
     int32_t ph = padding[0];
@@ -130,7 +151,7 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input
                                             size_t W,
                                             size_t OH,
                                             size_t OW,
-                                            int32_t* kinfor,
+                                            int32_t* ksize,
                                             int32_t* stride,
                                             int32_t* padding,
                                             bool count_include_pad,
@@ -146,7 +167,7 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input
                                               W,
                                               OH,
                                               OW,
-                                              kinfor,
+                                              ksize,
                                               stride,
                                               padding,
                                               count_include_pad,
@@ -166,7 +187,7 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
                                  size_t OD,
                                  size_t OH,
                                  size_t OW,
-                                 int32_t* kinfor,
+                                 int32_t* ksize,
                                  int32_t* stride,
                                  int32_t* padding,
                                  bool count_include_pad,
@@ -179,9 +200,9 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
     int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
     int32_t nc = ncod / OD, od = ncod % OD;
     int32_t n = nc / C, c = nc % C;
-    int32_t KD = kinfor[0];
-    int32_t R  = kinfor[1];
-    int32_t S  = kinfor[2];
+    int32_t KD = ksize[0];
+    int32_t R  = ksize[1];
+    int32_t S  = ksize[2];
     int32_t sd = stride[0];
     int32_t sh = stride[1];
     int32_t sw = stride[2];
@@ -260,7 +281,7 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input
                                             size_t OD,
                                             size_t OH,
                                             size_t OW,
-                                            int32_t* kinfor,
+                                            int32_t* ksize,
                                             int32_t* stride,
                                             int32_t* padding,
                                             bool count_include_pad,
@@ -278,7 +299,7 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input
                                               OD,
                                               OH,
                                               OW,
-                                              kinfor,
+                                              ksize,
                                               stride,
                                               padding,
                                               count_include_pad,
@@ -296,7 +317,7 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
                                   size_t W,
                                   size_t OH,
                                   size_t OW,
-                                  int32_t* kinfor,
+                                  int32_t* ksize,
                                   int32_t* stride,
                                   int32_t* padding,
                                   bool count_include_pad,
@@ -308,8 +329,8 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
     int32_t nch = gid / W, w = gid % W;
     int32_t nc = nch / H, h = nch % H;
     int32_t n = nc / C, c = nc % C;
-    int32_t R  = kinfor[0];
-    int32_t S  = kinfor[1];
+    int32_t R  = ksize[0];
+    int32_t S  = ksize[1];
     int32_t sh = stride[0];
     int32_t sw = stride[1];
     int32_t ph = padding[0];
@@ -382,7 +403,7 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp
                                              size_t W,
                                              size_t OH,
                                              size_t OW,
-                                             int32_t* kinfor,
+                                             int32_t* ksize,
                                              int32_t* stride,
                                              int32_t* padding,
                                              bool count_include_pad,
@@ -398,7 +419,7 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp
                                                W,
                                                OH,
                                                OW,
-                                               kinfor,
+                                               ksize,
                                                stride,
                                                padding,
                                                count_include_pad,
@@ -418,7 +439,7 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
                                   size_t OD,
                                   size_t OH,
                                   size_t OW,
-                                  int32_t* kinfor,
+                                  int32_t* ksize,
                                   int32_t* stride,
                                   int32_t* padding,
                                   bool count_include_pad,
@@ -431,9 +452,9 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
     int32_t ncd = ncdh / H, h = ncdh % H;
     int32_t nc = ncd / D, d = ncd % D;
     int32_t n = nc / C, c = nc % C;
-    int32_t KD = kinfor[0];
-    int32_t R  = kinfor[1];
-    int32_t S  = kinfor[2];
+    int32_t KD = ksize[0];
+    int32_t R  = ksize[1];
+    int32_t S  = ksize[2];
     int32_t sd = stride[0];
     int32_t sh = stride[1];
     int32_t sw = stride[2];
@@ -522,7 +543,7 @@ extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ outp
                                              size_t OD,
                                              size_t OH,
                                              size_t OW,
-                                             int32_t* kinfor,
+                                             int32_t* ksize,
                                              int32_t* stride,
                                              int32_t* padding,
                                              bool count_include_pad,
@@ -540,7 +561,7 @@ extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ outp
                                                OD,
                                                OH,
                                                OW,
-                                               kinfor,
+                                               ksize,
                                                stride,
                                                padding,
                                                count_include_pad,
diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp
index b677192b36..4fe9d5bc76 100644
--- a/src/solver/avgpool/backward_avgpool_2d.cpp
+++ b/src/solver/avgpool/backward_avgpool_2d.cpp
@@ -35,7 +35,7 @@
 #include <miopen/avgpool.hpp>
 #include <miopen/target_properties.hpp>
 
-#define LOCAL_SIZE_BWD_2D 1024
+#define LOCAL_SIZE_BWD_2D 256
 
 namespace miopen {
 
@@ -43,6 +43,36 @@ namespace solver {
 
 namespace avgpool {
 
+bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem)
+{
+    auto dtype      = problem.GetInputGradDesc().GetType();
+    auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
+    auto out_nelems = problem.GetOutputGradDesc().GetElementSize();
+    auto mul_nc =
+        problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1];
+    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+    if(dtype == miopenFloat)
+    {
+        return false;
+    }
+    else if(dtype == miopenHalf)
+    {
+        if(in_over_out < 2 && in_nelems >= 11075584)
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenBFloat16)
+    {
+        if(in_over_out < 2 || (in_nelems > 20000000 && mul_nc <= 2048))
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
 bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context,
                                      const miopen::avgpool::BwdProblemDescription& problem) const
 {
@@ -51,6 +81,10 @@ bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
+    if(!IsOverRocm(problem))
+    {
+        return false;
+    }
     return true;
 }
 
@@ -101,7 +135,7 @@ AvgPoolBackward2d::GetSolution(const ExecutionContext& context,
                    W,
                    OH,
                    OW,
-                   params.kinfor,
+                   params.ksize,
                    params.stride,
                    params.padding,
                    params.count_include_pad,
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
index 829511d8cb..6897097955 100644
--- a/src/solver/avgpool/backward_avgpool_3d.cpp
+++ b/src/solver/avgpool/backward_avgpool_3d.cpp
@@ -35,7 +35,7 @@
 #include <miopen/avgpool.hpp>
 #include <miopen/target_properties.hpp>
 
-#define LOCAL_SIZE_BWD_3D 1024
+#define LOCAL_SIZE_BWD_3D 256
 
 namespace miopen {
 
@@ -105,7 +105,7 @@ AvgPoolBackward3d::GetSolution(const ExecutionContext& context,
                    OD,
                    OH,
                    OW,
-                   params.kinfor,
+                   params.ksize,
                    params.stride,
                    params.padding,
                    params.count_include_pad,
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index 6ddef062da..3e70264097 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -36,7 +36,7 @@
 #include <miopen/avgpool.hpp>
 #include <miopen/target_properties.hpp>
 
-#define LOCAL_SIZE_FWD_2D 1024
+#define LOCAL_SIZE_FWD_2D 256
 
 namespace miopen {
 
@@ -44,6 +44,38 @@ namespace solver {
 
 namespace avgpool {
 
+bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem)
+{
+    auto dtype      = problem.GetOutputDesc().GetType();
+    auto in_nelems  = problem.GetInputDesc().GetElementSize();
+    auto out_nelems = problem.GetOutputDesc().GetElementSize();
+    auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
+    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+    if(dtype == miopenFloat)
+    {
+        if(in_over_out > 11 || (in_over_out < 2 && mul_nc >= 12288))
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenHalf)
+    {
+        if(in_over_out > 11 || (in_over_out < 2 && mul_nc < 90000))
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenBFloat16)
+    {
+        if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 6000000)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
 bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context,
                                     const miopen::avgpool::FwdProblemDescription& problem) const
 {
@@ -51,6 +83,10 @@ bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
+    if(!IsOverRocm(problem))
+    {
+        return false;
+    }
     return true;
 }
 
@@ -101,7 +137,7 @@ AvgPoolForward2d::GetSolution(const ExecutionContext& context,
                    W,
                    OH,
                    OW,
-                   params.kinfor,
+                   params.ksize,
                    params.stride,
                    params.padding,
                    params.count_include_pad,
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
index c1ee497b27..088aac6dca 100644
--- a/src/solver/avgpool/forward_avgpool_3d.cpp
+++ b/src/solver/avgpool/forward_avgpool_3d.cpp
@@ -35,7 +35,7 @@
 #include <miopen/avgpool.hpp>
 #include <miopen/target_properties.hpp>
 
-#define LOCAL_SIZE_FWD_3D 1024
+#define LOCAL_SIZE_FWD_3D 256
 
 namespace miopen {
 
@@ -104,7 +104,7 @@ AvgPoolForward3d::GetSolution(const ExecutionContext& context,
                    OD,
                    OH,
                    OW,
-                   params.kinfor,
+                   params.ksize,
                    params.stride,
                    params.padding,
                    params.count_include_pad,
diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp
index ef26e17d74..5b91033633 100644
--- a/test/cpu_avgpool.hpp
+++ b/test/cpu_avgpool.hpp
@@ -38,7 +38,7 @@ void cpu_avgpool_forward_2d(tensor<T> input,
                             size_t W,
                             size_t OH,
                             size_t OW,
-                            tensor<int32_t> kinfor,
+                            tensor<int32_t> ksize,
                             tensor<int32_t> stride,
                             tensor<int32_t> padding,
                             bool count_include_pad,
@@ -55,8 +55,8 @@ void cpu_avgpool_forward_2d(tensor<T> input,
         int32_t ncoh = gid / OW, ow = gid % OW;
         int32_t nc = ncoh / OH, oh = ncoh % OH;
         int32_t n = nc / C, c = nc % C;
-        int32_t R  = kinfor[0];
-        int32_t S  = kinfor[1];
+        int32_t R  = ksize[0];
+        int32_t S  = ksize[1];
         int32_t sh = stride[0];
         int32_t sw = stride[1];
         int32_t ph = padding[0];
@@ -129,7 +129,7 @@ void cpu_avgpool_forward_3d(tensor<T> input,
                             size_t OD,
                             size_t OH,
                             size_t OW,
-                            tensor<int32_t> kinfor,
+                            tensor<int32_t> ksize,
                             tensor<int32_t> stride,
                             tensor<int32_t> padding,
                             bool count_include_pad,
@@ -147,9 +147,9 @@ void cpu_avgpool_forward_3d(tensor<T> input,
         int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
         int32_t nc = ncod / OD, od = ncod % OD;
         int32_t n = nc / C, c = nc % C;
-        int32_t KD = kinfor[0];
-        int32_t R  = kinfor[1];
-        int32_t S  = kinfor[2];
+        int32_t KD = ksize[0];
+        int32_t R  = ksize[1];
+        int32_t S  = ksize[2];
         int32_t sd = stride[0];
         int32_t sh = stride[1];
         int32_t sw = stride[2];
@@ -228,7 +228,7 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
                              size_t W,
                              size_t OH,
                              size_t OW,
-                             tensor<int32_t> kinfor,
+                             tensor<int32_t> ksize,
                              tensor<int32_t> stride,
                              tensor<int32_t> padding,
                              bool count_include_pad,
@@ -245,8 +245,8 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
         int32_t nch = gid / W, w = gid % W;
         int32_t nc = nch / H, h = nch % H;
         int32_t n = nc / C, c = nc % C;
-        int32_t R  = kinfor[0];
-        int32_t S  = kinfor[1];
+        int32_t R  = ksize[0];
+        int32_t S  = ksize[1];
         int32_t sh = stride[0];
         int32_t sw = stride[1];
         int32_t ph = padding[0];
@@ -323,7 +323,7 @@ void cpu_avgpool_backward_3d(tensor<T> output_grad,
                              size_t OD,
                              size_t OH,
                              size_t OW,
-                             tensor<int32_t> kinfor,
+                             tensor<int32_t> ksize,
                              tensor<int32_t> stride,
                              tensor<int32_t> padding,
                              bool count_include_pad,
@@ -341,9 +341,9 @@ void cpu_avgpool_backward_3d(tensor<T> output_grad,
         int32_t ncd = ncdh / H, h = ncdh % H;
         int32_t nc = ncd / D, d = ncd % D;
         int32_t n = nc / C, c = nc % C;
-        int32_t KD = kinfor[0];
-        int32_t R  = kinfor[1];
-        int32_t S  = kinfor[2];
+        int32_t KD = ksize[0];
+        int32_t R  = ksize[1];
+        int32_t S  = ksize[2];
         int32_t sd = stride[0];
         int32_t sh = stride[1];
         int32_t sw = stride[2];

From 36128975121554bdd9336656f7781ddee410605f Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 19 Aug 2024 16:57:51 +0700
Subject: [PATCH 07/38] change params

---
 driver/avgpool_driver.hpp                     |  95 ++++-----
 include/miopen/miopen.h                       |  42 ++--
 src/avgpool.cpp                               |  84 ++++----
 src/avgpool_api.cpp                           |  90 +++++----
 src/include/miopen/avgpool.hpp                |  30 +--
 src/include/miopen/avgpool/invoke_params.hpp  |  42 ++--
 .../miopen/avgpool/problem_description.hpp    |  44 +----
 src/kernels/MIOpenAvgPool.cpp                 | 183 +++++++++---------
 src/solver/avgpool/backward_avgpool_2d.cpp    |  17 +-
 src/solver/avgpool/backward_avgpool_3d.cpp    |  52 ++++-
 src/solver/avgpool/forward_avgpool_2d.cpp     |  17 +-
 src/solver/avgpool/forward_avgpool_3d.cpp     |  48 ++++-
 12 files changed, 409 insertions(+), 335 deletions(-)

diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp
index 38beba92f1..ff7d04edd5 100644
--- a/driver/avgpool_driver.hpp
+++ b/driver/avgpool_driver.hpp
@@ -52,9 +52,6 @@ class AvgPoolDriver : public Driver
         miopenCreateTensorDescriptor(&outputDesc);
         miopenCreateTensorDescriptor(&inputGradDesc);
         miopenCreateTensorDescriptor(&outputGradDesc);
-        miopenCreateTensorDescriptor(&ksizeDesc);
-        miopenCreateTensorDescriptor(&strideDesc);
-        miopenCreateTensorDescriptor(&paddingDesc);
 
         data_type = miopen_type<Tgpu>{};
     }
@@ -83,9 +80,6 @@ class AvgPoolDriver : public Driver
         miopenDestroyTensorDescriptor(outputDesc);
         miopenDestroyTensorDescriptor(inputGradDesc);
         miopenDestroyTensorDescriptor(outputGradDesc);
-        miopenDestroyTensorDescriptor(ksizeDesc);
-        miopenDestroyTensorDescriptor(strideDesc);
-        miopenDestroyTensorDescriptor(paddingDesc);
     }
 
 private:
@@ -97,17 +91,11 @@ class AvgPoolDriver : public Driver
     miopenTensorDescriptor_t outputDesc;
     miopenTensorDescriptor_t inputGradDesc;
     miopenTensorDescriptor_t outputGradDesc;
-    miopenTensorDescriptor_t ksizeDesc;
-    miopenTensorDescriptor_t strideDesc;
-    miopenTensorDescriptor_t paddingDesc;
 
     std::unique_ptr<GPUMem> input_dev;
     std::unique_ptr<GPUMem> output_dev;
     std::unique_ptr<GPUMem> input_grad_dev;
     std::unique_ptr<GPUMem> output_grad_dev;
-    std::unique_ptr<GPUMem> ksize_dev;
-    std::unique_ptr<GPUMem> stride_dev;
-    std::unique_ptr<GPUMem> padding_dev;
 
     std::vector<Tgpu> input;
     std::vector<Tgpu> output;
@@ -172,29 +160,29 @@ std::vector<int> AvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char
 template <typename Tgpu, typename Tref>
 int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
 {
-    in_dim                   = GetInputTensorDimsFromCmd("input_dims");
-    std::vector<int> ksp_dim = {in_dim.size() - 2};
-    ksize                    = GetInputTensorDimsFromCmd("kernel_size");
-    stride                   = GetInputTensorDimsFromCmd("stride");
-    padding                  = GetInputTensorDimsFromCmd("padding");
+    in_dim      = GetInputTensorDimsFromCmd("input_dims");
+    int ksp_dim = in_dim.size() - 2;
+    ksize       = GetInputTensorDimsFromCmd("kernel_size");
+    stride      = GetInputTensorDimsFromCmd("stride");
+    padding     = GetInputTensorDimsFromCmd("padding");
 
-    if(ksize.size() != ksp_dim[0])
+    if(ksize.size() != ksp_dim)
     {
-        int ref = ksp_dim[0] - ksize.size();
-        while(ref--)
-            ksize.push_back(1);
+        int ref = ksp_dim - ksize.size();
+        while((ref--) != 0)
+            ksize.push_back(ksize[0]);
     }
-    if(stride.size() != ksp_dim[0])
+    if(stride.size() != ksp_dim)
     {
-        int ref = ksp_dim[0] - ksize.size();
-        while(ref--)
-            stride.push_back(1);
+        int ref = ksp_dim - stride.size();
+        while((ref--) != 0)
+            stride.push_back(stride[0]);
     }
-    if(padding.size() != ksp_dim[0])
+    if(padding.size() != ksp_dim)
     {
-        int ref = ksp_dim[0] - ksize.size();
-        while(ref--)
-            padding.push_back(0);
+        int ref = ksp_dim - padding.size();
+        while((ref--) != 0)
+            padding.push_back(padding[0]);
     }
 
     ceil_mode         = static_cast<bool>(inflags.GetValueInt("ceil_mode"));
@@ -242,9 +230,6 @@ int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
     SetTensorNd(outputDesc, out_dim, data_type);
     SetTensorNd(outputGradDesc, out_dim, data_type);
     SetTensorNd(inputGradDesc, in_dim, data_type);
-    SetTensorNd(ksizeDesc, ksp_dim, miopen_type<int32_t>{});
-    SetTensorNd(strideDesc, ksp_dim, miopen_type<int32_t>{});
-    SetTensorNd(paddingDesc, ksp_dim, miopen_type<int32_t>{});
 
     return miopenStatusSuccess;
 }
@@ -301,11 +286,8 @@ int AvgPoolDriver<Tgpu, Tref>::AddCmdLineArgs()
 template <typename Tgpu, typename Tref>
 int AvgPoolDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
 {
-    size_t input_sz   = GetTensorSize(inputDesc);
-    size_t output_sz  = GetTensorSize(outputDesc);
-    size_t ksize_sz   = GetTensorSize(ksizeDesc);
-    size_t stride_sz  = GetTensorSize(strideDesc);
-    size_t padding_sz = GetTensorSize(paddingDesc);
+    size_t input_sz  = GetTensorSize(inputDesc);
+    size_t output_sz = GetTensorSize(outputDesc);
 
     uint32_t ctx = 0;
 
@@ -313,9 +295,6 @@ int AvgPoolDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     output_dev      = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
     input_grad_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
     output_grad_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
-    ksize_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, ksize_sz, sizeof(int32_t)));
-    stride_dev      = std::unique_ptr<GPUMem>(new GPUMem(ctx, stride_sz, sizeof(int32_t)));
-    padding_dev     = std::unique_ptr<GPUMem>(new GPUMem(ctx, padding_sz, sizeof(int32_t)));
 
     input       = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0));
     output      = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0));
@@ -343,12 +322,6 @@ int AvgPoolDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     }
     status |= output_grad_dev->ToGPU(q, output_grad.data());
 
-    status |= ksize_dev->ToGPU(q, ksize.data());
-
-    status |= stride_dev->ToGPU(q, stride.data());
-
-    status |= padding_dev->ToGPU(q, padding.data());
-
     if(status != 0)
         std::cout << "Error copying data to GPU\n" << std::endl;
 
@@ -371,12 +344,15 @@ int AvgPoolDriver<Tgpu, Tref>::RunForwardGPU()
                              input_dev->GetMem(),
                              outputDesc,
                              output_dev->GetMem(),
-                             strideDesc,
-                             stride_dev->GetMem(),
-                             paddingDesc,
-                             padding_dev->GetMem(),
-                             ksizeDesc,
-                             ksize_dev->GetMem(),
+                             ksize.size() == 3 ? ksize[0] : 0,
+                             ksize.size() == 3 ? ksize[1] : ksize[0],
+                             ksize.size() == 3 ? ksize[2] : ksize[1],
+                             stride.size() == 3 ? stride[0] : 0,
+                             stride.size() == 3 ? stride[1] : stride[0],
+                             stride.size() == 3 ? stride[2] : stride[1],
+                             padding.size() == 3 ? padding[0] : 0,
+                             padding.size() == 3 ? padding[1] : padding[0],
+                             padding.size() == 3 ? padding[2] : padding[1],
                              count_include_pad,
                              divisor_override);
 
@@ -464,12 +440,15 @@ int AvgPoolDriver<Tgpu, Tref>::RunBackwardGPU()
                               output_grad_dev->GetMem(),
                               inputGradDesc,
                               input_grad_dev->GetMem(),
-                              strideDesc,
-                              stride_dev->GetMem(),
-                              paddingDesc,
-                              padding_dev->GetMem(),
-                              ksizeDesc,
-                              ksize_dev->GetMem(),
+                              ksize.size() == 3 ? ksize[0] : 0,
+                              ksize.size() == 3 ? ksize[1] : ksize[0],
+                              ksize.size() == 3 ? ksize[2] : ksize[1],
+                              stride.size() == 3 ? stride[0] : 0,
+                              stride.size() == 3 ? stride[1] : stride[0],
+                              stride.size() == 3 ? stride[2] : stride[1],
+                              padding.size() == 3 ? padding[0] : 0,
+                              padding.size() == 3 ? padding[1] : padding[0],
+                              padding.size() == 3 ? padding[2] : padding[1],
                               count_include_pad,
                               divisor_override);
 
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 18b0bcafdf..ea44de92d5 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -7636,12 +7636,6 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
  * @param input                    Data tensor input (input)
  * @param outputDesc               Tensor descriptor for output tensor (input)
  * @param output                   Data tensor output (output)
- * @param strideDesc               Tensor descriptor for stride tensor (input)
- * @param stride                   Data tensor stride (output)
- * @param paddingDesc              Tensor descriptor for padding tensor (input)
- * @param padding                  Data tensor padding (output)
- * @param ksizeDesc               Tensor descriptor for ksize tensor (input)
- * @param ksize                   Data tensor ksize (output)
  * @param count_include_pad        When True, will include the zero-padding in the averaging
  * calculation (input)
  * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
@@ -7653,12 +7647,15 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                                                   const void* input,
                                                   const miopenTensorDescriptor_t outputDesc,
                                                   void* output,
-                                                  const miopenTensorDescriptor_t strideDesc,
-                                                  const void* stride,
-                                                  const miopenTensorDescriptor_t paddingDesc,
-                                                  const void* padding,
-                                                  const miopenTensorDescriptor_t ksizeDesc,
-                                                  const void* ksize,
+                                                  const int32_t KD,
+                                                  const int32_t KH,
+                                                  const int32_t KW,
+                                                  const int32_t SD,
+                                                  const int32_t SH,
+                                                  const int32_t SW,
+                                                  const int32_t PD,
+                                                  const int32_t PH,
+                                                  const int32_t PW,
                                                   const bool count_include_pad,
                                                   const int32_t divisor_override);
 
@@ -7669,12 +7666,6 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
  * @param output_grad              Data tensor output grad (input)
  * @param inputGradDesc            Tensor descriptor for input grad tensor (input)
  * @param input_grad               Data tensor input grad (output)
- * @param strideDesc               Tensor descriptor for stride tensor (input)
- * @param stride                   Data tensor stride (output)
- * @param paddingDesc              Tensor descriptor for padding tensor (input)
- * @param padding                  Data tensor padding (output)
- * @param ksizeDesc               Tensor descriptor for ksize tensor (input)
- * @param ksize                   Data tensor ksize (output)
  * @param count_include_pad        When True, will include the zero-padding in the averaging
  * calculation (input)
  * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
@@ -7686,12 +7677,15 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                                                    const void* output_grad,
                                                    const miopenTensorDescriptor_t inputGradDesc,
                                                    void* input_grad,
-                                                   const miopenTensorDescriptor_t strideDesc,
-                                                   const void* stride,
-                                                   const miopenTensorDescriptor_t paddingDesc,
-                                                   const void* padding,
-                                                   const miopenTensorDescriptor_t ksizeDesc,
-                                                   const void* ksize,
+                                                   const int32_t KD,
+                                                   const int32_t KH,
+                                                   const int32_t KW,
+                                                   const int32_t SD,
+                                                   const int32_t SH,
+                                                   const int32_t SW,
+                                                   const int32_t PD,
+                                                   const int32_t PH,
+                                                   const int32_t PW,
                                                    const bool count_include_pad,
                                                    const int32_t divisor_override);
 /** @} */
diff --git a/src/avgpool.cpp b/src/avgpool.cpp
index 87ff481c6a..323f01c90e 100644
--- a/src/avgpool.cpp
+++ b/src/avgpool.cpp
@@ -38,36 +38,37 @@ miopenStatus_t AvgPoolForward(Handle& handle,
                               ConstData_t input,
                               const TensorDescriptor& outputDesc,
                               Data_t output,
-                              const TensorDescriptor& strideDesc,
-                              ConstData_t stride,
-                              const TensorDescriptor& paddingDesc,
-                              ConstData_t padding,
-                              const TensorDescriptor& ksizeDesc,
-                              ConstData_t ksize,
+                              const int32_t KD,
+                              const int32_t KH,
+                              const int32_t KW,
+                              const int32_t SD,
+                              const int32_t SH,
+                              const int32_t SW,
+                              const int32_t PD,
+                              const int32_t PH,
+                              const int32_t PW,
                               const bool count_include_pad,
                               const int32_t divisor_override)
 {
-    const auto problem = avgpool::FwdProblemDescription{inputDesc,
-                                                        outputDesc,
-                                                        strideDesc,
-                                                        paddingDesc,
-                                                        ksizeDesc,
-                                                        count_include_pad,
-                                                        divisor_override};
+    const auto problem =
+        avgpool::FwdProblemDescription{inputDesc, outputDesc, count_include_pad, divisor_override};
 
     const auto invoke_params = [&]() {
-        auto tmp        = avgpool::FwdInvokeParams{};
-        tmp.inputDesc   = &inputDesc;
-        tmp.outputDesc  = &outputDesc;
-        tmp.strideDesc  = &strideDesc;
-        tmp.paddingDesc = &paddingDesc;
-        tmp.ksizeDesc   = &ksizeDesc;
+        auto tmp       = avgpool::FwdInvokeParams{};
+        tmp.inputDesc  = &inputDesc;
+        tmp.outputDesc = &outputDesc;
 
         tmp.input             = input;
         tmp.output            = output;
-        tmp.stride            = stride;
-        tmp.padding           = padding;
-        tmp.ksize             = ksize;
+        tmp.KD                = KD;
+        tmp.KH                = KH;
+        tmp.KW                = KW;
+        tmp.SD                = SD;
+        tmp.SH                = SH;
+        tmp.SW                = SW;
+        tmp.PD                = PD;
+        tmp.PH                = PH;
+        tmp.PW                = PW;
         tmp.count_include_pad = count_include_pad;
         tmp.divisor_override  = divisor_override;
 
@@ -87,36 +88,37 @@ miopenStatus_t AvgPoolBackward(Handle& handle,
                                ConstData_t output_grad,
                                const TensorDescriptor& inputGradDesc,
                                Data_t input_grad,
-                               const TensorDescriptor& strideDesc,
-                               ConstData_t stride,
-                               const TensorDescriptor& paddingDesc,
-                               ConstData_t padding,
-                               const TensorDescriptor& ksizeDesc,
-                               ConstData_t ksize,
+                               const int32_t KD,
+                               const int32_t KH,
+                               const int32_t KW,
+                               const int32_t SD,
+                               const int32_t SH,
+                               const int32_t SW,
+                               const int32_t PD,
+                               const int32_t PH,
+                               const int32_t PW,
                                const bool count_include_pad,
                                const int32_t divisor_override)
 {
-    const auto problem = avgpool::BwdProblemDescription{outputGradDesc,
-                                                        inputGradDesc,
-                                                        strideDesc,
-                                                        paddingDesc,
-                                                        ksizeDesc,
-                                                        count_include_pad,
-                                                        divisor_override};
+    const auto problem = avgpool::BwdProblemDescription{
+        outputGradDesc, inputGradDesc, count_include_pad, divisor_override};
 
     const auto invoke_params = [&]() {
         auto tmp           = avgpool::BwdInvokeParams{};
         tmp.outputGradDesc = &outputGradDesc;
         tmp.inputGradDesc  = &inputGradDesc;
-        tmp.strideDesc     = &strideDesc;
-        tmp.paddingDesc    = &paddingDesc;
-        tmp.ksizeDesc      = &ksizeDesc;
 
         tmp.output_grad       = output_grad;
         tmp.input_grad        = input_grad;
-        tmp.stride            = stride;
-        tmp.padding           = padding;
-        tmp.ksize             = ksize;
+        tmp.KD                = KD;
+        tmp.KH                = KH;
+        tmp.KW                = KW;
+        tmp.SD                = SD;
+        tmp.SH                = SH;
+        tmp.SW                = SW;
+        tmp.PD                = PD;
+        tmp.PH                = PH;
+        tmp.PW                = PW;
         tmp.count_include_pad = count_include_pad;
         tmp.divisor_override  = divisor_override;
 
diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp
index fa2e8a957c..32e1f12f92 100644
--- a/src/avgpool_api.cpp
+++ b/src/avgpool_api.cpp
@@ -84,12 +84,15 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                                                const void* input,
                                                const miopenTensorDescriptor_t outputDesc,
                                                void* output,
-                                               const miopenTensorDescriptor_t strideDesc,
-                                               const void* stride,
-                                               const miopenTensorDescriptor_t paddingDesc,
-                                               const void* padding,
-                                               const miopenTensorDescriptor_t ksizeDesc,
-                                               const void* ksize,
+                                               const int32_t KD,
+                                               const int32_t KH,
+                                               const int32_t KW,
+                                               const int32_t SD,
+                                               const int32_t SH,
+                                               const int32_t SW,
+                                               const int32_t PD,
+                                               const int32_t PH,
+                                               const int32_t PW,
                                                const bool count_include_pad,
                                                const int32_t divisor_override)
 {
@@ -98,12 +101,15 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                         input,
                         outputDesc,
                         output,
-                        strideDesc,
-                        stride,
-                        paddingDesc,
-                        padding,
-                        ksizeDesc,
-                        ksize,
+                        KD,
+                        KH,
+                        KW,
+                        SD,
+                        SH,
+                        SW,
+                        PD,
+                        PH,
+                        PW,
                         count_include_pad,
                         divisor_override);
 
@@ -114,12 +120,15 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                                DataCast(input),
                                miopen::deref(outputDesc),
                                DataCast(output),
-                               miopen::deref(strideDesc),
-                               DataCast(stride),
-                               miopen::deref(paddingDesc),
-                               DataCast(padding),
-                               miopen::deref(ksizeDesc),
-                               DataCast(ksize),
+                               KD,
+                               KH,
+                               KW,
+                               SD,
+                               SH,
+                               SW,
+                               PD,
+                               PH,
+                               PW,
                                count_include_pad,
                                divisor_override);
     });
@@ -130,12 +139,15 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                                                 const void* output_grad,
                                                 const miopenTensorDescriptor_t inputGradDesc,
                                                 void* input_grad,
-                                                const miopenTensorDescriptor_t strideDesc,
-                                                const void* stride,
-                                                const miopenTensorDescriptor_t paddingDesc,
-                                                const void* padding,
-                                                const miopenTensorDescriptor_t ksizeDesc,
-                                                const void* ksize,
+                                                const int32_t KD,
+                                                const int32_t KH,
+                                                const int32_t KW,
+                                                const int32_t SD,
+                                                const int32_t SH,
+                                                const int32_t SW,
+                                                const int32_t PD,
+                                                const int32_t PH,
+                                                const int32_t PW,
                                                 const bool count_include_pad,
                                                 const int32_t divisor_override)
 {
@@ -144,12 +156,15 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                         output_grad,
                         inputGradDesc,
                         input_grad,
-                        strideDesc,
-                        stride,
-                        paddingDesc,
-                        padding,
-                        ksizeDesc,
-                        ksize,
+                        KD,
+                        KH,
+                        KW,
+                        SD,
+                        SH,
+                        SW,
+                        PD,
+                        PH,
+                        PW,
                         count_include_pad,
                         divisor_override);
 
@@ -160,12 +175,15 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                                 DataCast(output_grad),
                                 miopen::deref(inputGradDesc),
                                 DataCast(input_grad),
-                                miopen::deref(strideDesc),
-                                DataCast(stride),
-                                miopen::deref(paddingDesc),
-                                DataCast(padding),
-                                miopen::deref(ksizeDesc),
-                                DataCast(ksize),
+                                KD,
+                                KH,
+                                KW,
+                                SD,
+                                SH,
+                                SW,
+                                PD,
+                                PH,
+                                PW,
                                 count_include_pad,
                                 divisor_override);
     });
diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp
index 9210e45e3a..00a2717ff6 100644
--- a/src/include/miopen/avgpool.hpp
+++ b/src/include/miopen/avgpool.hpp
@@ -39,12 +39,15 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle,
                                                       ConstData_t input,
                                                       const TensorDescriptor& outputDesc,
                                                       Data_t output,
-                                                      const TensorDescriptor& strideDesc,
-                                                      ConstData_t stride,
-                                                      const TensorDescriptor& paddingDesc,
-                                                      ConstData_t padding,
-                                                      const TensorDescriptor& ksizeDesc,
-                                                      ConstData_t ksize,
+                                                      int32_t KD,
+                                                      int32_t KH,
+                                                      int32_t KW,
+                                                      int32_t SD,
+                                                      int32_t SH,
+                                                      int32_t SW,
+                                                      int32_t PD,
+                                                      int32_t PH,
+                                                      int32_t PW,
                                                       bool count_include_pad,
                                                       int32_t divisor_override);
 
@@ -53,12 +56,15 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle,
                                                        ConstData_t output_grad,
                                                        const TensorDescriptor& inputGradDesc,
                                                        Data_t input_grad,
-                                                       const TensorDescriptor& strideDesc,
-                                                       ConstData_t stride,
-                                                       const TensorDescriptor& paddingDesc,
-                                                       ConstData_t padding,
-                                                       const TensorDescriptor& ksizeDesc,
-                                                       ConstData_t ksize,
+                                                       int32_t KD,
+                                                       int32_t KH,
+                                                       int32_t KW,
+                                                       int32_t SD,
+                                                       int32_t SH,
+                                                       int32_t SW,
+                                                       int32_t PD,
+                                                       int32_t PH,
+                                                       int32_t PW,
                                                        bool count_include_pad,
                                                        int32_t divisor_override);
 } // namespace miopen
diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp
index 91a70725ee..e8bd9256ac 100644
--- a/src/include/miopen/avgpool/invoke_params.hpp
+++ b/src/include/miopen/avgpool/invoke_params.hpp
@@ -38,18 +38,22 @@ struct FwdInvokeParams : public miopen::InvokeParams
 
     FwdInvokeParams() = default;
 
-    const TensorDescriptor* inputDesc   = nullptr;
-    const TensorDescriptor* outputDesc  = nullptr;
-    const TensorDescriptor* strideDesc  = nullptr;
-    const TensorDescriptor* paddingDesc = nullptr;
-    const TensorDescriptor* ksizeDesc   = nullptr;
-
-    ConstData_t input   = nullptr;
-    Data_t output       = nullptr;
-    ConstData_t stride  = nullptr;
-    ConstData_t padding = nullptr;
-    ConstData_t ksize   = nullptr;
-
+    const TensorDescriptor* inputDesc  = nullptr;
+    const TensorDescriptor* outputDesc = nullptr;
+
+    ConstData_t input = nullptr;
+    Data_t output     = nullptr;
+    ConstData_t ksize = nullptr;
+
+    int32_t KD               = 0;
+    int32_t KH               = 0;
+    int32_t KW               = 0;
+    int32_t SD               = 0;
+    int32_t SH               = 0;
+    int32_t SW               = 0;
+    int32_t PD               = 0;
+    int32_t PH               = 0;
+    int32_t PW               = 0;
     bool count_include_pad   = false;
     int32_t divisor_override = 0;
 
@@ -64,16 +68,20 @@ struct BwdInvokeParams : public miopen::InvokeParams
 
     const TensorDescriptor* outputGradDesc = nullptr;
     const TensorDescriptor* inputGradDesc  = nullptr;
-    const TensorDescriptor* strideDesc     = nullptr;
-    const TensorDescriptor* paddingDesc    = nullptr;
-    const TensorDescriptor* ksizeDesc      = nullptr;
 
     ConstData_t output_grad = nullptr;
     Data_t input_grad       = nullptr;
-    ConstData_t stride      = nullptr;
-    ConstData_t padding     = nullptr;
     ConstData_t ksize       = nullptr;
 
+    int32_t KD               = 0;
+    int32_t KH               = 0;
+    int32_t KW               = 0;
+    int32_t SD               = 0;
+    int32_t SH               = 0;
+    int32_t SW               = 0;
+    int32_t PD               = 0;
+    int32_t PH               = 0;
+    int32_t PW               = 0;
     bool count_include_pad   = false;
     int32_t divisor_override = 0;
 
diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp
index 9166762235..2dee6a30ea 100644
--- a/src/include/miopen/avgpool/problem_description.hpp
+++ b/src/include/miopen/avgpool/problem_description.hpp
@@ -38,16 +38,8 @@ namespace avgpool {
 
 struct ProblemDescription : ProblemDescriptionBase
 {
-    ProblemDescription(const TensorDescriptor& strideDesc_,
-                       const TensorDescriptor& paddingDesc_,
-                       const TensorDescriptor& kinforDesc_,
-                       const bool count_include_pad_,
-                       const int32_t divisor_override_)
-        : strideDesc(strideDesc_),
-          paddingDesc(paddingDesc_),
-          kinforDesc(kinforDesc_),
-          count_include_pad(count_include_pad_),
-          divisor_override(divisor_override_)
+    ProblemDescription(const bool count_include_pad_, const int32_t divisor_override_)
+        : count_include_pad(count_include_pad_), divisor_override(divisor_override_)
     {
         if(divisor_override < 0)
         {
@@ -56,10 +48,6 @@ struct ProblemDescription : ProblemDescriptionBase
     }
 
 protected:
-    TensorDescriptor strideDesc;
-    TensorDescriptor paddingDesc;
-    TensorDescriptor kinforDesc;
-
     bool count_include_pad;
     int32_t divisor_override;
 };
@@ -68,13 +56,9 @@ struct FwdProblemDescription : ProblemDescription
 {
     FwdProblemDescription(const TensorDescriptor& inputDesc_,
                           const TensorDescriptor& outputDesc_,
-                          const TensorDescriptor& strideDesc_,
-                          const TensorDescriptor& paddingDesc_,
-                          const TensorDescriptor& kinforDesc_,
                           const bool count_include_pad_,
                           const int32_t divisor_override_)
-        : ProblemDescription(
-              strideDesc_, paddingDesc_, kinforDesc_, count_include_pad_, divisor_override_),
+        : ProblemDescription(count_include_pad_, divisor_override_),
           inputDesc(inputDesc_),
           outputDesc(outputDesc_)
     {
@@ -95,14 +79,6 @@ struct FwdProblemDescription : ProblemDescription
             MIOPEN_THROW(miopenStatusBadParm,
                          "AvgPool: Input and output tensor sizes do not match.");
         }
-        if(input_dims - 2 != strideDesc.GetElementSize() ||
-           input_dims - 2 != paddingDesc.GetElementSize() ||
-           input_dims - 2 != kinforDesc.GetElementSize())
-        {
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "AvgPool: Input tensor sizes and Kernel size or stride "
-                         "or padding do not match.");
-        }
 
         return true;
     }
@@ -118,13 +94,9 @@ struct BwdProblemDescription : ProblemDescription
 {
     BwdProblemDescription(const TensorDescriptor& outputGradDesc_,
                           const TensorDescriptor& inputGradDesc_,
-                          const TensorDescriptor& strideDesc_,
-                          const TensorDescriptor& paddingDesc_,
-                          const TensorDescriptor& kinforDesc_,
                           const bool count_include_pad_,
                           const int32_t divisor_override_)
-        : ProblemDescription(
-              strideDesc_, paddingDesc_, kinforDesc_, count_include_pad_, divisor_override_),
+        : ProblemDescription(count_include_pad_, divisor_override_),
           outputGradDesc(outputGradDesc_),
           inputGradDesc(inputGradDesc_)
     {
@@ -145,14 +117,6 @@ struct BwdProblemDescription : ProblemDescription
             MIOPEN_THROW(miopenStatusBadParm,
                          "AvgPool: Input grad and output grad tensor sizes do not match.");
         }
-        if(input_dims - 2 != strideDesc.GetElementSize() ||
-           input_dims - 2 != paddingDesc.GetElementSize() ||
-           input_dims - 2 != kinforDesc.GetElementSize())
-        {
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "AvgPool: Input grad tensor sizes and Kernel size or stride or padding do "
-                         "not match.");
-        }
 
         return true;
     }
diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
index 6d94bffac1..32ac270b37 100644
--- a/src/kernels/MIOpenAvgPool.cpp
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -40,27 +40,6 @@
 #define OUTPUT_TYPE float
 #endif
 
-// template <typename T, uint32_t Nd>
-// struct blockNd
-// {
-//     T val[Nd];
-// };
-
-// template <typename TI, typename TO, uint32_t Nd>
-// __device__ void avgPoolForwardNdNew(const TI* __restrict__ input,
-//                                     TO* __restrict__ output,
-//                                     size_t N,
-//                                     size_t C,
-//                                     const blockNd<size_t, Nd> sizeIn,
-//                                     const blockNd<size_t, Nd> sizeOut,
-//                                     const blockNd<int32_t, Nd> ksize,
-//                                     const blockNd<int32_t, Nd> stride,
-//                                     const blockNd<int32_t, Nd> padding,
-//                                     bool count_include_pad,
-//                                     int32_t divisor_override,
-//                                     tensor_view_t<Nd + 2> input_tv,
-//                                     tensor_view_t<Nd + 2> output_tv);
-
 template <typename TI, typename TO>
 __device__ void avgPoolForward2d(const TI* __restrict__ input,
                                  TO* __restrict__ output,
@@ -70,9 +49,12 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
                                  size_t W,
                                  size_t OH,
                                  size_t OW,
-                                 const int32_t* __restrict__ ksize,
-                                 const int32_t* __restrict__ stride,
-                                 const int32_t* __restrict__ padding,
+                                 int32_t R,
+                                 int32_t S,
+                                 int32_t sh,
+                                 int32_t sw,
+                                 int32_t ph,
+                                 int32_t pw,
                                  bool count_include_pad,
                                  int32_t divisor_override,
                                  tensor_view_t<4> input_tv,
@@ -82,19 +64,15 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
     int32_t ncoh = gid / OW, ow = gid % OW;
     int32_t nc = ncoh / OH, oh = ncoh % OH;
     int32_t n = nc / C, c = nc % C;
-    int32_t R  = ksize[0];
-    int32_t S  = ksize[1];
-    int32_t sh = stride[0];
-    int32_t sw = stride[1];
-    int32_t ph = padding[0];
-    int32_t pw = padding[1];
 
     if(n >= N)
         return;
 
     FLOAT_ACCUM m = 0;
+#pragma unroll
     for(int32_t r = 0; r < R; ++r)
     {
+#pragma unroll
         for(int32_t s = 0; s < S; ++s)
         {
             // input idx : (n, c, h, w)
@@ -151,9 +129,12 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input
                                             size_t W,
                                             size_t OH,
                                             size_t OW,
-                                            int32_t* ksize,
-                                            int32_t* stride,
-                                            int32_t* padding,
+                                            int32_t R,
+                                            int32_t S,
+                                            int32_t sh,
+                                            int32_t sw,
+                                            int32_t ph,
+                                            int32_t pw,
                                             bool count_include_pad,
                                             int32_t divisor_override,
                                             tensor_view_t<4> input_tv,
@@ -167,9 +148,12 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input
                                               W,
                                               OH,
                                               OW,
-                                              ksize,
-                                              stride,
-                                              padding,
+                                              R,
+                                              S,
+                                              sh,
+                                              sw,
+                                              ph,
+                                              pw,
                                               count_include_pad,
                                               divisor_override,
                                               input_tv,
@@ -187,9 +171,15 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
                                  size_t OD,
                                  size_t OH,
                                  size_t OW,
-                                 int32_t* ksize,
-                                 int32_t* stride,
-                                 int32_t* padding,
+                                 int32_t KD,
+                                 int32_t R,
+                                 int32_t S,
+                                 int32_t sd,
+                                 int32_t sh,
+                                 int32_t sw,
+                                 int32_t pd,
+                                 int32_t ph,
+                                 int32_t pw,
                                  bool count_include_pad,
                                  int32_t divisor_override,
                                  tensor_view_t<5> input_tv,
@@ -200,19 +190,11 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
     int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
     int32_t nc = ncod / OD, od = ncod % OD;
     int32_t n = nc / C, c = nc % C;
-    int32_t KD = ksize[0];
-    int32_t R  = ksize[1];
-    int32_t S  = ksize[2];
-    int32_t sd = stride[0];
-    int32_t sh = stride[1];
-    int32_t sw = stride[2];
-    int32_t pd = padding[0];
-    int32_t ph = padding[1];
-    int32_t pw = padding[2];
 
     if(n >= N)
         return;
     FLOAT_ACCUM sum = 0;
+#pragma unroll
     for(int32_t kd = 0; kd < KD; ++kd)
     {
         for(int32_t r = 0; r < R; ++r)
@@ -281,9 +263,15 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input
                                             size_t OD,
                                             size_t OH,
                                             size_t OW,
-                                            int32_t* ksize,
-                                            int32_t* stride,
-                                            int32_t* padding,
+                                            int32_t KD,
+                                            int32_t R,
+                                            int32_t S,
+                                            int32_t sd,
+                                            int32_t sh,
+                                            int32_t sw,
+                                            int32_t pd,
+                                            int32_t ph,
+                                            int32_t pw,
                                             bool count_include_pad,
                                             int32_t divisor_override,
                                             tensor_view_t<5> input_tv,
@@ -299,9 +287,15 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input
                                               OD,
                                               OH,
                                               OW,
-                                              ksize,
-                                              stride,
-                                              padding,
+                                              KD,
+                                              R,
+                                              S,
+                                              sd,
+                                              sh,
+                                              sw,
+                                              pd,
+                                              ph,
+                                              pw,
                                               count_include_pad,
                                               divisor_override,
                                               input_tv,
@@ -317,9 +311,12 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
                                   size_t W,
                                   size_t OH,
                                   size_t OW,
-                                  int32_t* ksize,
-                                  int32_t* stride,
-                                  int32_t* padding,
+                                  int32_t R,
+                                  int32_t S,
+                                  int32_t sh,
+                                  int32_t sw,
+                                  int32_t ph,
+                                  int32_t pw,
                                   bool count_include_pad,
                                   int32_t divisor_override,
                                   tensor_view_t<4> output_grad_tv,
@@ -329,19 +326,15 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
     int32_t nch = gid / W, w = gid % W;
     int32_t nc = nch / H, h = nch % H;
     int32_t n = nc / C, c = nc % C;
-    int32_t R  = ksize[0];
-    int32_t S  = ksize[1];
-    int32_t sh = stride[0];
-    int32_t sw = stride[1];
-    int32_t ph = padding[0];
-    int32_t pw = padding[1];
 
     if(n >= N)
         return;
 
     FLOAT_ACCUM grad = 0;
+#pragma unroll
     for(int32_t r = 0; r < R; ++r)
     {
+#pragma unroll
         for(int32_t s = 0; s < S; ++s)
         {
             int32_t ohsh = h + ph - r;
@@ -403,9 +396,12 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp
                                              size_t W,
                                              size_t OH,
                                              size_t OW,
-                                             int32_t* ksize,
-                                             int32_t* stride,
-                                             int32_t* padding,
+                                             int32_t R,
+                                             int32_t S,
+                                             int32_t sh,
+                                             int32_t sw,
+                                             int32_t ph,
+                                             int32_t pw,
                                              bool count_include_pad,
                                              int32_t divisor_override,
                                              tensor_view_t<4> output_grad_tv,
@@ -419,9 +415,12 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp
                                                W,
                                                OH,
                                                OW,
-                                               ksize,
-                                               stride,
-                                               padding,
+                                               R,
+                                               S,
+                                               sh,
+                                               sw,
+                                               ph,
+                                               pw,
                                                count_include_pad,
                                                divisor_override,
                                                output_grad_tv,
@@ -439,9 +438,15 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
                                   size_t OD,
                                   size_t OH,
                                   size_t OW,
-                                  int32_t* ksize,
-                                  int32_t* stride,
-                                  int32_t* padding,
+                                  int32_t KD,
+                                  int32_t R,
+                                  int32_t S,
+                                  int32_t sd,
+                                  int32_t sh,
+                                  int32_t sw,
+                                  int32_t pd,
+                                  int32_t ph,
+                                  int32_t pw,
                                   bool count_include_pad,
                                   int32_t divisor_override,
                                   tensor_view_t<5> output_grad_tv,
@@ -452,20 +457,12 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
     int32_t ncd = ncdh / H, h = ncdh % H;
     int32_t nc = ncd / D, d = ncd % D;
     int32_t n = nc / C, c = nc % C;
-    int32_t KD = ksize[0];
-    int32_t R  = ksize[1];
-    int32_t S  = ksize[2];
-    int32_t sd = stride[0];
-    int32_t sh = stride[1];
-    int32_t sw = stride[2];
-    int32_t pd = padding[0];
-    int32_t ph = padding[1];
-    int32_t pw = padding[2];
 
     if(n >= N)
         return;
 
     FLOAT_ACCUM grad = 0;
+#pragma unroll
     for(int32_t kd = 0; kd < KD; ++kd)
     {
         for(int32_t r = 0; r < R; ++r)
@@ -543,9 +540,15 @@ extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ outp
                                              size_t OD,
                                              size_t OH,
                                              size_t OW,
-                                             int32_t* ksize,
-                                             int32_t* stride,
-                                             int32_t* padding,
+                                             int32_t KD,
+                                             int32_t R,
+                                             int32_t S,
+                                             int32_t sd,
+                                             int32_t sh,
+                                             int32_t sw,
+                                             int32_t pd,
+                                             int32_t ph,
+                                             int32_t pw,
                                              bool count_include_pad,
                                              int32_t divisor_override,
                                              tensor_view_t<5> output_grad_tv,
@@ -561,9 +564,15 @@ extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ outp
                                                OD,
                                                OH,
                                                OW,
-                                               ksize,
-                                               stride,
-                                               padding,
+                                               KD,
+                                               R,
+                                               S,
+                                               sd,
+                                               sh,
+                                               sw,
+                                               pd,
+                                               ph,
+                                               pw,
                                                count_include_pad,
                                                divisor_override,
                                                output_grad_tv,
diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp
index 4fe9d5bc76..c5ed51dc27 100644
--- a/src/solver/avgpool/backward_avgpool_2d.cpp
+++ b/src/solver/avgpool/backward_avgpool_2d.cpp
@@ -81,10 +81,10 @@ bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
-    if(!IsOverRocm(problem))
-    {
-        return false;
-    }
+    // if(!IsOverRocm(problem))
+    // {
+    //     return false;
+    // }
     return true;
 }
 
@@ -135,9 +135,12 @@ AvgPoolBackward2d::GetSolution(const ExecutionContext& context,
                    W,
                    OH,
                    OW,
-                   params.ksize,
-                   params.stride,
-                   params.padding,
+                   params.KH,
+                   params.KW,
+                   params.SH,
+                   params.SW,
+                   params.PH,
+                   params.PW,
                    params.count_include_pad,
                    params.divisor_override,
                    output_grad_tv,
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
index 6897097955..96adbb2e46 100644
--- a/src/solver/avgpool/backward_avgpool_3d.cpp
+++ b/src/solver/avgpool/backward_avgpool_3d.cpp
@@ -43,6 +43,42 @@ namespace solver {
 
 namespace avgpool {
 
+bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem)
+{
+    auto dtype      = problem.GetInputGradDesc().GetType();
+    auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
+    auto out_nelems = problem.GetOutputGradDesc().GetElementSize();
+    auto mul_nc =
+        problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1];
+    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+    if(dtype == miopenFloat)
+    {
+        if((in_over_out < 8 && in_over_out > 1) || (in_over_out < 2 && in_nelems <= 5971968))
+        {
+            return true;
+        }
+        return false;
+    }
+    else if(dtype == miopenHalf)
+    {
+        if((in_over_out < 2 && mul_nc < 8192) ||
+           (8 > in_over_out && in_over_out > 7 && out_nelems >= 32401152))
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenBFloat16)
+    {
+        if((7 < in_over_out && in_over_out < 8 && in_nelems >= 944111616) ||
+           (in_over_out < 2 && in_nelems >= 4194304))
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
 bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context,
                                      const miopen::avgpool::BwdProblemDescription& problem) const
 {
@@ -51,6 +87,10 @@ bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
+    // if(!IsOverRocm(problem))
+    // {
+    //     return false;
+    // }
     return true;
 }
 
@@ -105,9 +145,15 @@ AvgPoolBackward3d::GetSolution(const ExecutionContext& context,
                    OD,
                    OH,
                    OW,
-                   params.ksize,
-                   params.stride,
-                   params.padding,
+                   params.KD,
+                   params.KH,
+                   params.KW,
+                   params.SD,
+                   params.SH,
+                   params.SW,
+                   params.PD,
+                   params.PH,
+                   params.PW,
                    params.count_include_pad,
                    params.divisor_override,
                    output_grad_tv,
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index 3e70264097..ebc5c4b956 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -83,10 +83,10 @@ bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
-    if(!IsOverRocm(problem))
-    {
-        return false;
-    }
+    // if(!IsOverRocm(problem))
+    // {
+    //     return false;
+    // }
     return true;
 }
 
@@ -137,9 +137,12 @@ AvgPoolForward2d::GetSolution(const ExecutionContext& context,
                    W,
                    OH,
                    OW,
-                   params.ksize,
-                   params.stride,
-                   params.padding,
+                   params.KH,
+                   params.KW,
+                   params.SH,
+                   params.SW,
+                   params.PH,
+                   params.PW,
                    params.count_include_pad,
                    params.divisor_override,
                    input_tv,
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
index 088aac6dca..32a24d47bb 100644
--- a/src/solver/avgpool/forward_avgpool_3d.cpp
+++ b/src/solver/avgpool/forward_avgpool_3d.cpp
@@ -43,6 +43,38 @@ namespace solver {
 
 namespace avgpool {
 
+bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem)
+{
+    auto dtype      = problem.GetOutputDesc().GetType();
+    auto in_nelems  = problem.GetInputDesc().GetElementSize();
+    auto out_nelems = problem.GetOutputDesc().GetElementSize();
+    auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
+    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+    if(dtype == miopenFloat)
+    {
+        if(in_over_out < 8 || in_over_out >= 262144)
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenHalf)
+    {
+        if(in_nelems >= 201326592 || (in_over_out < 2 && mul_nc < 8192))
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenBFloat16)
+    {
+        if((out_nelems >= 5971968 && in_over_out < 2) || out_nelems >= 74088000)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
 bool AvgPoolForward3d::IsApplicable(const ExecutionContext& context,
                                     const miopen::avgpool::FwdProblemDescription& problem) const
 {
@@ -50,6 +82,10 @@ bool AvgPoolForward3d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
+    // if(!IsOverRocm(problem))
+    // {
+    //     return false;
+    // }
     return true;
 }
 
@@ -104,9 +140,15 @@ AvgPoolForward3d::GetSolution(const ExecutionContext& context,
                    OD,
                    OH,
                    OW,
-                   params.ksize,
-                   params.stride,
-                   params.padding,
+                   params.KD,
+                   params.KH,
+                   params.KW,
+                   params.SD,
+                   params.SH,
+                   params.SW,
+                   params.PD,
+                   params.PH,
+                   params.PW,
                    params.count_include_pad,
                    params.divisor_override,
                    input_tv,

From 930d47e02a4573ac52713238704794d4228b7fb8 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Tue, 20 Aug 2024 18:42:22 +0700
Subject: [PATCH 08/38] fix gtest

---
 src/kernels/MIOpenAvgPool.cpp              |  4 -
 src/solver/avgpool/backward_avgpool_2d.cpp | 12 +--
 src/solver/avgpool/backward_avgpool_3d.cpp | 22 +++---
 src/solver/avgpool/forward_avgpool_2d.cpp  | 14 ++--
 src/solver/avgpool/forward_avgpool_3d.cpp  | 19 +++--
 test/gtest/avgpool.cpp                     | 24 ++++--
 test/gtest/avgpool.hpp                     | 89 +++++++++++++++-------
 7 files changed, 114 insertions(+), 70 deletions(-)

diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
index 32ac270b37..d17dcc38ff 100644
--- a/src/kernels/MIOpenAvgPool.cpp
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -72,7 +72,6 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
 #pragma unroll
     for(int32_t r = 0; r < R; ++r)
     {
-#pragma unroll
         for(int32_t s = 0; s < S; ++s)
         {
             // input idx : (n, c, h, w)
@@ -194,7 +193,6 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
     if(n >= N)
         return;
     FLOAT_ACCUM sum = 0;
-#pragma unroll
     for(int32_t kd = 0; kd < KD; ++kd)
     {
         for(int32_t r = 0; r < R; ++r)
@@ -334,7 +332,6 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
 #pragma unroll
     for(int32_t r = 0; r < R; ++r)
     {
-#pragma unroll
         for(int32_t s = 0; s < S; ++s)
         {
             int32_t ohsh = h + ph - r;
@@ -462,7 +459,6 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
         return;
 
     FLOAT_ACCUM grad = 0;
-#pragma unroll
     for(int32_t kd = 0; kd < KD; ++kd)
     {
         for(int32_t r = 0; r < R; ++r)
diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp
index c5ed51dc27..73adabb8e7 100644
--- a/src/solver/avgpool/backward_avgpool_2d.cpp
+++ b/src/solver/avgpool/backward_avgpool_2d.cpp
@@ -43,7 +43,7 @@ namespace solver {
 
 namespace avgpool {
 
-bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem)
+bool IsOverRocmBwd2d(const miopen::avgpool::BwdProblemDescription& problem)
 {
     auto dtype      = problem.GetInputGradDesc().GetType();
     auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
@@ -73,7 +73,7 @@ bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem)
     return false;
 }
 
-bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context,
+bool AvgPoolBackward2d::IsApplicable(const ExecutionContext&,
                                      const miopen::avgpool::BwdProblemDescription& problem) const
 {
     if(problem.GetInputGradDesc().GetNumDims() != 4 ||
@@ -81,10 +81,10 @@ bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
-    // if(!IsOverRocm(problem))
-    // {
-    //     return false;
-    // }
+    if(!IsOverRocmBwd2d(problem))
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
index 96adbb2e46..4815803ad3 100644
--- a/src/solver/avgpool/backward_avgpool_3d.cpp
+++ b/src/solver/avgpool/backward_avgpool_3d.cpp
@@ -43,18 +43,19 @@ namespace solver {
 
 namespace avgpool {
 
-bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem)
+bool IsOverRocmBwd3d(const miopen::avgpool::BwdProblemDescription& problem)
 {
     auto dtype      = problem.GetInputGradDesc().GetType();
     auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
     auto out_nelems = problem.GetOutputGradDesc().GetElementSize();
     auto mul_nc =
         problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1];
+    auto N           = problem.GetOutputGradDesc().GetLengths()[0];
     auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
 
     if(dtype == miopenFloat)
     {
-        if((in_over_out < 8 && in_over_out > 1) || (in_over_out < 2 && in_nelems <= 5971968))
+        if((in_over_out < 2 && out_nelems <= 12582912) || (in_over_out <= 8 && N >= 6))
         {
             return true;
         }
@@ -62,16 +63,15 @@ bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem)
     }
     else if(dtype == miopenHalf)
     {
-        if((in_over_out < 2 && mul_nc < 8192) ||
-           (8 > in_over_out && in_over_out > 7 && out_nelems >= 32401152))
+        if((in_over_out < 2 && mul_nc < 8192) || (8 > in_over_out && out_nelems >= 29052108))
         {
             return true;
         }
     }
     else if(dtype == miopenBFloat16)
     {
-        if((7 < in_over_out && in_over_out < 8 && in_nelems >= 944111616) ||
-           (in_over_out < 2 && in_nelems >= 4194304))
+        if((1 <= in_over_out && in_over_out < 2 && in_nelems >= 4194304) ||
+           (in_over_out <= 8 && in_nelems >= 944111616))
         {
             return true;
         }
@@ -79,7 +79,7 @@ bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem)
     return false;
 }
 
-bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context,
+bool AvgPoolBackward3d::IsApplicable(const ExecutionContext&,
                                      const miopen::avgpool::BwdProblemDescription& problem) const
 {
     if(problem.GetInputGradDesc().GetNumDims() != 5 ||
@@ -87,10 +87,10 @@ bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
-    // if(!IsOverRocm(problem))
-    // {
-    //     return false;
-    // }
+    if(!IsOverRocmBwd3d(problem))
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index ebc5c4b956..1c51feb54b 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -44,7 +44,7 @@ namespace solver {
 
 namespace avgpool {
 
-bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem)
+bool IsOverRocmFwd2d(const miopen::avgpool::FwdProblemDescription& problem)
 {
     auto dtype      = problem.GetOutputDesc().GetType();
     auto in_nelems  = problem.GetInputDesc().GetElementSize();
@@ -68,7 +68,7 @@ bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem)
     }
     else if(dtype == miopenBFloat16)
     {
-        if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 6000000)
+        if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 4816896)
         {
             return true;
         }
@@ -76,17 +76,17 @@ bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem)
     return false;
 }
 
-bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context,
+bool AvgPoolForward2d::IsApplicable(const ExecutionContext&,
                                     const miopen::avgpool::FwdProblemDescription& problem) const
 {
     if(problem.GetInputDesc().GetNumDims() != 4 || problem.GetOutputDesc().GetNumDims() != 4)
     {
         return false;
     }
-    // if(!IsOverRocm(problem))
-    // {
-    //     return false;
-    // }
+    if(!IsOverRocmFwd2d(problem))
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
index 32a24d47bb..6f70a07419 100644
--- a/src/solver/avgpool/forward_avgpool_3d.cpp
+++ b/src/solver/avgpool/forward_avgpool_3d.cpp
@@ -43,17 +43,22 @@ namespace solver {
 
 namespace avgpool {
 
-bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem)
+bool IsOverRocmFwd3d(const miopen::avgpool::FwdProblemDescription& problem)
 {
     auto dtype      = problem.GetOutputDesc().GetType();
     auto in_nelems  = problem.GetInputDesc().GetElementSize();
     auto out_nelems = problem.GetOutputDesc().GetElementSize();
     auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
+    auto N      = problem.GetOutputDesc().GetLengths()[0];
     auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
 
+    std::cout << "in_over_out: " << in_over_out << std::endl;
+    std::cout << "in_nelems: " << in_nelems << std::endl;
+    std::cout << "out_nelems: " << out_nelems << std::endl;
+
     if(dtype == miopenFloat)
     {
-        if(in_over_out < 8 || in_over_out >= 262144)
+        if(in_over_out < 2 || in_over_out >= 262144 || (out_nelems >= 10125000 && N > 4))
         {
             return true;
         }
@@ -75,17 +80,17 @@ bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem)
     return false;
 }
 
-bool AvgPoolForward3d::IsApplicable(const ExecutionContext& context,
+bool AvgPoolForward3d::IsApplicable(const ExecutionContext&,
                                     const miopen::avgpool::FwdProblemDescription& problem) const
 {
     if(problem.GetInputDesc().GetNumDims() != 5 || problem.GetOutputDesc().GetNumDims() != 5)
     {
         return false;
     }
-    // if(!IsOverRocm(problem))
-    // {
-    //     return false;
-    // }
+    if(!IsOverRocmFwd3d(problem))
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/test/gtest/avgpool.cpp b/test/gtest/avgpool.cpp
index fa002e5610..3ab32be510 100644
--- a/test/gtest/avgpool.cpp
+++ b/test/gtest/avgpool.cpp
@@ -111,9 +111,15 @@ TEST_P(GPU_Avgpool_fwd_BFP16, AvgPoolTestFwd)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP32, testing::ValuesIn(AvgPoolTestConfigs()));
-INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP16, testing::ValuesIn(AvgPoolTestConfigs()));
-INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_fwd_FP32,
+                         testing::ValuesIn(AvgPoolTestConfigsFwdFp32()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_fwd_FP16,
+                         testing::ValuesIn(AvgPoolTestConfigsFwdFp16()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_fwd_BFP16,
+                         testing::ValuesIn(AvgPoolTestConfigsFwdBfp16()));
 
 // BACKWARD TEST
 TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd)
@@ -158,6 +164,12 @@ TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP32, testing::ValuesIn(AvgPoolTestConfigs()));
-INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP16, testing::ValuesIn(AvgPoolTestConfigs()));
-INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_bwd_FP32,
+                         testing::ValuesIn(AvgPoolTestConfigsBwdFp32()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_bwd_FP16,
+                         testing::ValuesIn(AvgPoolTestConfigsBwdFp16()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_bwd_BFP16,
+                         testing::ValuesIn(AvgPoolTestConfigsBwdBfp16()));
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
index 26548e0a12..fca812357d 100644
--- a/test/gtest/avgpool.hpp
+++ b/test/gtest/avgpool.hpp
@@ -70,25 +70,50 @@ struct AvgPoolTestCase
     std::vector<int32_t> GetInput() const { return input_dims; }
 };
 
-inline std::vector<AvgPoolTestCase> AvgPoolTestConfigs()
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdFp32()
 {
     return {
-        {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, false, 0},
-        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, false, 0},
-        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, true, 0},
-        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, true, 0},
-        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, false, 1},
-        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, false, 1},
-        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, true, 1},
-        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, true, 1},
-        {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, false, 0},
-        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, false, 0},
-        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
-        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, true, 0},
-        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, false, 1},
-        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, false, 1},
-        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 1},
-        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, true, 1},
+        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdFp16()
+{
+    return {
+        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdBfp16()
+{
+    return {
+        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdFp32()
+{
+    return {
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdFp16()
+{
+    return {
+        {{64, 288, 35, 35}, {3, 3}, {1, 1}, {1, 1}, false, true, 0},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdBfp16()
+{
+    return {
+        {{64, 2048, 9, 9}, {3, 3}, {1, 1}, {1, 1}, false, true, 0},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
     };
 }
 
@@ -212,12 +237,15 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
                                         input_dev.get(),
                                         output.desc,
                                         output_dev.get(),
-                                        stride.desc,
-                                        stride_dev.get(),
-                                        padding.desc,
-                                        padding_dev.get(),
-                                        ksize.desc,
-                                        ksize_dev.get(),
+                                        ksize.GetSize() == 3 ? ksize[0] : 0,
+                                        ksize.GetSize() == 3 ? ksize[1] : ksize[0],
+                                        ksize.GetSize() == 3 ? ksize[2] : ksize[1],
+                                        stride.GetSize() == 3 ? stride[0] : 0,
+                                        stride.GetSize() == 3 ? stride[1] : stride[0],
+                                        stride.GetSize() == 3 ? stride[2] : stride[1],
+                                        padding.GetSize() == 3 ? padding[0] : 0,
+                                        padding.GetSize() == 3 ? padding[1] : padding[0],
+                                        padding.GetSize() == 3 ? padding[2] : padding[1],
                                         count_include_pad,
                                         divisor_override);
         fflush(stdout);
@@ -377,12 +405,15 @@ struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
                                          output_grad_dev.get(),
                                          input_grad.desc,
                                          input_grad_dev.get(),
-                                         stride.desc,
-                                         stride_dev.get(),
-                                         padding.desc,
-                                         padding_dev.get(),
-                                         ksize.desc,
-                                         ksize_dev.get(),
+                                         ksize.GetSize() == 3 ? ksize[0] : 0,
+                                         ksize.GetSize() == 3 ? ksize[1] : ksize[0],
+                                         ksize.GetSize() == 3 ? ksize[2] : ksize[1],
+                                         stride.GetSize() == 3 ? stride[0] : 0,
+                                         stride.GetSize() == 3 ? stride[1] : stride[0],
+                                         stride.GetSize() == 3 ? stride[2] : stride[1],
+                                         padding.GetSize() == 3 ? padding[0] : 0,
+                                         padding.GetSize() == 3 ? padding[1] : padding[0],
+                                         padding.GetSize() == 3 ? padding[2] : padding[1],
                                          count_include_pad,
                                          divisor_override);
 

From 5a357389c31287ebfdf57893d9a5046e08cce8a0 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Wed, 21 Aug 2024 17:48:10 +0700
Subject: [PATCH 09/38] passed gtest

---
 src/kernels/MIOpenAvgPool.cpp | 2 --
 test/gtest/avgpool.hpp        | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
index d17dcc38ff..76355d5729 100644
--- a/src/kernels/MIOpenAvgPool.cpp
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -69,7 +69,6 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
         return;
 
     FLOAT_ACCUM m = 0;
-#pragma unroll
     for(int32_t r = 0; r < R; ++r)
     {
         for(int32_t s = 0; s < S; ++s)
@@ -329,7 +328,6 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
         return;
 
     FLOAT_ACCUM grad = 0;
-#pragma unroll
     for(int32_t r = 0; r < R; ++r)
     {
         for(int32_t s = 0; s < S; ++s)
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
index fca812357d..94898d32b6 100644
--- a/test/gtest/avgpool.hpp
+++ b/test/gtest/avgpool.hpp
@@ -105,7 +105,7 @@ inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdFp16()
 {
     return {
         {{64, 288, 35, 35}, {3, 3}, {1, 1}, {1, 1}, false, true, 0},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+        {{6, 288, 35, 35, 35}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, false, true, 0},
     };
 }
 
@@ -113,7 +113,7 @@ inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdBfp16()
 {
     return {
         {{64, 2048, 9, 9}, {3, 3}, {1, 1}, {1, 1}, false, true, 0},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+        {{6, 128, 112, 112, 112}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
     };
 }
 

From 27470a21c4f33a9426cc635e791f2b02ba6dc7ac Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 30 Sep 2024 13:38:02 +0700
Subject: [PATCH 10/38] skeleton code

---
 docs/reference/index.rst                      |   2 +-
 include/miopen/miopen.h                       |  48 +-
 src/CMakeLists.txt                            |  16 +-
 src/adaptiveavgpool.cpp                       |  94 +++
 .../problem_description.cpp                   |  15 +-
 src/adaptiveavgpool_api.cpp                   | 113 ++++
 src/avgpool.cpp                               | 136 -----
 src/avgpool_api.cpp                           | 190 ------
 src/include/miopen/adaptiveavgpool.hpp        |  50 ++
 .../invoke_params.hpp                         |  30 +-
 .../problem_description.hpp                   |  45 +-
 .../miopen/adaptiveavgpool/solvers.hpp        | 159 +++++
 src/include/miopen/avgpool.hpp                |  71 ---
 src/include/miopen/avgpool/solvers.hpp        | 113 ----
 src/include/miopen/solver_id.hpp              |   2 +-
 src/kernels/MIOpenAdaptiveAvgPool.cpp         | 404 ++++++++++++
 src/kernels/MIOpenAvgPool.cpp                 | 574 ------------------
 src/solver.cpp                                |  25 +
 .../backward_adaptiveavgpool_1d.cpp}          |  39 +-
 .../backward_adaptiveavgpool_2d.cpp           | 153 +++++
 .../backward_adaptiveavgpool_3d.cpp}          |  42 +-
 .../forward_adaptiveavgpool_1d.cpp            | 145 +++++
 .../forward_adaptiveavgpool_2d.cpp}           |  50 +-
 .../forward_adaptiveavgpool_3d.cpp}           |  55 +-
 24 files changed, 1251 insertions(+), 1320 deletions(-)
 create mode 100644 src/adaptiveavgpool.cpp
 rename src/{avgpool => adaptiveavgpool}/problem_description.cpp (90%)
 create mode 100644 src/adaptiveavgpool_api.cpp
 delete mode 100644 src/avgpool.cpp
 delete mode 100644 src/avgpool_api.cpp
 create mode 100644 src/include/miopen/adaptiveavgpool.hpp
 rename src/include/miopen/{avgpool => adaptiveavgpool}/invoke_params.hpp (71%)
 rename src/include/miopen/{avgpool => adaptiveavgpool}/problem_description.hpp (68%)
 create mode 100644 src/include/miopen/adaptiveavgpool/solvers.hpp
 delete mode 100644 src/include/miopen/avgpool.hpp
 delete mode 100644 src/include/miopen/avgpool/solvers.hpp
 create mode 100644 src/kernels/MIOpenAdaptiveAvgPool.cpp
 delete mode 100644 src/kernels/MIOpenAvgPool.cpp
 rename src/solver/{avgpool/backward_avgpool_2d.cpp => adaptiveavgpool/backward_adaptiveavgpool_1d.cpp} (81%)
 create mode 100644 src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
 rename src/solver/{avgpool/backward_avgpool_3d.cpp => adaptiveavgpool/backward_adaptiveavgpool_3d.cpp} (81%)
 create mode 100644 src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
 rename src/solver/{avgpool/forward_avgpool_2d.cpp => adaptiveavgpool/forward_adaptiveavgpool_2d.cpp} (76%)
 rename src/solver/{avgpool/forward_avgpool_3d.cpp => adaptiveavgpool/forward_adaptiveavgpool_3d.cpp} (76%)

diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 9594e00ef0..d715ccef25 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -35,4 +35,4 @@ The MIOpen API library is structured as follows:
   * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental)
   * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental)
   * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental)
-  * :doc:`AvgPool <../doxygen/html/group__avgpool>` (experimental)
+  * :doc:`AdaptiveAvgPool <../doxygen/html/group__adaptiveavgpool>` (experimental)
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index ea44de92d5..57aeeb5d3b 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -70,7 +70,7 @@
  * @defgroup SGD
  * @defgroup getitem
  * @defgroup ReduceCalculation
- * @defgroup avgpool
+ * @defgroup adaptiveavgpool
  *
  */
 
@@ -7623,73 +7623,43 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
 #endif // MIOPEN_BETA_API
 
 #ifdef MIOPEN_BETA_API
-// avgpool APIs
-/** @addtogroup avgpool
+// adaptiveavgpool APIs
+/** @addtogroup adaptiveavgpool
  *
  *  @{
  */
 
-/*! @brief Execute an avgpool forward layer
+/*! @brief Execute an adaptiveavgpool forward layer
  *
  * @param handle                   MIOpen handle (input)
  * @param inputDesc                Tensor descriptor for input tensor (input)
  * @param input                    Data tensor input (input)
  * @param outputDesc               Tensor descriptor for output tensor (input)
  * @param output                   Data tensor output (output)
- * @param count_include_pad        When True, will include the zero-padding in the averaging
- * calculation (input)
- * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
- * use the number of elements in the pooling window (input)
  * @return                         miopenStatus_t
  */
 MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                                                   const miopenTensorDescriptor_t inputDesc,
                                                   const void* input,
                                                   const miopenTensorDescriptor_t outputDesc,
-                                                  void* output,
-                                                  const int32_t KD,
-                                                  const int32_t KH,
-                                                  const int32_t KW,
-                                                  const int32_t SD,
-                                                  const int32_t SH,
-                                                  const int32_t SW,
-                                                  const int32_t PD,
-                                                  const int32_t PH,
-                                                  const int32_t PW,
-                                                  const bool count_include_pad,
-                                                  const int32_t divisor_override);
-
-/*! @brief Execute an avgpool backward layer
+                                                  void* output);
+
+/*! @brief Execute an adaptiveavgpool backward layer
  *
  * @param handle                   MIOpen handle (input)
  * @param outputGradDesc           Tensor descriptor for output grad tensor (input)
  * @param output_grad              Data tensor output grad (input)
  * @param inputGradDesc            Tensor descriptor for input grad tensor (input)
  * @param input_grad               Data tensor input grad (output)
- * @param count_include_pad        When True, will include the zero-padding in the averaging
- * calculation (input)
- * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
- * use the number of elements in the pooling window (input)
  * @return                         miopenStatus_t
  */
 MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                                                    const miopenTensorDescriptor_t outputGradDesc,
                                                    const void* output_grad,
                                                    const miopenTensorDescriptor_t inputGradDesc,
-                                                   void* input_grad,
-                                                   const int32_t KD,
-                                                   const int32_t KH,
-                                                   const int32_t KW,
-                                                   const int32_t SD,
-                                                   const int32_t SH,
-                                                   const int32_t SW,
-                                                   const int32_t PD,
-                                                   const int32_t PH,
-                                                   const int32_t PW,
-                                                   const bool count_include_pad,
-                                                   const int32_t divisor_override);
+                                                   void* input_grad);
 /** @} */
-// CLOSEOUT avgpool DOXYGEN GROUP
+// CLOSEOUT adaptiveavgpool DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
 #ifdef __cplusplus
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ee36c92967..ae621b28ad 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -89,8 +89,8 @@ set( MIOpen_Source
     adam_api.cpp
     addlayernorm_api.cpp
     api/find2_0_commons.cpp
-    avgpool_api.cpp
-    avgpool/problem_description.cpp
+    adaptiveavgpool_api.cpp
+    adaptiveavgpool/problem_description.cpp
     batch_norm.cpp
     batch_norm_api.cpp
     batchnorm/problem_description.cpp
@@ -193,10 +193,12 @@ set( MIOpen_Source
     solver/activ/fwd_1.cpp
     solver/adam/adam.cpp
     solver/adam/transformers_adam_w.cpp
-    solver/avgpool/backward_avgpool_2d.cpp
-    solver/avgpool/backward_avgpool_3d.cpp
-    solver/avgpool/forward_avgpool_2d.cpp
-    solver/avgpool/forward_avgpool_3d.cpp
+    solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
+    solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
+    solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
+    solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
+    solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
+    solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
     solver/batchnorm/backward_ck.cpp
     solver/batchnorm/backward_per_activation.cpp
     solver/batchnorm/backward_per_activation_fused.cpp
@@ -633,7 +635,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         activ.cpp
         adam.cpp
         addlayernorm.cpp
-        avgpool.cpp
+        adaptiveavgpool.cpp
         cat.cpp
         groupnorm.cpp
         getitem.cpp
diff --git a/src/adaptiveavgpool.cpp b/src/adaptiveavgpool.cpp
new file mode 100644
index 0000000000..fee382a4d1
--- /dev/null
+++ b/src/adaptiveavgpool.cpp
@@ -0,0 +1,94 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/adaptiveavgpool.hpp>
+#include <miopen/kernel_cache.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/adaptiveavgpool/invoke_params.hpp>
+#include <miopen/adaptiveavgpool/solvers.hpp>
+#include <miopen/find_solution.hpp>
+
+namespace miopen {
+
+miopenStatus_t AdaptiveAvgPoolForward(Handle& handle,
+                                      const TensorDescriptor& inputDesc,
+                                      ConstData_t input,
+                                      const TensorDescriptor& outputDesc,
+                                      Data_t output)
+{
+    const auto problem = adaptiveavgpool::FwdProblemDescription{inputDesc, outputDesc};
+
+    const auto invoke_params = [&]() {
+        auto tmp       = adaptiveavgpool::FwdInvokeParams{};
+        tmp.inputDesc  = &inputDesc;
+        tmp.outputDesc = &outputDesc;
+
+        tmp.input  = input;
+        tmp.output = output;
+
+        return tmp;
+    }();
+    const auto algo = AlgorithmName{"AdaptiveAvgPoolForward"};
+    const auto solvers =
+        solver::SolverContainer<solver::adaptiveavgpool::AdaptiveAvgPoolForward1d,
+                                solver::adaptiveavgpool::AdaptiveAvgPoolForward2d,
+                                solver::adaptiveavgpool::AdaptiveAvgPoolForward3d>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t AdaptiveAvgPoolBackward(Handle& handle,
+                                       const TensorDescriptor& outputGradDesc,
+                                       ConstData_t output_grad,
+                                       const TensorDescriptor& inputGradDesc,
+                                       Data_t input_grad)
+{
+    const auto problem = adaptiveavgpool::BwdProblemDescription{outputGradDesc, inputGradDesc};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = adaptiveavgpool::BwdInvokeParams{};
+        tmp.outputGradDesc = &outputGradDesc;
+        tmp.inputGradDesc  = &inputGradDesc;
+
+        tmp.output_grad = output_grad;
+        tmp.input_grad  = input_grad;
+
+        return tmp;
+    }();
+    const auto algo = AlgorithmName{"AdaptiveAvgPoolBackward"};
+    const auto solvers =
+        solver::SolverContainer<solver::adaptiveavgpool::AdaptiveAvgPoolBackward1d,
+                                solver::adaptiveavgpool::AdaptiveAvgPoolBackward2d,
+                                solver::adaptiveavgpool::AdaptiveAvgPoolBackward3d>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
diff --git a/src/avgpool/problem_description.cpp b/src/adaptiveavgpool/problem_description.cpp
similarity index 90%
rename from src/avgpool/problem_description.cpp
rename to src/adaptiveavgpool/problem_description.cpp
index 96ecb4bb72..ec3b9cf636 100644
--- a/src/avgpool/problem_description.cpp
+++ b/src/adaptiveavgpool/problem_description.cpp
@@ -24,12 +24,13 @@
  *
  *******************************************************************************/
 
-#include <miopen/avgpool/problem_description.hpp>
+#include <miopen/adaptiveavgpool/problem_description.hpp>
 #include <miopen/names.hpp>
+#include <sstream>
 
 namespace miopen {
 
-namespace avgpool {
+namespace adaptiveavgpool {
 
 inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
 {
@@ -55,14 +56,12 @@ NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
 
     std::ostringstream ss;
 
-    ss << "avgpool_fwd";
+    ss << "adaptiveavgpool_fwd";
     ss << "-input_dtype" << input_dtype;
     ss << "-Is" << input_size;
     ss << "-Os" << output_size;
     ss << "-Si" << input_stride;
     ss << "-So" << output_stride;
-    ss << "-Cp " << count_include_pad;
-    ss << "-Do " << divisor_override;
 
     return NetworkConfig{ss.str()};
 }
@@ -78,18 +77,16 @@ NetworkConfig BwdProblemDescription::MakeNetworkConfig() const
 
     std::ostringstream ss;
 
-    ss << "avgpool_bwd";
+    ss << "adaptiveavgpool_bwd";
     ss << "-input_dtype" << input_dtype;
     ss << "-dIs" << input_grad_size;
     ss << "-dOs" << output_grad_size;
     ss << "-dSi" << input_grad_stride;
     ss << "-dSo" << output_grad_stride;
-    ss << "-Cp " << count_include_pad;
-    ss << "-Do " << divisor_override;
 
     return NetworkConfig{ss.str()};
 }
 
-} // namespace avgpool
+} // namespace adaptiveavgpool
 
 } // namespace miopen
diff --git a/src/adaptiveavgpool_api.cpp b/src/adaptiveavgpool_api.cpp
new file mode 100644
index 0000000000..a9159258f9
--- /dev/null
+++ b/src/adaptiveavgpool_api.cpp
@@ -0,0 +1,113 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/adaptiveavgpool.hpp>
+#include <miopen/errors.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/tensor_ops.hpp>
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+static void LogCmdAdaptiveAvgPool(const miopenTensorDescriptor_t xDesc,
+                                  const miopenTensorDescriptor_t oDesc,
+                                  const bool is_fwd)
+{
+    if(miopen::IsLoggingCmd())
+    {
+        std::stringstream ss;
+        auto dtype = miopen::deref(xDesc).GetType();
+        if(dtype == miopenHalf)
+        {
+            ss << "adaptiveavgpoolfp16";
+        }
+        else if(dtype == miopenFloat)
+        {
+            ss << "adaptiveavgpoolfp32";
+        }
+        else if(dtype == miopenBFloat16)
+        {
+            ss << "adaptiveavgpoolbfp16";
+        }
+
+        MIOPEN_LOG_FUNCTION(xDesc, oDesc, is_fwd);
+        ss << " -Is " << miopen::deref(xDesc).GetLengths();
+        ss << " -Os " << miopen::deref(oDesc).GetLengths();
+        ss << " -Si " << miopen::deref(xDesc).GetStrides();
+        ss << " -So " << miopen::deref(oDesc).GetStrides();
+        ss << " -F " << ((is_fwd) ? "1" : "2");
+
+        MIOPEN_LOG_DRIVER_CMD(ss.str());
+    }
+}
+
+extern "C" miopenStatus_t miopenAdaptiveAvgPoolForward(miopenHandle_t handle,
+                                                       const miopenTensorDescriptor_t inputDesc,
+                                                       const void* input,
+                                                       const miopenTensorDescriptor_t outputDesc,
+                                                       void* output)
+{
+    MIOPEN_LOG_FUNCTION(handle, inputDesc, input, outputDesc, output);
+
+    LogCmdAdaptiveAvgPool(inputDesc, outputDesc, true);
+    return miopen::try_([&] {
+        miopen::AdaptiveAvgPoolForward(miopen::deref(handle),
+                                       miopen::deref(inputDesc),
+                                       DataCast(input),
+                                       miopen::deref(outputDesc),
+                                       DataCast(output));
+    });
+}
+
+extern "C" miopenStatus_t
+miopenAdaptiveAvgPoolBackward(miopenHandle_t handle,
+                              const miopenTensorDescriptor_t outputGradDesc,
+                              const void* output_grad,
+                              const miopenTensorDescriptor_t inputGradDesc,
+                              void* input_grad)
+{
+    MIOPEN_LOG_FUNCTION(handle, outputGradDesc, output_grad, inputGradDesc, input_grad);
+
+    LogCmdAdaptiveAvgPool(inputGradDesc, outputGradDesc, false);
+    return miopen::try_([&] {
+        miopen::AdaptiveAvgPoolBackward(miopen::deref(handle),
+                                        miopen::deref(outputGradDesc),
+                                        DataCast(output_grad),
+                                        miopen::deref(inputGradDesc),
+                                        DataCast(input_grad));
+    });
+}
diff --git a/src/avgpool.cpp b/src/avgpool.cpp
deleted file mode 100644
index 323f01c90e..0000000000
--- a/src/avgpool.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2024 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <miopen/avgpool.hpp>
-#include <miopen/kernel_cache.hpp>
-#include <miopen/float_equal.hpp>
-#include <miopen/tensor.hpp>
-#include <miopen/avgpool/invoke_params.hpp>
-#include <miopen/avgpool/solvers.hpp>
-#include <miopen/find_solution.hpp>
-
-namespace miopen {
-
-miopenStatus_t AvgPoolForward(Handle& handle,
-                              const TensorDescriptor& inputDesc,
-                              ConstData_t input,
-                              const TensorDescriptor& outputDesc,
-                              Data_t output,
-                              const int32_t KD,
-                              const int32_t KH,
-                              const int32_t KW,
-                              const int32_t SD,
-                              const int32_t SH,
-                              const int32_t SW,
-                              const int32_t PD,
-                              const int32_t PH,
-                              const int32_t PW,
-                              const bool count_include_pad,
-                              const int32_t divisor_override)
-{
-    const auto problem =
-        avgpool::FwdProblemDescription{inputDesc, outputDesc, count_include_pad, divisor_override};
-
-    const auto invoke_params = [&]() {
-        auto tmp       = avgpool::FwdInvokeParams{};
-        tmp.inputDesc  = &inputDesc;
-        tmp.outputDesc = &outputDesc;
-
-        tmp.input             = input;
-        tmp.output            = output;
-        tmp.KD                = KD;
-        tmp.KH                = KH;
-        tmp.KW                = KW;
-        tmp.SD                = SD;
-        tmp.SH                = SH;
-        tmp.SW                = SW;
-        tmp.PD                = PD;
-        tmp.PH                = PH;
-        tmp.PW                = PW;
-        tmp.count_include_pad = count_include_pad;
-        tmp.divisor_override  = divisor_override;
-
-        return tmp;
-    }();
-    const auto algo    = AlgorithmName{"AvgPoolForward"};
-    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolForward2d,
-                                                 solver::avgpool::AvgPoolForward3d>{};
-
-    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
-
-    return miopenStatusSuccess;
-}
-
-miopenStatus_t AvgPoolBackward(Handle& handle,
-                               const TensorDescriptor& outputGradDesc,
-                               ConstData_t output_grad,
-                               const TensorDescriptor& inputGradDesc,
-                               Data_t input_grad,
-                               const int32_t KD,
-                               const int32_t KH,
-                               const int32_t KW,
-                               const int32_t SD,
-                               const int32_t SH,
-                               const int32_t SW,
-                               const int32_t PD,
-                               const int32_t PH,
-                               const int32_t PW,
-                               const bool count_include_pad,
-                               const int32_t divisor_override)
-{
-    const auto problem = avgpool::BwdProblemDescription{
-        outputGradDesc, inputGradDesc, count_include_pad, divisor_override};
-
-    const auto invoke_params = [&]() {
-        auto tmp           = avgpool::BwdInvokeParams{};
-        tmp.outputGradDesc = &outputGradDesc;
-        tmp.inputGradDesc  = &inputGradDesc;
-
-        tmp.output_grad       = output_grad;
-        tmp.input_grad        = input_grad;
-        tmp.KD                = KD;
-        tmp.KH                = KH;
-        tmp.KW                = KW;
-        tmp.SD                = SD;
-        tmp.SH                = SH;
-        tmp.SW                = SW;
-        tmp.PD                = PD;
-        tmp.PH                = PH;
-        tmp.PW                = PW;
-        tmp.count_include_pad = count_include_pad;
-        tmp.divisor_override  = divisor_override;
-
-        return tmp;
-    }();
-    const auto algo    = AlgorithmName{"AvgPoolBackward"};
-    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolBackward2d,
-                                                 solver::avgpool::AvgPoolBackward3d>{};
-
-    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
-
-    return miopenStatusSuccess;
-}
-
-} // namespace miopen
diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp
deleted file mode 100644
index 32e1f12f92..0000000000
--- a/src/avgpool_api.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2024 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <miopen/avgpool.hpp>
-#include <miopen/errors.hpp>
-#include <miopen/handle.hpp>
-#include <miopen/logger.hpp>
-#include <miopen/tensor_ops.hpp>
-
-inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
-{
-    os << '{';
-    for(int i = 0; i < v.size(); ++i)
-    {
-        if(i != 0)
-            os << ',';
-        os << v[i];
-    }
-    os << '}';
-    return os;
-}
-
-static void LogCmdAvgPool(const miopenTensorDescriptor_t xDesc,
-                          const miopenTensorDescriptor_t oDesc,
-                          const bool count_include_pad,
-                          const int32_t divisor_override,
-                          const bool is_fwd)
-{
-    if(miopen::IsLoggingCmd())
-    {
-        std::stringstream ss;
-        auto dtype = miopen::deref(xDesc).GetType();
-        if(dtype == miopenHalf)
-        {
-            ss << "avgpoolfp16";
-        }
-        else if(dtype == miopenFloat)
-        {
-            ss << "avgpoolfp32";
-        }
-        else if(dtype == miopenBFloat16)
-        {
-            ss << "avgpoolbfp16";
-        }
-
-        MIOPEN_LOG_FUNCTION(xDesc, oDesc, count_include_pad, divisor_override);
-        ss << " -Is " << miopen::deref(xDesc).GetLengths();
-        ss << " -Os " << miopen::deref(oDesc).GetLengths();
-        ss << " -Si " << miopen::deref(xDesc).GetStrides();
-        ss << " -So " << miopen::deref(oDesc).GetStrides();
-        ss << " -Cp " << count_include_pad;
-        ss << " -Do " << divisor_override;
-        ss << " -F " << ((is_fwd) ? "1" : "2");
-
-        MIOPEN_LOG_DRIVER_CMD(ss.str());
-    }
-}
-
-extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
-                                               const miopenTensorDescriptor_t inputDesc,
-                                               const void* input,
-                                               const miopenTensorDescriptor_t outputDesc,
-                                               void* output,
-                                               const int32_t KD,
-                                               const int32_t KH,
-                                               const int32_t KW,
-                                               const int32_t SD,
-                                               const int32_t SH,
-                                               const int32_t SW,
-                                               const int32_t PD,
-                                               const int32_t PH,
-                                               const int32_t PW,
-                                               const bool count_include_pad,
-                                               const int32_t divisor_override)
-{
-    MIOPEN_LOG_FUNCTION(handle,
-                        inputDesc,
-                        input,
-                        outputDesc,
-                        output,
-                        KD,
-                        KH,
-                        KW,
-                        SD,
-                        SH,
-                        SW,
-                        PD,
-                        PH,
-                        PW,
-                        count_include_pad,
-                        divisor_override);
-
-    LogCmdAvgPool(inputDesc, outputDesc, count_include_pad, divisor_override, true);
-    return miopen::try_([&] {
-        miopen::AvgPoolForward(miopen::deref(handle),
-                               miopen::deref(inputDesc),
-                               DataCast(input),
-                               miopen::deref(outputDesc),
-                               DataCast(output),
-                               KD,
-                               KH,
-                               KW,
-                               SD,
-                               SH,
-                               SW,
-                               PD,
-                               PH,
-                               PW,
-                               count_include_pad,
-                               divisor_override);
-    });
-}
-
-extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
-                                                const miopenTensorDescriptor_t outputGradDesc,
-                                                const void* output_grad,
-                                                const miopenTensorDescriptor_t inputGradDesc,
-                                                void* input_grad,
-                                                const int32_t KD,
-                                                const int32_t KH,
-                                                const int32_t KW,
-                                                const int32_t SD,
-                                                const int32_t SH,
-                                                const int32_t SW,
-                                                const int32_t PD,
-                                                const int32_t PH,
-                                                const int32_t PW,
-                                                const bool count_include_pad,
-                                                const int32_t divisor_override)
-{
-    MIOPEN_LOG_FUNCTION(handle,
-                        outputGradDesc,
-                        output_grad,
-                        inputGradDesc,
-                        input_grad,
-                        KD,
-                        KH,
-                        KW,
-                        SD,
-                        SH,
-                        SW,
-                        PD,
-                        PH,
-                        PW,
-                        count_include_pad,
-                        divisor_override);
-
-    LogCmdAvgPool(inputGradDesc, outputGradDesc, count_include_pad, divisor_override, false);
-    return miopen::try_([&] {
-        miopen::AvgPoolBackward(miopen::deref(handle),
-                                miopen::deref(outputGradDesc),
-                                DataCast(output_grad),
-                                miopen::deref(inputGradDesc),
-                                DataCast(input_grad),
-                                KD,
-                                KH,
-                                KW,
-                                SD,
-                                SH,
-                                SW,
-                                PD,
-                                PH,
-                                PW,
-                                count_include_pad,
-                                divisor_override);
-    });
-}
diff --git a/src/include/miopen/adaptiveavgpool.hpp b/src/include/miopen/adaptiveavgpool.hpp
new file mode 100644
index 0000000000..9f38a62d94
--- /dev/null
+++ b/src/include/miopen/adaptiveavgpool.hpp
@@ -0,0 +1,50 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/miopen.h>
+#ifndef MIOPEN_ADAPTIVEAVGPOOL_HPP_
+#define MIOPEN_ADAPTIVEAVGPOOL_HPP_
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t AdaptiveAvgPoolForward(Handle& handle,
+                                                              const TensorDescriptor& inputDesc,
+                                                              ConstData_t input,
+                                                              const TensorDescriptor& outputDesc,
+                                                              Data_t output);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t
+AdaptiveAvgPoolBackward(Handle& handle,
+                        const TensorDescriptor& outputGradDesc,
+                        ConstData_t output_grad,
+                        const TensorDescriptor& inputGradDesc,
+                        Data_t input_grad);
+} // namespace miopen
+#endif // _MIOPEN_ADAPTIVEAVGPOOL_HPP_
diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/adaptiveavgpool/invoke_params.hpp
similarity index 71%
rename from src/include/miopen/avgpool/invoke_params.hpp
rename to src/include/miopen/adaptiveavgpool/invoke_params.hpp
index e8bd9256ac..e97a66a427 100644
--- a/src/include/miopen/avgpool/invoke_params.hpp
+++ b/src/include/miopen/adaptiveavgpool/invoke_params.hpp
@@ -31,7 +31,7 @@
 #include <miopen/tensor.hpp>
 
 namespace miopen {
-namespace avgpool {
+namespace adaptiveavgpool {
 
 struct FwdInvokeParams : public miopen::InvokeParams
 {
@@ -43,19 +43,6 @@ struct FwdInvokeParams : public miopen::InvokeParams
 
     ConstData_t input = nullptr;
     Data_t output     = nullptr;
-    ConstData_t ksize = nullptr;
-
-    int32_t KD               = 0;
-    int32_t KH               = 0;
-    int32_t KW               = 0;
-    int32_t SD               = 0;
-    int32_t SH               = 0;
-    int32_t SW               = 0;
-    int32_t PD               = 0;
-    int32_t PH               = 0;
-    int32_t PW               = 0;
-    bool count_include_pad   = false;
-    int32_t divisor_override = 0;
 
     std::size_t GetWorkspaceSize() const { return 0; }
     Data_t GetWorkspace() const { return nullptr; }
@@ -71,23 +58,10 @@ struct BwdInvokeParams : public miopen::InvokeParams
 
     ConstData_t output_grad = nullptr;
     Data_t input_grad       = nullptr;
-    ConstData_t ksize       = nullptr;
-
-    int32_t KD               = 0;
-    int32_t KH               = 0;
-    int32_t KW               = 0;
-    int32_t SD               = 0;
-    int32_t SH               = 0;
-    int32_t SW               = 0;
-    int32_t PD               = 0;
-    int32_t PH               = 0;
-    int32_t PW               = 0;
-    bool count_include_pad   = false;
-    int32_t divisor_override = 0;
 
     std::size_t GetWorkspaceSize() const { return 0; }
     Data_t GetWorkspace() const { return nullptr; }
 };
 
-} // namespace avgpool
+} // namespace adaptiveavgpool
 } // namespace miopen
diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/adaptiveavgpool/problem_description.hpp
similarity index 68%
rename from src/include/miopen/avgpool/problem_description.hpp
rename to src/include/miopen/adaptiveavgpool/problem_description.hpp
index 2dee6a30ea..53be89cd89 100644
--- a/src/include/miopen/avgpool/problem_description.hpp
+++ b/src/include/miopen/adaptiveavgpool/problem_description.hpp
@@ -34,33 +34,12 @@ namespace miopen {
 
 struct NetworkConfig;
 
-namespace avgpool {
+namespace adaptiveavgpool {
 
-struct ProblemDescription : ProblemDescriptionBase
+struct FwdProblemDescription : ProblemDescriptionBase
 {
-    ProblemDescription(const bool count_include_pad_, const int32_t divisor_override_)
-        : count_include_pad(count_include_pad_), divisor_override(divisor_override_)
-    {
-        if(divisor_override < 0)
-        {
-            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: divisor_override must be non-negative.");
-        }
-    }
-
-protected:
-    bool count_include_pad;
-    int32_t divisor_override;
-};
-
-struct FwdProblemDescription : ProblemDescription
-{
-    FwdProblemDescription(const TensorDescriptor& inputDesc_,
-                          const TensorDescriptor& outputDesc_,
-                          const bool count_include_pad_,
-                          const int32_t divisor_override_)
-        : ProblemDescription(count_include_pad_, divisor_override_),
-          inputDesc(inputDesc_),
-          outputDesc(outputDesc_)
+    FwdProblemDescription(const TensorDescriptor& inputDesc_, const TensorDescriptor& outputDesc_)
+        : inputDesc(inputDesc_), outputDesc(outputDesc_)
     {
         IsValidLength();
     }
@@ -77,7 +56,7 @@ struct FwdProblemDescription : ProblemDescription
            outputDesc.GetLengths().size() != input_dims)
         {
             MIOPEN_THROW(miopenStatusBadParm,
-                         "AvgPool: Input and output tensor sizes do not match.");
+                         "AdaptiveAvgPool: Input and output tensor sizes do not match.");
         }
 
         return true;
@@ -90,15 +69,11 @@ struct FwdProblemDescription : ProblemDescription
     TensorDescriptor outputDesc;
 };
 
-struct BwdProblemDescription : ProblemDescription
+struct BwdProblemDescription : ProblemDescriptionBase
 {
     BwdProblemDescription(const TensorDescriptor& outputGradDesc_,
-                          const TensorDescriptor& inputGradDesc_,
-                          const bool count_include_pad_,
-                          const int32_t divisor_override_)
-        : ProblemDescription(count_include_pad_, divisor_override_),
-          outputGradDesc(outputGradDesc_),
-          inputGradDesc(inputGradDesc_)
+                          const TensorDescriptor& inputGradDesc_)
+        : outputGradDesc(outputGradDesc_), inputGradDesc(inputGradDesc_)
     {
         IsValidLength();
     }
@@ -115,7 +90,7 @@ struct BwdProblemDescription : ProblemDescription
            outputGradDesc.GetLengths().size() != input_dims)
         {
             MIOPEN_THROW(miopenStatusBadParm,
-                         "AvgPool: Input grad and output grad tensor sizes do not match.");
+                         "AdaptiveAvgPool: Input grad and output grad tensor sizes do not match.");
         }
 
         return true;
@@ -128,6 +103,6 @@ struct BwdProblemDescription : ProblemDescription
     TensorDescriptor inputGradDesc;
 };
 
-} // namespace avgpool
+} // namespace adaptiveavgpool
 
 } // namespace miopen
diff --git a/src/include/miopen/adaptiveavgpool/solvers.hpp b/src/include/miopen/adaptiveavgpool/solvers.hpp
new file mode 100644
index 0000000000..25f08f3345
--- /dev/null
+++ b/src/include/miopen/adaptiveavgpool/solvers.hpp
@@ -0,0 +1,159 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include <miopen/solver.hpp>
+#include <miopen/adaptiveavgpool/problem_description.hpp>
+#include "miopen/kernel_build_params.hpp"
+#include "miopen/kernel_info.hpp"
+
+namespace miopen {
+
+namespace solver {
+
+const auto make_hip_kernel = [](std::vector<size_t> localsize,
+                                std::vector<size_t> gridsize,
+                                std::string kernel_file,
+                                std::string kernel_name,
+                                KernelBuildParameters build_params) {
+    while(localsize.size() < 3)
+        localsize.push_back(1);
+    while(gridsize.size() < 3)
+        gridsize.push_back(1);
+    for(int i = 0; i < localsize.size(); ++i)
+        gridsize[i] = AlignUp(gridsize[i], localsize[i]);
+    return KernelInfo{
+        build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name};
+};
+
+namespace adaptiveavgpool {
+
+using AdaptiveAvgPoolForward =
+    NonTunableSolverBase<ExecutionContext, miopen::adaptiveavgpool::FwdProblemDescription>;
+
+using AdaptiveAvgPoolBackward =
+    NonTunableSolverBase<ExecutionContext, miopen::adaptiveavgpool::BwdProblemDescription>;
+
+// FORWARD
+struct AdaptiveAvgPoolForward1d final : AdaptiveAvgPoolForward
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<AdaptiveAvgPoolForward1d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::adaptiveavgpool::FwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::adaptiveavgpool::FwdProblemDescription& problem) const override;
+};
+
+struct AdaptiveAvgPoolForward2d final : AdaptiveAvgPoolForward
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<AdaptiveAvgPoolForward2d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::adaptiveavgpool::FwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::adaptiveavgpool::FwdProblemDescription& problem) const override;
+};
+
+struct AdaptiveAvgPoolForward3d final : AdaptiveAvgPoolForward
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<AdaptiveAvgPoolForward3d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::adaptiveavgpool::FwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::adaptiveavgpool::FwdProblemDescription& problem) const override;
+};
+
+// BACKWARD
+struct AdaptiveAvgPoolBackward1d final : AdaptiveAvgPoolBackward
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<AdaptiveAvgPoolBackward1d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::adaptiveavgpool::BwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::adaptiveavgpool::BwdProblemDescription& problem) const override;
+};
+
+struct AdaptiveAvgPoolBackward2d final : AdaptiveAvgPoolBackward
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<AdaptiveAvgPoolBackward2d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::adaptiveavgpool::BwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::adaptiveavgpool::BwdProblemDescription& problem) const override;
+};
+
+struct AdaptiveAvgPoolBackward3d final : AdaptiveAvgPoolBackward
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<AdaptiveAvgPoolBackward3d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::adaptiveavgpool::BwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::adaptiveavgpool::BwdProblemDescription& problem) const override;
+};
+
+} // namespace adaptiveavgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp
deleted file mode 100644
index 00a2717ff6..0000000000
--- a/src/include/miopen/avgpool.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2024 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <miopen/miopen.h>
-#ifndef MIOPEN_AVGPOOL_HPP_
-#define MIOPEN_AVGPOOL_HPP_
-
-#include <miopen/common.hpp>
-
-namespace miopen {
-
-struct Handle;
-struct TensorDescriptor;
-
-MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle,
-                                                      const TensorDescriptor& inputDesc,
-                                                      ConstData_t input,
-                                                      const TensorDescriptor& outputDesc,
-                                                      Data_t output,
-                                                      int32_t KD,
-                                                      int32_t KH,
-                                                      int32_t KW,
-                                                      int32_t SD,
-                                                      int32_t SH,
-                                                      int32_t SW,
-                                                      int32_t PD,
-                                                      int32_t PH,
-                                                      int32_t PW,
-                                                      bool count_include_pad,
-                                                      int32_t divisor_override);
-
-MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle,
-                                                       const TensorDescriptor& outputGradDesc,
-                                                       ConstData_t output_grad,
-                                                       const TensorDescriptor& inputGradDesc,
-                                                       Data_t input_grad,
-                                                       int32_t KD,
-                                                       int32_t KH,
-                                                       int32_t KW,
-                                                       int32_t SD,
-                                                       int32_t SH,
-                                                       int32_t SW,
-                                                       int32_t PD,
-                                                       int32_t PH,
-                                                       int32_t PW,
-                                                       bool count_include_pad,
-                                                       int32_t divisor_override);
-} // namespace miopen
-#endif // _MIOPEN_AVGPOOL_HPP_
diff --git a/src/include/miopen/avgpool/solvers.hpp b/src/include/miopen/avgpool/solvers.hpp
deleted file mode 100644
index 5577b9fad6..0000000000
--- a/src/include/miopen/avgpool/solvers.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2024 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#pragma once
-
-#include "miopen/conv_solution.hpp"
-#include "miopen/execution_context.hpp"
-#include <miopen/solver.hpp>
-#include <miopen/avgpool/problem_description.hpp>
-#include "miopen/kernel_build_params.hpp"
-#include "miopen/kernel_info.hpp"
-
-namespace miopen {
-
-namespace solver {
-
-const auto make_hip_kernel = [](std::vector<size_t> localsize,
-                                std::vector<size_t> gridsize,
-                                std::string kernel_file,
-                                std::string kernel_name,
-                                KernelBuildParameters build_params) {
-    while(localsize.size() < 3)
-        localsize.push_back(1);
-    while(gridsize.size() < 3)
-        gridsize.push_back(1);
-    for(int i = 0; i < localsize.size(); ++i)
-        gridsize[i] = AlignUp(gridsize[i], localsize[i]);
-    return KernelInfo{
-        build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name};
-};
-
-namespace avgpool {
-
-using AvgPoolForward =
-    NonTunableSolverBase<ExecutionContext, miopen::avgpool::FwdProblemDescription>;
-
-using AvgPoolBackward =
-    NonTunableSolverBase<ExecutionContext, miopen::avgpool::BwdProblemDescription>;
-
-// FORWARD
-struct AvgPoolForward2d final : AvgPoolForward
-{
-    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolForward2d>(); }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::FwdProblemDescription& problem) const override;
-
-    ConvSolution GetSolution(const ExecutionContext& context,
-                             const miopen::avgpool::FwdProblemDescription& problem) const override;
-};
-
-struct AvgPoolForward3d final : AvgPoolForward
-{
-    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolForward3d>(); }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::FwdProblemDescription& problem) const override;
-
-    ConvSolution GetSolution(const ExecutionContext& context,
-                             const miopen::avgpool::FwdProblemDescription& problem) const override;
-};
-
-// BACKWARD
-struct AvgPoolBackward2d final : AvgPoolBackward
-{
-    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolBackward2d>(); }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::BwdProblemDescription& problem) const override;
-
-    ConvSolution GetSolution(const ExecutionContext& context,
-                             const miopen::avgpool::BwdProblemDescription& problem) const override;
-};
-
-struct AvgPoolBackward3d final : AvgPoolBackward
-{
-    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolBackward3d>(); }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::BwdProblemDescription& problem) const override;
-
-    ConvSolution GetSolution(const ExecutionContext& context,
-                             const miopen::avgpool::BwdProblemDescription& problem) const override;
-};
-
-} // namespace avgpool
-
-} // namespace solver
-
-} // namespace miopen
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index 194afd79ac..25fc7aad16 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -60,7 +60,7 @@ enum class Primitive
     Softmax,
     Adam,
     Item,
-    AvgPool
+    AdaptiveAvgPool
 };
 
 struct MIOPEN_INTERNALS_EXPORT Id
diff --git a/src/kernels/MIOpenAdaptiveAvgPool.cpp b/src/kernels/MIOpenAdaptiveAvgPool.cpp
new file mode 100644
index 0000000000..d29a03ab1d
--- /dev/null
+++ b/src/kernels/MIOpenAdaptiveAvgPool.cpp
@@ -0,0 +1,404 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <cstddef>
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "tensor_view.hpp"
+
+#ifndef INPUT_TYPE
+#define INPUT_TYPE float
+#endif
+
+#ifndef OUTPUT_TYPE
+#define OUTPUT_TYPE float
+#endif
+
+template <typename TI, typename TO>
+__device__ void avgPoolForward1d(const TI* __restrict__ input,
+                                 TO* __restrict__ output,
+                                 size_t N,
+                                 size_t C,
+                                 size_t H,
+                                 size_t OH,
+                                 tensor_view_t<3> input_tv,
+                                 tensor_view_t<3> output_tv)
+{
+    size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+    size_t nc = gid / OH, oh = gid % OH;
+    size_t n = nc / C, c = nc % C;
+    if(n >= N)
+        return;
+
+    int32_t h  = (int32_t)floor((float)(oh * H) / OH);
+    int32_t kh = (int32_t)ceil((float)((oh + 1) * H) / OH) - h;
+
+    DTYPE_ACCURATE sum = 0;
+    for(int ih = h; ih < (h + kh); ++ih)
+    {
+        sum += GET_3D_VAL_AT(input, n, c, ih);
+    }
+
+    SET_3D_VAL_AT(output, n, c, oh, sum / kh);
+}
+extern "C" __global__ void AvgPoolForward1d(const INPUT_TYPE* __restrict__ input,
+                                            OUTPUT_TYPE* __restrict__ output,
+                                            size_t N,
+                                            size_t C,
+                                            size_t H,
+                                            size_t OH,
+                                            tensor_view_t<3> input_tv,
+                                            tensor_view_t<3> output_tv)
+{
+    avgPoolForward1d<INPUT_TYPE, OUTPUT_TYPE>(input, output, N, C, H, OH, input_tv, output_tv);
+}
+
+template <typename TI, typename TO>
+__device__ void avgPoolBackward1d(const TI* __restrict__ output_grad,
+                                  TO* __restrict__ input_grad,
+                                  size_t N,
+                                  size_t C,
+                                  size_t H,
+                                  size_t OH,
+                                  tensor_view_t<3> output_grad_tv,
+                                  tensor_view_t<3> input_grad_tv)
+{
+    size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+    size_t nc = gid / H, h = gid % H;
+    size_t n = nc / C, c = nc % C;
+    if(n >= N)
+        return;
+
+    int32_t oh  = (int32_t)floor((float)(h * OH) / H);
+    int32_t koh = (int32_t)ceil((float)((h + 1) * OH) / H) - oh;
+
+    DTYPE_ACCURATE grad = 0;
+    for(int ih = oh; ih < (oh + koh); ++ih)
+    {
+        int32_t kh =
+            (int32_t)ceil((float)((ih + 1) * H) / OH) - (int32_t)floor((float)(ih * H) / OH);
+        grad += GET_3D_VAL_AT(output_grad, n, c, ih) / kh;
+    }
+
+    SET_3D_VAL_AT(input_grad, n, c, h, grad);
+}
+extern "C" __global__ void AvgPoolBackward1d(const INPUT_TYPE* __restrict__ output_grad,
+                                             OUTPUT_TYPE* __restrict__ input_grad,
+                                             size_t N,
+                                             size_t C,
+                                             size_t H,
+                                             size_t OH,
+                                             tensor_view_t<3> output_grad_tv,
+                                             tensor_view_t<3> input_grad_tv)
+{
+    avgPoolBackward1d<INPUT_TYPE, OUTPUT_TYPE>(
+        output_grad, input_grad, N, C, H, OH, output_grad_tv, input_grad_tv);
+}
+
+template <typename TI, typename TO>
+__device__ void avgPoolForward2d(const TI* __restrict__ input,
+                                 TO* __restrict__ output,
+                                 size_t N,
+                                 size_t C,
+                                 size_t H,
+                                 size_t W,
+                                 size_t OH,
+                                 size_t OW,
+                                 tensor_view_t<4> input_tv,
+                                 tensor_view_t<4> output_tv)
+{
+    int32_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ncoh = gid / OW, ow = gid % OW;
+    int32_t nc = ncoh / OH, oh = ncoh % OH;
+    int32_t n = nc / C, c = nc % C;
+
+    if(n >= N)
+        return;
+
+    size_t h  = (size_t)floor((float)(oh * H) / OH);
+    size_t kh = (size_t)ceil((float)((oh + 1) * H) / OH) - h;
+
+    size_t w  = (size_t)floor((float)(ow * W) / OW);
+    size_t kw = (size_t)ceil((float)((ow + 1) * W) / OW) - w;
+
+    FSTYPE divider = (FSTYPE)(kh * kw);
+    FSTYPE sum     = 0;
+    for(size_t ih = h; ih < (h + kh); ++ih)
+    {
+        for(size_t iw = w; iw < (w + kw); ++iw)
+        {
+            sum += GET_4D_VAL_AT(input, n, c, ih, iw);
+        }
+    }
+
+    SET_4D_VAL_AT(output, n, c, oh, ow, sum / divider);
+
+    output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = CVT_ACCUM2FLOAT(val);
+}
+
+extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input,
+                                            OUTPUT_TYPE* __restrict__ output,
+                                            size_t N,
+                                            size_t C,
+                                            size_t H,
+                                            size_t W,
+                                            size_t OH,
+                                            size_t OW,
+                                            tensor_view_t<4> input_tv,
+                                            tensor_view_t<4> output_tv)
+{
+    avgPoolForward2d<INPUT_TYPE, OUTPUT_TYPE>(
+        input, output, N, C, H, W, OH, OW, input_tv, output_tv);
+}
+
+template <typename TI, typename TO>
+__device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
+                                  TO* __restrict__ input_grad,
+                                  size_t N,
+                                  size_t C,
+                                  size_t H,
+                                  size_t W,
+                                  size_t OH,
+                                  size_t OW,
+                                  tensor_view_t<4> output_grad_tv,
+                                  tensor_view_t<4> input_grad_tv)
+{
+    int32_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t nch = gid / W, w = gid % W;
+    int32_t nc = nch / H, h = nch % H;
+    int32_t n = nc / C, c = nc % C;
+
+    if(n >= N)
+        return;
+
+    size_t oh  = (size_t)floor((float)(h * OH) / H);
+    size_t koh = (size_t)ceil((float)((h + 1) * OH) / H) - oh;
+
+    size_t ow  = (size_t)floor((float)(w * OW) / W);
+    size_t kow = (size_t)ceil((float)((w + 1) * OW) / W) - ow;
+
+    FLOAT_ACCUM grad = 0;
+    for(size_t ih = oh; ih < (oh + koh); ++ih)
+    {
+        size_t kh = (size_t)ceil((float)((ih + 1) * H) / OH) - (size_t)floor((float)(ih * H) / OH);
+        for(size_t iw = ow; iw < (ow + kow); ++iw)
+        {
+            size_t kw =
+                (size_t)ceil((float)((iw + 1) * W) / OW) - (size_t)floor((float)(iw * W) / OW);
+            grad += (FSTYPE)(GET_4D_VAL_AT(output_grad, n, c, ih, iw)) / (kh * kw);
+        }
+    }
+
+    SET_4D_VAL_AT(input_grad, n, c, h, w, grad);
+
+    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
+        CVT_ACCUM2FLOAT(grad);
+}
+
+extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad,
+                                             OUTPUT_TYPE* __restrict__ input_grad,
+                                             size_t N,
+                                             size_t C,
+                                             size_t H,
+                                             size_t W,
+                                             size_t OH,
+                                             size_t OW,
+                                             tensor_view_t<4> output_grad_tv,
+                                             tensor_view_t<4> input_grad_tv)
+{
+    avgPoolBackward2d<INPUT_TYPE, OUTPUT_TYPE>(
+        output_grad, input_grad, N, C, H, W, OH, OW, output_grad_tv, input_grad_tv);
+}
+
+// __kernel void AdaptiveAvgpool2dBackward1x1OutputNHWC(const __global DTYPE_PTR output_grad,
+//                                                      __global DTYPE_PTR input_grad,
+//                                                      const int32_t N,
+//                                                      const int32_t C,
+//                                                      const int32_t HW,
+//                                                      const int32_t output_grad_off,
+//                                                      const int32_t input_grad_off)
+// {
+// /* VSIZE 2 and 16 is fastest but don't know why */
+// #define VSIZE 2
+//     size_t gid = get_global_id(0) * VSIZE;
+//     size_t c   = gid % C;
+//     size_t n   = gid / C;
+//     if(n >= N)
+//         return;
+
+//     __global DTYPE_VEC_PTR(VSIZE) output_grad_vec =
+//         (__global DTYPE_VEC_PTR(VSIZE))(output_grad + n * C + c + output_grad_off);
+
+//     DTYPE_VEC(VSIZE) output_grad_v = GET(output_grad_vec, 0) / HW;
+
+//     __global DTYPE_VEC_PTR(VSIZE) input_grad_vec =
+//         (__global DTYPE_VEC_PTR(VSIZE))(input_grad + n * C * HW + c + input_grad_off);
+
+//     for(size_t i = 0; i < HW; ++i)
+//     {
+//         SET(input_grad_vec, i * C / VSIZE, output_grad_v);
+//     }
+// #undef VSIZE
+// }
+
+template <typename TI, typename TO>
+__device__ void avgPoolForward3d(const TI* __restrict__ input,
+                                 TO* __restrict__ output,
+                                 size_t N,
+                                 size_t C,
+                                 size_t D,
+                                 size_t H,
+                                 size_t W,
+                                 size_t OD,
+                                 size_t OH,
+                                 size_t OW,
+                                 tensor_view_t<5> input_tv,
+                                 tensor_view_t<5> output_tv)
+{
+    int32_t gid    = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ncodoh = gid / OW, ow = gid % OW;
+    int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
+    int32_t nc = ncod / OD, od = ncod % OD;
+    int32_t n = nc / C, c = nc % C;
+
+    if(n >= N)
+        return;
+    int32_t d  = (int32_t)floor((float)(od * D) / OD);
+    int32_t kd = (int32_t)ceil((float)((od + 1) * D) / OD) - d;
+
+    int32_t h  = (int32_t)floor((float)(oh * H) / OH);
+    int32_t kh = (int32_t)ceil((float)((oh + 1) * H) / OH) - h;
+
+    int32_t w  = (int32_t)floor((float)(ow * W) / OW);
+    int32_t kw = (int32_t)ceil((float)((ow + 1) * W) / OW) - w;
+
+    DTYPE_ACCURATE sum = 0;
+    for(int32_t id = d; id < (d + kd); ++id)
+    {
+        for(int32_t ih = h; ih < (h + kh); ++ih)
+        {
+            for(int32_t iw = w; iw < (w + kw); ++iw)
+            {
+                sum += GET_5D_VAL_AT(input, n, c, id, ih, iw);
+            }
+        }
+    }
+
+    output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
+        CVT_ACCUM2FLOAT(val);
+    SET_5D_VAL_AT(output, n, c, od, oh, ow, sum / (kd * kh * kw));
+}
+
+extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input,
+                                            OUTPUT_TYPE* __restrict__ output,
+                                            size_t N,
+                                            size_t C,
+                                            size_t D,
+                                            size_t H,
+                                            size_t W,
+                                            size_t OD,
+                                            size_t OH,
+                                            size_t OW,
+                                            tensor_view_t<5> input_tv,
+                                            tensor_view_t<5> output_tv)
+{
+    avgPoolForward3d<INPUT_TYPE, OUTPUT_TYPE>(
+        input, output, N, C, D, H, W, OD, OH, OW, input_tv, output_tv);
+}
+
+template <typename TI, typename TO>
+__device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
+                                  TO* __restrict__ input_grad,
+                                  size_t N,
+                                  size_t C,
+                                  size_t D,
+                                  size_t H,
+                                  size_t W,
+                                  size_t OD,
+                                  size_t OH,
+                                  size_t OW,
+                                  tensor_view_t<5> output_grad_tv,
+                                  tensor_view_t<5> input_grad_tv)
+{
+    int32_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ncdh = gid / W, w = gid % W;
+    int32_t ncd = ncdh / H, h = ncdh % H;
+    int32_t nc = ncd / D, d = ncd % D;
+    int32_t n = nc / C, c = nc % C;
+
+    if(n >= N)
+        return;
+
+    int32_t od  = (int32_t)floor((float)(d * OD) / D);
+    int32_t kod = (int32_t)ceil((float)((d + 1) * OD) / D) - od;
+
+    int32_t oh  = (int32_t)floor((float)(h * OH) / H);
+    int32_t koh = (int32_t)ceil((float)((h + 1) * OH) / H) - oh;
+
+    int32_t ow  = (int32_t)floor((float)(w * OW) / W);
+    int32_t kow = (int32_t)ceil((float)((w + 1) * OW) / W) - ow;
+
+    DTYPE_ACCURATE grad = 0;
+    for(int32_t id = od; id < (od + kod); ++id)
+    {
+        int32_t kd =
+            (int32_t)ceil((float)((id + 1) * D) / OD) - (int32_t)floor((float)(id * D) / OD);
+        for(int32_t ih = oh; ih < (oh + koh); ++ih)
+        {
+            int32_t kh =
+                (int32_t)ceil((float)((ih + 1) * H) / OH) - (int32_t)floor((float)(ih * H) / OH);
+            for(int32_t iw = ow; iw < (ow + kow); ++iw)
+            {
+                int32_t kw = (int32_t)ceil((float)((iw + 1) * W) / OW) -
+                             (int32_t)floor((float)(iw * W) / OW);
+                grad += GET_5D_VAL_AT(output_grad, n, c, id, ih, iw) / (kd * kh * kw);
+            }
+        }
+    }
+
+    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
+        CVT_ACCUM2FLOAT(grad);
+}
+
+extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad,
+                                             OUTPUT_TYPE* __restrict__ input_grad,
+                                             size_t N,
+                                             size_t C,
+                                             size_t D,
+                                             size_t H,
+                                             size_t W,
+                                             size_t OD,
+                                             size_t OH,
+                                             size_t OW,
+                                             tensor_view_t<5> output_grad_tv,
+                                             tensor_view_t<5> input_grad_tv)
+{
+    avgPoolBackward3d<INPUT_TYPE, OUTPUT_TYPE>(
+        output_grad, input_grad, N, C, D, H, W, OD, OH, OW, output_grad_tv, input_grad_tv);
+}
diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
deleted file mode 100644
index 76355d5729..0000000000
--- a/src/kernels/MIOpenAvgPool.cpp
+++ /dev/null
@@ -1,574 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2024 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <cstddef>
-#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime.h>
-#endif
-
-#include "float_types.h"
-#include "tensor_view.hpp"
-
-#ifndef INPUT_TYPE
-#define INPUT_TYPE float
-#endif
-
-#ifndef OUTPUT_TYPE
-#define OUTPUT_TYPE float
-#endif
-
-template <typename TI, typename TO>
-__device__ void avgPoolForward2d(const TI* __restrict__ input,
-                                 TO* __restrict__ output,
-                                 size_t N,
-                                 size_t C,
-                                 size_t H,
-                                 size_t W,
-                                 size_t OH,
-                                 size_t OW,
-                                 int32_t R,
-                                 int32_t S,
-                                 int32_t sh,
-                                 int32_t sw,
-                                 int32_t ph,
-                                 int32_t pw,
-                                 bool count_include_pad,
-                                 int32_t divisor_override,
-                                 tensor_view_t<4> input_tv,
-                                 tensor_view_t<4> output_tv)
-{
-    int32_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
-    int32_t ncoh = gid / OW, ow = gid % OW;
-    int32_t nc = ncoh / OH, oh = ncoh % OH;
-    int32_t n = nc / C, c = nc % C;
-
-    if(n >= N)
-        return;
-
-    FLOAT_ACCUM m = 0;
-    for(int32_t r = 0; r < R; ++r)
-    {
-        for(int32_t s = 0; s < S; ++s)
-        {
-            // input idx : (n, c, h, w)
-            int32_t h = oh * sh - ph + r;
-            if(h < 0 || h >= H)
-                continue;
-            int32_t w = ow * sw - pw + s;
-            if(w < 0 || w >= W)
-                continue;
-            // int32_t input_idx = ((n * C + c) * H + h) * W + w;
-            m += CVT_FLOAT2ACCUM(
-                input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
-        }
-    }
-
-    int32_t hstart = oh * sh - ph;
-    int32_t wstart = ow * sw - pw;
-    int32_t hend   = min(hstart + R, H + ph);
-    int32_t wend   = min(wstart + S, W + pw);
-
-    const int32_t pool_size = (hend - hstart) * (wend - wstart);
-
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    hend   = min(hend, H);
-    wend   = min(wend, W);
-
-    int32_t divide_factor;
-    if(divisor_override != 0)
-    {
-        divide_factor = divisor_override;
-    }
-    else
-    {
-        if(count_include_pad)
-        {
-            divide_factor = pool_size;
-        }
-        else
-        {
-            divide_factor = (hend - hstart) * (wend - wstart);
-        }
-    }
-    FLOAT_ACCUM val = m / divide_factor;
-
-    output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = CVT_ACCUM2FLOAT(val);
-}
-
-extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input,
-                                            OUTPUT_TYPE* __restrict__ output,
-                                            size_t N,
-                                            size_t C,
-                                            size_t H,
-                                            size_t W,
-                                            size_t OH,
-                                            size_t OW,
-                                            int32_t R,
-                                            int32_t S,
-                                            int32_t sh,
-                                            int32_t sw,
-                                            int32_t ph,
-                                            int32_t pw,
-                                            bool count_include_pad,
-                                            int32_t divisor_override,
-                                            tensor_view_t<4> input_tv,
-                                            tensor_view_t<4> output_tv)
-{
-    avgPoolForward2d<INPUT_TYPE, OUTPUT_TYPE>(input,
-                                              output,
-                                              N,
-                                              C,
-                                              H,
-                                              W,
-                                              OH,
-                                              OW,
-                                              R,
-                                              S,
-                                              sh,
-                                              sw,
-                                              ph,
-                                              pw,
-                                              count_include_pad,
-                                              divisor_override,
-                                              input_tv,
-                                              output_tv);
-}
-
-template <typename TI, typename TO>
-__device__ void avgPoolForward3d(const TI* __restrict__ input,
-                                 TO* __restrict__ output,
-                                 size_t N,
-                                 size_t C,
-                                 size_t D,
-                                 size_t H,
-                                 size_t W,
-                                 size_t OD,
-                                 size_t OH,
-                                 size_t OW,
-                                 int32_t KD,
-                                 int32_t R,
-                                 int32_t S,
-                                 int32_t sd,
-                                 int32_t sh,
-                                 int32_t sw,
-                                 int32_t pd,
-                                 int32_t ph,
-                                 int32_t pw,
-                                 bool count_include_pad,
-                                 int32_t divisor_override,
-                                 tensor_view_t<5> input_tv,
-                                 tensor_view_t<5> output_tv)
-{
-    int32_t gid    = threadIdx.x + blockIdx.x * blockDim.x;
-    int32_t ncodoh = gid / OW, ow = gid % OW;
-    int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
-    int32_t nc = ncod / OD, od = ncod % OD;
-    int32_t n = nc / C, c = nc % C;
-
-    if(n >= N)
-        return;
-    FLOAT_ACCUM sum = 0;
-    for(int32_t kd = 0; kd < KD; ++kd)
-    {
-        for(int32_t r = 0; r < R; ++r)
-        {
-            for(int32_t s = 0; s < S; ++s)
-            {
-                // input idx : (n, c, d, h, w)
-                int32_t d = od * sd - pd + kd;
-                if(d < 0 || d >= D)
-                    continue;
-                int32_t h = oh * sh - ph + r;
-                if(h < 0 || h >= H)
-                    continue;
-                int32_t w = ow * sw - pw + s;
-                if(w < 0 || w >= W)
-                    continue;
-                // int32_t input_idx = ((n * C + c) * H + h) * W + w;
-                sum += CVT_FLOAT2ACCUM(
-                    input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
-            }
-        }
-    }
-    int32_t dstart = od * sd - pd;
-    int32_t hstart = oh * sh - ph;
-    int32_t wstart = ow * sw - pw;
-    int32_t dend   = min(dstart + KD, D + pd);
-    int32_t hend   = min(hstart + R, H + ph);
-    int32_t wend   = min(wstart + S, W + pw);
-
-    const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-    dstart                  = max(dstart, 0);
-    hstart                  = max(hstart, 0);
-    wstart                  = max(wstart, 0);
-    dend                    = min(dend, D);
-    hend                    = min(hend, H);
-    wend                    = min(wend, W);
-
-    int32_t divide_factor;
-    if(divisor_override != 0)
-    {
-        divide_factor = divisor_override;
-    }
-    else
-    {
-        if(count_include_pad)
-        {
-            divide_factor = pool_size;
-        }
-        else
-        {
-            divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
-        }
-    }
-    FLOAT_ACCUM val = sum / divide_factor;
-    output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
-        CVT_ACCUM2FLOAT(val);
-}
-
-extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input,
-                                            OUTPUT_TYPE* __restrict__ output,
-                                            size_t N,
-                                            size_t C,
-                                            size_t D,
-                                            size_t H,
-                                            size_t W,
-                                            size_t OD,
-                                            size_t OH,
-                                            size_t OW,
-                                            int32_t KD,
-                                            int32_t R,
-                                            int32_t S,
-                                            int32_t sd,
-                                            int32_t sh,
-                                            int32_t sw,
-                                            int32_t pd,
-                                            int32_t ph,
-                                            int32_t pw,
-                                            bool count_include_pad,
-                                            int32_t divisor_override,
-                                            tensor_view_t<5> input_tv,
-                                            tensor_view_t<5> output_tv)
-{
-    avgPoolForward3d<INPUT_TYPE, OUTPUT_TYPE>(input,
-                                              output,
-                                              N,
-                                              C,
-                                              D,
-                                              H,
-                                              W,
-                                              OD,
-                                              OH,
-                                              OW,
-                                              KD,
-                                              R,
-                                              S,
-                                              sd,
-                                              sh,
-                                              sw,
-                                              pd,
-                                              ph,
-                                              pw,
-                                              count_include_pad,
-                                              divisor_override,
-                                              input_tv,
-                                              output_tv);
-}
-
-template <typename TI, typename TO>
-__device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
-                                  TO* __restrict__ input_grad,
-                                  size_t N,
-                                  size_t C,
-                                  size_t H,
-                                  size_t W,
-                                  size_t OH,
-                                  size_t OW,
-                                  int32_t R,
-                                  int32_t S,
-                                  int32_t sh,
-                                  int32_t sw,
-                                  int32_t ph,
-                                  int32_t pw,
-                                  bool count_include_pad,
-                                  int32_t divisor_override,
-                                  tensor_view_t<4> output_grad_tv,
-                                  tensor_view_t<4> input_grad_tv)
-{
-    int32_t gid = threadIdx.x + blockIdx.x * blockDim.x;
-    int32_t nch = gid / W, w = gid % W;
-    int32_t nc = nch / H, h = nch % H;
-    int32_t n = nc / C, c = nc % C;
-
-    if(n >= N)
-        return;
-
-    FLOAT_ACCUM grad = 0;
-    for(int32_t r = 0; r < R; ++r)
-    {
-        for(int32_t s = 0; s < S; ++s)
-        {
-            int32_t ohsh = h + ph - r;
-            if(ohsh % sh != 0)
-                continue;
-            int32_t oh = ohsh / sh;
-            if(oh < 0 || oh >= OH)
-                continue;
-            int32_t owsw = w + pw - s;
-            if(owsw % sw != 0)
-                continue;
-            int32_t ow = owsw / sw;
-            if(ow < 0 || ow >= OW)
-                continue;
-
-            int32_t hstart = oh * sh - ph;
-            int32_t wstart = ow * sw - pw;
-            int32_t hend   = min(hstart + R, H + ph);
-            int32_t wend   = min(wstart + S, W + pw);
-
-            const int32_t pool_size = (hend - hstart) * (wend - wstart);
-
-            hstart = max(hstart, 0);
-            wstart = max(wstart, 0);
-            hend   = min(hend, H);
-            wend   = min(wend, W);
-
-            int32_t divide_factor;
-            if(divisor_override != 0)
-            {
-                divide_factor = divisor_override;
-            }
-            else
-            {
-                if(count_include_pad)
-                {
-                    divide_factor = pool_size;
-                }
-                else
-                {
-                    divide_factor = (hend - hstart) * (wend - wstart);
-                }
-            }
-
-            grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(
-                        tensor_layout_t<4>(n, c, oh, ow))]) /
-                    divide_factor;
-        }
-    }
-    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
-        CVT_ACCUM2FLOAT(grad);
-}
-
-extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad,
-                                             OUTPUT_TYPE* __restrict__ input_grad,
-                                             size_t N,
-                                             size_t C,
-                                             size_t H,
-                                             size_t W,
-                                             size_t OH,
-                                             size_t OW,
-                                             int32_t R,
-                                             int32_t S,
-                                             int32_t sh,
-                                             int32_t sw,
-                                             int32_t ph,
-                                             int32_t pw,
-                                             bool count_include_pad,
-                                             int32_t divisor_override,
-                                             tensor_view_t<4> output_grad_tv,
-                                             tensor_view_t<4> input_grad_tv)
-{
-    avgPoolBackward2d<INPUT_TYPE, OUTPUT_TYPE>(output_grad,
-                                               input_grad,
-                                               N,
-                                               C,
-                                               H,
-                                               W,
-                                               OH,
-                                               OW,
-                                               R,
-                                               S,
-                                               sh,
-                                               sw,
-                                               ph,
-                                               pw,
-                                               count_include_pad,
-                                               divisor_override,
-                                               output_grad_tv,
-                                               input_grad_tv);
-}
-
-template <typename TI, typename TO>
-__device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
-                                  TO* __restrict__ input_grad,
-                                  size_t N,
-                                  size_t C,
-                                  size_t D,
-                                  size_t H,
-                                  size_t W,
-                                  size_t OD,
-                                  size_t OH,
-                                  size_t OW,
-                                  int32_t KD,
-                                  int32_t R,
-                                  int32_t S,
-                                  int32_t sd,
-                                  int32_t sh,
-                                  int32_t sw,
-                                  int32_t pd,
-                                  int32_t ph,
-                                  int32_t pw,
-                                  bool count_include_pad,
-                                  int32_t divisor_override,
-                                  tensor_view_t<5> output_grad_tv,
-                                  tensor_view_t<5> input_grad_tv)
-{
-    int32_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
-    int32_t ncdh = gid / W, w = gid % W;
-    int32_t ncd = ncdh / H, h = ncdh % H;
-    int32_t nc = ncd / D, d = ncd % D;
-    int32_t n = nc / C, c = nc % C;
-
-    if(n >= N)
-        return;
-
-    FLOAT_ACCUM grad = 0;
-    for(int32_t kd = 0; kd < KD; ++kd)
-    {
-        for(int32_t r = 0; r < R; ++r)
-        {
-            for(int32_t s = 0; s < S; ++s)
-            {
-                int32_t odsd = d + pd - kd;
-                if(odsd % sd != 0)
-                    continue;
-                int32_t od = odsd / sd;
-                if(od < 0 || od >= OD)
-                    continue;
-
-                int32_t ohsh = h + ph - r;
-                if(ohsh % sh != 0)
-                    continue;
-                int32_t oh = ohsh / sh;
-                if(oh < 0 || oh >= OH)
-                    continue;
-
-                int32_t owsw = w + pw - s;
-                if(owsw % sw != 0)
-                    continue;
-                int32_t ow = owsw / sw;
-                if(ow < 0 || ow >= OW)
-                    continue;
-
-                int32_t dstart = od * sd - pd;
-                int32_t hstart = oh * sh - ph;
-                int32_t wstart = ow * sw - pw;
-                int32_t dend   = min(dstart + KD, D + pd);
-                int32_t hend   = min(hstart + R, H + ph);
-                int32_t wend   = min(wstart + S, W + pw);
-
-                const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-                dstart                  = max(dstart, 0);
-                hstart                  = max(hstart, 0);
-                wstart                  = max(wstart, 0);
-                dend                    = min(dend, D);
-                hend                    = min(hend, H);
-                wend                    = min(wend, W);
-                int32_t divide_factor;
-                if(divisor_override != 0)
-                {
-                    divide_factor = divisor_override;
-                }
-                else
-                {
-                    if(count_include_pad)
-                    {
-                        divide_factor = pool_size;
-                    }
-                    else
-                    {
-                        divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
-                    }
-                }
-                grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(
-                            tensor_layout_t<5>(n, c, od, oh, ow))]) /
-                        divide_factor;
-            }
-        }
-    }
-    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
-        CVT_ACCUM2FLOAT(grad);
-}
-
-extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad,
-                                             OUTPUT_TYPE* __restrict__ input_grad,
-                                             size_t N,
-                                             size_t C,
-                                             size_t D,
-                                             size_t H,
-                                             size_t W,
-                                             size_t OD,
-                                             size_t OH,
-                                             size_t OW,
-                                             int32_t KD,
-                                             int32_t R,
-                                             int32_t S,
-                                             int32_t sd,
-                                             int32_t sh,
-                                             int32_t sw,
-                                             int32_t pd,
-                                             int32_t ph,
-                                             int32_t pw,
-                                             bool count_include_pad,
-                                             int32_t divisor_override,
-                                             tensor_view_t<5> output_grad_tv,
-                                             tensor_view_t<5> input_grad_tv)
-{
-    avgPoolBackward3d<INPUT_TYPE, OUTPUT_TYPE>(output_grad,
-                                               input_grad,
-                                               N,
-                                               C,
-                                               D,
-                                               H,
-                                               W,
-                                               OD,
-                                               OH,
-                                               OW,
-                                               KD,
-                                               R,
-                                               S,
-                                               sd,
-                                               sh,
-                                               sw,
-                                               pd,
-                                               ph,
-                                               pw,
-                                               count_include_pad,
-                                               divisor_override,
-                                               output_grad_tv,
-                                               input_grad_tv);
-}
diff --git a/src/solver.cpp b/src/solver.cpp
index 6b451ca498..a20ebd6b6e 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -24,6 +24,7 @@
  *
  *******************************************************************************/
 
+#include "miopen/adaptiveavgpool/solvers.hpp"
 #include <miopen/solver.hpp>
 
 #include <miopen/activ/solvers.hpp>
@@ -673,6 +674,30 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
              fusion::ConvWinoFuryRxSFused<2, 3>{}.SolverDbId(),
              miopenConvolutionAlgoWinograd);
 
+    Register(registry,
+             ++id,
+             Primitive::AdaptiveAvgPool,
+             adaptiveavgpool::AdaptiveAvgPoolForward1d{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::AdaptiveAvgPool,
+             adaptiveavgpool::AdaptiveAvgPoolForward2d{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::AdaptiveAvgPool,
+             adaptiveavgpool::AdaptiveAvgPoolForward3d{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::AdaptiveAvgPool,
+             adaptiveavgpool::AdaptiveAvgPoolBackward1d{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::AdaptiveAvgPool,
+             adaptiveavgpool::AdaptiveAvgPoolBackward2d{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::AdaptiveAvgPool,
+             adaptiveavgpool::AdaptiveAvgPoolBackward3d{}.SolverDbId());
     // IMPORTANT: New solvers should be added to the end of the function!
 }
 
diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
similarity index 81%
rename from src/solver/avgpool/backward_avgpool_2d.cpp
rename to src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
index 73adabb8e7..1afb78de45 100644
--- a/src/solver/avgpool/backward_avgpool_2d.cpp
+++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
@@ -28,11 +28,11 @@
 #include "miopen/execution_context.hpp"
 #include "miopen/invoke_params.hpp"
 #include "miopen/tensor_view_utils.hpp"
-#include <miopen/avgpool/solvers.hpp>
+#include <miopen/adaptiveavgpool/solvers.hpp>
 
-#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/adaptiveavgpool/invoke_params.hpp>
 #include <miopen/datatype.hpp>
-#include <miopen/avgpool.hpp>
+#include <miopen/adaptiveavgpool.hpp>
 #include <miopen/target_properties.hpp>
 
 #define LOCAL_SIZE_BWD_2D 256
@@ -41,9 +41,9 @@ namespace miopen {
 
 namespace solver {
 
-namespace avgpool {
+namespace adaptiveavgpool {
 
-bool IsOverRocmBwd2d(const miopen::avgpool::BwdProblemDescription& problem)
+bool IsOverRocmBwd2d(const miopen::adaptiveavgpool::BwdProblemDescription& problem)
 {
     auto dtype      = problem.GetInputGradDesc().GetType();
     auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
@@ -73,8 +73,8 @@ bool IsOverRocmBwd2d(const miopen::avgpool::BwdProblemDescription& problem)
     return false;
 }
 
-bool AvgPoolBackward2d::IsApplicable(const ExecutionContext&,
-                                     const miopen::avgpool::BwdProblemDescription& problem) const
+bool AdaptiveAvgPoolBackward2d::IsApplicable(
+    const ExecutionContext&, const miopen::adaptiveavgpool::BwdProblemDescription& problem) const
 {
     if(problem.GetInputGradDesc().GetNumDims() != 4 ||
        problem.GetOutputGradDesc().GetNumDims() != 4)
@@ -88,9 +88,9 @@ bool AvgPoolBackward2d::IsApplicable(const ExecutionContext&,
     return true;
 }
 
-ConvSolution
-AvgPoolBackward2d::GetSolution(const ExecutionContext& context,
-                               const miopen::avgpool::BwdProblemDescription& problem) const
+ConvSolution AdaptiveAvgPoolBackward2d::GetSolution(
+    const ExecutionContext& context,
+    const miopen::adaptiveavgpool::BwdProblemDescription& problem) const
 {
     std::ignore = context;
 
@@ -108,12 +108,15 @@ AvgPoolBackward2d::GetSolution(const ExecutionContext& context,
         {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
         {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
 
-    result.construction_params.push_back(make_hip_kernel(
-        {LOCAL_SIZE_BWD_2D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolBackward2d", build_params));
+    result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_2D},
+                                                         {N_total},
+                                                         "MIOpenAdaptiveAvgPool.cpp",
+                                                         "AdaptiveAvgPoolBackward2d",
+                                                         build_params));
 
     result.invoker_factory = [](const std::vector<Kernel>& kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) params = raw_params.CastTo<miopen::avgpool::BwdInvokeParams>();
+            decltype(auto) params = raw_params.CastTo<miopen::adaptiveavgpool::BwdInvokeParams>();
 
             decltype(auto) kernel = handle_.Run(kernels.front());
 
@@ -135,14 +138,6 @@ AvgPoolBackward2d::GetSolution(const ExecutionContext& context,
                    W,
                    OH,
                    OW,
-                   params.KH,
-                   params.KW,
-                   params.SH,
-                   params.SW,
-                   params.PH,
-                   params.PW,
-                   params.count_include_pad,
-                   params.divisor_override,
                    output_grad_tv,
                    input_grad_tv);
         };
@@ -151,7 +146,7 @@ AvgPoolBackward2d::GetSolution(const ExecutionContext& context,
     return result;
 }
 
-} // namespace avgpool
+} // namespace adaptiveavgpool
 
 } // namespace solver
 
diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
new file mode 100644
index 0000000000..1afb78de45
--- /dev/null
+++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
@@ -0,0 +1,153 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <miopen/adaptiveavgpool/solvers.hpp>
+
+#include <miopen/adaptiveavgpool/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/adaptiveavgpool.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_BWD_2D 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace adaptiveavgpool {
+
+bool IsOverRocmBwd2d(const miopen::adaptiveavgpool::BwdProblemDescription& problem)
+{
+    auto dtype      = problem.GetInputGradDesc().GetType();
+    auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
+    auto out_nelems = problem.GetOutputGradDesc().GetElementSize();
+    auto mul_nc =
+        problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1];
+    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+    if(dtype == miopenFloat)
+    {
+        return false;
+    }
+    else if(dtype == miopenHalf)
+    {
+        if(in_over_out < 2 && in_nelems >= 11075584)
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenBFloat16)
+    {
+        if(in_over_out < 2 || (in_nelems > 20000000 && mul_nc <= 2048))
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool AdaptiveAvgPoolBackward2d::IsApplicable(
+    const ExecutionContext&, const miopen::adaptiveavgpool::BwdProblemDescription& problem) const
+{
+    if(problem.GetInputGradDesc().GetNumDims() != 4 ||
+       problem.GetOutputGradDesc().GetNumDims() != 4)
+    {
+        return false;
+    }
+    if(!IsOverRocmBwd2d(problem))
+    {
+        return false;
+    }
+    return true;
+}
+
+ConvSolution AdaptiveAvgPoolBackward2d::GetSolution(
+    const ExecutionContext& context,
+    const miopen::adaptiveavgpool::BwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
+    auto dtype        = problem.GetInputGradDesc().GetType();
+    size_t N_total    = problem.GetNtotal();
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+        {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
+
+    result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_2D},
+                                                         {N_total},
+                                                         "MIOpenAdaptiveAvgPool.cpp",
+                                                         "AdaptiveAvgPoolBackward2d",
+                                                         build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::adaptiveavgpool::BwdInvokeParams>();
+
+            decltype(auto) kernel = handle_.Run(kernels.front());
+
+            auto input_grad_tv  = get_inner_expanded_tv<4>(deref(params.inputGradDesc));
+            auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc));
+
+            auto N  = deref(params.inputGradDesc).GetLengths()[0];
+            auto C  = deref(params.inputGradDesc).GetLengths()[1];
+            auto H  = deref(params.inputGradDesc).GetLengths()[2];
+            auto W  = deref(params.inputGradDesc).GetLengths()[3];
+            auto OH = deref(params.outputGradDesc).GetLengths()[2];
+            auto OW = deref(params.outputGradDesc).GetLengths()[3];
+
+            kernel(params.output_grad,
+                   params.input_grad,
+                   N,
+                   C,
+                   H,
+                   W,
+                   OH,
+                   OW,
+                   output_grad_tv,
+                   input_grad_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace adaptiveavgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
similarity index 81%
rename from src/solver/avgpool/backward_avgpool_3d.cpp
rename to src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
index 4815803ad3..51d815e281 100644
--- a/src/solver/avgpool/backward_avgpool_3d.cpp
+++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
@@ -28,11 +28,11 @@
 #include "miopen/execution_context.hpp"
 #include "miopen/invoke_params.hpp"
 #include "miopen/tensor_view_utils.hpp"
-#include <miopen/avgpool/solvers.hpp>
+#include <miopen/adaptiveavgpool/solvers.hpp>
 
-#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/adaptiveavgpool/invoke_params.hpp>
 #include <miopen/datatype.hpp>
-#include <miopen/avgpool.hpp>
+#include <miopen/adaptiveavgpool.hpp>
 #include <miopen/target_properties.hpp>
 
 #define LOCAL_SIZE_BWD_3D 256
@@ -41,9 +41,9 @@ namespace miopen {
 
 namespace solver {
 
-namespace avgpool {
+namespace adaptiveavgpool {
 
-bool IsOverRocmBwd3d(const miopen::avgpool::BwdProblemDescription& problem)
+bool IsOverRocmBwd3d(const miopen::adaptiveavgpool::BwdProblemDescription& problem)
 {
     auto dtype      = problem.GetInputGradDesc().GetType();
     auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
@@ -79,8 +79,8 @@ bool IsOverRocmBwd3d(const miopen::avgpool::BwdProblemDescription& problem)
     return false;
 }
 
-bool AvgPoolBackward3d::IsApplicable(const ExecutionContext&,
-                                     const miopen::avgpool::BwdProblemDescription& problem) const
+bool AdaptiveAvgPoolBackward3d::IsApplicable(
+    const ExecutionContext&, const miopen::adaptiveavgpool::BwdProblemDescription& problem) const
 {
     if(problem.GetInputGradDesc().GetNumDims() != 5 ||
        problem.GetOutputGradDesc().GetNumDims() != 5)
@@ -94,9 +94,9 @@ bool AvgPoolBackward3d::IsApplicable(const ExecutionContext&,
     return true;
 }
 
-ConvSolution
-AvgPoolBackward3d::GetSolution(const ExecutionContext& context,
-                               const miopen::avgpool::BwdProblemDescription& problem) const
+ConvSolution AdaptiveAvgPoolBackward3d::GetSolution(
+    const ExecutionContext& context,
+    const miopen::adaptiveavgpool::BwdProblemDescription& problem) const
 {
     std::ignore = context;
 
@@ -114,12 +114,15 @@ AvgPoolBackward3d::GetSolution(const ExecutionContext& context,
         {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
         {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
 
-    result.construction_params.push_back(make_hip_kernel(
-        {LOCAL_SIZE_BWD_3D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolBackward3d", build_params));
+    result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_3D},
+                                                         {N_total},
+                                                         "MIOpenAdaptiveAvgPool.cpp",
+                                                         "AdaptiveAvgPoolBackward3d",
+                                                         build_params));
 
     result.invoker_factory = [](const std::vector<Kernel>& kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) params = raw_params.CastTo<miopen::avgpool::BwdInvokeParams>();
+            decltype(auto) params = raw_params.CastTo<miopen::adaptiveavgpool::BwdInvokeParams>();
 
             decltype(auto) kernel = handle_.Run(kernels.front());
 
@@ -145,17 +148,6 @@ AvgPoolBackward3d::GetSolution(const ExecutionContext& context,
                    OD,
                    OH,
                    OW,
-                   params.KD,
-                   params.KH,
-                   params.KW,
-                   params.SD,
-                   params.SH,
-                   params.SW,
-                   params.PD,
-                   params.PH,
-                   params.PW,
-                   params.count_include_pad,
-                   params.divisor_override,
                    output_grad_tv,
                    input_grad_tv);
         };
@@ -164,7 +156,7 @@ AvgPoolBackward3d::GetSolution(const ExecutionContext& context,
     return result;
 }
 
-} // namespace avgpool
+} // namespace adaptiveavgpool
 
 } // namespace solver
 
diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
new file mode 100644
index 0000000000..85bb5747f3
--- /dev/null
+++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
@@ -0,0 +1,145 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <miopen/adaptiveavgpool/solvers.hpp>
+
+#include <miopen/adaptiveavgpool/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/adaptiveavgpool.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_FWD_2D 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace adaptiveavgpool {
+
+bool IsOverRocmFwd2d(const miopen::adaptiveavgpool::FwdProblemDescription& problem)
+{
+    auto dtype      = problem.GetOutputDesc().GetType();
+    auto in_nelems  = problem.GetInputDesc().GetElementSize();
+    auto out_nelems = problem.GetOutputDesc().GetElementSize();
+    auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
+    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+    if(dtype == miopenFloat)
+    {
+        if(in_over_out > 11 || (in_over_out < 2 && mul_nc >= 12288))
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenHalf)
+    {
+        if(in_over_out > 11 || (in_over_out < 2 && mul_nc < 90000))
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenBFloat16)
+    {
+        if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 4816896)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool AdaptiveAvgPoolForward1d::IsApplicable(
+    const ExecutionContext&, const miopen::adaptiveavgpool::FwdProblemDescription& problem) const
+{
+    if(problem.GetInputDesc().GetNumDims() != 4 || problem.GetOutputDesc().GetNumDims() != 4)
+    {
+        return false;
+    }
+    if(!IsOverRocmFwd2d(problem))
+    {
+        return false;
+    }
+    return true;
+}
+
+ConvSolution AdaptiveAvgPoolForward1d::GetSolution(
+    const ExecutionContext& context,
+    const miopen::adaptiveavgpool::FwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
+    auto dtype        = problem.GetOutputDesc().GetType();
+    size_t N_total    = problem.GetNtotal();
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+        {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
+
+    result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_2D},
+                                                         {N_total},
+                                                         "MIOpenAdaptiveAvgPool.cpp",
+                                                         "AdaptiveAvgPoolForward1d",
+                                                         build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::adaptiveavgpool::FwdInvokeParams>();
+
+            decltype(auto) kernel = handle_.Run(kernels.front());
+
+            auto input_tv  = get_inner_expanded_tv<4>(deref(params.inputDesc));
+            auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc));
+
+            size_t N  = deref(params.inputDesc).GetLengths()[0];
+            size_t C  = deref(params.inputDesc).GetLengths()[1];
+            size_t H  = deref(params.inputDesc).GetLengths()[2];
+            size_t W  = deref(params.inputDesc).GetLengths()[3];
+            size_t OH = deref(params.outputDesc).GetLengths()[2];
+            size_t OW = deref(params.outputDesc).GetLengths()[3];
+
+            kernel(params.input, params.output, N, C, H, W, OH, OW, input_tv, output_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace adaptiveavgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
similarity index 76%
rename from src/solver/avgpool/forward_avgpool_2d.cpp
rename to src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
index 1c51feb54b..d1afc40842 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
@@ -29,11 +29,11 @@
 #include "miopen/invoke_params.hpp"
 #include "miopen/tensor_view_utils.hpp"
 #include <cstdint>
-#include <miopen/avgpool/solvers.hpp>
+#include <miopen/adaptiveavgpool/solvers.hpp>
 
-#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/adaptiveavgpool/invoke_params.hpp>
 #include <miopen/datatype.hpp>
-#include <miopen/avgpool.hpp>
+#include <miopen/adaptiveavgpool.hpp>
 #include <miopen/target_properties.hpp>
 
 #define LOCAL_SIZE_FWD_2D 256
@@ -42,9 +42,9 @@ namespace miopen {
 
 namespace solver {
 
-namespace avgpool {
+namespace adaptiveavgpool {
 
-bool IsOverRocmFwd2d(const miopen::avgpool::FwdProblemDescription& problem)
+bool IsOverRocmFwd2d(const miopen::adaptiveavgpool::FwdProblemDescription& problem)
 {
     auto dtype      = problem.GetOutputDesc().GetType();
     auto in_nelems  = problem.GetInputDesc().GetElementSize();
@@ -76,8 +76,8 @@ bool IsOverRocmFwd2d(const miopen::avgpool::FwdProblemDescription& problem)
     return false;
 }
 
-bool AvgPoolForward2d::IsApplicable(const ExecutionContext&,
-                                    const miopen::avgpool::FwdProblemDescription& problem) const
+bool AdaptiveAvgPoolForward2d::IsApplicable(
+    const ExecutionContext&, const miopen::adaptiveavgpool::FwdProblemDescription& problem) const
 {
     if(problem.GetInputDesc().GetNumDims() != 4 || problem.GetOutputDesc().GetNumDims() != 4)
     {
@@ -90,9 +90,9 @@ bool AvgPoolForward2d::IsApplicable(const ExecutionContext&,
     return true;
 }
 
-ConvSolution
-AvgPoolForward2d::GetSolution(const ExecutionContext& context,
-                              const miopen::avgpool::FwdProblemDescription& problem) const
+ConvSolution AdaptiveAvgPoolForward2d::GetSolution(
+    const ExecutionContext& context,
+    const miopen::adaptiveavgpool::FwdProblemDescription& problem) const
 {
     std::ignore = context;
 
@@ -110,12 +110,15 @@ AvgPoolForward2d::GetSolution(const ExecutionContext& context,
         {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
         {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
 
-    result.construction_params.push_back(make_hip_kernel(
-        {LOCAL_SIZE_FWD_2D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolForward2d", build_params));
+    result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_2D},
+                                                         {N_total},
+                                                         "MIOpenAdaptiveAvgPool.cpp",
+                                                         "AdaptiveAvgPoolForward2d",
+                                                         build_params));
 
     result.invoker_factory = [](const std::vector<Kernel>& kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) params = raw_params.CastTo<miopen::avgpool::FwdInvokeParams>();
+            decltype(auto) params = raw_params.CastTo<miopen::adaptiveavgpool::FwdInvokeParams>();
 
             decltype(auto) kernel = handle_.Run(kernels.front());
 
@@ -129,31 +132,14 @@ AvgPoolForward2d::GetSolution(const ExecutionContext& context,
             size_t OH = deref(params.outputDesc).GetLengths()[2];
             size_t OW = deref(params.outputDesc).GetLengths()[3];
 
-            kernel(params.input,
-                   params.output,
-                   N,
-                   C,
-                   H,
-                   W,
-                   OH,
-                   OW,
-                   params.KH,
-                   params.KW,
-                   params.SH,
-                   params.SW,
-                   params.PH,
-                   params.PW,
-                   params.count_include_pad,
-                   params.divisor_override,
-                   input_tv,
-                   output_tv);
+            kernel(params.input, params.output, N, C, H, W, OH, OW, input_tv, output_tv);
         };
     };
 
     return result;
 }
 
-} // namespace avgpool
+} // namespace adaptiveavgpool
 
 } // namespace solver
 
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
similarity index 76%
rename from src/solver/avgpool/forward_avgpool_3d.cpp
rename to src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
index 6f70a07419..cf9bf5a9b9 100644
--- a/src/solver/avgpool/forward_avgpool_3d.cpp
+++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
@@ -28,11 +28,11 @@
 #include "miopen/execution_context.hpp"
 #include "miopen/invoke_params.hpp"
 #include "miopen/tensor_view_utils.hpp"
-#include <miopen/avgpool/solvers.hpp>
+#include <miopen/adaptiveavgpool/solvers.hpp>
 
-#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/adaptiveavgpool/invoke_params.hpp>
 #include <miopen/datatype.hpp>
-#include <miopen/avgpool.hpp>
+#include <miopen/adaptiveavgpool.hpp>
 #include <miopen/target_properties.hpp>
 
 #define LOCAL_SIZE_FWD_3D 256
@@ -41,9 +41,9 @@ namespace miopen {
 
 namespace solver {
 
-namespace avgpool {
+namespace adaptiveavgpool {
 
-bool IsOverRocmFwd3d(const miopen::avgpool::FwdProblemDescription& problem)
+bool IsOverRocmFwd3d(const miopen::adaptiveavgpool::FwdProblemDescription& problem)
 {
     auto dtype      = problem.GetOutputDesc().GetType();
     auto in_nelems  = problem.GetInputDesc().GetElementSize();
@@ -80,8 +80,8 @@ bool IsOverRocmFwd3d(const miopen::avgpool::FwdProblemDescription& problem)
     return false;
 }
 
-bool AvgPoolForward3d::IsApplicable(const ExecutionContext&,
-                                    const miopen::avgpool::FwdProblemDescription& problem) const
+bool AdaptiveAvgPoolForward3d::IsApplicable(
+    const ExecutionContext&, const miopen::adaptiveavgpool::FwdProblemDescription& problem) const
 {
     if(problem.GetInputDesc().GetNumDims() != 5 || problem.GetOutputDesc().GetNumDims() != 5)
     {
@@ -94,9 +94,9 @@ bool AvgPoolForward3d::IsApplicable(const ExecutionContext&,
     return true;
 }
 
-ConvSolution
-AvgPoolForward3d::GetSolution(const ExecutionContext& context,
-                              const miopen::avgpool::FwdProblemDescription& problem) const
+ConvSolution AdaptiveAvgPoolForward3d::GetSolution(
+    const ExecutionContext& context,
+    const miopen::adaptiveavgpool::FwdProblemDescription& problem) const
 {
     std::ignore = context;
 
@@ -114,12 +114,15 @@ AvgPoolForward3d::GetSolution(const ExecutionContext& context,
         {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
         {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
 
-    result.construction_params.push_back(make_hip_kernel(
-        {LOCAL_SIZE_FWD_3D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolForward3d", build_params));
+    result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_3D},
+                                                         {N_total},
+                                                         "MIOpenAdaptiveAvgPool.cpp",
+                                                         "AdaptiveAvgPoolForward3d",
+                                                         build_params));
 
     result.invoker_factory = [](const std::vector<Kernel>& kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) params = raw_params.CastTo<miopen::avgpool::FwdInvokeParams>();
+            decltype(auto) params = raw_params.CastTo<miopen::adaptiveavgpool::FwdInvokeParams>();
 
             decltype(auto) kernel = handle_.Run(kernels.front());
 
@@ -135,36 +138,14 @@ AvgPoolForward3d::GetSolution(const ExecutionContext& context,
             auto OH = deref(params.outputDesc).GetLengths()[3];
             auto OW = deref(params.outputDesc).GetLengths()[4];
 
-            kernel(params.input,
-                   params.output,
-                   N,
-                   C,
-                   D,
-                   H,
-                   W,
-                   OD,
-                   OH,
-                   OW,
-                   params.KD,
-                   params.KH,
-                   params.KW,
-                   params.SD,
-                   params.SH,
-                   params.SW,
-                   params.PD,
-                   params.PH,
-                   params.PW,
-                   params.count_include_pad,
-                   params.divisor_override,
-                   input_tv,
-                   output_tv);
+            kernel(params.input, params.output, N, C, D, H, W, OD, OH, OW, input_tv, output_tv);
         };
     };
 
     return result;
 }
 
-} // namespace avgpool
+} // namespace adaptiveavgpool
 
 } // namespace solver
 

From 2e131c7058cb58c25b470e2fd452b41335cf7e85 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Thu, 3 Oct 2024 16:13:11 +0700
Subject: [PATCH 11/38] small fix

---
 driver/CMakeLists.txt                         |   2 +-
 driver/adaptiveavgpool_driver.hpp             | 490 +++++++++++++++
 driver/avgpool_driver.hpp                     | 575 ------------------
 ...{dm_avgpool.cpp => dm_adaptiveavgpool.cpp} |  14 +-
 driver/driver.hpp                             |   7 +-
 driver/mloAdaptiveAvgPoolHost.hpp             | 337 ++++++++++
 driver/mloAvgPoolHost.hpp                     | 438 -------------
 include/miopen/miopen.h                       |  21 +-
 src/CMakeLists.txt                            |   2 +-
 .../adaptiveavgpool/problem_description.hpp   | 120 ++++
 src/kernels/MIOpenAdaptiveAvgPool.cpp         | 461 +++++++-------
 src/kernels/tensor_view.hpp                   |  46 +-
 .../backward_adaptiveavgpool_1d.cpp           |  62 +-
 .../backward_adaptiveavgpool_2d.cpp           |  36 +-
 .../forward_adaptiveavgpool_1d.cpp            |  45 +-
 .../forward_adaptiveavgpool_2d.cpp            |  26 +-
 test/cpu_adaptiveavgpool.hpp                  | 311 ++++++++++
 test/cpu_avgpool.hpp                          | 426 -------------
 .../{avgpool.cpp => adaptiveavgpool.cpp}      |  56 +-
 test/gtest/adaptiveavgpool.hpp                | 380 ++++++++++++
 test/gtest/avgpool.hpp                        | 451 --------------
 21 files changed, 1969 insertions(+), 2337 deletions(-)
 create mode 100644 driver/adaptiveavgpool_driver.hpp
 delete mode 100644 driver/avgpool_driver.hpp
 rename driver/{dm_avgpool.cpp => dm_adaptiveavgpool.cpp} (81%)
 create mode 100644 driver/mloAdaptiveAvgPoolHost.hpp
 delete mode 100644 driver/mloAvgPoolHost.hpp
 create mode 100644 test/cpu_adaptiveavgpool.hpp
 delete mode 100644 test/cpu_avgpool.hpp
 rename test/gtest/{avgpool.cpp => adaptiveavgpool.cpp} (64%)
 create mode 100644 test/gtest/adaptiveavgpool.hpp
 delete mode 100644 test/gtest/avgpool.hpp

diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 385580e2e1..4fd3c033db 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -32,7 +32,7 @@ add_executable(MIOpenDriver
     dm_activ.cpp
     dm_adam.cpp
     dm_addlayernorm.cpp
-    dm_avgpool.cpp
+    dm_adaptiveavgpool.cpp
     dm_bnorm.cpp
     dm_cat.cpp
     dm_conv.cpp
diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp
new file mode 100644
index 0000000000..fd86cf9eec
--- /dev/null
+++ b/driver/adaptiveavgpool_driver.hpp
@@ -0,0 +1,490 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_ADAPTIVEAVGPOOL_DRIVER_HPP
+#define GUARD_MIOPEN_ADAPTIVEAVGPOOL_DRIVER_HPP
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "mloAdaptiveAvgPoolHost.hpp"
+#include "random.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+
+#include <miopen/env.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <vector>
+
+template <typename Tgpu, typename Tref>
+class AdaptiveAvgPoolDriver : public Driver
+{
+public:
+    AdaptiveAvgPoolDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+        miopenCreateTensorDescriptor(&inputGradDesc);
+        miopenCreateTensorDescriptor(&outputGradDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    std::vector<int> ComputeStrides(std::vector<int> input);
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    std::vector<int> GetInputTensorDimsFromCmd(const char* param);
+    int GetandSetData() override;
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~AdaptiveAvgPoolDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+        miopenDestroyTensorDescriptor(inputGradDesc);
+        miopenDestroyTensorDescriptor(outputGradDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t outputDesc;
+    miopenTensorDescriptor_t inputGradDesc;
+    miopenTensorDescriptor_t outputGradDesc;
+
+    std::unique_ptr<GPUMem> input_dev;
+    std::unique_ptr<GPUMem> output_dev;
+    std::unique_ptr<GPUMem> input_grad_dev;
+    std::unique_ptr<GPUMem> output_grad_dev;
+
+    std::vector<Tgpu> input;
+    std::vector<Tgpu> output;
+    std::vector<Tref> output_host;
+    std::vector<Tgpu> input_grad;
+    std::vector<Tref> input_grad_host;
+    std::vector<Tgpu> output_grad;
+
+    size_t N = 1, C = 1, D = 1, H = 1, W = 1, OD = 1, OH = 1, OW = 1;
+
+    std::vector<int> in_dim;
+    std::vector<int> out_dim;
+    bool isContiguous;
+};
+
+template <typename Tgpu, typename Tref>
+int AdaptiveAvgPoolDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+    isContiguous = inflags.GetValueInt("is-contiguous") == 1 ? true : false;
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+std::vector<int> AdaptiveAvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char* param)
+{
+    std::string lengthsStr = inflags.GetValueStr(param);
+
+    std::vector<int> lengths;
+    std::size_t pos = 0;
+    std::size_t new_pos;
+
+    new_pos = lengthsStr.find(',', pos);
+    while(new_pos != std::string::npos)
+    {
+        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
+
+        int len = std::stoi(sliceStr);
+
+        lengths.push_back(len);
+
+        pos     = new_pos + 1;
+        new_pos = lengthsStr.find(',', pos);
+    };
+
+    std::string sliceStr = lengthsStr.substr(pos);
+    int len              = std::stoi(sliceStr);
+
+    lengths.push_back(len);
+
+    return (lengths);
+}
+
+template <typename Tgpu, typename Tref>
+int AdaptiveAvgPoolDriver<Tgpu, Tref>::GetandSetData()
+{
+    in_dim                     = GetInputTensorDimsFromCmd("input_dims");
+    std::vector<int> in_stride = ComputeStrides(in_dim);
+    out_dim                    = GetInputTensorDimsFromCmd("output_dims");
+    if(in_dim.size() != out_dim.size() + 2)
+    {
+        MIOPEN_THROW(miopenStatusBadParm,
+                     "AdaptiveAvgPool: Input and output tensor sizes do not match.");
+    }
+    N                                 = in_dim[0];
+    C                                 = in_dim[1];
+    std::vector<size_t> out_dim_final = {N, C};
+    if(in_dim.size() == 3)
+    {
+        H = in_dim[2];
+
+        OH = out_dim[0];
+        out_dim_final.push_back(OH);
+    }
+    else if(in_dim.size() == 4)
+    {
+        H = in_dim[2];
+        W = in_dim[3];
+
+        OH = out_dim[0];
+        OW = out_dim[1];
+        out_dim_final.push_back(OH);
+        out_dim_final.push_back(OW);
+    }
+    else if(in_dim.size() == 5)
+    {
+        D = in_dim[2];
+        H = in_dim[3];
+        W = in_dim[4];
+
+        OD = out_dim[0];
+        OH = out_dim[1];
+        OW = out_dim[2];
+        out_dim_final.push_back(OD);
+        out_dim_final.push_back(OH);
+        out_dim_final.push_back(OW);
+    }
+    std::vector<int> out_grad_stride = ComputeStrides(out_dim_final);
+    SetTensorNd(inputDesc, in_dim, in_stride, data_type);
+    SetTensorNd(outputDesc, out_dim_final, data_type);
+    SetTensorNd(outputGradDesc, out_dim_final, out_grad_stride, data_type);
+    SetTensorNd(inputGradDesc, in_dim, data_type);
+
+    return miopenStatusSuccess;
+}
+
+// Equivalent to: tensor.tranpose(0, -1).contiguous().tranpose(0, -1) incase contiguous = False
+template <typename Tgpu, typename Tref>
+std::vector<int> AdaptiveAvgPoolDriver<Tgpu, Tref>::ComputeStrides(std::vector<int> inputDim)
+{
+    if(!isContiguous)
+        std::swap(inputDim.front(), inputDim.back());
+    std::vector<int> strides(inputDim.size());
+    strides.back() = 1;
+    for(int i = inputDim.size() - 2; i >= 0; --i)
+        strides[i] = strides[i + 1] * inputDim[i + 1];
+    if(!isContiguous)
+        std::swap(strides.front(), strides.back());
+    return strides;
+}
+
+template <typename Tgpu, typename Tref>
+int AdaptiveAvgPoolDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag("forw", 'F', "1", "Run only Forward AdaptiveAvgPool (Default=1)", "int");
+    inflags.AddInputFlag(
+        "input_dims",
+        'D',
+        "2,3,7,9,9",
+        "The dimensional lengths of the input tensor: N,C,D,H,W... Example: 2,3,7,9,9.",
+        "string");
+    inflags.AddInputFlag(
+        "output_dims",
+        'S',
+        "5,5,5",
+        "The dimensional lengths of the output tensor: OD,OH,OW,... Example: 5,5,5.",
+        "string");
+    inflags.AddInputFlag("is-contiguous", 'c', "1", "is-contiguous (Default=1)", "int");
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int");
+    inflags.AddInputFlag("time", 't', "1", "Time (Default=1)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AdaptiveAvgPoolDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t input_sz  = GetTensorSize(inputDesc);
+    size_t output_sz = GetTensorSize(outputDesc);
+
+    uint32_t ctx = 0;
+
+    input_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    output_dev      = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
+    input_grad_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    output_grad_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
+
+    input       = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0));
+    output      = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0));
+    output_host = std::vector<Tref>(output_sz, static_cast<Tref>(0));
+
+    input_grad      = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0));
+    input_grad_host = std::vector<Tref>(input_sz, static_cast<Tref>(0));
+    output_grad     = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0));
+
+    int status;
+
+    for(int i = 0; i < input_sz; i++)
+    {
+        input[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-10.0f), static_cast<Tgpu>(10.0f));
+    }
+    status = input_dev->ToGPU(q, input.data());
+
+    status |= output_dev->ToGPU(q, output.data());
+
+    status |= input_grad_dev->ToGPU(q, input_grad.data());
+
+    for(int i = 0; i < output_sz; i++)
+    {
+        output_grad[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-1.0), static_cast<Tgpu>(1.0));
+    }
+    status |= output_grad_dev->ToGPU(q, output_grad.data());
+
+    if(status != 0)
+        std::cout << "Error copying data to GPU\n" << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0.0;
+    float kernel_first_time = 0.0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenAdaptiveAvgPoolForward(
+            GetHandle(), inputDesc, input_dev->GetMem(), outputDesc, output_dev->GetMem());
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            printf("Wall-clock Time Forward AdaptiveAvgPool Elapsed: %f ms\n",
+                   t.gettime_ms() / iter);
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        printf("GPU Kernel Time Forward AdaptiveAvgPool Elapsed: %f ms\n", kernel_average_time);
+    }
+
+    output_dev->FromGPU(GetStream(), output.data());
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    if(in_dim.size() == 3)
+    {
+        mloAdaptiveAvgPoolForward1dRunHost<Tgpu, Tref>(
+            inputDesc, outputDesc, input.data(), output_host.data(), N, C, H, OH);
+    }
+    else if(in_dim.size() == 4)
+    {
+        mloAdaptiveAvgPoolForward2dRunHost<Tgpu, Tref>(
+            inputDesc, outputDesc, input.data(), output_host.data(), N, C, H, W, OH, OW);
+    }
+    else if(in_dim.size() == 5)
+    {
+        mloAdaptiveAvgPoolForward3dRunHost<Tgpu, Tref>(
+            inputDesc, outputDesc, input.data(), output_host.data(), N, C, D, H, W, OD, OH, OW);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    float kernel_total_time = 0.0;
+    float kernel_first_time = 0.0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenAdaptiveAvgPoolBackward(GetHandle(),
+                                      outputGradDesc,
+                                      output_grad_dev->GetMem(),
+                                      inputGradDesc,
+                                      input_grad_dev->GetMem());
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            printf("Wall-clock Time Backward AdaptiveAvgPool Elapsed: %f ms\n",
+                   t.gettime_ms() / iter);
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        printf("GPU Kernel Time Backward AdaptiveAvgPool Elapsed: %f ms\n", kernel_average_time);
+    }
+
+    input_grad_dev->FromGPU(GetStream(), input_grad.data());
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    if(in_dim.size() == 3)
+    {
+        mloAdaptiveAvgPoolBackward1dRunHost<Tgpu, Tref>(
+            outputGradDesc, inputGradDesc, output_grad.data(), input_grad_host.data(), N, C, H, OH);
+    }
+    else if(in_dim.size() == 4)
+    {
+        mloAdaptiveAvgPoolBackward2dRunHost<Tgpu, Tref>(outputGradDesc,
+                                                        inputGradDesc,
+                                                        output_grad.data(),
+                                                        input_grad_host.data(),
+                                                        N,
+                                                        C,
+                                                        H,
+                                                        W,
+                                                        OH,
+                                                        OW);
+    }
+    else if(in_dim.size() == 5)
+    {
+        mloAdaptiveAvgPoolBackward3dRunHost<Tgpu, Tref>(outputGradDesc,
+                                                        inputGradDesc,
+                                                        output_grad.data(),
+                                                        input_grad_host.data(),
+                                                        N,
+                                                        C,
+                                                        D,
+                                                        H,
+                                                        W,
+                                                        OD,
+                                                        OH,
+                                                        OW);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+Tref AdaptiveAvgPoolDriver<Tgpu, Tref>::GetTolerance()
+{
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8.0;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int AdaptiveAvgPoolDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error           = miopen::rms_range(output_host, output);
+
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Forward AdaptiveAvgPool FAILED: " << error << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        printf("Forward AdaptiveAvgPool Verifies on CPU and GPU (err=%f)\n", error);
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AdaptiveAvgPoolDriver<Tgpu, Tref>::VerifyBackward()
+{
+    RunBackwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error           = miopen::rms_range(input_grad_host, input_grad);
+
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Backward AdaptiveAvgPool FAILED: " << error << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        printf("Backward AdaptiveAvgPool Verifies on CPU and GPU (err=%f)\n", error);
+    }
+    return miopenStatusSuccess;
+}
+
+#endif // GUARD_MIOPEN_ADAPTIVEAVGPOOL_DRIVER_HPP
diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp
deleted file mode 100644
index ff7d04edd5..0000000000
--- a/driver/avgpool_driver.hpp
+++ /dev/null
@@ -1,575 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2024 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_MIOPEN_AVGPOOL_DRIVER_HPP
-#define GUARD_MIOPEN_AVGPOOL_DRIVER_HPP
-
-#include "InputFlags.hpp"
-#include "driver.hpp"
-#include "mloAvgPoolHost.hpp"
-#include "random.hpp"
-#include "tensor_driver.hpp"
-#include "timer.hpp"
-
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
-
-#include <miopen/env.hpp>
-#include <miopen/handle.hpp>
-#include <miopen/miopen.h>
-#include <miopen/tensor.hpp>
-#include <vector>
-
-template <typename Tgpu, typename Tref>
-class AvgPoolDriver : public Driver
-{
-public:
-    AvgPoolDriver() : Driver()
-    {
-        miopenCreateTensorDescriptor(&inputDesc);
-        miopenCreateTensorDescriptor(&outputDesc);
-        miopenCreateTensorDescriptor(&inputGradDesc);
-        miopenCreateTensorDescriptor(&outputGradDesc);
-
-        data_type = miopen_type<Tgpu>{};
-    }
-
-    int AddCmdLineArgs() override;
-    int ParseCmdLineArgs(int argc, char* argv[]) override;
-    InputFlags& GetInputFlags() override { return inflags; }
-
-    std::vector<int> GetInputTensorDimsFromCmd(const char* param);
-    int GetandSetData() override;
-
-    int AllocateBuffersAndCopy() override;
-
-    int RunForwardGPU() override;
-    int RunForwardCPU();
-
-    int RunBackwardGPU() override;
-    int RunBackwardCPU();
-
-    Tref GetTolerance();
-    int VerifyBackward() override;
-    int VerifyForward() override;
-    ~AvgPoolDriver() override
-    {
-        miopenDestroyTensorDescriptor(inputDesc);
-        miopenDestroyTensorDescriptor(outputDesc);
-        miopenDestroyTensorDescriptor(inputGradDesc);
-        miopenDestroyTensorDescriptor(outputGradDesc);
-    }
-
-private:
-    InputFlags inflags;
-
-    int forw;
-
-    miopenTensorDescriptor_t inputDesc;
-    miopenTensorDescriptor_t outputDesc;
-    miopenTensorDescriptor_t inputGradDesc;
-    miopenTensorDescriptor_t outputGradDesc;
-
-    std::unique_ptr<GPUMem> input_dev;
-    std::unique_ptr<GPUMem> output_dev;
-    std::unique_ptr<GPUMem> input_grad_dev;
-    std::unique_ptr<GPUMem> output_grad_dev;
-
-    std::vector<Tgpu> input;
-    std::vector<Tgpu> output;
-    std::vector<Tref> output_host;
-    std::vector<Tgpu> input_grad;
-    std::vector<Tref> input_grad_host;
-    std::vector<Tgpu> output_grad;
-    std::vector<int32_t> ksize;
-    std::vector<int32_t> stride;
-    std::vector<int32_t> padding;
-
-    bool ceil_mode;
-    bool count_include_pad;
-    int32_t divisor_override;
-    int32_t N, C, D, H, W, OD, OH, OW;
-
-    std::vector<int> in_dim;
-};
-
-template <typename Tgpu, typename Tref>
-int AvgPoolDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
-{
-    inflags.Parse(argc, argv);
-
-    if(inflags.GetValueInt("time") == 1)
-    {
-        miopenEnableProfiling(GetHandle(), true);
-    }
-    return miopenStatusSuccess;
-}
-
-template <typename Tgpu, typename Tref>
-std::vector<int> AvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char* param)
-{
-    std::string lengthsStr = inflags.GetValueStr(param);
-
-    std::vector<int> lengths;
-    std::size_t pos = 0;
-    std::size_t new_pos;
-
-    new_pos = lengthsStr.find(',', pos);
-    while(new_pos != std::string::npos)
-    {
-        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
-
-        int len = std::stoi(sliceStr);
-
-        lengths.push_back(len);
-
-        pos     = new_pos + 1;
-        new_pos = lengthsStr.find(',', pos);
-    };
-
-    std::string sliceStr = lengthsStr.substr(pos);
-    int len              = std::stoi(sliceStr);
-
-    lengths.push_back(len);
-
-    return (lengths);
-}
-
-template <typename Tgpu, typename Tref>
-int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
-{
-    in_dim      = GetInputTensorDimsFromCmd("input_dims");
-    int ksp_dim = in_dim.size() - 2;
-    ksize       = GetInputTensorDimsFromCmd("kernel_size");
-    stride      = GetInputTensorDimsFromCmd("stride");
-    padding     = GetInputTensorDimsFromCmd("padding");
-
-    if(ksize.size() != ksp_dim)
-    {
-        int ref = ksp_dim - ksize.size();
-        while((ref--) != 0)
-            ksize.push_back(ksize[0]);
-    }
-    if(stride.size() != ksp_dim)
-    {
-        int ref = ksp_dim - stride.size();
-        while((ref--) != 0)
-            stride.push_back(stride[0]);
-    }
-    if(padding.size() != ksp_dim)
-    {
-        int ref = ksp_dim - padding.size();
-        while((ref--) != 0)
-            padding.push_back(padding[0]);
-    }
-
-    ceil_mode         = static_cast<bool>(inflags.GetValueInt("ceil_mode"));
-    count_include_pad = static_cast<bool>(inflags.GetValueInt("count_include_pad"));
-    divisor_override  = inflags.GetValueInt("divisor_override");
-
-    N = in_dim[0];
-    C = in_dim[1];
-    D = in_dim.size() == 5 ? in_dim[2] : 1;
-    H = in_dim.size() == 5 ? in_dim[3] : in_dim[2];
-    W = in_dim.size() == 5 ? in_dim[4] : in_dim[3];
-
-    std::vector<int32_t> out_dim;
-    if(in_dim.size() == 5)
-    {
-        if(ceil_mode)
-        {
-            OD = std::ceil(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
-            OH = std::ceil(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
-            OW = std::ceil(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
-        }
-        else
-        {
-            OD = std::floor(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
-            OH = std::floor(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
-            OW = std::floor(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
-        }
-        out_dim = std::vector<int32_t>{N, C, OD, OH, OW};
-    }
-    else
-    {
-        if(ceil_mode)
-        {
-            OH = std::ceil(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
-            OW = std::ceil(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
-        }
-        else
-        {
-            OH = std::floor(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
-            OW = std::floor(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
-        }
-        out_dim = std::vector<int32_t>{N, C, OH, OW};
-    }
-    SetTensorNd(inputDesc, in_dim, data_type);
-    SetTensorNd(outputDesc, out_dim, data_type);
-    SetTensorNd(outputGradDesc, out_dim, data_type);
-    SetTensorNd(inputGradDesc, in_dim, data_type);
-
-    return miopenStatusSuccess;
-}
-
-template <typename Tgpu, typename Tref>
-int AvgPoolDriver<Tgpu, Tref>::AddCmdLineArgs()
-{
-    inflags.AddInputFlag("forw", 'F', "1", "Run only Forward AvgPool (Default=1)", "int");
-    inflags.AddInputFlag(
-        "input_dims",
-        'D',
-        "2,3,7,9",
-        "The dimensional lengths of the input tensor: N,C,D1,D2,... Example: 2,3,7,9.",
-        "string");
-    inflags.AddInputFlag(
-        "kernel_size", 'k', "1,1", "The size of the window D1,D2,... Example: 1,1.", "string");
-    inflags.AddInputFlag(
-        "stride",
-        's',
-        "1,1",
-        "The stride of the window. Default value is kernel_size D1,D2,... Example: 1,1.",
-        "string");
-    inflags.AddInputFlag("padding",
-                         'p',
-                         "0,0",
-                         "Implicit zero padding to be added on both sides D1,D2,... Example: 0,0.",
-                         "string");
-    inflags.AddInputFlag("ceil_mode",
-                         'c',
-                         "1",
-                         "When 1, will use ceil instead of floor to compute the output shape.",
-                         "int");
-    inflags.AddInputFlag("count_include_pad",
-                         'P',
-                         "0",
-                         "When 1, will include the zero-padding in the averaging calculation.",
-                         "int");
-    inflags.AddInputFlag("divisor_override",
-                         'd',
-                         "0",
-                         "If specified, it will be used as divisor, otherwise size of the pooling "
-                         "region will be used.",
-                         "int");
-
-    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
-    inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int");
-    inflags.AddInputFlag("time", 't', "1", "Time (Default=1)", "int");
-    inflags.AddInputFlag(
-        "wall", 'w', "0", "Wall-clock Time, Requires time == 1 (Default=0)", "int");
-
-    return miopenStatusSuccess;
-}
-
-template <typename Tgpu, typename Tref>
-int AvgPoolDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
-{
-    size_t input_sz  = GetTensorSize(inputDesc);
-    size_t output_sz = GetTensorSize(outputDesc);
-
-    uint32_t ctx = 0;
-
-    input_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
-    output_dev      = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
-    input_grad_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
-    output_grad_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
-
-    input       = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0));
-    output      = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0));
-    output_host = std::vector<Tref>(output_sz, static_cast<Tref>(0));
-
-    input_grad      = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0));
-    input_grad_host = std::vector<Tref>(input_sz, static_cast<Tref>(0));
-    output_grad     = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0));
-
-    int status;
-
-    for(int i = 0; i < input_sz; i++)
-    {
-        input[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-10.0f), static_cast<Tgpu>(10.0f));
-    }
-    status = input_dev->ToGPU(q, input.data());
-
-    status |= output_dev->ToGPU(q, output.data());
-
-    status |= input_grad_dev->ToGPU(q, input_grad.data());
-
-    for(int i = 0; i < output_sz; i++)
-    {
-        output_grad[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-1.0), static_cast<Tgpu>(1.0));
-    }
-    status |= output_grad_dev->ToGPU(q, output_grad.data());
-
-    if(status != 0)
-        std::cout << "Error copying data to GPU\n" << std::endl;
-
-    return miopenStatusSuccess;
-}
-
-template <typename Tgpu, typename Tref>
-int AvgPoolDriver<Tgpu, Tref>::RunForwardGPU()
-{
-    float kernel_total_time = 0.0;
-    float kernel_first_time = 0.0;
-
-    Timer t;
-    START_TIME
-
-    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
-    {
-        miopenAvgPoolForward(GetHandle(),
-                             inputDesc,
-                             input_dev->GetMem(),
-                             outputDesc,
-                             output_dev->GetMem(),
-                             ksize.size() == 3 ? ksize[0] : 0,
-                             ksize.size() == 3 ? ksize[1] : ksize[0],
-                             ksize.size() == 3 ? ksize[2] : ksize[1],
-                             stride.size() == 3 ? stride[0] : 0,
-                             stride.size() == 3 ? stride[1] : stride[0],
-                             stride.size() == 3 ? stride[2] : stride[1],
-                             padding.size() == 3 ? padding[0] : 0,
-                             padding.size() == 3 ? padding[1] : padding[0],
-                             padding.size() == 3 ? padding[2] : padding[1],
-                             count_include_pad,
-                             divisor_override);
-
-        float time = 0.0;
-        miopenGetKernelTime(GetHandle(), &time);
-        kernel_total_time += time;
-        if(i == 0)
-            kernel_first_time = time;
-    }
-
-    if(inflags.GetValueInt("time") == 1)
-    {
-        STOP_TIME
-        int iter = inflags.GetValueInt("iter");
-        if(WALL_CLOCK)
-            printf("Wall-clock Time Forward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter);
-
-        float kernel_average_time =
-            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
-        printf("GPU Kernel Time Forward AvgPool Elapsed: %f ms\n", kernel_average_time);
-    }
-
-    output_dev->FromGPU(GetStream(), output.data());
-
-    return miopenStatusSuccess;
-}
-
-template <typename Tgpu, typename Tref>
-int AvgPoolDriver<Tgpu, Tref>::RunForwardCPU()
-{
-    if(in_dim.size() == 4)
-    {
-        mloAvgPoolForward2dRunHost<Tgpu, Tref>(inputDesc,
-                                               outputDesc,
-                                               input.data(),
-                                               output_host.data(),
-                                               N,
-                                               C,
-                                               H,
-                                               W,
-                                               OH,
-                                               OW,
-                                               ksize.data(),
-                                               stride.data(),
-                                               padding.data(),
-                                               count_include_pad,
-                                               divisor_override);
-    }
-    else if(in_dim.size() == 5)
-    {
-        mloAvgPoolForward3dRunHost<Tgpu, Tref>(inputDesc,
-                                               outputDesc,
-                                               input.data(),
-                                               output_host.data(),
-                                               N,
-                                               C,
-                                               D,
-                                               H,
-                                               W,
-                                               OD,
-                                               OH,
-                                               OW,
-                                               ksize.data(),
-                                               stride.data(),
-                                               padding.data(),
-                                               count_include_pad,
-                                               divisor_override);
-    }
-    return miopenStatusSuccess;
-}
-
-template <typename Tgpu, typename Tref>
-int AvgPoolDriver<Tgpu, Tref>::RunBackwardGPU()
-{
-    float kernel_total_time = 0.0;
-    float kernel_first_time = 0.0;
-
-    Timer t;
-    START_TIME
-
-    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
-    {
-        miopenAvgPoolBackward(GetHandle(),
-                              outputGradDesc,
-                              output_grad_dev->GetMem(),
-                              inputGradDesc,
-                              input_grad_dev->GetMem(),
-                              ksize.size() == 3 ? ksize[0] : 0,
-                              ksize.size() == 3 ? ksize[1] : ksize[0],
-                              ksize.size() == 3 ? ksize[2] : ksize[1],
-                              stride.size() == 3 ? stride[0] : 0,
-                              stride.size() == 3 ? stride[1] : stride[0],
-                              stride.size() == 3 ? stride[2] : stride[1],
-                              padding.size() == 3 ? padding[0] : 0,
-                              padding.size() == 3 ? padding[1] : padding[0],
-                              padding.size() == 3 ? padding[2] : padding[1],
-                              count_include_pad,
-                              divisor_override);
-
-        float time = 0.0;
-        miopenGetKernelTime(GetHandle(), &time);
-        kernel_total_time += time;
-        if(i == 0)
-            kernel_first_time = time;
-    }
-
-    if(inflags.GetValueInt("time") == 1)
-    {
-        STOP_TIME
-        int iter = inflags.GetValueInt("iter");
-        if(WALL_CLOCK)
-            printf("Wall-clock Time Backward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter);
-
-        float kernel_average_time =
-            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
-        printf("GPU Kernel Time Backward AvgPool Elapsed: %f ms\n", kernel_average_time);
-    }
-
-    input_grad_dev->FromGPU(GetStream(), input_grad.data());
-
-    return miopenStatusSuccess;
-}
-
-template <typename Tgpu, typename Tref>
-int AvgPoolDriver<Tgpu, Tref>::RunBackwardCPU()
-{
-    if(in_dim.size() == 4)
-    {
-        mloAvgPoolBackward2dRunHost<Tgpu, Tref>(outputGradDesc,
-                                                inputGradDesc,
-                                                output_grad.data(),
-                                                input_grad_host.data(),
-                                                N,
-                                                C,
-                                                H,
-                                                W,
-                                                OH,
-                                                OW,
-                                                ksize.data(),
-                                                stride.data(),
-                                                padding.data(),
-                                                count_include_pad,
-                                                divisor_override);
-    }
-    else if(in_dim.size() == 5)
-    {
-        mloAvgPoolBackward3dRunHost<Tgpu, Tref>(outputGradDesc,
-                                                inputGradDesc,
-                                                output_grad.data(),
-                                                input_grad_host.data(),
-                                                N,
-                                                C,
-                                                D,
-                                                H,
-                                                W,
-                                                OD,
-                                                OH,
-                                                OW,
-                                                ksize.data(),
-                                                stride.data(),
-                                                padding.data(),
-                                                count_include_pad,
-                                                divisor_override);
-    }
-    return miopenStatusSuccess;
-}
-
-template <typename Tgpu, typename Tref>
-Tref AvgPoolDriver<Tgpu, Tref>::GetTolerance()
-{
-    // Computation error of fp16 is ~2^13 (=8192) bigger than
-    // the one of fp32 because mantissa is shorter by 13 bits.
-    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
-
-    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
-    if(std::is_same<Tgpu, bfloat16>::value)
-        tolerance *= 8.0;
-    return tolerance;
-}
-
-template <typename Tgpu, typename Tref>
-int AvgPoolDriver<Tgpu, Tref>::VerifyForward()
-{
-    RunForwardCPU();
-    const Tref tolerance = GetTolerance();
-    auto error           = miopen::rms_range(output_host, output);
-
-    if(!std::isfinite(error) || error > tolerance)
-    {
-        std::cout << "Forward AvgPool FAILED: " << error << std::endl;
-        return EC_VerifyFwd;
-    }
-    else
-    {
-        printf("Forward AvgPool Verifies on CPU and GPU (err=%f)\n", error);
-    }
-
-    return miopenStatusSuccess;
-}
-
-template <typename Tgpu, typename Tref>
-int AvgPoolDriver<Tgpu, Tref>::VerifyBackward()
-{
-    RunBackwardCPU();
-    const Tref tolerance = GetTolerance();
-    auto error           = miopen::rms_range(input_grad_host, input_grad);
-
-    if(!std::isfinite(error) || error > tolerance)
-    {
-        std::cout << "Backward AvgPool FAILED: " << error << std::endl;
-        return EC_VerifyFwd;
-    }
-    else
-    {
-        printf("Backward AvgPool Verifies on CPU and GPU (err=%f)\n", error);
-    }
-    return miopenStatusSuccess;
-}
-
-#endif // GUARD_MIOPEN_AVGPOOL_DRIVER_HPP
diff --git a/driver/dm_avgpool.cpp b/driver/dm_adaptiveavgpool.cpp
similarity index 81%
rename from driver/dm_avgpool.cpp
rename to driver/dm_adaptiveavgpool.cpp
index ec0e457056..b6e53ba17e 100644
--- a/driver/dm_avgpool.cpp
+++ b/driver/dm_adaptiveavgpool.cpp
@@ -24,16 +24,16 @@
  *
  *******************************************************************************/
 #include "registry_driver_maker.hpp"
-#include "avgpool_driver.hpp"
+#include "adaptiveavgpool_driver.hpp"
 
 static Driver* makeDriver(const std::string& base_arg)
 {
-    if(base_arg == "avgpool")
-        return new AvgPoolDriver<float, float>();
-    if(base_arg == "avgpoolfp16")
-        return new AvgPoolDriver<float16, float>();
-    if(base_arg == "avgpoolbfp16")
-        return new AvgPoolDriver<bfloat16, float>();
+    if(base_arg == "adaptiveavgpool")
+        return new AdaptiveAvgPoolDriver<float, float>();
+    if(base_arg == "adaptiveavgpoolfp16")
+        return new AdaptiveAvgPoolDriver<float16, float>();
+    if(base_arg == "adaptiveavgpoolbfp16")
+        return new AdaptiveAvgPoolDriver<bfloat16, float>();
     return nullptr;
 }
 
diff --git a/driver/driver.hpp b/driver/driver.hpp
index bd42f6ee13..15e20ad55d 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -175,7 +175,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
            "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
-           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], avgpool[bfp16|fp16]\n");
+           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], adaptiveavgpool[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -206,8 +206,9 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" &&
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
        arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" &&
-       arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "avgpool" &&
-       arg != "avgpoolfp16" && arg != "avgpoolbfp16" && arg != "--version")
+       arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" &&
+       arg != "adaptiveavgpool" && arg != "adaptiveavgpoolfp16" && arg != "adaptiveavgpoolbfp16" &&
+       arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/mloAdaptiveAvgPoolHost.hpp b/driver/mloAdaptiveAvgPoolHost.hpp
new file mode 100644
index 0000000000..1c45f16213
--- /dev/null
+++ b/driver/mloAdaptiveAvgPoolHost.hpp
@@ -0,0 +1,337 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MLO_ADAPTIVEAVGPOOLHOST_H_
+#define MLO_ADAPTIVEAVGPOOLHOST_H_
+
+#include <cmath>
+#include <miopen/tensor.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputDesc,
+                                           const miopenTensorDescriptor_t outputDesc,
+                                           Tgpu* input,
+                                           Tcheck* output,
+                                           size_t N,
+                                           size_t C,
+                                           size_t H,
+                                           size_t OH)
+{
+    auto dims  = miopen::deref(inputDesc).GetLengths();
+    auto numel = miopen::deref(outputDesc).GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<3>(miopen::deref(inputDesc));
+    auto output_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(outputDesc));
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        size_t nc = gid / OH, oh = gid % OH;
+        size_t n = nc / C, c = nc % C;
+
+        if(n >= N)
+            return 0;
+
+        size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
+        size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
+
+        float sum = 0;
+        for(size_t ih = h; ih < (h + kh); ++ih)
+        {
+            sum += static_cast<float>(input[input_tv.get_tensor_view_idx({n, c, ih})]);
+        }
+
+        output[output_tv.get_tensor_view_idx({n, c, oh})] = static_cast<Tcheck>(sum / kh);
+    }
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
+                                           const miopenTensorDescriptor_t outputDesc,
+                                           Tgpu* input,
+                                           Tcheck* output,
+                                           size_t N,
+                                           size_t C,
+                                           size_t H,
+                                           size_t W,
+                                           size_t OH,
+                                           size_t OW)
+{
+    auto dims  = miopen::deref(inputDesc).GetLengths();
+    auto numel = miopen::deref(outputDesc).GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc));
+    auto output_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputDesc));
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        size_t ncoh = gid / OW, ow = gid % OW;
+        size_t nc = ncoh / OH, oh = ncoh % OH;
+        size_t n = nc / C, c = nc % C;
+
+        if(n >= N)
+            return 0;
+
+        size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
+        size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
+
+        size_t w  = static_cast<size_t>(std::floor(static_cast<float>(ow * W) / OW));
+        size_t kw = static_cast<size_t>(std::ceil(static_cast<float>((ow + 1) * W) / OW)) - w;
+
+        float divider = static_cast<float>(kh * kw);
+        float sum     = 0;
+        for(size_t ih = h; ih < (h + kh); ++ih)
+        {
+            for(size_t iw = w; iw < (w + kw); ++iw)
+            {
+                sum += static_cast<float>(input[input_tv.get_tensor_view_idx({n, c, ih, iw})]);
+            }
+        }
+
+        output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = static_cast<Tcheck>(sum / divider);
+    }
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
+                                           const miopenTensorDescriptor_t outputDesc,
+                                           Tgpu* input,
+                                           Tcheck* output,
+                                           size_t N,
+                                           size_t C,
+                                           size_t D,
+                                           size_t H,
+                                           size_t W,
+                                           size_t OD,
+                                           size_t OH,
+                                           size_t OW)
+{
+    auto dims  = miopen::deref(inputDesc).GetLengths();
+    auto numel = miopen::deref(outputDesc).GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        size_t ncodoh = gid / OW, ow = gid % OW;
+        size_t ncod = ncodoh / OH, oh = ncodoh % OH;
+        size_t nc = ncod / OD, od = ncod % OD;
+        size_t n = nc / C, c = nc % C;
+
+        if(n >= N)
+            return 0;
+
+        size_t d  = static_cast<size_t>(std::floor(static_cast<float>(od * D) / OD));
+        size_t kd = static_cast<size_t>(std::ceil(static_cast<float>((od + 1) * D) / OD)) - d;
+
+        size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
+        size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
+
+        size_t w  = static_cast<size_t>(std::floor(static_cast<float>(ow * W) / OW));
+        size_t kw = static_cast<size_t>(std::ceil(static_cast<float>((ow + 1) * W) / OW)) - w;
+
+        float sum = 0;
+        for(size_t id = d; id < (d + kd); ++id)
+        {
+            for(size_t ih = h; ih < (h + kh); ++ih)
+            {
+                for(size_t iw = w; iw < (w + kw); ++iw)
+                {
+                    sum +=
+                        static_cast<float>(input[input_tv.get_tensor_view_idx({n, c, id, ih, iw})]);
+                }
+            }
+        }
+
+        output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] =
+            static_cast<Tcheck>(sum / (kd * kh * kw));
+    }
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outputGradDesc,
+                                            const miopenTensorDescriptor_t inputGradDesc,
+                                            Tgpu* output_grad,
+                                            Tcheck* input_grad,
+                                            size_t N,
+                                            size_t C,
+                                            size_t H,
+                                            size_t OH)
+{
+    auto dims  = miopen::deref(inputGradDesc).GetLengths();
+    auto numel = miopen::deref(inputGradDesc).GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(outputGradDesc));
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<3>(miopen::deref(inputGradDesc));
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        size_t nc = gid / H, h = gid % H;
+        size_t n = nc / C, c = nc % C;
+
+        if(n >= N)
+            return 0;
+
+        size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
+        size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
+
+        float grad = 0;
+        for(size_t ih = oh; ih < (oh + koh); ++ih)
+        {
+            size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((ih + 1) * H) / OH)) -
+                        static_cast<size_t>(std::floor(static_cast<float>(ih * H) / OH));
+            grad +=
+                static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih})]) /
+                kh;
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = static_cast<Tcheck>(grad);
+    }
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDesc,
+                                            const miopenTensorDescriptor_t inputGradDesc,
+                                            Tgpu* output_grad,
+                                            Tcheck* input_grad,
+                                            size_t N,
+                                            size_t C,
+                                            size_t H,
+                                            size_t W,
+                                            size_t OH,
+                                            size_t OW)
+{
+    auto dims  = miopen::deref(inputGradDesc).GetLengths();
+    auto numel = miopen::deref(inputGradDesc).GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc));
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc));
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        size_t nch = gid / W, w = gid % W;
+        size_t nc = nch / H, h = nch % H;
+        size_t n = nc / C, c = nc % C;
+
+        if(n >= N)
+            return 0;
+
+        size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
+        size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
+
+        size_t ow  = static_cast<size_t>(std::floor(static_cast<float>(w * OW) / W));
+        size_t kow = static_cast<size_t>(std::ceil(static_cast<float>((w + 1) * OW) / W)) - ow;
+
+        float grad = 0;
+        for(size_t ih = oh; ih < (oh + koh); ++ih)
+        {
+            size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((ih + 1) * H) / OH)) -
+                        static_cast<size_t>(std::floor(static_cast<float>(ih * H) / OH));
+            for(size_t iw = ow; iw < (ow + kow); ++iw)
+            {
+                size_t kw = static_cast<size_t>(std::ceil(static_cast<float>((iw + 1) * W) / OW)) -
+                            static_cast<size_t>(std::floor(static_cast<float>(iw * W) / OW));
+                grad += static_cast<float>(
+                            output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih, iw})]) /
+                        (kh * kw);
+            }
+        }
+
+        input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = static_cast<Tcheck>(grad);
+    }
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDesc,
+                                            const miopenTensorDescriptor_t inputGradDesc,
+                                            Tgpu* output_grad,
+                                            Tcheck* input_grad,
+                                            size_t N,
+                                            size_t C,
+                                            size_t D,
+                                            size_t H,
+                                            size_t W,
+                                            size_t OD,
+                                            size_t OH,
+                                            size_t OW)
+{
+    auto dims  = miopen::deref(inputGradDesc).GetLengths();
+    auto numel = miopen::deref(inputGradDesc).GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc));
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc));
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        size_t ncdh = gid / W, w = gid % W;
+        size_t ncd = ncdh / H, h = ncdh % H;
+        size_t nc = ncd / D, d = ncd % D;
+        size_t n = nc / C, c = nc % C;
+
+        if(n >= N)
+            return 0;
+
+        size_t od  = static_cast<size_t>(std::floor(static_cast<float>(d * OD) / D));
+        size_t kod = static_cast<size_t>(std::ceil(static_cast<float>((d + 1) * OD) / D)) - od;
+
+        size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
+        size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
+
+        size_t ow  = static_cast<size_t>(std::floor(static_cast<float>(w * OW) / W));
+        size_t kow = static_cast<size_t>(std::ceil(static_cast<float>((w + 1) * OW) / W)) - ow;
+
+        float grad = 0;
+        for(size_t id = od; id < (od + kod); ++id)
+        {
+            size_t kd = static_cast<size_t>(std::ceil(static_cast<float>((id + 1) * D) / OD)) -
+                        static_cast<size_t>(std::floor(static_cast<float>(id * D) / OD));
+            for(size_t ih = oh; ih < (oh + koh); ++ih)
+            {
+                size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((ih + 1) * H) / OH)) -
+                            static_cast<size_t>(std::floor(static_cast<float>(ih * H) / OH));
+                for(size_t iw = ow; iw < (ow + kow); ++iw)
+                {
+                    size_t kw =
+                        static_cast<size_t>(std::ceil(static_cast<float>((iw + 1) * W) / OW)) -
+                        static_cast<size_t>(std::floor(static_cast<float>(iw * W) / OW));
+                    grad +=
+                        static_cast<float>(
+                            output_grad[output_grad_tv.get_tensor_view_idx({n, c, id, ih, iw})]) /
+                        (kd * kh * kw);
+                }
+            }
+        }
+
+        input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = static_cast<Tcheck>(grad);
+    }
+    return 0;
+}
+
+#endif // MLO_ADAPTIVEAVGPOOLHOST_H_
diff --git a/driver/mloAvgPoolHost.hpp b/driver/mloAvgPoolHost.hpp
deleted file mode 100644
index 6980ce968e..0000000000
--- a/driver/mloAvgPoolHost.hpp
+++ /dev/null
@@ -1,438 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2024 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef MLO_AVGPOOLHOST_H_
-#define MLO_AVGPOOLHOST_H_
-
-#include <miopen/tensor.hpp>
-#include <miopen/tensor_view_utils.hpp>
-
-template <typename Tgpu, typename Tcheck>
-int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
-                                   const miopenTensorDescriptor_t outputDesc,
-                                   Tgpu* input,
-                                   Tcheck* output,
-                                   size_t N,
-                                   size_t C,
-                                   size_t H,
-                                   size_t W,
-                                   size_t OH,
-                                   size_t OW,
-                                   const int32_t* ksize,
-                                   const int32_t* stride,
-                                   const int32_t* padding,
-                                   bool count_include_pad,
-                                   int32_t divisor_override)
-{
-    auto dims  = miopen::deref(inputDesc).GetLengths();
-    auto numel = miopen::deref(outputDesc).GetElementSize();
-
-    auto input_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc));
-    auto output_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputDesc));
-
-    for(int32_t gid = 0; gid < numel; gid++)
-    {
-        int32_t ncoh = gid / OW, ow = gid % OW;
-        int32_t nc = ncoh / OH, oh = ncoh % OH;
-        int32_t n = nc / C, c = nc % C;
-        int32_t R  = ksize[0];
-        int32_t S  = ksize[1];
-        int32_t sh = stride[0];
-        int32_t sw = stride[1];
-        int32_t ph = padding[0];
-        int32_t pw = padding[1];
-
-        if(n >= N)
-            return 0;
-
-        float m = 0;
-        for(int32_t r = 0; r < R; ++r)
-        {
-            for(int32_t s = 0; s < S; ++s)
-            {
-                // input idx : (n, c, h, w)
-                int32_t h = oh * sh - ph + r;
-                if(h < 0 || h >= H)
-                    continue;
-                int32_t w = ow * sw - pw + s;
-                if(w < 0 || w >= W)
-                    continue;
-                // int32_t input_idx = ((n * C + c) * H + h) * W + w;
-                m += static_cast<float>(
-                    input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
-            }
-        }
-
-        int32_t hstart = oh * sh - ph;
-        int32_t wstart = ow * sw - pw;
-        int32_t hend   = min(hstart + R, H + ph);
-        int32_t wend   = min(wstart + S, W + pw);
-
-        const int32_t pool_size = (hend - hstart) * (wend - wstart);
-
-        hstart = max(hstart, 0);
-        wstart = max(wstart, 0);
-        hend   = min(hend, H);
-        wend   = min(wend, W);
-
-        int32_t divide_factor;
-        if(divisor_override != 0)
-        {
-            divide_factor = divisor_override;
-        }
-        else
-        {
-            if(count_include_pad)
-            {
-                divide_factor = pool_size;
-            }
-            else
-            {
-                divide_factor = (hend - hstart) * (wend - wstart);
-            }
-        }
-        float val = m / divide_factor;
-
-        output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] =
-            static_cast<Tcheck>(val);
-    }
-    return 0;
-}
-
-template <typename Tgpu, typename Tcheck>
-int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
-                                   const miopenTensorDescriptor_t outputDesc,
-                                   Tgpu* input,
-                                   Tcheck* output,
-                                   size_t N,
-                                   size_t C,
-                                   size_t D,
-                                   size_t H,
-                                   size_t W,
-                                   size_t OD,
-                                   size_t OH,
-                                   size_t OW,
-                                   const int32_t* ksize,
-                                   const int32_t* stride,
-                                   const int32_t* padding,
-                                   bool count_include_pad,
-                                   int32_t divisor_override)
-{
-    auto dims  = miopen::deref(inputDesc).GetLengths();
-    auto numel = miopen::deref(outputDesc).GetElementSize();
-
-    auto input_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
-    auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
-
-    for(int32_t gid = 0; gid < numel; gid++)
-    {
-        int32_t ncodoh = gid / OW, ow = gid % OW;
-        int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
-        int32_t nc = ncod / OD, od = ncod % OD;
-        int32_t n = nc / C, c = nc % C;
-        int32_t KD = ksize[0];
-        int32_t R  = ksize[1];
-        int32_t S  = ksize[2];
-        int32_t sd = stride[0];
-        int32_t sh = stride[1];
-        int32_t sw = stride[2];
-        int32_t pd = padding[0];
-        int32_t ph = padding[1];
-        int32_t pw = padding[2];
-
-        if(n >= N)
-            return 0;
-        float sum = 0;
-        for(int32_t kd = 0; kd < KD; ++kd)
-        {
-            for(int32_t r = 0; r < R; ++r)
-            {
-                for(int32_t s = 0; s < S; ++s)
-                {
-                    // input idx : (n, c, d, h, w)
-                    int32_t d = od * sd - pd + kd;
-                    if(d < 0 || d >= D)
-                        continue;
-                    int32_t h = oh * sh - ph + r;
-                    if(h < 0 || h >= H)
-                        continue;
-                    int32_t w = ow * sw - pw + s;
-                    if(w < 0 || w >= W)
-                        continue;
-                    // int32_t input_idx = ((n * C + c) * H + h) * W + w;
-                    sum += static_cast<float>(
-                        input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
-                }
-            }
-        }
-        int32_t dstart = od * sd - pd;
-        int32_t hstart = oh * sh - ph;
-        int32_t wstart = ow * sw - pw;
-        int32_t dend   = min(dstart + KD, D + pd);
-        int32_t hend   = min(hstart + R, H + ph);
-        int32_t wend   = min(wstart + S, W + pw);
-
-        const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-        dstart                  = max(dstart, 0);
-        hstart                  = max(hstart, 0);
-        wstart                  = max(wstart, 0);
-        dend                    = min(dend, D);
-        hend                    = min(hend, H);
-        wend                    = min(wend, W);
-
-        int32_t divide_factor;
-        if(divisor_override != 0)
-        {
-            divide_factor = divisor_override;
-        }
-        else
-        {
-            if(count_include_pad)
-            {
-                divide_factor = pool_size;
-            }
-            else
-            {
-                divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
-            }
-        }
-        float val = sum / divide_factor;
-        output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
-            static_cast<Tcheck>(val);
-    }
-    return 0;
-}
-
-template <typename Tgpu, typename Tcheck>
-int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDesc,
-                                    const miopenTensorDescriptor_t inputGradDesc,
-                                    Tgpu* output_grad,
-                                    Tcheck* input_grad,
-                                    size_t N,
-                                    size_t C,
-                                    size_t H,
-                                    size_t W,
-                                    size_t OH,
-                                    size_t OW,
-                                    const int32_t* ksize,
-                                    const int32_t* stride,
-                                    const int32_t* padding,
-                                    bool count_include_pad,
-                                    int32_t divisor_override)
-{
-    auto dims  = miopen::deref(inputGradDesc).GetLengths();
-    auto numel = miopen::deref(inputGradDesc).GetElementSize();
-
-    auto output_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc));
-    auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc));
-
-    for(size_t gid = 0; gid < numel; gid++)
-    {
-        int32_t nch = gid / W, w = gid % W;
-        int32_t nc = nch / H, h = nch % H;
-        int32_t n = nc / C, c = nc % C;
-        int32_t R  = ksize[0];
-        int32_t S  = ksize[1];
-        int32_t sh = stride[0];
-        int32_t sw = stride[1];
-        int32_t ph = padding[0];
-        int32_t pw = padding[1];
-
-        if(n >= N)
-            return 0;
-
-        float grad = 0;
-        for(int32_t r = 0; r < R; ++r)
-        {
-            for(int32_t s = 0; s < S; ++s)
-            {
-                int32_t ohsh = h + ph - r;
-                if(ohsh % sh != 0)
-                    continue;
-                int32_t oh = ohsh / sh;
-                if(oh < 0 || oh >= OH)
-                    continue;
-                int32_t owsw = w + pw - s;
-                if(owsw % sw != 0)
-                    continue;
-                int32_t ow = owsw / sw;
-                if(ow < 0 || ow >= OW)
-                    continue;
-
-                int32_t hstart = oh * sh - ph;
-                int32_t wstart = ow * sw - pw;
-                int32_t hend   = min(hstart + R, H + ph);
-                int32_t wend   = min(wstart + S, W + pw);
-
-                const int32_t pool_size = (hend - hstart) * (wend - wstart);
-
-                hstart = max(hstart, 0);
-                wstart = max(wstart, 0);
-                hend   = min(hend, H);
-                wend   = min(wend, W);
-
-                int32_t divide_factor;
-                if(divisor_override != 0)
-                {
-                    divide_factor = divisor_override;
-                }
-                else
-                {
-                    if(count_include_pad)
-                    {
-                        divide_factor = pool_size;
-                    }
-                    else
-                    {
-                        divide_factor = (hend - hstart) * (wend - wstart);
-                    }
-                }
-
-                grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
-                            tensor_layout_t<4>(n, c, oh, ow))]) /
-                        divide_factor;
-            }
-        }
-        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
-            static_cast<Tcheck>(grad);
-    }
-    return 0;
-}
-
-template <typename Tgpu, typename Tcheck>
-int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDesc,
-                                    const miopenTensorDescriptor_t inputGradDesc,
-                                    Tgpu* output_grad,
-                                    Tcheck* input_grad,
-                                    size_t N,
-                                    size_t C,
-                                    size_t D,
-                                    size_t H,
-                                    size_t W,
-                                    size_t OD,
-                                    size_t OH,
-                                    size_t OW,
-                                    const int32_t* ksize,
-                                    const int32_t* stride,
-                                    const int32_t* padding,
-                                    bool count_include_pad,
-                                    int32_t divisor_override)
-{
-    auto dims  = miopen::deref(inputGradDesc).GetLengths();
-    auto numel = miopen::deref(inputGradDesc).GetElementSize();
-
-    auto output_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc));
-    auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc));
-
-    for(size_t gid = 0; gid < numel; gid++)
-    {
-        int32_t ncdh = gid / W, w = gid % W;
-        int32_t ncd = ncdh / H, h = ncdh % H;
-        int32_t nc = ncd / D, d = ncd % D;
-        int32_t n = nc / C, c = nc % C;
-        int32_t KD = ksize[0];
-        int32_t R  = ksize[1];
-        int32_t S  = ksize[2];
-        int32_t sd = stride[0];
-        int32_t sh = stride[1];
-        int32_t sw = stride[2];
-        int32_t pd = padding[0];
-        int32_t ph = padding[1];
-        int32_t pw = padding[2];
-
-        if(n >= N)
-            return 0;
-
-        float grad = 0;
-        for(int32_t kd = 0; kd < KD; ++kd)
-        {
-            for(int32_t r = 0; r < R; ++r)
-            {
-                for(int32_t s = 0; s < S; ++s)
-                {
-                    int32_t odsd = d + pd - kd;
-                    if(odsd % sd != 0)
-                        continue;
-                    int32_t od = odsd / sd;
-                    if(od < 0 || od >= OD)
-                        continue;
-
-                    int32_t ohsh = h + ph - r;
-                    if(ohsh % sh != 0)
-                        continue;
-                    int32_t oh = ohsh / sh;
-                    if(oh < 0 || oh >= OH)
-                        continue;
-
-                    int32_t owsw = w + pw - s;
-                    if(owsw % sw != 0)
-                        continue;
-                    int32_t ow = owsw / sw;
-                    if(ow < 0 || ow >= OW)
-                        continue;
-
-                    int32_t dstart = od * sd - pd;
-                    int32_t hstart = oh * sh - ph;
-                    int32_t wstart = ow * sw - pw;
-                    int32_t dend   = min(dstart + KD, D + pd);
-                    int32_t hend   = min(hstart + R, H + ph);
-                    int32_t wend   = min(wstart + S, W + pw);
-
-                    const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-                    dstart                  = max(dstart, 0);
-                    hstart                  = max(hstart, 0);
-                    wstart                  = max(wstart, 0);
-                    dend                    = min(dend, D);
-                    hend                    = min(hend, H);
-                    wend                    = min(wend, W);
-                    int32_t divide_factor;
-                    if(divisor_override != 0)
-                    {
-                        divide_factor = divisor_override;
-                    }
-                    else
-                    {
-                        if(count_include_pad)
-                        {
-                            divide_factor = pool_size;
-                        }
-                        else
-                        {
-                            divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
-                        }
-                    }
-                    grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
-                                tensor_layout_t<5>(n, c, od, oh, ow))]) /
-                            divide_factor;
-                }
-            }
-        }
-        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
-            static_cast<Tcheck>(grad);
-    }
-    return 0;
-}
-
-#endif // MLO_AVGPOOLHOST_H_
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 57aeeb5d3b..2e44b62588 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -7638,11 +7638,11 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
  * @param output                   Data tensor output (output)
  * @return                         miopenStatus_t
  */
-MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
-                                                  const miopenTensorDescriptor_t inputDesc,
-                                                  const void* input,
-                                                  const miopenTensorDescriptor_t outputDesc,
-                                                  void* output);
+MIOPEN_EXPORT miopenStatus_t miopenAdaptiveAvgPoolForward(miopenHandle_t handle,
+                                                          const miopenTensorDescriptor_t inputDesc,
+                                                          const void* input,
+                                                          const miopenTensorDescriptor_t outputDesc,
+                                                          void* output);
 
 /*! @brief Execute an adaptiveavgpool backward layer
  *
@@ -7653,11 +7653,12 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
  * @param input_grad               Data tensor input grad (output)
  * @return                         miopenStatus_t
  */
-MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
-                                                   const miopenTensorDescriptor_t outputGradDesc,
-                                                   const void* output_grad,
-                                                   const miopenTensorDescriptor_t inputGradDesc,
-                                                   void* input_grad);
+MIOPEN_EXPORT miopenStatus_t
+miopenAdaptiveAvgPoolBackward(miopenHandle_t handle,
+                              const miopenTensorDescriptor_t outputGradDesc,
+                              const void* output_grad,
+                              const miopenTensorDescriptor_t inputGradDesc,
+                              void* input_grad);
 /** @} */
 // CLOSEOUT adaptiveavgpool DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ae621b28ad..f46579e007 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -490,7 +490,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         ${GPU_BATCHED_TRANSPOSE_KERNEL_HIP}
         ${GPU_GENERAL_TENSOR_REORDER_KERNEL_HIP_SOURCE}
         kernels/MIOpenAdam.cpp
-        kernels/MIOpenAvgPool.cpp
+        kernels/MIOpenAdaptiveAvgPool.cpp
         kernels/MIOpenCat.cpp
         kernels/MIOpenCheckNumerics.cpp
         kernels/MIOpenBatchNormActivBwdPerAct.cl
diff --git a/src/include/miopen/adaptiveavgpool/problem_description.hpp b/src/include/miopen/adaptiveavgpool/problem_description.hpp
index 53be89cd89..adec5759e7 100644
--- a/src/include/miopen/adaptiveavgpool/problem_description.hpp
+++ b/src/include/miopen/adaptiveavgpool/problem_description.hpp
@@ -42,6 +42,7 @@ struct FwdProblemDescription : ProblemDescriptionBase
         : inputDesc(inputDesc_), outputDesc(outputDesc_)
     {
         IsValidLength();
+        IsValidDims();
     }
 
     auto GetInputDesc() const { return inputDesc; }
@@ -59,9 +60,68 @@ struct FwdProblemDescription : ProblemDescriptionBase
                          "AdaptiveAvgPool: Input and output tensor sizes do not match.");
         }
 
+        if(input_dims == 3)
+        {
+            if(outputDesc.GetLengths()[2] > inputDesc.GetLengths()[2])
+            {
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "AdaptiveAvgPool: Input tensor sizes are too small compare to output "
+                             "tensor sizes.");
+            }
+        }
+        else if(input_dims == 4)
+        {
+            if(outputDesc.GetLengths()[2] > inputDesc.GetLengths()[2] ||
+               outputDesc.GetLengths()[3] > inputDesc.GetLengths()[3])
+            {
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "AdaptiveAvgPool: Input tensor sizes are too small compare to output "
+                             "tensor sizes.");
+            }
+        }
+        else if(input_dims == 5)
+        {
+            if(outputDesc.GetLengths()[2] > inputDesc.GetLengths()[2] ||
+               outputDesc.GetLengths()[3] > inputDesc.GetLengths()[3] ||
+               outputDesc.GetLengths()[4] > inputDesc.GetLengths()[4])
+            {
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "AdaptiveAvgPool: Input tensor sizes are too small compare to output "
+                             "tensor sizes.");
+            }
+        }
+
+        return true;
+    }
+
+    bool IsValidDims() const
+    {
+        if(inputDesc.GetLengths().size() > 5 || inputDesc.GetLengths().size() < 3)
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AdaptiveAvgPool: Only 3D, 4D and 5D tensors are supported.");
+        }
+
         return true;
     }
 
+    bool IsAllContiguous() const
+    {
+        auto isContiguous = [](TensorDescriptor td) {
+            size_t s = 1;
+            for(int i = td.GetNumDims() - 1; i >= 0; --i)
+            {
+                if(s != td.GetStrides()[i])
+                {
+                    return false;
+                }
+                s *= td.GetLengths()[i];
+            }
+            return true;
+        };
+        return isContiguous(inputDesc) && isContiguous(outputDesc);
+    }
+
     NetworkConfig MakeNetworkConfig() const override;
 
 protected:
@@ -76,6 +136,7 @@ struct BwdProblemDescription : ProblemDescriptionBase
         : outputGradDesc(outputGradDesc_), inputGradDesc(inputGradDesc_)
     {
         IsValidLength();
+        IsValidDims();
     }
 
     auto GetOutputGradDesc() const { return outputGradDesc; }
@@ -93,9 +154,68 @@ struct BwdProblemDescription : ProblemDescriptionBase
                          "AdaptiveAvgPool: Input grad and output grad tensor sizes do not match.");
         }
 
+        if(input_dims == 3)
+        {
+            if(outputGradDesc.GetLengths()[2] > inputGradDesc.GetLengths()[2])
+            {
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "AdaptiveAvgPool: Input grad tensor sizes are too small compare to "
+                             "output grad tensor sizes.");
+            }
+        }
+        else if(input_dims == 4)
+        {
+            if(outputGradDesc.GetLengths()[2] > inputGradDesc.GetLengths()[2] ||
+               outputGradDesc.GetLengths()[3] > inputGradDesc.GetLengths()[3])
+            {
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "AdaptiveAvgPool: Input grad tensor sizes are too small compare to "
+                             "output grad tensor sizes.");
+            }
+        }
+        else if(input_dims == 5)
+        {
+            if(outputGradDesc.GetLengths()[2] > inputGradDesc.GetLengths()[2] ||
+               outputGradDesc.GetLengths()[3] > inputGradDesc.GetLengths()[3] ||
+               outputGradDesc.GetLengths()[4] > inputGradDesc.GetLengths()[4])
+            {
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "AdaptiveAvgPool: Input grad tensor sizes are too small compare to "
+                             "output grad tensor sizes.");
+            }
+        }
+
+        return true;
+    }
+
+    bool IsValidDims() const
+    {
+        if(inputGradDesc.GetLengths().size() > 5 || inputGradDesc.GetLengths().size() < 3)
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AdaptiveAvgPool: Only 3D, 4D and 5D tensors are supported.");
+        }
+
         return true;
     }
 
+    bool IsAllContiguous() const
+    {
+        auto isContiguous = [](TensorDescriptor td) {
+            size_t s = 1;
+            for(int i = td.GetNumDims() - 1; i >= 0; --i)
+            {
+                if(s != td.GetStrides()[i])
+                {
+                    return false;
+                }
+                s *= td.GetLengths()[i];
+            }
+            return true;
+        };
+        return isContiguous(inputGradDesc) && isContiguous(outputGradDesc);
+    }
+
     NetworkConfig MakeNetworkConfig() const override;
 
 protected:
diff --git a/src/kernels/MIOpenAdaptiveAvgPool.cpp b/src/kernels/MIOpenAdaptiveAvgPool.cpp
index d29a03ab1d..17877fdf0c 100644
--- a/src/kernels/MIOpenAdaptiveAvgPool.cpp
+++ b/src/kernels/MIOpenAdaptiveAvgPool.cpp
@@ -23,7 +23,6 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include <cstddef>
 #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
 #include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
@@ -41,14 +40,14 @@
 #endif
 
 template <typename TI, typename TO>
-__device__ void avgPoolForward1d(const TI* __restrict__ input,
-                                 TO* __restrict__ output,
-                                 size_t N,
-                                 size_t C,
-                                 size_t H,
-                                 size_t OH,
-                                 tensor_view_t<3> input_tv,
-                                 tensor_view_t<3> output_tv)
+__device__ void adaptiveAvgPoolForward1d(const TI* __restrict__ input,
+                                         TO* __restrict__ output,
+                                         size_t N,
+                                         size_t C,
+                                         size_t H,
+                                         size_t OH,
+                                         tensor_view_t<3> input_tv,
+                                         tensor_view_t<3> output_tv)
 {
     size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
     size_t nc = gid / OH, oh = gid % OH;
@@ -56,38 +55,38 @@ __device__ void avgPoolForward1d(const TI* __restrict__ input,
     if(n >= N)
         return;
 
-    int32_t h  = (int32_t)floor((float)(oh * H) / OH);
-    int32_t kh = (int32_t)ceil((float)((oh + 1) * H) / OH) - h;
+    size_t h  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(oh * H) / OH));
+    size_t kh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((oh + 1) * H) / OH)) - h;
 
-    DTYPE_ACCURATE sum = 0;
-    for(int ih = h; ih < (h + kh); ++ih)
+    FLOAT_ACCUM sum = 0;
+    for(size_t ih = h; ih < (h + kh); ++ih)
     {
-        sum += GET_3D_VAL_AT(input, n, c, ih);
+        sum += CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx({n, c, ih})]);
     }
-
-    SET_3D_VAL_AT(output, n, c, oh, sum / kh);
+    output[output_tv.get_tensor_view_idx({n, c, oh})] = CVT_ACCUM2FLOAT(sum / kh);
 }
-extern "C" __global__ void AvgPoolForward1d(const INPUT_TYPE* __restrict__ input,
-                                            OUTPUT_TYPE* __restrict__ output,
-                                            size_t N,
-                                            size_t C,
-                                            size_t H,
-                                            size_t OH,
-                                            tensor_view_t<3> input_tv,
-                                            tensor_view_t<3> output_tv)
+extern "C" __global__ void AdaptiveAvgPoolForward1d(const INPUT_TYPE* __restrict__ input,
+                                                    OUTPUT_TYPE* __restrict__ output,
+                                                    size_t N,
+                                                    size_t C,
+                                                    size_t H,
+                                                    size_t OH,
+                                                    tensor_view_t<3> input_tv,
+                                                    tensor_view_t<3> output_tv)
 {
-    avgPoolForward1d<INPUT_TYPE, OUTPUT_TYPE>(input, output, N, C, H, OH, input_tv, output_tv);
+    adaptiveAvgPoolForward1d<INPUT_TYPE, OUTPUT_TYPE>(
+        input, output, N, C, H, OH, input_tv, output_tv);
 }
 
 template <typename TI, typename TO>
-__device__ void avgPoolBackward1d(const TI* __restrict__ output_grad,
-                                  TO* __restrict__ input_grad,
-                                  size_t N,
-                                  size_t C,
-                                  size_t H,
-                                  size_t OH,
-                                  tensor_view_t<3> output_grad_tv,
-                                  tensor_view_t<3> input_grad_tv)
+__device__ void adaptiveAvgPoolBackward1d(const TI* __restrict__ output_grad,
+                                          TO* __restrict__ input_grad,
+                                          size_t N,
+                                          size_t C,
+                                          size_t H,
+                                          size_t OH,
+                                          tensor_view_t<3> output_grad_tv,
+                                          tensor_view_t<3> input_grad_tv)
 {
     size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
     size_t nc = gid / H, h = gid % H;
@@ -95,310 +94,276 @@ __device__ void avgPoolBackward1d(const TI* __restrict__ output_grad,
     if(n >= N)
         return;
 
-    int32_t oh  = (int32_t)floor((float)(h * OH) / H);
-    int32_t koh = (int32_t)ceil((float)((h + 1) * OH) / H) - oh;
+    size_t oh  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(h * OH) / H));
+    size_t koh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((h + 1) * OH) / H)) - oh;
 
-    DTYPE_ACCURATE grad = 0;
-    for(int ih = oh; ih < (oh + koh); ++ih)
+    FLOAT_ACCUM grad = 0;
+    for(size_t ih = oh; ih < (oh + koh); ++ih)
     {
-        int32_t kh =
-            (int32_t)ceil((float)((ih + 1) * H) / OH) - (int32_t)floor((float)(ih * H) / OH);
-        grad += GET_3D_VAL_AT(output_grad, n, c, ih) / kh;
+        size_t kh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((ih + 1) * H) / OH)) -
+                    static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(ih * H) / OH));
+        grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih})]) / kh;
     }
-
-    SET_3D_VAL_AT(input_grad, n, c, h, grad);
+    input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = CVT_ACCUM2FLOAT(grad);
 }
-extern "C" __global__ void AvgPoolBackward1d(const INPUT_TYPE* __restrict__ output_grad,
-                                             OUTPUT_TYPE* __restrict__ input_grad,
-                                             size_t N,
-                                             size_t C,
-                                             size_t H,
-                                             size_t OH,
-                                             tensor_view_t<3> output_grad_tv,
-                                             tensor_view_t<3> input_grad_tv)
+
+extern "C" __global__ void AdaptiveAvgPoolBackward1d(const INPUT_TYPE* __restrict__ output_grad,
+                                                     OUTPUT_TYPE* __restrict__ input_grad,
+                                                     size_t N,
+                                                     size_t C,
+                                                     size_t H,
+                                                     size_t OH,
+                                                     tensor_view_t<3> output_grad_tv,
+                                                     tensor_view_t<3> input_grad_tv)
 {
-    avgPoolBackward1d<INPUT_TYPE, OUTPUT_TYPE>(
+    adaptiveAvgPoolBackward1d<INPUT_TYPE, OUTPUT_TYPE>(
         output_grad, input_grad, N, C, H, OH, output_grad_tv, input_grad_tv);
 }
 
 template <typename TI, typename TO>
-__device__ void avgPoolForward2d(const TI* __restrict__ input,
-                                 TO* __restrict__ output,
-                                 size_t N,
-                                 size_t C,
-                                 size_t H,
-                                 size_t W,
-                                 size_t OH,
-                                 size_t OW,
-                                 tensor_view_t<4> input_tv,
-                                 tensor_view_t<4> output_tv)
+__device__ void adaptiveAvgPoolForward2d(const TI* __restrict__ input,
+                                         TO* __restrict__ output,
+                                         size_t N,
+                                         size_t C,
+                                         size_t H,
+                                         size_t W,
+                                         size_t OH,
+                                         size_t OW,
+                                         tensor_view_t<4> input_tv,
+                                         tensor_view_t<4> output_tv)
 {
-    int32_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
-    int32_t ncoh = gid / OW, ow = gid % OW;
-    int32_t nc = ncoh / OH, oh = ncoh % OH;
-    int32_t n = nc / C, c = nc % C;
+    size_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
+    size_t ncoh = gid / OW, ow = gid % OW;
+    size_t nc = ncoh / OH, oh = ncoh % OH;
+    size_t n = nc / C, c = nc % C;
 
     if(n >= N)
         return;
 
-    size_t h  = (size_t)floor((float)(oh * H) / OH);
-    size_t kh = (size_t)ceil((float)((oh + 1) * H) / OH) - h;
+    size_t h  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(oh * H) / OH));
+    size_t kh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((oh + 1) * H) / OH)) - h;
 
-    size_t w  = (size_t)floor((float)(ow * W) / OW);
-    size_t kw = (size_t)ceil((float)((ow + 1) * W) / OW) - w;
+    size_t w  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(ow * W) / OW));
+    size_t kw = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((ow + 1) * W) / OW)) - w;
 
-    FSTYPE divider = (FSTYPE)(kh * kw);
-    FSTYPE sum     = 0;
+    FLOAT_ACCUM divider = static_cast<FLOAT_ACCUM>(kh * kw);
+    FLOAT_ACCUM sum     = 0;
     for(size_t ih = h; ih < (h + kh); ++ih)
     {
         for(size_t iw = w; iw < (w + kw); ++iw)
         {
-            sum += GET_4D_VAL_AT(input, n, c, ih, iw);
+            sum += CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx({n, c, ih, iw})]);
         }
     }
-
-    SET_4D_VAL_AT(output, n, c, oh, ow, sum / divider);
-
-    output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = CVT_ACCUM2FLOAT(val);
+    output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = CVT_ACCUM2FLOAT(sum / divider);
 }
 
-extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input,
-                                            OUTPUT_TYPE* __restrict__ output,
-                                            size_t N,
-                                            size_t C,
-                                            size_t H,
-                                            size_t W,
-                                            size_t OH,
-                                            size_t OW,
-                                            tensor_view_t<4> input_tv,
-                                            tensor_view_t<4> output_tv)
+extern "C" __global__ void AdaptiveAvgPoolForward2d(const INPUT_TYPE* __restrict__ input,
+                                                    OUTPUT_TYPE* __restrict__ output,
+                                                    size_t N,
+                                                    size_t C,
+                                                    size_t H,
+                                                    size_t W,
+                                                    size_t OH,
+                                                    size_t OW,
+                                                    tensor_view_t<4> input_tv,
+                                                    tensor_view_t<4> output_tv)
 {
-    avgPoolForward2d<INPUT_TYPE, OUTPUT_TYPE>(
+    adaptiveAvgPoolForward2d<INPUT_TYPE, OUTPUT_TYPE>(
         input, output, N, C, H, W, OH, OW, input_tv, output_tv);
 }
 
 template <typename TI, typename TO>
-__device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
-                                  TO* __restrict__ input_grad,
-                                  size_t N,
-                                  size_t C,
-                                  size_t H,
-                                  size_t W,
-                                  size_t OH,
-                                  size_t OW,
-                                  tensor_view_t<4> output_grad_tv,
-                                  tensor_view_t<4> input_grad_tv)
+__device__ void adaptiveAvgPoolBackward2d(const TI* __restrict__ output_grad,
+                                          TO* __restrict__ input_grad,
+                                          size_t N,
+                                          size_t C,
+                                          size_t H,
+                                          size_t W,
+                                          size_t OH,
+                                          size_t OW,
+                                          tensor_view_t<4> output_grad_tv,
+                                          tensor_view_t<4> input_grad_tv)
 {
-    int32_t gid = threadIdx.x + blockIdx.x * blockDim.x;
-    int32_t nch = gid / W, w = gid % W;
-    int32_t nc = nch / H, h = nch % H;
-    int32_t n = nc / C, c = nc % C;
+    size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+    size_t nch = gid / W, w = gid % W;
+    size_t nc = nch / H, h = nch % H;
+    size_t n = nc / C, c = nc % C;
 
     if(n >= N)
         return;
 
-    size_t oh  = (size_t)floor((float)(h * OH) / H);
-    size_t koh = (size_t)ceil((float)((h + 1) * OH) / H) - oh;
+    size_t oh  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(h * OH) / H));
+    size_t koh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((h + 1) * OH) / H)) - oh;
 
-    size_t ow  = (size_t)floor((float)(w * OW) / W);
-    size_t kow = (size_t)ceil((float)((w + 1) * OW) / W) - ow;
+    size_t ow  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(w * OW) / W));
+    size_t kow = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((w + 1) * OW) / W)) - ow;
 
     FLOAT_ACCUM grad = 0;
     for(size_t ih = oh; ih < (oh + koh); ++ih)
     {
-        size_t kh = (size_t)ceil((float)((ih + 1) * H) / OH) - (size_t)floor((float)(ih * H) / OH);
+        size_t kh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((ih + 1) * H) / OH)) -
+                    static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(ih * H) / OH));
         for(size_t iw = ow; iw < (ow + kow); ++iw)
         {
-            size_t kw =
-                (size_t)ceil((float)((iw + 1) * W) / OW) - (size_t)floor((float)(iw * W) / OW);
-            grad += (FSTYPE)(GET_4D_VAL_AT(output_grad, n, c, ih, iw)) / (kh * kw);
+            size_t kw = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((iw + 1) * W) / OW)) -
+                        static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(iw * W) / OW));
+            grad +=
+                CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih, iw})]) /
+                (kh * kw);
         }
     }
 
-    SET_4D_VAL_AT(input_grad, n, c, h, w, grad);
-
-    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
-        CVT_ACCUM2FLOAT(grad);
+    input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = CVT_ACCUM2FLOAT(grad);
 }
 
-extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad,
-                                             OUTPUT_TYPE* __restrict__ input_grad,
-                                             size_t N,
-                                             size_t C,
-                                             size_t H,
-                                             size_t W,
-                                             size_t OH,
-                                             size_t OW,
-                                             tensor_view_t<4> output_grad_tv,
-                                             tensor_view_t<4> input_grad_tv)
+extern "C" __global__ void AdaptiveAvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad,
+                                                     OUTPUT_TYPE* __restrict__ input_grad,
+                                                     size_t N,
+                                                     size_t C,
+                                                     size_t H,
+                                                     size_t W,
+                                                     size_t OH,
+                                                     size_t OW,
+                                                     tensor_view_t<4> output_grad_tv,
+                                                     tensor_view_t<4> input_grad_tv)
 {
-    avgPoolBackward2d<INPUT_TYPE, OUTPUT_TYPE>(
+    adaptiveAvgPoolBackward2d<INPUT_TYPE, OUTPUT_TYPE>(
         output_grad, input_grad, N, C, H, W, OH, OW, output_grad_tv, input_grad_tv);
 }
 
-// __kernel void AdaptiveAvgpool2dBackward1x1OutputNHWC(const __global DTYPE_PTR output_grad,
-//                                                      __global DTYPE_PTR input_grad,
-//                                                      const int32_t N,
-//                                                      const int32_t C,
-//                                                      const int32_t HW,
-//                                                      const int32_t output_grad_off,
-//                                                      const int32_t input_grad_off)
-// {
-// /* VSIZE 2 and 16 is fastest but don't know why */
-// #define VSIZE 2
-//     size_t gid = get_global_id(0) * VSIZE;
-//     size_t c   = gid % C;
-//     size_t n   = gid / C;
-//     if(n >= N)
-//         return;
-
-//     __global DTYPE_VEC_PTR(VSIZE) output_grad_vec =
-//         (__global DTYPE_VEC_PTR(VSIZE))(output_grad + n * C + c + output_grad_off);
-
-//     DTYPE_VEC(VSIZE) output_grad_v = GET(output_grad_vec, 0) / HW;
-
-//     __global DTYPE_VEC_PTR(VSIZE) input_grad_vec =
-//         (__global DTYPE_VEC_PTR(VSIZE))(input_grad + n * C * HW + c + input_grad_off);
-
-//     for(size_t i = 0; i < HW; ++i)
-//     {
-//         SET(input_grad_vec, i * C / VSIZE, output_grad_v);
-//     }
-// #undef VSIZE
-// }
-
 template <typename TI, typename TO>
-__device__ void avgPoolForward3d(const TI* __restrict__ input,
-                                 TO* __restrict__ output,
-                                 size_t N,
-                                 size_t C,
-                                 size_t D,
-                                 size_t H,
-                                 size_t W,
-                                 size_t OD,
-                                 size_t OH,
-                                 size_t OW,
-                                 tensor_view_t<5> input_tv,
-                                 tensor_view_t<5> output_tv)
+__device__ void adaptiveAvgPoolForward3d(const TI* __restrict__ input,
+                                         TO* __restrict__ output,
+                                         size_t N,
+                                         size_t C,
+                                         size_t D,
+                                         size_t H,
+                                         size_t W,
+                                         size_t OD,
+                                         size_t OH,
+                                         size_t OW,
+                                         tensor_view_t<5> input_tv,
+                                         tensor_view_t<5> output_tv)
 {
-    int32_t gid    = threadIdx.x + blockIdx.x * blockDim.x;
-    int32_t ncodoh = gid / OW, ow = gid % OW;
-    int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
-    int32_t nc = ncod / OD, od = ncod % OD;
-    int32_t n = nc / C, c = nc % C;
+    size_t gid    = threadIdx.x + blockIdx.x * blockDim.x;
+    size_t ncodoh = gid / OW, ow = gid % OW;
+    size_t ncod = ncodoh / OH, oh = ncodoh % OH;
+    size_t nc = ncod / OD, od = ncod % OD;
+    size_t n = nc / C, c = nc % C;
 
     if(n >= N)
         return;
-    int32_t d  = (int32_t)floor((float)(od * D) / OD);
-    int32_t kd = (int32_t)ceil((float)((od + 1) * D) / OD) - d;
+    size_t d  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(od * D) / OD));
+    size_t kd = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((od + 1) * D) / OD)) - d;
 
-    int32_t h  = (int32_t)floor((float)(oh * H) / OH);
-    int32_t kh = (int32_t)ceil((float)((oh + 1) * H) / OH) - h;
+    size_t h  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(oh * H) / OH));
+    size_t kh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((oh + 1) * H) / OH)) - h;
 
-    int32_t w  = (int32_t)floor((float)(ow * W) / OW);
-    int32_t kw = (int32_t)ceil((float)((ow + 1) * W) / OW) - w;
+    size_t w  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(ow * W) / OW));
+    size_t kw = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((ow + 1) * W) / OW)) - w;
 
-    DTYPE_ACCURATE sum = 0;
-    for(int32_t id = d; id < (d + kd); ++id)
+    FLOAT_ACCUM sum = 0;
+    for(size_t id = d; id < (d + kd); ++id)
     {
-        for(int32_t ih = h; ih < (h + kh); ++ih)
+        for(size_t ih = h; ih < (h + kh); ++ih)
         {
-            for(int32_t iw = w; iw < (w + kw); ++iw)
+            for(size_t iw = w; iw < (w + kw); ++iw)
             {
-                sum += GET_5D_VAL_AT(input, n, c, id, ih, iw);
+                sum += CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx({n, c, id, ih, iw})]);
             }
         }
     }
 
-    output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
-        CVT_ACCUM2FLOAT(val);
-    SET_5D_VAL_AT(output, n, c, od, oh, ow, sum / (kd * kh * kw));
+    output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] =
+        CVT_ACCUM2FLOAT(sum / (kd * kh * kw));
 }
 
-extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input,
-                                            OUTPUT_TYPE* __restrict__ output,
-                                            size_t N,
-                                            size_t C,
-                                            size_t D,
-                                            size_t H,
-                                            size_t W,
-                                            size_t OD,
-                                            size_t OH,
-                                            size_t OW,
-                                            tensor_view_t<5> input_tv,
-                                            tensor_view_t<5> output_tv)
+extern "C" __global__ void AdaptiveAvgPoolForward3d(const INPUT_TYPE* __restrict__ input,
+                                                    OUTPUT_TYPE* __restrict__ output,
+                                                    size_t N,
+                                                    size_t C,
+                                                    size_t D,
+                                                    size_t H,
+                                                    size_t W,
+                                                    size_t OD,
+                                                    size_t OH,
+                                                    size_t OW,
+                                                    tensor_view_t<5> input_tv,
+                                                    tensor_view_t<5> output_tv)
 {
-    avgPoolForward3d<INPUT_TYPE, OUTPUT_TYPE>(
+    adaptiveAvgPoolForward3d<INPUT_TYPE, OUTPUT_TYPE>(
         input, output, N, C, D, H, W, OD, OH, OW, input_tv, output_tv);
 }
 
 template <typename TI, typename TO>
-__device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
-                                  TO* __restrict__ input_grad,
-                                  size_t N,
-                                  size_t C,
-                                  size_t D,
-                                  size_t H,
-                                  size_t W,
-                                  size_t OD,
-                                  size_t OH,
-                                  size_t OW,
-                                  tensor_view_t<5> output_grad_tv,
-                                  tensor_view_t<5> input_grad_tv)
+__device__ void adaptiveAvgPoolBackward3d(const TI* __restrict__ output_grad,
+                                          TO* __restrict__ input_grad,
+                                          size_t N,
+                                          size_t C,
+                                          size_t D,
+                                          size_t H,
+                                          size_t W,
+                                          size_t OD,
+                                          size_t OH,
+                                          size_t OW,
+                                          tensor_view_t<5> output_grad_tv,
+                                          tensor_view_t<5> input_grad_tv)
 {
-    int32_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
-    int32_t ncdh = gid / W, w = gid % W;
-    int32_t ncd = ncdh / H, h = ncdh % H;
-    int32_t nc = ncd / D, d = ncd % D;
-    int32_t n = nc / C, c = nc % C;
+    size_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
+    size_t ncdh = gid / W, w = gid % W;
+    size_t ncd = ncdh / H, h = ncdh % H;
+    size_t nc = ncd / D, d = ncd % D;
+    size_t n = nc / C, c = nc % C;
 
     if(n >= N)
         return;
 
-    int32_t od  = (int32_t)floor((float)(d * OD) / D);
-    int32_t kod = (int32_t)ceil((float)((d + 1) * OD) / D) - od;
+    size_t od  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(d * OD) / D));
+    size_t kod = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((d + 1) * OD) / D)) - od;
 
-    int32_t oh  = (int32_t)floor((float)(h * OH) / H);
-    int32_t koh = (int32_t)ceil((float)((h + 1) * OH) / H) - oh;
+    size_t oh  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(h * OH) / H));
+    size_t koh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((h + 1) * OH) / H)) - oh;
 
-    int32_t ow  = (int32_t)floor((float)(w * OW) / W);
-    int32_t kow = (int32_t)ceil((float)((w + 1) * OW) / W) - ow;
+    size_t ow  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(w * OW) / W));
+    size_t kow = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((w + 1) * OW) / W)) - ow;
 
-    DTYPE_ACCURATE grad = 0;
-    for(int32_t id = od; id < (od + kod); ++id)
+    FLOAT_ACCUM grad = 0;
+    for(size_t id = od; id < (od + kod); ++id)
     {
-        int32_t kd =
-            (int32_t)ceil((float)((id + 1) * D) / OD) - (int32_t)floor((float)(id * D) / OD);
-        for(int32_t ih = oh; ih < (oh + koh); ++ih)
+        size_t kd = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((id + 1) * D) / OD)) -
+                    static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(id * D) / OD));
+        for(size_t ih = oh; ih < (oh + koh); ++ih)
         {
-            int32_t kh =
-                (int32_t)ceil((float)((ih + 1) * H) / OH) - (int32_t)floor((float)(ih * H) / OH);
-            for(int32_t iw = ow; iw < (ow + kow); ++iw)
+            size_t kh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((ih + 1) * H) / OH)) -
+                        static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(ih * H) / OH));
+            for(size_t iw = ow; iw < (ow + kow); ++iw)
             {
-                int32_t kw = (int32_t)ceil((float)((iw + 1) * W) / OW) -
-                             (int32_t)floor((float)(iw * W) / OW);
-                grad += GET_5D_VAL_AT(output_grad, n, c, id, ih, iw) / (kd * kh * kw);
+                size_t kw = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((iw + 1) * W) / OW)) -
+                            static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(iw * W) / OW));
+                grad += CVT_FLOAT2ACCUM(
+                            output_grad[output_grad_tv.get_tensor_view_idx({n, c, id, ih, iw})]) /
+                        (kd * kh * kw);
             }
         }
     }
 
-    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
-        CVT_ACCUM2FLOAT(grad);
+    input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = CVT_ACCUM2FLOAT(grad);
 }
 
-extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad,
-                                             OUTPUT_TYPE* __restrict__ input_grad,
-                                             size_t N,
-                                             size_t C,
-                                             size_t D,
-                                             size_t H,
-                                             size_t W,
-                                             size_t OD,
-                                             size_t OH,
-                                             size_t OW,
-                                             tensor_view_t<5> output_grad_tv,
-                                             tensor_view_t<5> input_grad_tv)
+extern "C" __global__ void AdaptiveAvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad,
+                                                     OUTPUT_TYPE* __restrict__ input_grad,
+                                                     size_t N,
+                                                     size_t C,
+                                                     size_t D,
+                                                     size_t H,
+                                                     size_t W,
+                                                     size_t OD,
+                                                     size_t OH,
+                                                     size_t OW,
+                                                     tensor_view_t<5> output_grad_tv,
+                                                     tensor_view_t<5> input_grad_tv)
 {
-    avgPoolBackward3d<INPUT_TYPE, OUTPUT_TYPE>(
+    adaptiveAvgPoolBackward3d<INPUT_TYPE, OUTPUT_TYPE>(
         output_grad, input_grad, N, C, D, H, W, OD, OH, OW, output_grad_tv, input_grad_tv);
 }
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index d64dbf21f9..12394dbde6 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -27,6 +27,8 @@
 #ifndef GUARD_TENSOR_VIEW_HPP
 #define GUARD_TENSOR_VIEW_HPP
 
+#include <initializer_list>
+
 template <int N>
 struct tensor_layout_t;
 
@@ -47,7 +49,6 @@ struct tensor_view_t
     uint64_t stride[N];
     uint64_t size[N];
 };
-
 template <int N>
 struct tensor_layout_t
 {
@@ -72,44 +73,13 @@ struct tensor_layout_t
         }
     }
 
-    constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w)
-    {
-        static_assert(N == 5);
-        layout[0] = n;
-        layout[1] = c;
-        layout[2] = d;
-        layout[3] = h;
-        layout[4] = w;
-    }
-
-    constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t h, uint64_t w)
-    {
-        static_assert(N == 4);
-        layout[0] = n;
-        layout[1] = c;
-        layout[2] = h;
-        layout[3] = w;
-    }
-
-    constexpr tensor_layout_t(uint64_t n, uint64_t h, uint64_t w)
-    {
-        static_assert(N == 3);
-        layout[0] = n;
-        layout[1] = h;
-        layout[2] = w;
-    }
-
-    constexpr tensor_layout_t(uint64_t n, uint64_t w)
-    {
-        static_assert(N == 2);
-        layout[0] = n;
-        layout[1] = w;
-    }
-
-    constexpr tensor_layout_t(uint64_t n)
+    constexpr tensor_layout_t(std::initializer_list<uint64_t> layout_)
     {
-        static_assert(N == 1);
-        layout[0] = n;
+        static_assert(N > 0);
+        for(auto i = 0; i < N; ++i)
+        {
+            layout[i] = layout_.begin()[i];
+        }
     }
 
     uint64_t layout[N];
diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
index 1afb78de45..e97c9ec0a9 100644
--- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
+++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
@@ -35,7 +35,7 @@
 #include <miopen/adaptiveavgpool.hpp>
 #include <miopen/target_properties.hpp>
 
-#define LOCAL_SIZE_BWD_2D 256
+#define LOCAL_SIZE_BWD_1D 256
 
 namespace miopen {
 
@@ -43,29 +43,17 @@ namespace solver {
 
 namespace adaptiveavgpool {
 
-bool IsOverRocmBwd2d(const miopen::adaptiveavgpool::BwdProblemDescription& problem)
+bool IsOverRocmBwd1d(const miopen::adaptiveavgpool::BwdProblemDescription& problem)
 {
-    auto dtype      = problem.GetInputGradDesc().GetType();
-    auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
-    auto out_nelems = problem.GetOutputGradDesc().GetElementSize();
-    auto mul_nc =
-        problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1];
-    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
-
-    if(dtype == miopenFloat)
+    if(!problem.IsAllContiguous())
     {
-        return false;
+        return true;
     }
-    else if(dtype == miopenHalf)
+    else
     {
-        if(in_over_out < 2 && in_nelems >= 11075584)
-        {
-            return true;
-        }
-    }
-    else if(dtype == miopenBFloat16)
-    {
-        if(in_over_out < 2 || (in_nelems > 20000000 && mul_nc <= 2048))
+        auto mul_nc = problem.GetOutputGradDesc().GetLengths()[0] *
+                      problem.GetOutputGradDesc().GetLengths()[1];
+        if(mul_nc < 141312)
         {
             return true;
         }
@@ -73,22 +61,22 @@ bool IsOverRocmBwd2d(const miopen::adaptiveavgpool::BwdProblemDescription& probl
     return false;
 }
 
-bool AdaptiveAvgPoolBackward2d::IsApplicable(
+bool AdaptiveAvgPoolBackward1d::IsApplicable(
     const ExecutionContext&, const miopen::adaptiveavgpool::BwdProblemDescription& problem) const
 {
-    if(problem.GetInputGradDesc().GetNumDims() != 4 ||
-       problem.GetOutputGradDesc().GetNumDims() != 4)
+    if(problem.GetInputGradDesc().GetNumDims() != 3 ||
+       problem.GetOutputGradDesc().GetNumDims() != 3)
     {
         return false;
     }
-    if(!IsOverRocmBwd2d(problem))
+    if(!IsOverRocmBwd1d(problem))
     {
         return false;
     }
     return true;
 }
 
-ConvSolution AdaptiveAvgPoolBackward2d::GetSolution(
+ConvSolution AdaptiveAvgPoolBackward1d::GetSolution(
     const ExecutionContext& context,
     const miopen::adaptiveavgpool::BwdProblemDescription& problem) const
 {
@@ -108,10 +96,10 @@ ConvSolution AdaptiveAvgPoolBackward2d::GetSolution(
         {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
         {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
 
-    result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_2D},
+    result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_1D},
                                                          {N_total},
                                                          "MIOpenAdaptiveAvgPool.cpp",
-                                                         "AdaptiveAvgPoolBackward2d",
+                                                         "AdaptiveAvgPoolBackward1d",
                                                          build_params));
 
     result.invoker_factory = [](const std::vector<Kernel>& kernels) {
@@ -120,26 +108,16 @@ ConvSolution AdaptiveAvgPoolBackward2d::GetSolution(
 
             decltype(auto) kernel = handle_.Run(kernels.front());
 
-            auto input_grad_tv  = get_inner_expanded_tv<4>(deref(params.inputGradDesc));
-            auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc));
+            auto input_grad_tv  = get_inner_expanded_tv<3>(deref(params.inputGradDesc));
+            auto output_grad_tv = get_inner_expanded_tv<3>(deref(params.outputGradDesc));
 
             auto N  = deref(params.inputGradDesc).GetLengths()[0];
             auto C  = deref(params.inputGradDesc).GetLengths()[1];
             auto H  = deref(params.inputGradDesc).GetLengths()[2];
-            auto W  = deref(params.inputGradDesc).GetLengths()[3];
             auto OH = deref(params.outputGradDesc).GetLengths()[2];
-            auto OW = deref(params.outputGradDesc).GetLengths()[3];
-
-            kernel(params.output_grad,
-                   params.input_grad,
-                   N,
-                   C,
-                   H,
-                   W,
-                   OH,
-                   OW,
-                   output_grad_tv,
-                   input_grad_tv);
+
+            kernel(
+                params.output_grad, params.input_grad, N, C, H, OH, output_grad_tv, input_grad_tv);
         };
     };
 
diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
index 1afb78de45..dd8aeb9902 100644
--- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
+++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
@@ -45,29 +45,33 @@ namespace adaptiveavgpool {
 
 bool IsOverRocmBwd2d(const miopen::adaptiveavgpool::BwdProblemDescription& problem)
 {
-    auto dtype      = problem.GetInputGradDesc().GetType();
-    auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
-    auto out_nelems = problem.GetOutputGradDesc().GetElementSize();
-    auto mul_nc =
-        problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1];
-    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
-
-    if(dtype == miopenFloat)
+    if(problem.IsAllContiguous())
     {
         return false;
     }
-    else if(dtype == miopenHalf)
+    else
     {
-        if(in_over_out < 2 && in_nelems >= 11075584)
+        auto dtype       = problem.GetInputGradDesc().GetType();
+        auto in_nelems   = problem.GetInputGradDesc().GetElementSize();
+        auto out_nelems  = problem.GetOutputGradDesc().GetElementSize();
+        auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+        if(dtype == miopenFloat)
         {
-            return true;
+            if(in_nelems > 3801600)
+                return true;
         }
-    }
-    else if(dtype == miopenBFloat16)
-    {
-        if(in_over_out < 2 || (in_nelems > 20000000 && mul_nc <= 2048))
+        else if(dtype == miopenHalf)
+        {
+            if(in_over_out == 1 || (in_over_out >= 1024 && in_over_out <= 4096))
+                return true;
+        }
+        else if(dtype == miopenBFloat16)
         {
-            return true;
+            if(in_over_out < 13 || (in_over_out >= 1024 && in_over_out <= 4096))
+            {
+                return true;
+            }
         }
     }
     return false;
diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
index 85bb5747f3..3ad93574de 100644
--- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
+++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
@@ -35,7 +35,7 @@
 #include <miopen/adaptiveavgpool.hpp>
 #include <miopen/target_properties.hpp>
 
-#define LOCAL_SIZE_FWD_2D 256
+#define LOCAL_SIZE_FWD_1D 256
 
 namespace miopen {
 
@@ -43,34 +43,15 @@ namespace solver {
 
 namespace adaptiveavgpool {
 
-bool IsOverRocmFwd2d(const miopen::adaptiveavgpool::FwdProblemDescription& problem)
+bool IsOverRocmFwd1d(const miopen::adaptiveavgpool::FwdProblemDescription& problem)
 {
-    auto dtype      = problem.GetOutputDesc().GetType();
-    auto in_nelems  = problem.GetInputDesc().GetElementSize();
-    auto out_nelems = problem.GetOutputDesc().GetElementSize();
-    auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
+    auto in_nelems   = problem.GetInputDesc().GetLengths()[-1];
+    auto out_nelems  = problem.GetOutputDesc().GetLengths()[-1];
     auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
 
-    if(dtype == miopenFloat)
+    if(in_over_out < 56)
     {
-        if(in_over_out > 11 || (in_over_out < 2 && mul_nc >= 12288))
-        {
-            return true;
-        }
-    }
-    else if(dtype == miopenHalf)
-    {
-        if(in_over_out > 11 || (in_over_out < 2 && mul_nc < 90000))
-        {
-            return true;
-        }
-    }
-    else if(dtype == miopenBFloat16)
-    {
-        if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 4816896)
-        {
-            return true;
-        }
+        return true;
     }
     return false;
 }
@@ -78,11 +59,11 @@ bool IsOverRocmFwd2d(const miopen::adaptiveavgpool::FwdProblemDescription& probl
 bool AdaptiveAvgPoolForward1d::IsApplicable(
     const ExecutionContext&, const miopen::adaptiveavgpool::FwdProblemDescription& problem) const
 {
-    if(problem.GetInputDesc().GetNumDims() != 4 || problem.GetOutputDesc().GetNumDims() != 4)
+    if(problem.GetInputDesc().GetNumDims() != 3 || problem.GetOutputDesc().GetNumDims() != 3)
     {
         return false;
     }
-    if(!IsOverRocmFwd2d(problem))
+    if(!IsOverRocmFwd1d(problem))
     {
         return false;
     }
@@ -109,7 +90,7 @@ ConvSolution AdaptiveAvgPoolForward1d::GetSolution(
         {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
         {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
 
-    result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_2D},
+    result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_1D},
                                                          {N_total},
                                                          "MIOpenAdaptiveAvgPool.cpp",
                                                          "AdaptiveAvgPoolForward1d",
@@ -121,17 +102,15 @@ ConvSolution AdaptiveAvgPoolForward1d::GetSolution(
 
             decltype(auto) kernel = handle_.Run(kernels.front());
 
-            auto input_tv  = get_inner_expanded_tv<4>(deref(params.inputDesc));
-            auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc));
+            auto input_tv  = get_inner_expanded_tv<3>(deref(params.inputDesc));
+            auto output_tv = get_inner_expanded_tv<3>(deref(params.outputDesc));
 
             size_t N  = deref(params.inputDesc).GetLengths()[0];
             size_t C  = deref(params.inputDesc).GetLengths()[1];
             size_t H  = deref(params.inputDesc).GetLengths()[2];
-            size_t W  = deref(params.inputDesc).GetLengths()[3];
             size_t OH = deref(params.outputDesc).GetLengths()[2];
-            size_t OW = deref(params.outputDesc).GetLengths()[3];
 
-            kernel(params.input, params.output, N, C, H, W, OH, OW, input_tv, output_tv);
+            kernel(params.input, params.output, N, C, H, OH, input_tv, output_tv);
         };
     };
 
diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
index d1afc40842..92c120494e 100644
--- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
+++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
@@ -28,7 +28,6 @@
 #include "miopen/execution_context.hpp"
 #include "miopen/invoke_params.hpp"
 #include "miopen/tensor_view_utils.hpp"
-#include <cstdint>
 #include <miopen/adaptiveavgpool/solvers.hpp>
 
 #include <miopen/adaptiveavgpool/invoke_params.hpp>
@@ -46,32 +45,19 @@ namespace adaptiveavgpool {
 
 bool IsOverRocmFwd2d(const miopen::adaptiveavgpool::FwdProblemDescription& problem)
 {
-    auto dtype      = problem.GetOutputDesc().GetType();
-    auto in_nelems  = problem.GetInputDesc().GetElementSize();
-    auto out_nelems = problem.GetOutputDesc().GetElementSize();
-    auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
+    auto in_nelems   = problem.GetInputDesc().GetElementSize();
+    auto out_nelems  = problem.GetOutputDesc().GetElementSize();
     auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
 
-    if(dtype == miopenFloat)
+    if(problem.IsAllContiguous())
     {
-        if(in_over_out > 11 || (in_over_out < 2 && mul_nc >= 12288))
-        {
+        if((in_over_out < 13) || (in_over_out >= 100 && in_over_out <= 112))
             return true;
-        }
     }
-    else if(dtype == miopenHalf)
+    else
     {
-        if(in_over_out > 11 || (in_over_out < 2 && mul_nc < 90000))
-        {
+        if(in_over_out < 248)
             return true;
-        }
-    }
-    else if(dtype == miopenBFloat16)
-    {
-        if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 4816896)
-        {
-            return true;
-        }
     }
     return false;
 }
diff --git a/test/cpu_adaptiveavgpool.hpp b/test/cpu_adaptiveavgpool.hpp
new file mode 100644
index 0000000000..4b6dd99dda
--- /dev/null
+++ b/test/cpu_adaptiveavgpool.hpp
@@ -0,0 +1,311 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CPU_AVGPOOL_HPP
+#define GUARD_CPU_AVGPOOL_HPP
+
+#include "tensor_holder.hpp"
+#include <miopen/tensor_view_utils.hpp>
+
+template <class T>
+void cpu_adaptiveavgpool_forward_1d(
+    tensor<T> input, tensor<T>& output, size_t N, size_t C, size_t H, size_t OH)
+{
+    auto dims  = input.desc.GetLengths();
+    auto numel = output.desc.GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<3>(input.desc);
+    auto output_tv = miopen::get_inner_expanded_tv<3>(output.desc);
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        size_t nc = gid / OH, oh = gid % OH;
+        size_t n = nc / C, c = nc % C;
+
+        if(n >= N)
+            return;
+
+        size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
+        size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
+
+        float sum = 0;
+        for(size_t ih = h; ih < (h + kh); ++ih)
+        {
+            sum += static_cast<float>(input[input_tv.get_tensor_view_idx({n, c, ih})]);
+        }
+
+        output[output_tv.get_tensor_view_idx({n, c, oh})] = static_cast<T>(sum / kh);
+    }
+}
+
+template <class T>
+void cpu_adaptiveavgpool_forward_2d(tensor<T> input,
+                                    tensor<T>& output,
+                                    size_t N,
+                                    size_t C,
+                                    size_t H,
+                                    size_t W,
+                                    size_t OH,
+                                    size_t OW)
+{
+    auto dims  = input.desc.GetLengths();
+    auto numel = output.desc.GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<4>(input.desc);
+    auto output_tv = miopen::get_inner_expanded_tv<4>(output.desc);
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        size_t ncoh = gid / OW, ow = gid % OW;
+        size_t nc = ncoh / OH, oh = ncoh % OH;
+        size_t n = nc / C, c = nc % C;
+
+        if(n >= N)
+            return;
+
+        size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
+        size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
+
+        size_t w  = static_cast<size_t>(std::floor(static_cast<float>(ow * W) / OW));
+        size_t kw = static_cast<size_t>(std::ceil(static_cast<float>((ow + 1) * W) / OW)) - w;
+
+        float divider = static_cast<float>(kh * kw);
+        float sum     = 0;
+        for(size_t ih = h; ih < (h + kh); ++ih)
+        {
+            for(size_t iw = w; iw < (w + kw); ++iw)
+            {
+                sum += static_cast<float>(input[input_tv.get_tensor_view_idx({n, c, ih, iw})]);
+            }
+        }
+
+        output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = static_cast<T>(sum / divider);
+    }
+}
+
+template <class T>
+void cpu_adaptiveavgpool_forward_3d(tensor<T> input,
+                                    tensor<T>& output,
+                                    size_t N,
+                                    size_t C,
+                                    size_t D,
+                                    size_t H,
+                                    size_t W,
+                                    size_t OD,
+                                    size_t OH,
+                                    size_t OW)
+{
+    auto dims  = input.desc.GetLengths();
+    auto numel = output.desc.GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<5>(input.desc);
+    auto output_tv = miopen::get_inner_expanded_tv<5>(output.desc);
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        size_t ncodoh = gid / OW, ow = gid % OW;
+        size_t ncod = ncodoh / OH, oh = ncodoh % OH;
+        size_t nc = ncod / OD, od = ncod % OD;
+        size_t n = nc / C, c = nc % C;
+
+        if(n >= N)
+            return;
+
+        size_t d  = static_cast<size_t>(std::floor(static_cast<float>(od * D) / OD));
+        size_t kd = static_cast<size_t>(std::ceil(static_cast<float>((od + 1) * D) / OD)) - d;
+
+        size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
+        size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
+
+        size_t w  = static_cast<size_t>(std::floor(static_cast<float>(ow * W) / OW));
+        size_t kw = static_cast<size_t>(std::ceil(static_cast<float>((ow + 1) * W) / OW)) - w;
+
+        float sum = 0;
+        for(size_t id = d; id < (d + kd); ++id)
+        {
+            for(size_t ih = h; ih < (h + kh); ++ih)
+            {
+                for(size_t iw = w; iw < (w + kw); ++iw)
+                {
+                    sum +=
+                        static_cast<float>(input[input_tv.get_tensor_view_idx({n, c, id, ih, iw})]);
+                }
+            }
+        }
+
+        output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] =
+            static_cast<T>(sum / (kd * kh * kw));
+    }
+}
+
+template <class T>
+void cpu_adaptiveavgpool_backward_1d(
+    tensor<T> output_grad, tensor<T>& input_grad, size_t N, size_t C, size_t H, size_t OH)
+{
+    auto dims  = input_grad.desc.GetLengths();
+    auto numel = input_grad.desc.GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<3>(output_grad.desc);
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<3>(input_grad.desc);
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        size_t nc = gid / H, h = gid % H;
+        size_t n = nc / C, c = nc % C;
+
+        if(n >= N)
+            return;
+
+        size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
+        size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
+
+        float grad = 0;
+        for(size_t ih = oh; ih < (oh + koh); ++ih)
+        {
+            size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((ih + 1) * H) / OH)) -
+                        static_cast<size_t>(std::floor(static_cast<float>(ih * H) / OH));
+            grad +=
+                static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih})]) /
+                kh;
+        }
+
+        input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = static_cast<T>(grad);
+    }
+}
+
+template <class T>
+void cpu_adaptiveavgpool_backward_2d(tensor<T> output_grad,
+                                     tensor<T>& input_grad,
+                                     size_t N,
+                                     size_t C,
+                                     size_t H,
+                                     size_t W,
+                                     size_t OH,
+                                     size_t OW)
+{
+    auto dims  = input_grad.desc.GetLengths();
+    auto numel = input_grad.desc.GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc);
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(input_grad.desc);
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        size_t nch = gid / W, w = gid % W;
+        size_t nc = nch / H, h = nch % H;
+        size_t n = nc / C, c = nc % C;
+
+        if(n >= N)
+            return;
+
+        size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
+        size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
+
+        size_t ow  = static_cast<size_t>(std::floor(static_cast<float>(w * OW) / W));
+        size_t kow = static_cast<size_t>(std::ceil(static_cast<float>((w + 1) * OW) / W)) - ow;
+
+        float grad = 0;
+        for(size_t ih = oh; ih < (oh + koh); ++ih)
+        {
+            size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((ih + 1) * H) / OH)) -
+                        static_cast<size_t>(std::floor(static_cast<float>(ih * H) / OH));
+            for(size_t iw = ow; iw < (ow + kow); ++iw)
+            {
+                size_t kw = static_cast<size_t>(std::ceil(static_cast<float>((iw + 1) * W) / OW)) -
+                            static_cast<size_t>(std::floor(static_cast<float>(iw * W) / OW));
+                grad += static_cast<float>(
+                            output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih, iw})]) /
+                        (kh * kw);
+            }
+        }
+
+        input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = static_cast<T>(grad);
+    }
+}
+
+template <class T>
+void cpu_adaptiveavgpool_backward_3d(tensor<T> output_grad,
+                                     tensor<T>& input_grad,
+                                     size_t N,
+                                     size_t C,
+                                     size_t D,
+                                     size_t H,
+                                     size_t W,
+                                     size_t OD,
+                                     size_t OH,
+                                     size_t OW)
+{
+    auto dims  = input_grad.desc.GetLengths();
+    auto numel = input_grad.desc.GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc);
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(input_grad.desc);
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        size_t ncdh = gid / W, w = gid % W;
+        size_t ncd = ncdh / H, h = ncdh % H;
+        size_t nc = ncd / D, d = ncd % D;
+        size_t n = nc / C, c = nc % C;
+
+        if(n >= N)
+            return;
+
+        size_t od  = static_cast<size_t>(std::floor(static_cast<float>(d * OD) / D));
+        size_t kod = static_cast<size_t>(std::ceil(static_cast<float>((d + 1) * OD) / D)) - od;
+
+        size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
+        size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
+
+        size_t ow  = static_cast<size_t>(std::floor(static_cast<float>(w * OW) / W));
+        size_t kow = static_cast<size_t>(std::ceil(static_cast<float>((w + 1) * OW) / W)) - ow;
+
+        float grad = 0;
+        for(size_t id = od; id < (od + kod); ++id)
+        {
+            size_t kd = static_cast<size_t>(std::ceil(static_cast<float>((id + 1) * D) / OD)) -
+                        static_cast<size_t>(std::floor(static_cast<float>(id * D) / OD));
+            for(size_t ih = oh; ih < (oh + koh); ++ih)
+            {
+                size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((ih + 1) * H) / OH)) -
+                            static_cast<size_t>(std::floor(static_cast<float>(ih * H) / OH));
+                for(size_t iw = ow; iw < (ow + kow); ++iw)
+                {
+                    size_t kw =
+                        static_cast<size_t>(std::ceil(static_cast<float>((iw + 1) * W) / OW)) -
+                        static_cast<size_t>(std::floor(static_cast<float>(iw * W) / OW));
+                    grad +=
+                        static_cast<float>(
+                            output_grad[output_grad_tv.get_tensor_view_idx({n, c, id, ih, iw})]) /
+                        (kd * kh * kw);
+                }
+            }
+        }
+
+        input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = static_cast<T>(grad);
+    }
+}
+
+#endif
diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp
deleted file mode 100644
index 5b91033633..0000000000
--- a/test/cpu_avgpool.hpp
+++ /dev/null
@@ -1,426 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2024 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_CPU_AVGPOOL_HPP
-#define GUARD_CPU_AVGPOOL_HPP
-
-#include "tensor_holder.hpp"
-#include <miopen/tensor_view_utils.hpp>
-
-template <class T>
-void cpu_avgpool_forward_2d(tensor<T> input,
-                            tensor<T>& output,
-                            size_t N,
-                            size_t C,
-                            size_t H,
-                            size_t W,
-                            size_t OH,
-                            size_t OW,
-                            tensor<int32_t> ksize,
-                            tensor<int32_t> stride,
-                            tensor<int32_t> padding,
-                            bool count_include_pad,
-                            int32_t divisor_override)
-{
-    auto dims  = input.desc.GetLengths();
-    auto numel = output.desc.GetElementSize();
-
-    auto input_tv  = miopen::get_inner_expanded_tv<4>(input.desc);
-    auto output_tv = miopen::get_inner_expanded_tv<4>(output.desc);
-
-    for(int32_t gid = 0; gid < numel; gid++)
-    {
-        int32_t ncoh = gid / OW, ow = gid % OW;
-        int32_t nc = ncoh / OH, oh = ncoh % OH;
-        int32_t n = nc / C, c = nc % C;
-        int32_t R  = ksize[0];
-        int32_t S  = ksize[1];
-        int32_t sh = stride[0];
-        int32_t sw = stride[1];
-        int32_t ph = padding[0];
-        int32_t pw = padding[1];
-
-        if(n >= N)
-            return;
-
-        float m = 0;
-        for(int32_t r = 0; r < R; ++r)
-        {
-            for(int32_t s = 0; s < S; ++s)
-            {
-                // input idx : (n, c, h, w)
-                int32_t h = oh * sh - ph + r;
-                if(h < 0 || h >= H)
-                    continue;
-                int32_t w = ow * sw - pw + s;
-                if(w < 0 || w >= W)
-                    continue;
-                // int32_t input_idx = ((n * C + c) * H + h) * W + w;
-                m += static_cast<float>(
-                    input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
-            }
-        }
-
-        int32_t hstart = oh * sh - ph;
-        int32_t wstart = ow * sw - pw;
-        int32_t hend   = min(hstart + R, H + ph);
-        int32_t wend   = min(wstart + S, W + pw);
-
-        const int32_t pool_size = (hend - hstart) * (wend - wstart);
-
-        hstart = max(hstart, 0);
-        wstart = max(wstart, 0);
-        hend   = min(hend, H);
-        wend   = min(wend, W);
-
-        int32_t divide_factor;
-        if(divisor_override != 0)
-        {
-            divide_factor = divisor_override;
-        }
-        else
-        {
-            if(count_include_pad)
-            {
-                divide_factor = pool_size;
-            }
-            else
-            {
-                divide_factor = (hend - hstart) * (wend - wstart);
-            }
-        }
-        float val = m / divide_factor;
-
-        output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] =
-            static_cast<T>(val);
-    }
-}
-
-template <class T>
-void cpu_avgpool_forward_3d(tensor<T> input,
-                            tensor<T>& output,
-                            size_t N,
-                            size_t C,
-                            size_t D,
-                            size_t H,
-                            size_t W,
-                            size_t OD,
-                            size_t OH,
-                            size_t OW,
-                            tensor<int32_t> ksize,
-                            tensor<int32_t> stride,
-                            tensor<int32_t> padding,
-                            bool count_include_pad,
-                            int32_t divisor_override)
-{
-    auto dims  = input.desc.GetLengths();
-    auto numel = output.desc.GetElementSize();
-
-    auto input_tv  = miopen::get_inner_expanded_tv<5>(input.desc);
-    auto output_tv = miopen::get_inner_expanded_tv<5>(output.desc);
-
-    for(int32_t gid = 0; gid < numel; gid++)
-    {
-        int32_t ncodoh = gid / OW, ow = gid % OW;
-        int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
-        int32_t nc = ncod / OD, od = ncod % OD;
-        int32_t n = nc / C, c = nc % C;
-        int32_t KD = ksize[0];
-        int32_t R  = ksize[1];
-        int32_t S  = ksize[2];
-        int32_t sd = stride[0];
-        int32_t sh = stride[1];
-        int32_t sw = stride[2];
-        int32_t pd = padding[0];
-        int32_t ph = padding[1];
-        int32_t pw = padding[2];
-
-        if(n >= N)
-            return;
-        float sum = 0;
-        for(int32_t kd = 0; kd < KD; ++kd)
-        {
-            for(int32_t r = 0; r < R; ++r)
-            {
-                for(int32_t s = 0; s < S; ++s)
-                {
-                    // input idx : (n, c, d, h, w)
-                    int32_t d = od * sd - pd + kd;
-                    if(d < 0 || d >= D)
-                        continue;
-                    int32_t h = oh * sh - ph + r;
-                    if(h < 0 || h >= H)
-                        continue;
-                    int32_t w = ow * sw - pw + s;
-                    if(w < 0 || w >= W)
-                        continue;
-                    // int32_t input_idx = ((n * C + c) * H + h) * W + w;
-                    sum += static_cast<float>(
-                        input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
-                }
-            }
-        }
-        int32_t dstart = od * sd - pd;
-        int32_t hstart = oh * sh - ph;
-        int32_t wstart = ow * sw - pw;
-        int32_t dend   = min(dstart + KD, D + pd);
-        int32_t hend   = min(hstart + R, H + ph);
-        int32_t wend   = min(wstart + S, W + pw);
-
-        const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-        dstart                  = max(dstart, 0);
-        hstart                  = max(hstart, 0);
-        wstart                  = max(wstart, 0);
-        dend                    = min(dend, D);
-        hend                    = min(hend, H);
-        wend                    = min(wend, W);
-
-        int32_t divide_factor;
-        if(divisor_override != 0)
-        {
-            divide_factor = divisor_override;
-        }
-        else
-        {
-            if(count_include_pad)
-            {
-                divide_factor = pool_size;
-            }
-            else
-            {
-                divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
-            }
-        }
-        float val = sum / divide_factor;
-        output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
-            static_cast<T>(val);
-    }
-}
-
-template <class T>
-void cpu_avgpool_backward_2d(tensor<T> output_grad,
-                             tensor<T>& input_grad,
-                             size_t N,
-                             size_t C,
-                             size_t H,
-                             size_t W,
-                             size_t OH,
-                             size_t OW,
-                             tensor<int32_t> ksize,
-                             tensor<int32_t> stride,
-                             tensor<int32_t> padding,
-                             bool count_include_pad,
-                             int32_t divisor_override)
-{
-    auto dims  = input_grad.desc.GetLengths();
-    auto numel = input_grad.desc.GetElementSize();
-
-    auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc);
-    auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(input_grad.desc);
-
-    for(size_t gid = 0; gid < numel; gid++)
-    {
-        int32_t nch = gid / W, w = gid % W;
-        int32_t nc = nch / H, h = nch % H;
-        int32_t n = nc / C, c = nc % C;
-        int32_t R  = ksize[0];
-        int32_t S  = ksize[1];
-        int32_t sh = stride[0];
-        int32_t sw = stride[1];
-        int32_t ph = padding[0];
-        int32_t pw = padding[1];
-
-        if(n >= N)
-            return;
-
-        float grad = 0;
-        for(int32_t r = 0; r < R; ++r)
-        {
-            for(int32_t s = 0; s < S; ++s)
-            {
-                int32_t ohsh = h + ph - r;
-                if(ohsh % sh != 0)
-                    continue;
-                int32_t oh = ohsh / sh;
-                if(oh < 0 || oh >= OH)
-                    continue;
-                int32_t owsw = w + pw - s;
-                if(owsw % sw != 0)
-                    continue;
-                int32_t ow = owsw / sw;
-                if(ow < 0 || ow >= OW)
-                    continue;
-
-                int32_t hstart = oh * sh - ph;
-                int32_t wstart = ow * sw - pw;
-                int32_t hend   = min(hstart + R, H + ph);
-                int32_t wend   = min(wstart + S, W + pw);
-
-                const int32_t pool_size = (hend - hstart) * (wend - wstart);
-
-                hstart = max(hstart, 0);
-                wstart = max(wstart, 0);
-                hend   = min(hend, H);
-                wend   = min(wend, W);
-
-                int32_t divide_factor;
-                if(divisor_override != 0)
-                {
-                    divide_factor = divisor_override;
-                }
-                else
-                {
-                    if(count_include_pad)
-                    {
-                        divide_factor = pool_size;
-                    }
-                    else
-                    {
-                        divide_factor = (hend - hstart) * (wend - wstart);
-                    }
-                }
-
-                grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
-                            tensor_layout_t<4>(n, c, oh, ow))]) /
-                        divide_factor;
-            }
-        }
-        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
-            static_cast<T>(grad);
-    }
-}
-
-template <class T>
-void cpu_avgpool_backward_3d(tensor<T> output_grad,
-                             tensor<T>& input_grad,
-                             size_t N,
-                             size_t C,
-                             size_t D,
-                             size_t H,
-                             size_t W,
-                             size_t OD,
-                             size_t OH,
-                             size_t OW,
-                             tensor<int32_t> ksize,
-                             tensor<int32_t> stride,
-                             tensor<int32_t> padding,
-                             bool count_include_pad,
-                             int32_t divisor_override)
-{
-    auto dims  = input_grad.desc.GetLengths();
-    auto numel = input_grad.desc.GetElementSize();
-
-    auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc);
-    auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(input_grad.desc);
-
-    for(size_t gid = 0; gid < numel; gid++)
-    {
-        int32_t ncdh = gid / W, w = gid % W;
-        int32_t ncd = ncdh / H, h = ncdh % H;
-        int32_t nc = ncd / D, d = ncd % D;
-        int32_t n = nc / C, c = nc % C;
-        int32_t KD = ksize[0];
-        int32_t R  = ksize[1];
-        int32_t S  = ksize[2];
-        int32_t sd = stride[0];
-        int32_t sh = stride[1];
-        int32_t sw = stride[2];
-        int32_t pd = padding[0];
-        int32_t ph = padding[1];
-        int32_t pw = padding[2];
-
-        if(n >= N)
-            return;
-
-        float grad = 0;
-        for(int32_t kd = 0; kd < KD; ++kd)
-        {
-            for(int32_t r = 0; r < R; ++r)
-            {
-                for(int32_t s = 0; s < S; ++s)
-                {
-                    int32_t odsd = d + pd - kd;
-                    if(odsd % sd != 0)
-                        continue;
-                    int32_t od = odsd / sd;
-                    if(od < 0 || od >= OD)
-                        continue;
-
-                    int32_t ohsh = h + ph - r;
-                    if(ohsh % sh != 0)
-                        continue;
-                    int32_t oh = ohsh / sh;
-                    if(oh < 0 || oh >= OH)
-                        continue;
-
-                    int32_t owsw = w + pw - s;
-                    if(owsw % sw != 0)
-                        continue;
-                    int32_t ow = owsw / sw;
-                    if(ow < 0 || ow >= OW)
-                        continue;
-
-                    int32_t dstart = od * sd - pd;
-                    int32_t hstart = oh * sh - ph;
-                    int32_t wstart = ow * sw - pw;
-                    int32_t dend   = min(dstart + KD, D + pd);
-                    int32_t hend   = min(hstart + R, H + ph);
-                    int32_t wend   = min(wstart + S, W + pw);
-
-                    const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-                    dstart                  = max(dstart, 0);
-                    hstart                  = max(hstart, 0);
-                    wstart                  = max(wstart, 0);
-                    dend                    = min(dend, D);
-                    hend                    = min(hend, H);
-                    wend                    = min(wend, W);
-                    int32_t divide_factor;
-                    if(divisor_override != 0)
-                    {
-                        divide_factor = divisor_override;
-                    }
-                    else
-                    {
-                        if(count_include_pad)
-                        {
-                            divide_factor = pool_size;
-                        }
-                        else
-                        {
-                            divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
-                        }
-                    }
-                    grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
-                                tensor_layout_t<5>(n, c, od, oh, ow))]) /
-                            divide_factor;
-                }
-            }
-        }
-        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
-            static_cast<T>(grad);
-    }
-}
-
-#endif
diff --git a/test/gtest/avgpool.cpp b/test/gtest/adaptiveavgpool.cpp
similarity index 64%
rename from test/gtest/avgpool.cpp
rename to test/gtest/adaptiveavgpool.cpp
index 3ab32be510..a548ada4cd 100644
--- a/test/gtest/avgpool.cpp
+++ b/test/gtest/adaptiveavgpool.cpp
@@ -23,13 +23,13 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include "avgpool.hpp"
+#include "adaptiveavgpool.hpp"
 #include <miopen/env.hpp>
 
 MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
 MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
 
-namespace avgpool {
+namespace adaptiveavgpool {
 
 std::string GetFloatArg()
 {
@@ -41,35 +41,35 @@ std::string GetFloatArg()
     return tmp;
 }
 
-struct GPU_Avgpool_fwd_FP32 : AvgPoolTestFwd<float>
+struct GPU_AdaptiveAvgpool_fwd_FP32 : AdaptiveAvgPoolTestFwd<float>
 {
 };
 
-struct GPU_Avgpool_fwd_FP16 : AvgPoolTestFwd<half>
+struct GPU_AdaptiveAvgpool_fwd_FP16 : AdaptiveAvgPoolTestFwd<half>
 {
 };
 
-struct GPU_Avgpool_fwd_BFP16 : AvgPoolTestFwd<bfloat16>
+struct GPU_AdaptiveAvgpool_fwd_BFP16 : AdaptiveAvgPoolTestFwd<bfloat16>
 {
 };
 
-struct GPU_Avgpool_bwd_FP32 : AvgPoolTestBwd<float>
+struct GPU_AdaptiveAvgpool_bwd_FP32 : AdaptiveAvgPoolTestBwd<float>
 {
 };
 
-struct GPU_Avgpool_bwd_FP16 : AvgPoolTestBwd<half>
+struct GPU_AdaptiveAvgpool_bwd_FP16 : AdaptiveAvgPoolTestBwd<half>
 {
 };
 
-struct GPU_Avgpool_bwd_BFP16 : AvgPoolTestBwd<bfloat16>
+struct GPU_AdaptiveAvgpool_bwd_BFP16 : AdaptiveAvgPoolTestBwd<bfloat16>
 {
 };
 
-} // namespace avgpool
-using namespace avgpool;
+} // namespace adaptiveavgpool
+using namespace adaptiveavgpool;
 
 // FORWARD TEST
-TEST_P(GPU_Avgpool_fwd_FP32, AvgPoolTestFwd)
+TEST_P(GPU_AdaptiveAvgpool_fwd_FP32, AdaptiveAvgPoolTestFwd)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -83,7 +83,7 @@ TEST_P(GPU_Avgpool_fwd_FP32, AvgPoolTestFwd)
     }
 };
 
-TEST_P(GPU_Avgpool_fwd_FP16, AvgPoolTestFwd)
+TEST_P(GPU_AdaptiveAvgpool_fwd_FP16, AdaptiveAvgPoolTestFwd)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -97,7 +97,7 @@ TEST_P(GPU_Avgpool_fwd_FP16, AvgPoolTestFwd)
     }
 };
 
-TEST_P(GPU_Avgpool_fwd_BFP16, AvgPoolTestFwd)
+TEST_P(GPU_AdaptiveAvgpool_fwd_BFP16, AdaptiveAvgPoolTestFwd)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -112,17 +112,17 @@ TEST_P(GPU_Avgpool_fwd_BFP16, AvgPoolTestFwd)
 };
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_Avgpool_fwd_FP32,
-                         testing::ValuesIn(AvgPoolTestConfigsFwdFp32()));
+                         GPU_AdaptiveAvgpool_fwd_FP32,
+                         testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp32()));
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_Avgpool_fwd_FP16,
-                         testing::ValuesIn(AvgPoolTestConfigsFwdFp16()));
+                         GPU_AdaptiveAvgpool_fwd_FP16,
+                         testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp16()));
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_Avgpool_fwd_BFP16,
-                         testing::ValuesIn(AvgPoolTestConfigsFwdBfp16()));
+                         GPU_AdaptiveAvgpool_fwd_BFP16,
+                         testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdBfp16()));
 
 // BACKWARD TEST
-TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd)
+TEST_P(GPU_AdaptiveAvgpool_bwd_FP32, AdaptiveAvgPoolTestBwd)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -136,7 +136,7 @@ TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd)
     }
 };
 
-TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd)
+TEST_P(GPU_AdaptiveAvgpool_bwd_FP16, AdaptiveAvgPoolTestBwd)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -150,7 +150,7 @@ TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd)
     }
 };
 
-TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd)
+TEST_P(GPU_AdaptiveAvgpool_bwd_BFP16, AdaptiveAvgPoolTestBwd)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -165,11 +165,11 @@ TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd)
 };
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_Avgpool_bwd_FP32,
-                         testing::ValuesIn(AvgPoolTestConfigsBwdFp32()));
+                         GPU_AdaptiveAvgpool_bwd_FP32,
+                         testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp32()));
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_Avgpool_bwd_FP16,
-                         testing::ValuesIn(AvgPoolTestConfigsBwdFp16()));
+                         GPU_AdaptiveAvgpool_bwd_FP16,
+                         testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp16()));
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_Avgpool_bwd_BFP16,
-                         testing::ValuesIn(AvgPoolTestConfigsBwdBfp16()));
+                         GPU_AdaptiveAvgpool_bwd_BFP16,
+                         testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdBfp16()));
diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp
new file mode 100644
index 0000000000..8c172e4494
--- /dev/null
+++ b/test/gtest/adaptiveavgpool.hpp
@@ -0,0 +1,380 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "cpu_adaptiveavgpool.hpp"
+#include "get_handle.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <miopen/adaptiveavgpool.hpp>
+#include <miopen/miopen.h>
+#include <vector>
+
+template <class T>
+inline std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+struct AdaptiveAvgPoolTestCase
+{
+    std::vector<size_t> input_dims;
+    std::vector<size_t> output_dims;
+    bool is_contiguous = true;
+
+    friend std::ostream& operator<<(std::ostream& os, const AdaptiveAvgPoolTestCase& tc)
+    {
+        return os << " input_dims:" << tc.input_dims << " output_dims:" << tc.output_dims
+                  << "is_contiguous:" << tc.is_contiguous;
+    }
+
+    std::vector<size_t> GetInput() const { return input_dims; }
+    std::vector<size_t> GetOutput() const { return output_dims; }
+
+    std::vector<size_t> ComputeStrides(std::vector<size_t> inputDim) const
+    {
+        if(!is_contiguous)
+            std::swap(inputDim.front(), inputDim.back());
+        std::vector<size_t> strides(inputDim.size());
+        strides.back() = 1;
+        for(int i = inputDim.size() - 2; i >= 0; --i)
+            strides[i] = strides[i + 1] * inputDim[i + 1];
+        if(!is_contiguous)
+            std::swap(strides.front(), strides.back());
+        return strides;
+    }
+};
+
+inline std::vector<AdaptiveAvgPoolTestCase> AdaptiveAvgPoolTestConfigsFwdFp32()
+{
+    return {
+        {{64, 768, 17}, {10}, false},
+        {{64, 768, 17}, {10}, true},
+        {{64, 78, 17, 17}, {10, 10}, false},
+        {{64, 78, 17, 17}, {10, 10}, true},
+        {{6, 18, 18, 18, 18}, {5, 5, 5}, false},
+        {{6, 18, 18, 18, 18}, {5, 5, 5}, true},
+    };
+}
+
+inline std::vector<AdaptiveAvgPoolTestCase> AdaptiveAvgPoolTestConfigsFwdFp16()
+{
+    return {
+        {{64, 768, 17}, {10}, false},
+        {{64, 768, 17}, {10}, true},
+        {{64, 78, 17, 17}, {10, 10}, false},
+        {{64, 78, 17, 17}, {10, 10}, true},
+        {{6, 18, 18, 18, 18}, {5, 5, 5}, false},
+        {{6, 18, 18, 18, 18}, {5, 5, 5}, true},
+    };
+}
+
+inline std::vector<AdaptiveAvgPoolTestCase> AdaptiveAvgPoolTestConfigsFwdBfp16()
+{
+    return {
+        {{64, 768, 17}, {10}, false},
+        {{64, 768, 17}, {10}, true},
+        {{64, 78, 17, 17}, {10, 10}, false},
+        {{64, 78, 17, 17}, {10, 10}, true},
+        {{6, 18, 18, 18, 18}, {5, 5, 5}, false},
+        {{6, 18, 18, 18, 18}, {5, 5, 5}, true},
+    };
+}
+
+inline std::vector<AdaptiveAvgPoolTestCase> AdaptiveAvgPoolTestConfigsBwdFp32()
+{
+    return {
+        {{64, 768, 17}, {10}, false},
+        {{64, 768, 17}, {10}, true},
+        {{64, 78, 17, 17}, {10, 10}, false},
+        {{64, 78, 17, 17}, {10, 10}, true},
+        {{6, 18, 18, 18, 18}, {5, 5, 5}, false},
+        {{6, 18, 18, 18, 18}, {5, 5, 5}, true},
+    };
+}
+
+inline std::vector<AdaptiveAvgPoolTestCase> AdaptiveAvgPoolTestConfigsBwdFp16()
+{
+    return {
+        {{64, 768, 17}, {10}, false},
+        {{64, 768, 17}, {10}, true},
+        {{64, 28, 35, 35}, {7, 7}, false},
+        {{64, 28, 35, 35}, {7, 7}, true},
+        {{6, 28, 35, 35, 35}, {10, 10, 10}, false},
+        {{6, 28, 35, 35, 35}, {10, 10, 10}, true},
+    };
+}
+
+inline std::vector<AdaptiveAvgPoolTestCase> AdaptiveAvgPoolTestConfigsBwdBfp16()
+{
+    return {
+        {{64, 768, 17}, {10}, false},
+        {{64, 768, 17}, {10}, true},
+        {{64, 208, 9, 9}, {7, 7}, false},
+        {{64, 208, 9, 9}, {7, 7}, true},
+        {{6, 18, 12, 12, 12}, {5, 5, 5}, false},
+        {{6, 18, 12, 12, 12}, {5, 5, 5}, true},
+    };
+}
+
+// FORWARD TEST
+template <typename T = float>
+struct AdaptiveAvgPoolTestFwd : public ::testing::TestWithParam<AdaptiveAvgPoolTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle                     = get_handle();
+        adaptiveavgpool_config            = GetParam();
+        auto in_dim                       = adaptiveavgpool_config.GetInput();
+        auto in_strides                   = adaptiveavgpool_config.ComputeStrides(in_dim);
+        auto out_dim                      = adaptiveavgpool_config.GetOutput();
+        N                                 = in_dim[0];
+        C                                 = in_dim[1];
+        std::vector<size_t> out_dim_final = {N, C};
+        if(in_dim.size() == 3)
+        {
+            D = 1;
+            H = in_dim[2];
+            W = 1;
+
+            OD = 1;
+            OH = out_dim[0];
+            OW = 1;
+            out_dim_final.push_back(OH);
+        }
+        else if(in_dim.size() == 4)
+        {
+            D = 1;
+            H = in_dim[2];
+            W = in_dim[3];
+
+            OD = 1;
+            OH = out_dim[0];
+            OW = out_dim[1];
+            out_dim_final.push_back(OH);
+            out_dim_final.push_back(OW);
+        }
+        else if(in_dim.size() == 5)
+        {
+            D = in_dim[2];
+            H = in_dim[3];
+            W = in_dim[4];
+
+            OD = out_dim[0];
+            OH = out_dim[1];
+            OW = out_dim[2];
+            out_dim_final.push_back(OD);
+            out_dim_final.push_back(OH);
+            out_dim_final.push_back(OW);
+        }
+
+        auto gen_input_value = [](auto...) {
+            return prng::gen_A_to_B<T>(static_cast<T>(-10.0f), static_cast<T>(10.0f));
+        };
+        input = tensor<T>{in_dim, in_strides}.generate(gen_input_value);
+
+        output = tensor<T>{out_dim_final};
+        std::fill(output.begin(), output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_output = tensor<T>{out_dim_final};
+        std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        input_dev  = handle.Write(input.data);
+        output_dev = handle.Write(output.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        auto dims = input.desc.GetNumDims();
+        if(dims == 3)
+        {
+            cpu_adaptiveavgpool_forward_1d(input, ref_output, N, C, H, OH);
+        }
+        else if(dims == 4)
+        {
+            cpu_adaptiveavgpool_forward_2d(input, ref_output, N, C, H, W, OH, OW);
+        }
+        else if(dims == 5)
+        {
+            cpu_adaptiveavgpool_forward_3d<T>(input, ref_output, N, C, D, H, W, OD, OH, OW);
+        }
+        status = miopen::AdaptiveAvgPoolForward(
+            handle, input.desc, input_dev.get(), output.desc, output_dev.get());
+        fflush(stdout);
+        ASSERT_EQ(status, miopenStatusSuccess);
+
+        output.data = handle.Read<T>(output_dev, output.data.size());
+    }
+
+    void Verify()
+    {
+        double threshold = std::numeric_limits<T>::epsilon();
+
+        auto error = miopen::rms_range(ref_output, output);
+
+        ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output));
+        EXPECT_LT(error, threshold * 10);
+    }
+    AdaptiveAvgPoolTestCase adaptiveavgpool_config;
+
+    tensor<T> input;
+    tensor<T> output;
+    tensor<T> ref_output;
+
+    size_t N, C, D, H, W, OD, OH, OW;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+};
+
+// BACKWARD TEST
+template <typename T = float>
+struct AdaptiveAvgPoolTestBwd : public ::testing::TestWithParam<AdaptiveAvgPoolTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle                          = get_handle();
+        adaptiveavgpool_config                 = GetParam();
+        auto in_grad_dim                       = adaptiveavgpool_config.GetInput();
+        auto out_grad_dim                      = adaptiveavgpool_config.GetOutput();
+        N                                      = in_grad_dim[0];
+        C                                      = in_grad_dim[1];
+        std::vector<size_t> out_grad_dim_final = {N, C};
+
+        if(in_grad_dim.size() == 3)
+        {
+            D = 1;
+            H = in_grad_dim[2];
+            W = 1;
+
+            OD = 1;
+            OH = out_grad_dim[0];
+            OW = 1;
+            out_grad_dim_final.push_back(OH);
+        }
+        else if(in_grad_dim.size() == 4)
+        {
+            D = 1;
+            H = in_grad_dim[2];
+            W = in_grad_dim[3];
+
+            OD = 1;
+            OH = out_grad_dim[0];
+            OW = out_grad_dim[1];
+            out_grad_dim_final.push_back(OH);
+            out_grad_dim_final.push_back(OW);
+        }
+        else if(in_grad_dim.size() == 5)
+        {
+            D = in_grad_dim[2];
+            H = in_grad_dim[3];
+            W = in_grad_dim[4];
+
+            OD = out_grad_dim[0];
+            OH = out_grad_dim[1];
+            OW = out_grad_dim[2];
+            out_grad_dim_final.push_back(OD);
+            out_grad_dim_final.push_back(OH);
+            out_grad_dim_final.push_back(OW);
+        }
+        auto out_grad_strides = adaptiveavgpool_config.ComputeStrides(out_grad_dim_final);
+
+        auto gen_output_grad_value = [](auto...) {
+            return prng::gen_A_to_B<T>(static_cast<T>(-10.0f), static_cast<T>(10.0f));
+        };
+        output_grad =
+            tensor<T>{out_grad_dim_final, out_grad_strides}.generate(gen_output_grad_value);
+
+        input_grad = tensor<T>{in_grad_dim};
+        std::fill(input_grad.begin(), input_grad.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_input_grad = tensor<T>{in_grad_dim};
+        std::fill(
+            ref_input_grad.begin(), ref_input_grad.end(), std::numeric_limits<T>::quiet_NaN());
+
+        output_grad_dev = handle.Write(output_grad.data);
+        input_grad_dev  = handle.Write(input_grad.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+
+        miopenStatus_t status;
+
+        auto dims = input_grad.desc.GetNumDims();
+        if(dims == 3)
+        {
+            cpu_adaptiveavgpool_backward_1d(output_grad, ref_input_grad, N, C, H, OH);
+        }
+        else if(dims == 4)
+        {
+            cpu_adaptiveavgpool_backward_2d(output_grad, ref_input_grad, N, C, H, W, OH, OW);
+        }
+        else if(dims == 5)
+        {
+            cpu_adaptiveavgpool_backward_3d<T>(
+                output_grad, ref_input_grad, N, C, D, H, W, OD, OH, OW);
+        }
+        status = miopen::AdaptiveAvgPoolBackward(
+            handle, output_grad.desc, output_grad_dev.get(), input_grad.desc, input_grad_dev.get());
+
+        ASSERT_EQ(status, miopenStatusSuccess);
+
+        input_grad.data = handle.Read<T>(input_grad_dev, input_grad.data.size());
+    }
+
+    void Verify()
+    {
+        double threshold = std::numeric_limits<T>::epsilon();
+        auto error       = miopen::rms_range(ref_input_grad, input_grad);
+        ASSERT_EQ(miopen::range_distance(ref_input_grad), miopen::range_distance(input_grad));
+        EXPECT_LT(error, threshold * 10);
+    }
+    AdaptiveAvgPoolTestCase adaptiveavgpool_config;
+
+    tensor<T> output_grad;
+    tensor<T> input_grad;
+    tensor<T> ref_input_grad;
+
+    size_t N, C, D, H, W, OD, OH, OW;
+
+    miopen::Allocator::ManageDataPtr output_grad_dev;
+    miopen::Allocator::ManageDataPtr input_grad_dev;
+};
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
deleted file mode 100644
index 94898d32b6..0000000000
--- a/test/gtest/avgpool.hpp
+++ /dev/null
@@ -1,451 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2024 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "../driver/tensor_driver.hpp"
-#include "cpu_avgpool.hpp"
-#include "get_handle.hpp"
-#include "random.hpp"
-#include "tensor_holder.hpp"
-#include "verify.hpp"
-#include <cstdint>
-#include <gtest/gtest.h>
-#include <iostream>
-#include <miopen/avgpool.hpp>
-#include <miopen/miopen.h>
-
-template <class T>
-inline std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
-{
-    os << '{';
-    for(int i = 0; i < v.size(); ++i)
-    {
-        if(i != 0)
-            os << ',';
-        os << v[i];
-    }
-    os << '}';
-    return os;
-}
-
-struct AvgPoolTestCase
-{
-    std::vector<int32_t> input_dims;
-    std::vector<int32_t> kernel_size;
-    std::vector<int32_t> stride;
-    std::vector<int32_t> padding;
-    bool ceil_mode;
-    bool count_include_pad;
-    int32_t divisor_override;
-
-    friend std::ostream& operator<<(std::ostream& os, const AvgPoolTestCase& tc)
-    {
-        return os << " input_dims:" << tc.input_dims << " kernel_size:" << tc.kernel_size
-                  << " stride:" << tc.stride << " padding:" << tc.padding
-                  << " ceil_mode:" << tc.ceil_mode << " count_include_pad:" << tc.count_include_pad
-                  << " divisor_override:" << tc.divisor_override;
-    }
-
-    std::vector<int32_t> GetInput() const { return input_dims; }
-};
-
-inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdFp32()
-{
-    return {
-        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
-    };
-}
-
-inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdFp16()
-{
-    return {
-        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
-    };
-}
-
-inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdBfp16()
-{
-    return {
-        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
-    };
-}
-
-inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdFp32()
-{
-    return {
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
-    };
-}
-
-inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdFp16()
-{
-    return {
-        {{64, 288, 35, 35}, {3, 3}, {1, 1}, {1, 1}, false, true, 0},
-        {{6, 288, 35, 35, 35}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, false, true, 0},
-    };
-}
-
-inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdBfp16()
-{
-    return {
-        {{64, 2048, 9, 9}, {3, 3}, {1, 1}, {1, 1}, false, true, 0},
-        {{6, 128, 112, 112, 112}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
-    };
-}
-
-// FORWARD TEST
-template <typename T = float>
-struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
-{
-protected:
-    void SetUp() override
-    {
-        auto&& handle     = get_handle();
-        avgpool_config    = GetParam();
-        auto in_dim       = avgpool_config.GetInput();
-        N                 = in_dim[0];
-        C                 = in_dim[1];
-        D                 = in_dim.size() == 5 ? in_dim[2] : 1;
-        H                 = in_dim.size() == 5 ? in_dim[3] : in_dim[2];
-        W                 = in_dim.size() == 5 ? in_dim[4] : in_dim[3];
-        ksize             = tensor<int32_t>{in_dim.size() - 2};
-        ksize.data        = avgpool_config.kernel_size;
-        stride            = tensor<int32_t>{in_dim.size() - 2};
-        stride.data       = avgpool_config.stride;
-        padding           = tensor<int32_t>{in_dim.size() - 2};
-        padding.data      = avgpool_config.padding;
-        ceil_mode         = avgpool_config.ceil_mode;
-        count_include_pad = avgpool_config.count_include_pad;
-        divisor_override  = avgpool_config.divisor_override;
-
-        auto gen_input_value = [](auto...) {
-            return prng::gen_A_to_B<T>(static_cast<T>(-10.0f), static_cast<T>(10.0f));
-        };
-        input = tensor<T>{in_dim}.generate(gen_input_value);
-
-        std::vector<int32_t> out_dim;
-        if(in_dim.size() == 5)
-        {
-            if(ceil_mode)
-            {
-                OD = std::ceil(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
-                OH = std::ceil(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
-                OW = std::ceil(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
-            }
-            else
-            {
-                OD = std::floor(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
-                OH = std::floor(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
-                OW = std::floor(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
-            }
-            out_dim = {N, C, OD, OH, OW};
-        }
-        else
-        {
-            if(ceil_mode)
-            {
-                OH = std::ceil(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
-                OW = std::ceil(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
-            }
-            else
-            {
-                OH = std::floor(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
-                OW = std::floor(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
-            }
-            out_dim = {N, C, OH, OW};
-        }
-
-        output = tensor<T>{out_dim};
-        std::fill(output.begin(), output.end(), std::numeric_limits<T>::quiet_NaN());
-
-        ref_output = tensor<T>{out_dim};
-        std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits<T>::quiet_NaN());
-
-        input_dev   = handle.Write(input.data);
-        output_dev  = handle.Write(output.data);
-        ksize_dev   = handle.Write(ksize.data);
-        stride_dev  = handle.Write(stride.data);
-        padding_dev = handle.Write(padding.data);
-    }
-
-    void RunTest()
-    {
-        auto&& handle = get_handle();
-        miopenStatus_t status;
-
-        auto dims = input.desc.GetNumDims();
-        if(dims == 4)
-        {
-            cpu_avgpool_forward_2d(input,
-                                   ref_output,
-                                   N,
-                                   C,
-                                   H,
-                                   W,
-                                   OH,
-                                   OW,
-                                   ksize,
-                                   stride,
-                                   padding,
-                                   count_include_pad,
-                                   divisor_override);
-        }
-        else if(dims == 5)
-        {
-            cpu_avgpool_forward_3d<T>(input,
-                                      ref_output,
-                                      N,
-                                      C,
-                                      D,
-                                      H,
-                                      W,
-                                      OD,
-                                      OH,
-                                      OW,
-                                      ksize,
-                                      stride,
-                                      padding,
-                                      count_include_pad,
-                                      divisor_override);
-        }
-        status = miopen::AvgPoolForward(handle,
-                                        input.desc,
-                                        input_dev.get(),
-                                        output.desc,
-                                        output_dev.get(),
-                                        ksize.GetSize() == 3 ? ksize[0] : 0,
-                                        ksize.GetSize() == 3 ? ksize[1] : ksize[0],
-                                        ksize.GetSize() == 3 ? ksize[2] : ksize[1],
-                                        stride.GetSize() == 3 ? stride[0] : 0,
-                                        stride.GetSize() == 3 ? stride[1] : stride[0],
-                                        stride.GetSize() == 3 ? stride[2] : stride[1],
-                                        padding.GetSize() == 3 ? padding[0] : 0,
-                                        padding.GetSize() == 3 ? padding[1] : padding[0],
-                                        padding.GetSize() == 3 ? padding[2] : padding[1],
-                                        count_include_pad,
-                                        divisor_override);
-        fflush(stdout);
-        ASSERT_EQ(status, miopenStatusSuccess);
-
-        output.data = handle.Read<T>(output_dev, output.data.size());
-    }
-
-    void Verify()
-    {
-        double threshold = std::numeric_limits<T>::epsilon();
-
-        auto error = miopen::rms_range(ref_output, output);
-
-        ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output));
-        EXPECT_LT(error, threshold * 10);
-    }
-    AvgPoolTestCase avgpool_config;
-
-    tensor<T> input;
-    tensor<T> output;
-    tensor<T> ref_output;
-    tensor<int32_t> ksize;
-    tensor<int32_t> stride;
-    tensor<int32_t> padding;
-
-    bool ceil_mode;
-    bool count_include_pad;
-    int32_t divisor_override;
-    int32_t N, C, D, H, W, OD, OH, OW;
-
-    miopen::Allocator::ManageDataPtr input_dev;
-    miopen::Allocator::ManageDataPtr output_dev;
-    miopen::Allocator::ManageDataPtr ksize_dev;
-    miopen::Allocator::ManageDataPtr stride_dev;
-    miopen::Allocator::ManageDataPtr padding_dev;
-};
-
-// BACKWARD TEST
-template <typename T = float>
-struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
-{
-protected:
-    void SetUp() override
-    {
-        auto&& handle     = get_handle();
-        avgpool_config    = GetParam();
-        auto in_grad_dim  = avgpool_config.GetInput();
-        N                 = in_grad_dim[0];
-        C                 = in_grad_dim[1];
-        D                 = in_grad_dim.size() == 5 ? in_grad_dim[2] : 1;
-        H                 = in_grad_dim.size() == 5 ? in_grad_dim[3] : in_grad_dim[2];
-        W                 = in_grad_dim.size() == 5 ? in_grad_dim[4] : in_grad_dim[3];
-        ksize             = tensor<int32_t>{in_grad_dim.size() - 2};
-        ksize.data        = avgpool_config.kernel_size;
-        stride            = tensor<int32_t>{in_grad_dim.size() - 2};
-        stride.data       = avgpool_config.stride;
-        padding           = tensor<int32_t>{in_grad_dim.size() - 2};
-        padding.data      = avgpool_config.padding;
-        ceil_mode         = avgpool_config.ceil_mode;
-        count_include_pad = avgpool_config.count_include_pad;
-        divisor_override  = avgpool_config.divisor_override;
-
-        std::vector<int32_t> out_grad_dim;
-        if(in_grad_dim.size() == 5)
-        {
-            if(ceil_mode)
-            {
-                OD = std::ceil(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
-                OH = std::ceil(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
-                OW = std::ceil(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
-            }
-            else
-            {
-                OD = std::floor(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
-                OH = std::floor(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
-                OW = std::floor(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
-            }
-            out_grad_dim = {N, C, OD, OH, OW};
-        }
-        else
-        {
-            if(ceil_mode)
-            {
-                OH = std::ceil(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
-                OW = std::ceil(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
-            }
-            else
-            {
-                OH = std::floor(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
-                OW = std::floor(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
-            }
-            out_grad_dim = {N, C, OH, OW};
-        }
-        auto gen_output_grad_value = [](auto...) {
-            return prng::gen_A_to_B<T>(static_cast<T>(-10.0f), static_cast<T>(10.0f));
-        };
-        output_grad = tensor<T>{out_grad_dim}.generate(gen_output_grad_value);
-
-        input_grad = tensor<T>{in_grad_dim};
-        std::fill(input_grad.begin(), input_grad.end(), std::numeric_limits<T>::quiet_NaN());
-
-        ref_input_grad = tensor<T>{in_grad_dim};
-        std::fill(
-            ref_input_grad.begin(), ref_input_grad.end(), std::numeric_limits<T>::quiet_NaN());
-
-        output_grad_dev = handle.Write(output_grad.data);
-        input_grad_dev  = handle.Write(input_grad.data);
-        ksize_dev       = handle.Write(ksize.data);
-        stride_dev      = handle.Write(stride.data);
-        padding_dev     = handle.Write(padding.data);
-    }
-
-    void RunTest()
-    {
-        auto&& handle = get_handle();
-
-        miopenStatus_t status;
-
-        auto dims = input_grad.desc.GetNumDims();
-        if(dims == 4)
-        {
-            cpu_avgpool_backward_2d(output_grad,
-                                    ref_input_grad,
-                                    N,
-                                    C,
-                                    H,
-                                    W,
-                                    OH,
-                                    OW,
-                                    ksize,
-                                    stride,
-                                    padding,
-                                    count_include_pad,
-                                    divisor_override);
-        }
-        else if(dims == 5)
-        {
-            cpu_avgpool_backward_3d<T>(output_grad,
-                                       ref_input_grad,
-                                       N,
-                                       C,
-                                       D,
-                                       H,
-                                       W,
-                                       OD,
-                                       OH,
-                                       OW,
-                                       ksize,
-                                       stride,
-                                       padding,
-                                       count_include_pad,
-                                       divisor_override);
-        }
-        status = miopen::AvgPoolBackward(handle,
-                                         output_grad.desc,
-                                         output_grad_dev.get(),
-                                         input_grad.desc,
-                                         input_grad_dev.get(),
-                                         ksize.GetSize() == 3 ? ksize[0] : 0,
-                                         ksize.GetSize() == 3 ? ksize[1] : ksize[0],
-                                         ksize.GetSize() == 3 ? ksize[2] : ksize[1],
-                                         stride.GetSize() == 3 ? stride[0] : 0,
-                                         stride.GetSize() == 3 ? stride[1] : stride[0],
-                                         stride.GetSize() == 3 ? stride[2] : stride[1],
-                                         padding.GetSize() == 3 ? padding[0] : 0,
-                                         padding.GetSize() == 3 ? padding[1] : padding[0],
-                                         padding.GetSize() == 3 ? padding[2] : padding[1],
-                                         count_include_pad,
-                                         divisor_override);
-
-        ASSERT_EQ(status, miopenStatusSuccess);
-
-        input_grad.data = handle.Read<T>(input_grad_dev, input_grad.data.size());
-    }
-
-    void Verify()
-    {
-        double threshold = std::numeric_limits<T>::epsilon();
-        auto error       = miopen::rms_range(ref_input_grad, input_grad);
-        ASSERT_EQ(miopen::range_distance(ref_input_grad), miopen::range_distance(input_grad));
-        EXPECT_LT(error, threshold * 10);
-    }
-    AvgPoolTestCase avgpool_config;
-
-    tensor<T> output_grad;
-    tensor<T> input_grad;
-    tensor<T> ref_input_grad;
-    tensor<int32_t> ksize;
-    tensor<int32_t> stride;
-    tensor<int32_t> padding;
-
-    bool ceil_mode;
-    bool count_include_pad;
-    int32_t divisor_override;
-    int32_t N, C, D, H, W, OD, OH, OW;
-
-    miopen::Allocator::ManageDataPtr output_grad_dev;
-    miopen::Allocator::ManageDataPtr input_grad_dev;
-    miopen::Allocator::ManageDataPtr ksize_dev;
-    miopen::Allocator::ManageDataPtr stride_dev;
-    miopen::Allocator::ManageDataPtr padding_dev;
-};

From 7d3a0a69e7dd3a566939e3a0ad9daa086d460169 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Thu, 3 Oct 2024 17:04:52 +0700
Subject: [PATCH 12/38] add isOverRocm3d

---
 driver/adaptiveavgpool_driver.hpp             | 26 ++++++++--------
 .../miopen/adaptiveavgpool/solvers.hpp        |  1 +
 .../backward_adaptiveavgpool_3d.cpp           | 31 +++----------------
 .../forward_adaptiveavgpool_3d.cpp            | 30 ++++--------------
 test/gtest/adaptiveavgpool.hpp                | 13 +++-----
 5 files changed, 30 insertions(+), 71 deletions(-)

diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp
index fd86cf9eec..1a9b1b6242 100644
--- a/driver/adaptiveavgpool_driver.hpp
+++ b/driver/adaptiveavgpool_driver.hpp
@@ -56,12 +56,12 @@ class AdaptiveAvgPoolDriver : public Driver
         data_type = miopen_type<Tgpu>{};
     }
 
-    std::vector<int> ComputeStrides(std::vector<int> input);
+    std::vector<size_t> ComputeStrides(std::vector<size_t> input);
     int AddCmdLineArgs() override;
     int ParseCmdLineArgs(int argc, char* argv[]) override;
     InputFlags& GetInputFlags() override { return inflags; }
 
-    std::vector<int> GetInputTensorDimsFromCmd(const char* param);
+    std::vector<size_t> GetInputTensorDimsFromCmd(const char* param);
     int GetandSetData() override;
 
     int AllocateBuffersAndCopy() override;
@@ -107,8 +107,8 @@ class AdaptiveAvgPoolDriver : public Driver
 
     size_t N = 1, C = 1, D = 1, H = 1, W = 1, OD = 1, OH = 1, OW = 1;
 
-    std::vector<int> in_dim;
-    std::vector<int> out_dim;
+    std::vector<size_t> in_dim;
+    std::vector<size_t> out_dim;
     bool isContiguous;
 };
 
@@ -126,11 +126,11 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
 }
 
 template <typename Tgpu, typename Tref>
-std::vector<int> AdaptiveAvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char* param)
+std::vector<size_t> AdaptiveAvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char* param)
 {
     std::string lengthsStr = inflags.GetValueStr(param);
 
-    std::vector<int> lengths;
+    std::vector<size_t> lengths;
     std::size_t pos = 0;
     std::size_t new_pos;
 
@@ -150,7 +150,7 @@ std::vector<int> AdaptiveAvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(co
     std::string sliceStr = lengthsStr.substr(pos);
     int len              = std::stoi(sliceStr);
 
-    lengths.push_back(len);
+    lengths.push_back(static_cast<size_t>(len));
 
     return (lengths);
 }
@@ -158,9 +158,9 @@ std::vector<int> AdaptiveAvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(co
 template <typename Tgpu, typename Tref>
 int AdaptiveAvgPoolDriver<Tgpu, Tref>::GetandSetData()
 {
-    in_dim                     = GetInputTensorDimsFromCmd("input_dims");
-    std::vector<int> in_stride = ComputeStrides(in_dim);
-    out_dim                    = GetInputTensorDimsFromCmd("output_dims");
+    in_dim                        = GetInputTensorDimsFromCmd("input_dims");
+    std::vector<size_t> in_stride = ComputeStrides(in_dim);
+    out_dim                       = GetInputTensorDimsFromCmd("output_dims");
     if(in_dim.size() != out_dim.size() + 2)
     {
         MIOPEN_THROW(miopenStatusBadParm,
@@ -199,7 +199,7 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::GetandSetData()
         out_dim_final.push_back(OH);
         out_dim_final.push_back(OW);
     }
-    std::vector<int> out_grad_stride = ComputeStrides(out_dim_final);
+    std::vector<size_t> out_grad_stride = ComputeStrides(out_dim_final);
     SetTensorNd(inputDesc, in_dim, in_stride, data_type);
     SetTensorNd(outputDesc, out_dim_final, data_type);
     SetTensorNd(outputGradDesc, out_dim_final, out_grad_stride, data_type);
@@ -210,11 +210,11 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::GetandSetData()
 
 // Equivalent to: tensor.tranpose(0, -1).contiguous().tranpose(0, -1) incase contiguous = False
 template <typename Tgpu, typename Tref>
-std::vector<int> AdaptiveAvgPoolDriver<Tgpu, Tref>::ComputeStrides(std::vector<int> inputDim)
+std::vector<size_t> AdaptiveAvgPoolDriver<Tgpu, Tref>::ComputeStrides(std::vector<size_t> inputDim)
 {
     if(!isContiguous)
         std::swap(inputDim.front(), inputDim.back());
-    std::vector<int> strides(inputDim.size());
+    std::vector<size_t> strides(inputDim.size());
     strides.back() = 1;
     for(int i = inputDim.size() - 2; i >= 0; --i)
         strides[i] = strides[i + 1] * inputDim[i + 1];
diff --git a/src/include/miopen/adaptiveavgpool/solvers.hpp b/src/include/miopen/adaptiveavgpool/solvers.hpp
index 25f08f3345..ce2419527a 100644
--- a/src/include/miopen/adaptiveavgpool/solvers.hpp
+++ b/src/include/miopen/adaptiveavgpool/solvers.hpp
@@ -32,6 +32,7 @@
 #include <miopen/adaptiveavgpool/problem_description.hpp>
 #include "miopen/kernel_build_params.hpp"
 #include "miopen/kernel_info.hpp"
+#include "miopen/mlo_internal.hpp"
 
 namespace miopen {
 
diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
index 51d815e281..b45d024c0b 100644
--- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
+++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
@@ -45,36 +45,15 @@ namespace adaptiveavgpool {
 
 bool IsOverRocmBwd3d(const miopen::adaptiveavgpool::BwdProblemDescription& problem)
 {
-    auto dtype      = problem.GetInputGradDesc().GetType();
-    auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
-    auto out_nelems = problem.GetOutputGradDesc().GetElementSize();
-    auto mul_nc =
-        problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1];
-    auto N           = problem.GetOutputGradDesc().GetLengths()[0];
-    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
-
-    if(dtype == miopenFloat)
+    if(!problem.IsAllContiguous())
     {
-        if((in_over_out < 2 && out_nelems <= 12582912) || (in_over_out <= 8 && N >= 6))
-        {
-            return true;
-        }
-        return false;
-    }
-    else if(dtype == miopenHalf)
-    {
-        if((in_over_out < 2 && mul_nc < 8192) || (8 > in_over_out && out_nelems >= 29052108))
-        {
-            return true;
-        }
+        return true;
     }
-    else if(dtype == miopenBFloat16)
+    else
     {
-        if((1 <= in_over_out && in_over_out < 2 && in_nelems >= 4194304) ||
-           (in_over_out <= 8 && in_nelems >= 944111616))
-        {
+        if((problem.GetInputGradDesc().GetElementSize() /
+            problem.GetOutputGradDesc().GetElementSize()) == 1)
             return true;
-        }
     }
     return false;
 }
diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
index cf9bf5a9b9..481805cfa4 100644
--- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
+++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
@@ -45,37 +45,19 @@ namespace adaptiveavgpool {
 
 bool IsOverRocmFwd3d(const miopen::adaptiveavgpool::FwdProblemDescription& problem)
 {
-    auto dtype      = problem.GetOutputDesc().GetType();
-    auto in_nelems  = problem.GetInputDesc().GetElementSize();
-    auto out_nelems = problem.GetOutputDesc().GetElementSize();
-    auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
-    auto N      = problem.GetOutputDesc().GetLengths()[0];
+    auto in_nelems   = problem.GetInputDesc().GetElementSize();
+    auto out_nelems  = problem.GetOutputDesc().GetElementSize();
     auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
 
-    std::cout << "in_over_out: " << in_over_out << std::endl;
-    std::cout << "in_nelems: " << in_nelems << std::endl;
-    std::cout << "out_nelems: " << out_nelems << std::endl;
-
-    if(dtype == miopenFloat)
-    {
-        if(in_over_out < 2 || in_over_out >= 262144 || (out_nelems >= 10125000 && N > 4))
-        {
-            return true;
-        }
-    }
-    else if(dtype == miopenHalf)
+    if(problem.IsAllContiguous())
     {
-        if(in_nelems >= 201326592 || (in_over_out < 2 && mul_nc < 8192))
-        {
+        if(in_over_out <= 98)
             return true;
-        }
     }
-    else if(dtype == miopenBFloat16)
+    else
     {
-        if((out_nelems >= 5971968 && in_over_out < 2) || out_nelems >= 74088000)
-        {
+        if(in_over_out < 8000)
             return true;
-        }
     }
     return false;
 }
diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp
index 8c172e4494..8e2213dbf8 100644
--- a/test/gtest/adaptiveavgpool.hpp
+++ b/test/gtest/adaptiveavgpool.hpp
@@ -117,10 +117,9 @@ inline std::vector<AdaptiveAvgPoolTestCase> AdaptiveAvgPoolTestConfigsBwdFp32()
     return {
         {{64, 768, 17}, {10}, false},
         {{64, 768, 17}, {10}, true},
-        {{64, 78, 17, 17}, {10, 10}, false},
-        {{64, 78, 17, 17}, {10, 10}, true},
+        {{64, 206, 17, 17}, {10, 10}, false},
         {{6, 18, 18, 18, 18}, {5, 5, 5}, false},
-        {{6, 18, 18, 18, 18}, {5, 5, 5}, true},
+        {{6, 18, 18, 18, 18}, {18, 18, 18}, true},
     };
 }
 
@@ -129,10 +128,9 @@ inline std::vector<AdaptiveAvgPoolTestCase> AdaptiveAvgPoolTestConfigsBwdFp16()
     return {
         {{64, 768, 17}, {10}, false},
         {{64, 768, 17}, {10}, true},
-        {{64, 28, 35, 35}, {7, 7}, false},
-        {{64, 28, 35, 35}, {7, 7}, true},
+        {{64, 28, 35, 35}, {35, 35}, false},
         {{6, 28, 35, 35, 35}, {10, 10, 10}, false},
-        {{6, 28, 35, 35, 35}, {10, 10, 10}, true},
+        {{6, 28, 35, 35, 35}, {35, 35, 35}, true},
     };
 }
 
@@ -142,9 +140,8 @@ inline std::vector<AdaptiveAvgPoolTestCase> AdaptiveAvgPoolTestConfigsBwdBfp16()
         {{64, 768, 17}, {10}, false},
         {{64, 768, 17}, {10}, true},
         {{64, 208, 9, 9}, {7, 7}, false},
-        {{64, 208, 9, 9}, {7, 7}, true},
         {{6, 18, 12, 12, 12}, {5, 5, 5}, false},
-        {{6, 18, 12, 12, 12}, {5, 5, 5}, true},
+        {{6, 18, 12, 12, 12}, {12, 12, 12}, true},
     };
 }
 

From dfbb6c71b58e1cbedb789238c9568f8f4afde770 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Thu, 3 Oct 2024 17:10:52 +0700
Subject: [PATCH 13/38] add const Tgpu

---
 driver/mloAdaptiveAvgPoolHost.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/driver/mloAdaptiveAvgPoolHost.hpp b/driver/mloAdaptiveAvgPoolHost.hpp
index 1c45f16213..8bd435f415 100644
--- a/driver/mloAdaptiveAvgPoolHost.hpp
+++ b/driver/mloAdaptiveAvgPoolHost.hpp
@@ -33,7 +33,7 @@
 template <typename Tgpu, typename Tcheck>
 int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputDesc,
                                            const miopenTensorDescriptor_t outputDesc,
-                                           Tgpu* input,
+                                           const Tgpu* input,
                                            Tcheck* output,
                                            size_t N,
                                            size_t C,
@@ -71,7 +71,7 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD
 template <typename Tgpu, typename Tcheck>
 int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
                                            const miopenTensorDescriptor_t outputDesc,
-                                           Tgpu* input,
+                                           const Tgpu* input,
                                            Tcheck* output,
                                            size_t N,
                                            size_t C,
@@ -119,7 +119,7 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD
 template <typename Tgpu, typename Tcheck>
 int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
                                            const miopenTensorDescriptor_t outputDesc,
-                                           Tgpu* input,
+                                           const Tgpu* input,
                                            Tcheck* output,
                                            size_t N,
                                            size_t C,
@@ -177,7 +177,7 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD
 template <typename Tgpu, typename Tcheck>
 int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outputGradDesc,
                                             const miopenTensorDescriptor_t inputGradDesc,
-                                            Tgpu* output_grad,
+                                            const Tgpu* output_grad,
                                             Tcheck* input_grad,
                                             size_t N,
                                             size_t C,
@@ -218,7 +218,7 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu
 template <typename Tgpu, typename Tcheck>
 int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDesc,
                                             const miopenTensorDescriptor_t inputGradDesc,
-                                            Tgpu* output_grad,
+                                            const Tgpu* output_grad,
                                             Tcheck* input_grad,
                                             size_t N,
                                             size_t C,
@@ -271,7 +271,7 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu
 template <typename Tgpu, typename Tcheck>
 int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDesc,
                                             const miopenTensorDescriptor_t inputGradDesc,
-                                            Tgpu* output_grad,
+                                            const Tgpu* output_grad,
                                             Tcheck* input_grad,
                                             size_t N,
                                             size_t C,

From 88179b049d2840c3a084a09f3ce96c8a2a0aed0f Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Fri, 4 Oct 2024 13:32:44 +0700
Subject: [PATCH 14/38] add resolve comment Rocm PR

---
 src/adaptiveavgpool.cpp                       |  4 ++++
 src/adaptiveavgpool_api.cpp                   | 20 +++++++++----------
 src/include/miopen/adaptiveavgpool.hpp        |  5 +++++
 .../backward_adaptiveavgpool_1d.cpp           |  4 ++++
 .../backward_adaptiveavgpool_2d.cpp           |  4 ++++
 .../backward_adaptiveavgpool_3d.cpp           |  4 ++++
 .../forward_adaptiveavgpool_1d.cpp            |  4 ++++
 .../forward_adaptiveavgpool_2d.cpp            |  4 ++++
 .../forward_adaptiveavgpool_3d.cpp            |  4 ++++
 test/gtest/adaptiveavgpool.hpp                |  4 ++--
 10 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/src/adaptiveavgpool.cpp b/src/adaptiveavgpool.cpp
index fee382a4d1..f5ff954740 100644
--- a/src/adaptiveavgpool.cpp
+++ b/src/adaptiveavgpool.cpp
@@ -33,6 +33,8 @@
 
 namespace miopen {
 
+namespace adaptiveavgpool {
+
 miopenStatus_t AdaptiveAvgPoolForward(Handle& handle,
                                       const TensorDescriptor& inputDesc,
                                       ConstData_t input,
@@ -91,4 +93,6 @@ miopenStatus_t AdaptiveAvgPoolBackward(Handle& handle,
     return miopenStatusSuccess;
 }
 
+} // namespace adaptiveavgpool
+
 } // namespace miopen
diff --git a/src/adaptiveavgpool_api.cpp b/src/adaptiveavgpool_api.cpp
index a9159258f9..c183386a6a 100644
--- a/src/adaptiveavgpool_api.cpp
+++ b/src/adaptiveavgpool_api.cpp
@@ -85,11 +85,11 @@ extern "C" miopenStatus_t miopenAdaptiveAvgPoolForward(miopenHandle_t handle,
 
     LogCmdAdaptiveAvgPool(inputDesc, outputDesc, true);
     return miopen::try_([&] {
-        miopen::AdaptiveAvgPoolForward(miopen::deref(handle),
-                                       miopen::deref(inputDesc),
-                                       DataCast(input),
-                                       miopen::deref(outputDesc),
-                                       DataCast(output));
+        miopen::adaptiveavgpool::AdaptiveAvgPoolForward(miopen::deref(handle),
+                                                        miopen::deref(inputDesc),
+                                                        DataCast(input),
+                                                        miopen::deref(outputDesc),
+                                                        DataCast(output));
     });
 }
 
@@ -104,10 +104,10 @@ miopenAdaptiveAvgPoolBackward(miopenHandle_t handle,
 
     LogCmdAdaptiveAvgPool(inputGradDesc, outputGradDesc, false);
     return miopen::try_([&] {
-        miopen::AdaptiveAvgPoolBackward(miopen::deref(handle),
-                                        miopen::deref(outputGradDesc),
-                                        DataCast(output_grad),
-                                        miopen::deref(inputGradDesc),
-                                        DataCast(input_grad));
+        miopen::adaptiveavgpool::AdaptiveAvgPoolBackward(miopen::deref(handle),
+                                                         miopen::deref(outputGradDesc),
+                                                         DataCast(output_grad),
+                                                         miopen::deref(inputGradDesc),
+                                                         DataCast(input_grad));
     });
 }
diff --git a/src/include/miopen/adaptiveavgpool.hpp b/src/include/miopen/adaptiveavgpool.hpp
index 9f38a62d94..9902befb99 100644
--- a/src/include/miopen/adaptiveavgpool.hpp
+++ b/src/include/miopen/adaptiveavgpool.hpp
@@ -34,6 +34,8 @@ namespace miopen {
 struct Handle;
 struct TensorDescriptor;
 
+namespace adaptiveavgpool {
+
 MIOPEN_INTERNALS_EXPORT miopenStatus_t AdaptiveAvgPoolForward(Handle& handle,
                                                               const TensorDescriptor& inputDesc,
                                                               ConstData_t input,
@@ -46,5 +48,8 @@ AdaptiveAvgPoolBackward(Handle& handle,
                         ConstData_t output_grad,
                         const TensorDescriptor& inputGradDesc,
                         Data_t input_grad);
+
+} // namespace adaptiveavgpool
+
 } // namespace miopen
 #endif // _MIOPEN_ADAPTIVEAVGPOOL_HPP_
diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
index e97c9ec0a9..19dfa7d5f9 100644
--- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
+++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
@@ -73,6 +73,10 @@ bool AdaptiveAvgPoolBackward1d::IsApplicable(
     {
         return false;
     }
+    if(!(problem.GetInputGradDesc().GetType() == miopenFloat ||
+         problem.GetInputGradDesc().GetType() == miopenHalf ||
+         problem.GetInputGradDesc().GetType() == miopenBFloat16))
+        return false;
     return true;
 }
 
diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
index dd8aeb9902..bc813dd7bf 100644
--- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
+++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
@@ -89,6 +89,10 @@ bool AdaptiveAvgPoolBackward2d::IsApplicable(
     {
         return false;
     }
+    if(!(problem.GetInputGradDesc().GetType() == miopenFloat ||
+         problem.GetInputGradDesc().GetType() == miopenHalf ||
+         problem.GetInputGradDesc().GetType() == miopenBFloat16))
+        return false;
     return true;
 }
 
diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
index b45d024c0b..d2073f4304 100644
--- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
+++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
@@ -70,6 +70,10 @@ bool AdaptiveAvgPoolBackward3d::IsApplicable(
     {
         return false;
     }
+    if(!(problem.GetInputGradDesc().GetType() == miopenFloat ||
+         problem.GetInputGradDesc().GetType() == miopenHalf ||
+         problem.GetInputGradDesc().GetType() == miopenBFloat16))
+        return false;
     return true;
 }
 
diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
index 3ad93574de..1dc63c5858 100644
--- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
+++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
@@ -67,6 +67,10 @@ bool AdaptiveAvgPoolForward1d::IsApplicable(
     {
         return false;
     }
+    if(!(problem.GetInputDesc().GetType() == miopenFloat ||
+         problem.GetInputDesc().GetType() == miopenHalf ||
+         problem.GetInputDesc().GetType() == miopenBFloat16))
+        return false;
     return true;
 }
 
diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
index 92c120494e..623485634a 100644
--- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
+++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
@@ -73,6 +73,10 @@ bool AdaptiveAvgPoolForward2d::IsApplicable(
     {
         return false;
     }
+    if(!(problem.GetInputDesc().GetType() == miopenFloat ||
+         problem.GetInputDesc().GetType() == miopenHalf ||
+         problem.GetInputDesc().GetType() == miopenBFloat16))
+        return false;
     return true;
 }
 
diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
index 481805cfa4..b4081849eb 100644
--- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
+++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
@@ -73,6 +73,10 @@ bool AdaptiveAvgPoolForward3d::IsApplicable(
     {
         return false;
     }
+    if(!(problem.GetInputDesc().GetType() == miopenFloat ||
+         problem.GetInputDesc().GetType() == miopenHalf ||
+         problem.GetInputDesc().GetType() == miopenBFloat16))
+        return false;
     return true;
 }
 
diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp
index 8e2213dbf8..7f01813331 100644
--- a/test/gtest/adaptiveavgpool.hpp
+++ b/test/gtest/adaptiveavgpool.hpp
@@ -230,7 +230,7 @@ struct AdaptiveAvgPoolTestFwd : public ::testing::TestWithParam<AdaptiveAvgPoolT
         {
             cpu_adaptiveavgpool_forward_3d<T>(input, ref_output, N, C, D, H, W, OD, OH, OW);
         }
-        status = miopen::AdaptiveAvgPoolForward(
+        status = miopen::adaptiveavgpool::AdaptiveAvgPoolForward(
             handle, input.desc, input_dev.get(), output.desc, output_dev.get());
         fflush(stdout);
         ASSERT_EQ(status, miopenStatusSuccess);
@@ -349,7 +349,7 @@ struct AdaptiveAvgPoolTestBwd : public ::testing::TestWithParam<AdaptiveAvgPoolT
             cpu_adaptiveavgpool_backward_3d<T>(
                 output_grad, ref_input_grad, N, C, D, H, W, OD, OH, OW);
         }
-        status = miopen::AdaptiveAvgPoolBackward(
+        status = miopen::adaptiveavgpool::AdaptiveAvgPoolBackward(
             handle, output_grad.desc, output_grad_dev.get(), input_grad.desc, input_grad_dev.get());
 
         ASSERT_EQ(status, miopenStatusSuccess);

From d2b2d1f3c5353d3882b0c8f0feba89524b79220c Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Fri, 4 Oct 2024 13:38:28 +0700
Subject: [PATCH 15/38] add issametype

---
 .../adaptiveavgpool/problem_description.hpp   | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/include/miopen/adaptiveavgpool/problem_description.hpp b/src/include/miopen/adaptiveavgpool/problem_description.hpp
index adec5759e7..2fda5f111b 100644
--- a/src/include/miopen/adaptiveavgpool/problem_description.hpp
+++ b/src/include/miopen/adaptiveavgpool/problem_description.hpp
@@ -43,6 +43,7 @@ struct FwdProblemDescription : ProblemDescriptionBase
     {
         IsValidLength();
         IsValidDims();
+        IsSameType();
     }
 
     auto GetInputDesc() const { return inputDesc; }
@@ -122,6 +123,17 @@ struct FwdProblemDescription : ProblemDescriptionBase
         return isContiguous(inputDesc) && isContiguous(outputDesc);
     }
 
+    bool IsSameType() const
+    {
+        if(inputDesc.GetType() != outputDesc.GetType())
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AdaptiveAvgPool: Input and output tensor types do not match.");
+        }
+
+        return true;
+    }
+
     NetworkConfig MakeNetworkConfig() const override;
 
 protected:
@@ -137,6 +149,7 @@ struct BwdProblemDescription : ProblemDescriptionBase
     {
         IsValidLength();
         IsValidDims();
+        IsSameType();
     }
 
     auto GetOutputGradDesc() const { return outputGradDesc; }
@@ -216,6 +229,17 @@ struct BwdProblemDescription : ProblemDescriptionBase
         return isContiguous(inputGradDesc) && isContiguous(outputGradDesc);
     }
 
+    bool IsSameType() const
+    {
+        if(inputGradDesc.GetType() != outputGradDesc.GetType())
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AdaptiveAvgPool: Input grad and output grad tensor types do not match.");
+        }
+
+        return true;
+    }
+
     NetworkConfig MakeNetworkConfig() const override;
 
 protected:

From 0dc61de31075ddebd4712e95f00869ff877aaec7 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Fri, 4 Oct 2024 13:46:28 +0700
Subject: [PATCH 16/38] rm ENV gtest

---
 test/gtest/adaptiveavgpool.cpp | 125 ++++++---------------------------
 1 file changed, 22 insertions(+), 103 deletions(-)

diff --git a/test/gtest/adaptiveavgpool.cpp b/test/gtest/adaptiveavgpool.cpp
index a548ada4cd..e12e327500 100644
--- a/test/gtest/adaptiveavgpool.cpp
+++ b/test/gtest/adaptiveavgpool.cpp
@@ -24,91 +24,30 @@
  *
  *******************************************************************************/
 #include "adaptiveavgpool.hpp"
-#include <miopen/env.hpp>
-
-MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
-MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
-
-namespace adaptiveavgpool {
-
-std::string GetFloatArg()
-{
-    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
-    if(tmp.empty())
-    {
-        return "";
-    }
-    return tmp;
-}
-
-struct GPU_AdaptiveAvgpool_fwd_FP32 : AdaptiveAvgPoolTestFwd<float>
-{
-};
-
-struct GPU_AdaptiveAvgpool_fwd_FP16 : AdaptiveAvgPoolTestFwd<half>
-{
-};
-
-struct GPU_AdaptiveAvgpool_fwd_BFP16 : AdaptiveAvgPoolTestFwd<bfloat16>
-{
-};
-
-struct GPU_AdaptiveAvgpool_bwd_FP32 : AdaptiveAvgPoolTestBwd<float>
-{
-};
-
-struct GPU_AdaptiveAvgpool_bwd_FP16 : AdaptiveAvgPoolTestBwd<half>
-{
-};
-
-struct GPU_AdaptiveAvgpool_bwd_BFP16 : AdaptiveAvgPoolTestBwd<bfloat16>
-{
-};
-
-} // namespace adaptiveavgpool
-using namespace adaptiveavgpool;
+#include "gtest/gtest.h"
+using float16 = half_float::half;
 
 // FORWARD TEST
+using GPU_AdaptiveAvgpool_fwd_FP32  = AdaptiveAvgPoolTestFwd<float>;
+using GPU_AdaptiveAvgpool_fwd_FP16  = AdaptiveAvgPoolTestFwd<float16>;
+using GPU_AdaptiveAvgpool_fwd_BFP16 = AdaptiveAvgPoolTestFwd<bfloat16>;
+
 TEST_P(GPU_AdaptiveAvgpool_fwd_FP32, AdaptiveAvgPoolTestFwd)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 TEST_P(GPU_AdaptiveAvgpool_fwd_FP16, AdaptiveAvgPoolTestFwd)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 TEST_P(GPU_AdaptiveAvgpool_fwd_BFP16, AdaptiveAvgPoolTestFwd)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
@@ -122,46 +61,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdBfp16()));
 
 // BACKWARD TEST
+using GPU_AdaptiveAvgpool_bwd_FP32  = AdaptiveAvgPoolTestBwd<float>;
+using GPU_AdaptiveAvgpool_bwd_FP16  = AdaptiveAvgPoolTestBwd<float16>;
+using GPU_AdaptiveAvgpool_bwd_BFP16 = AdaptiveAvgPoolTestBwd<bfloat16>;
+
 TEST_P(GPU_AdaptiveAvgpool_bwd_FP32, AdaptiveAvgPoolTestBwd)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 TEST_P(GPU_AdaptiveAvgpool_bwd_FP16, AdaptiveAvgPoolTestBwd)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 TEST_P(GPU_AdaptiveAvgpool_bwd_BFP16, AdaptiveAvgPoolTestBwd)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Smoke,

From 194ab403b47bf314044c90169a70e68956c33237 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Fri, 4 Oct 2024 13:57:34 +0700
Subject: [PATCH 17/38] rm magic number in driver test

---
 driver/adaptiveavgpool_driver.hpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp
index 1a9b1b6242..3a74a0aef5 100644
--- a/driver/adaptiveavgpool_driver.hpp
+++ b/driver/adaptiveavgpool_driver.hpp
@@ -438,13 +438,7 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunBackwardCPU()
 template <typename Tgpu, typename Tref>
 Tref AdaptiveAvgPoolDriver<Tgpu, Tref>::GetTolerance()
 {
-    // Computation error of fp16 is ~2^13 (=8192) bigger than
-    // the one of fp32 because mantissa is shorter by 13 bits.
-    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
-
-    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
-    if(std::is_same<Tgpu, bfloat16>::value)
-        tolerance *= 8.0;
+    Tref tolerance = std::numeric_limits<Tgpu>::epsilon() * 10;
     return tolerance;
 }
 

From 36b0662ebb17441223b77cbcbfafe4b02ff5b513 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Fri, 4 Oct 2024 16:25:10 +0700
Subject: [PATCH 18/38] change to AddTensorFlag

---
 driver/adaptiveavgpool_driver.hpp | 53 ++++++-------------------------
 1 file changed, 10 insertions(+), 43 deletions(-)

diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp
index 3a74a0aef5..afe2161720 100644
--- a/driver/adaptiveavgpool_driver.hpp
+++ b/driver/adaptiveavgpool_driver.hpp
@@ -61,7 +61,6 @@ class AdaptiveAvgPoolDriver : public Driver
     int ParseCmdLineArgs(int argc, char* argv[]) override;
     InputFlags& GetInputFlags() override { return inflags; }
 
-    std::vector<size_t> GetInputTensorDimsFromCmd(const char* param);
     int GetandSetData() override;
 
     int AllocateBuffersAndCopy() override;
@@ -107,8 +106,8 @@ class AdaptiveAvgPoolDriver : public Driver
 
     size_t N = 1, C = 1, D = 1, H = 1, W = 1, OD = 1, OH = 1, OW = 1;
 
-    std::vector<size_t> in_dim;
-    std::vector<size_t> out_dim;
+    std::vector<int> in_dim;
+    std::vector<int> out_dim;
     bool isContiguous;
 };
 
@@ -125,42 +124,12 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref>
-std::vector<size_t> AdaptiveAvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char* param)
-{
-    std::string lengthsStr = inflags.GetValueStr(param);
-
-    std::vector<size_t> lengths;
-    std::size_t pos = 0;
-    std::size_t new_pos;
-
-    new_pos = lengthsStr.find(',', pos);
-    while(new_pos != std::string::npos)
-    {
-        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
-
-        int len = std::stoi(sliceStr);
-
-        lengths.push_back(len);
-
-        pos     = new_pos + 1;
-        new_pos = lengthsStr.find(',', pos);
-    };
-
-    std::string sliceStr = lengthsStr.substr(pos);
-    int len              = std::stoi(sliceStr);
-
-    lengths.push_back(static_cast<size_t>(len));
-
-    return (lengths);
-}
-
 template <typename Tgpu, typename Tref>
 int AdaptiveAvgPoolDriver<Tgpu, Tref>::GetandSetData()
 {
-    in_dim                        = GetInputTensorDimsFromCmd("input_dims");
+    in_dim                        = inflags.GetValueTensor("input_dims").lengths;
     std::vector<size_t> in_stride = ComputeStrides(in_dim);
-    out_dim                       = GetInputTensorDimsFromCmd("output_dims");
+    out_dim                       = inflags.GetValueTensor("output_dims").lengths;
     if(in_dim.size() != out_dim.size() + 2)
     {
         MIOPEN_THROW(miopenStatusBadParm,
@@ -227,18 +196,16 @@ template <typename Tgpu, typename Tref>
 int AdaptiveAvgPoolDriver<Tgpu, Tref>::AddCmdLineArgs()
 {
     inflags.AddInputFlag("forw", 'F', "1", "Run only Forward AdaptiveAvgPool (Default=1)", "int");
-    inflags.AddInputFlag(
+    inflags.AddTensorFlag(
         "input_dims",
         'D',
-        "2,3,7,9,9",
-        "The dimensional lengths of the input tensor: N,C,D,H,W... Example: 2,3,7,9,9.",
-        "string");
-    inflags.AddInputFlag(
+        "2x3x7x9x9",
+        "The dimensional lengths of the input tensor: N,C,D,H,W... Example: 2,3,7,9,9.");
+    inflags.AddTensorFlag(
         "output_dims",
         'S',
-        "5,5,5",
-        "The dimensional lengths of the output tensor: OD,OH,OW,... Example: 5,5,5.",
-        "string");
+        "5x5x5",
+        "The dimensional lengths of the output tensor: OD,OH,OW,... Example: 5,5,5.");
     inflags.AddInputFlag("is-contiguous", 'c', "1", "is-contiguous (Default=1)", "int");
     inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
     inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int");

From 99ff4bb5996cf21aa7ccc706130f3761d4332aac Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Fri, 4 Oct 2024 17:23:41 +0700
Subject: [PATCH 19/38] small fix

---
 driver/adaptiveavgpool_driver.hpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp
index afe2161720..724397a9e6 100644
--- a/driver/adaptiveavgpool_driver.hpp
+++ b/driver/adaptiveavgpool_driver.hpp
@@ -56,7 +56,7 @@ class AdaptiveAvgPoolDriver : public Driver
         data_type = miopen_type<Tgpu>{};
     }
 
-    std::vector<size_t> ComputeStrides(std::vector<size_t> input);
+    std::vector<int> ComputeStrides(std::vector<int> input);
     int AddCmdLineArgs() override;
     int ParseCmdLineArgs(int argc, char* argv[]) override;
     InputFlags& GetInputFlags() override { return inflags; }
@@ -127,17 +127,17 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
 template <typename Tgpu, typename Tref>
 int AdaptiveAvgPoolDriver<Tgpu, Tref>::GetandSetData()
 {
-    in_dim                        = inflags.GetValueTensor("input_dims").lengths;
-    std::vector<size_t> in_stride = ComputeStrides(in_dim);
-    out_dim                       = inflags.GetValueTensor("output_dims").lengths;
+    in_dim                     = inflags.GetValueTensor("input_dims").lengths;
+    std::vector<int> in_stride = ComputeStrides(in_dim);
+    out_dim                    = inflags.GetValueTensor("output_dims").lengths;
     if(in_dim.size() != out_dim.size() + 2)
     {
         MIOPEN_THROW(miopenStatusBadParm,
                      "AdaptiveAvgPool: Input and output tensor sizes do not match.");
     }
-    N                                 = in_dim[0];
-    C                                 = in_dim[1];
-    std::vector<size_t> out_dim_final = {N, C};
+    N                              = in_dim[0];
+    C                              = in_dim[1];
+    std::vector<int> out_dim_final = {N, C};
     if(in_dim.size() == 3)
     {
         H = in_dim[2];
@@ -168,7 +168,7 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::GetandSetData()
         out_dim_final.push_back(OH);
         out_dim_final.push_back(OW);
     }
-    std::vector<size_t> out_grad_stride = ComputeStrides(out_dim_final);
+    std::vector<int> out_grad_stride = ComputeStrides(out_dim_final);
     SetTensorNd(inputDesc, in_dim, in_stride, data_type);
     SetTensorNd(outputDesc, out_dim_final, data_type);
     SetTensorNd(outputGradDesc, out_dim_final, out_grad_stride, data_type);
@@ -179,11 +179,11 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::GetandSetData()
 
 // Equivalent to: tensor.tranpose(0, -1).contiguous().tranpose(0, -1) incase contiguous = False
 template <typename Tgpu, typename Tref>
-std::vector<size_t> AdaptiveAvgPoolDriver<Tgpu, Tref>::ComputeStrides(std::vector<size_t> inputDim)
+std::vector<int> AdaptiveAvgPoolDriver<Tgpu, Tref>::ComputeStrides(std::vector<int> inputDim)
 {
     if(!isContiguous)
         std::swap(inputDim.front(), inputDim.back());
-    std::vector<size_t> strides(inputDim.size());
+    std::vector<int> strides(inputDim.size());
     strides.back() = 1;
     for(int i = inputDim.size() - 2; i >= 0; --i)
         strides[i] = strides[i + 1] * inputDim[i + 1];

From 42b54a7b97166d6d9bc48f94f8f8b3306d273d01 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 7 Oct 2024 10:28:12 +0700
Subject: [PATCH 20/38] fix driver as comments

---
 driver/CMakeLists.txt                    |   2 +-
 driver/adaptiveavgpool_driver.hpp        | 145 ++++++++++++++---------
 driver/mloAdaptiveAvgPoolHost.hpp        |  29 ++---
 src/include/miopen/tensor_view_utils.hpp |   1 -
 4 files changed, 104 insertions(+), 73 deletions(-)

diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 3eeb7d4d42..ea309e87ed 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -31,8 +31,8 @@ add_executable(MIOpenDriver
     conv_common.cpp
     dm_activ.cpp
     dm_adam.cpp
-    dm_addlayernorm.cpp
     dm_adaptiveavgpool.cpp
+    dm_addlayernorm.cpp
     dm_bnorm.cpp
     dm_cat.cpp
     dm_conv.cpp
diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp
index 724397a9e6..65badc5fa2 100644
--- a/driver/adaptiveavgpool_driver.hpp
+++ b/driver/adaptiveavgpool_driver.hpp
@@ -23,8 +23,7 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#ifndef GUARD_MIOPEN_ADAPTIVEAVGPOOL_DRIVER_HPP
-#define GUARD_MIOPEN_ADAPTIVEAVGPOOL_DRIVER_HPP
+#pragma once
 
 #include "InputFlags.hpp"
 #include "driver.hpp"
@@ -169,10 +168,16 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::GetandSetData()
         out_dim_final.push_back(OW);
     }
     std::vector<int> out_grad_stride = ComputeStrides(out_dim_final);
-    SetTensorNd(inputDesc, in_dim, in_stride, data_type);
-    SetTensorNd(outputDesc, out_dim_final, data_type);
-    SetTensorNd(outputGradDesc, out_dim_final, out_grad_stride, data_type);
-    SetTensorNd(inputGradDesc, in_dim, data_type);
+    if(SetTensorNd(inputDesc, in_dim, in_stride, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing input tensor: " + inflags.GetValueStr("input_dims") + ".");
+    if(SetTensorNd(outputDesc, out_dim_final, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing output tensor: " + inflags.GetValueStr("output_dims") + ".");
+    if(SetTensorNd(outputGradDesc, out_dim_final, out_grad_stride, data_type) !=
+       miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing output grad tensor: " + inflags.GetValueStr("output_dims") +
+                     ".");
+    if(SetTensorNd(inputGradDesc, in_dim, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing input grad tensor: " + inflags.GetValueStr("input_dims") + ".");
 
     return miopenStatusSuccess;
 }
@@ -200,12 +205,12 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::AddCmdLineArgs()
         "input_dims",
         'D',
         "2x3x7x9x9",
-        "The dimensional lengths of the input tensor: N,C,D,H,W... Example: 2,3,7,9,9.");
+        "The dimensional lengths of the input tensor: N,C,D,H,W... Example: 2x3x7x9x9.");
     inflags.AddTensorFlag(
         "output_dims",
         'S',
         "5x5x5",
-        "The dimensional lengths of the output tensor: OD,OH,OW,... Example: 5,5,5.");
+        "The dimensional lengths of the output tensor: OD,OH,OW,... Example: 5x5x5.");
     inflags.AddInputFlag("is-contiguous", 'c', "1", "is-contiguous (Default=1)", "int");
     inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
     inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int");
@@ -272,8 +277,9 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunForwardGPU()
 
     for(int i = 0; i < inflags.GetValueInt("iter"); i++)
     {
-        miopenAdaptiveAvgPoolForward(
+        auto status = miopenAdaptiveAvgPoolForward(
             GetHandle(), inputDesc, input_dev->GetMem(), outputDesc, output_dev->GetMem());
+        MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in miopenAdaptiveAvgPoolForward");
 
         float time = 0.0;
         miopenGetKernelTime(GetHandle(), &time);
@@ -287,15 +293,21 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunForwardGPU()
         STOP_TIME
         int iter = inflags.GetValueInt("iter");
         if(WALL_CLOCK)
-            printf("Wall-clock Time Forward AdaptiveAvgPool Elapsed: %f ms\n",
-                   t.gettime_ms() / iter);
+            std::cout << "Wall-clock Time Forward AdaptiveAvgPool Elapsed: "
+                      << t.gettime_ms() / iter << " ms" << std::endl;
 
         float kernel_average_time =
             iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
-        printf("GPU Kernel Time Forward AdaptiveAvgPool Elapsed: %f ms\n", kernel_average_time);
+        std::cout << "GPU Kernel Time Forward AdaptiveAvgPool Elapsed: " << kernel_average_time
+                  << " ms" << std::endl;
     }
 
-    output_dev->FromGPU(GetStream(), output.data());
+    if(output_dev->FromGPU(GetStream(), output.data()) != 0)
+    {
+        std::cerr << "Error copying (output_dev) from GPU, size: " << output_dev->GetSize()
+                  << std::endl;
+        return miopenStatusInternalError;
+    }
 
     return miopenStatusSuccess;
 }
@@ -303,22 +315,30 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunForwardGPU()
 template <typename Tgpu, typename Tref>
 int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunForwardCPU()
 {
+    int status = miopenStatusSuccess;
+
     if(in_dim.size() == 3)
     {
-        mloAdaptiveAvgPoolForward1dRunHost<Tgpu, Tref>(
+        status = mloAdaptiveAvgPoolForward1dRunHost<Tgpu, Tref>(
             inputDesc, outputDesc, input.data(), output_host.data(), N, C, H, OH);
+        MIOPEN_THROW_IF(status != miopenStatusSuccess,
+                        "Error in mloAdaptiveAvgPoolForward1dRunHost");
     }
     else if(in_dim.size() == 4)
     {
-        mloAdaptiveAvgPoolForward2dRunHost<Tgpu, Tref>(
+        status = mloAdaptiveAvgPoolForward2dRunHost<Tgpu, Tref>(
             inputDesc, outputDesc, input.data(), output_host.data(), N, C, H, W, OH, OW);
+        MIOPEN_THROW_IF(status != miopenStatusSuccess,
+                        "Error in mloAdaptiveAvgPoolForward2dRunHost");
     }
     else if(in_dim.size() == 5)
     {
-        mloAdaptiveAvgPoolForward3dRunHost<Tgpu, Tref>(
+        status = mloAdaptiveAvgPoolForward3dRunHost<Tgpu, Tref>(
             inputDesc, outputDesc, input.data(), output_host.data(), N, C, D, H, W, OD, OH, OW);
+        MIOPEN_THROW_IF(status != miopenStatusSuccess,
+                        "Error in mloAdaptiveAvgPoolForward3dRunHost");
     }
-    return miopenStatusSuccess;
+    return status;
 }
 
 template <typename Tgpu, typename Tref>
@@ -332,11 +352,12 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunBackwardGPU()
 
     for(int i = 0; i < inflags.GetValueInt("iter"); i++)
     {
-        miopenAdaptiveAvgPoolBackward(GetHandle(),
-                                      outputGradDesc,
-                                      output_grad_dev->GetMem(),
-                                      inputGradDesc,
-                                      input_grad_dev->GetMem());
+        auto status = miopenAdaptiveAvgPoolBackward(GetHandle(),
+                                                    outputGradDesc,
+                                                    output_grad_dev->GetMem(),
+                                                    inputGradDesc,
+                                                    input_grad_dev->GetMem());
+        MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in miopenAdaptiveAvgPoolBackward");
 
         float time = 0.0;
         miopenGetKernelTime(GetHandle(), &time);
@@ -350,15 +371,21 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunBackwardGPU()
         STOP_TIME
         int iter = inflags.GetValueInt("iter");
         if(WALL_CLOCK)
-            printf("Wall-clock Time Backward AdaptiveAvgPool Elapsed: %f ms\n",
-                   t.gettime_ms() / iter);
+            std::cout << "Wall-clock Time Backward AdaptiveAvgPool Elapsed: "
+                      << t.gettime_ms() / iter << " ms" << std::endl;
 
         float kernel_average_time =
             iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
-        printf("GPU Kernel Time Backward AdaptiveAvgPool Elapsed: %f ms\n", kernel_average_time);
+        std::cout << "GPU Kernel Time Backward AdaptiveAvgPool Elapsed: " << kernel_average_time
+                  << " ms" << std::endl;
     }
 
-    input_grad_dev->FromGPU(GetStream(), input_grad.data());
+    if(input_grad_dev->FromGPU(GetStream(), input_grad.data()) != 0)
+    {
+        std::cerr << "Error copying (input_grad_dev) from GPU, size: " << input_grad_dev->GetSize()
+                  << std::endl;
+        return miopenStatusInternalError;
+    }
 
     return miopenStatusSuccess;
 }
@@ -366,40 +393,48 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunBackwardGPU()
 template <typename Tgpu, typename Tref>
 int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunBackwardCPU()
 {
+    int status = miopenStatusSuccess;
+
     if(in_dim.size() == 3)
     {
-        mloAdaptiveAvgPoolBackward1dRunHost<Tgpu, Tref>(
+        status = mloAdaptiveAvgPoolBackward1dRunHost<Tgpu, Tref>(
             outputGradDesc, inputGradDesc, output_grad.data(), input_grad_host.data(), N, C, H, OH);
+        MIOPEN_THROW_IF(status != miopenStatusSuccess,
+                        "Error in mloAdaptiveAvgPoolBackward1dRunHost");
     }
     else if(in_dim.size() == 4)
     {
-        mloAdaptiveAvgPoolBackward2dRunHost<Tgpu, Tref>(outputGradDesc,
-                                                        inputGradDesc,
-                                                        output_grad.data(),
-                                                        input_grad_host.data(),
-                                                        N,
-                                                        C,
-                                                        H,
-                                                        W,
-                                                        OH,
-                                                        OW);
+        status = mloAdaptiveAvgPoolBackward2dRunHost<Tgpu, Tref>(outputGradDesc,
+                                                                 inputGradDesc,
+                                                                 output_grad.data(),
+                                                                 input_grad_host.data(),
+                                                                 N,
+                                                                 C,
+                                                                 H,
+                                                                 W,
+                                                                 OH,
+                                                                 OW);
+        MIOPEN_THROW_IF(status != miopenStatusSuccess,
+                        "Error in mloAdaptiveAvgPoolBackward2dRunHost");
     }
     else if(in_dim.size() == 5)
     {
-        mloAdaptiveAvgPoolBackward3dRunHost<Tgpu, Tref>(outputGradDesc,
-                                                        inputGradDesc,
-                                                        output_grad.data(),
-                                                        input_grad_host.data(),
-                                                        N,
-                                                        C,
-                                                        D,
-                                                        H,
-                                                        W,
-                                                        OD,
-                                                        OH,
-                                                        OW);
+        status = mloAdaptiveAvgPoolBackward3dRunHost<Tgpu, Tref>(outputGradDesc,
+                                                                 inputGradDesc,
+                                                                 output_grad.data(),
+                                                                 input_grad_host.data(),
+                                                                 N,
+                                                                 C,
+                                                                 D,
+                                                                 H,
+                                                                 W,
+                                                                 OD,
+                                                                 OH,
+                                                                 OW);
+        MIOPEN_THROW_IF(status != miopenStatusSuccess,
+                        "Error in mloAdaptiveAvgPoolBackward3dRunHost");
     }
-    return miopenStatusSuccess;
+    return status;
 }
 
 template <typename Tgpu, typename Tref>
@@ -423,7 +458,8 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::VerifyForward()
     }
     else
     {
-        printf("Forward AdaptiveAvgPool Verifies on CPU and GPU (err=%f)\n", error);
+        std::cout << "Forward AdaptiveAvgPool Verifies on CPU and GPU (err=" << error << ")"
+                  << std::endl;
     }
 
     return miopenStatusSuccess;
@@ -439,13 +475,12 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::VerifyBackward()
     if(!std::isfinite(error) || error > tolerance)
     {
         std::cout << "Backward AdaptiveAvgPool FAILED: " << error << std::endl;
-        return EC_VerifyFwd;
+        return EC_VerifyBwd;
     }
     else
     {
-        printf("Backward AdaptiveAvgPool Verifies on CPU and GPU (err=%f)\n", error);
+        std::cout << "Backward AdaptiveAvgPool Verifies on CPU and GPU (err=" << error << ")"
+                  << std::endl;
     }
     return miopenStatusSuccess;
 }
-
-#endif // GUARD_MIOPEN_ADAPTIVEAVGPOOL_DRIVER_HPP
diff --git a/driver/mloAdaptiveAvgPoolHost.hpp b/driver/mloAdaptiveAvgPoolHost.hpp
index 8bd435f415..21e758f494 100644
--- a/driver/mloAdaptiveAvgPoolHost.hpp
+++ b/driver/mloAdaptiveAvgPoolHost.hpp
@@ -23,8 +23,7 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#ifndef MLO_ADAPTIVEAVGPOOLHOST_H_
-#define MLO_ADAPTIVEAVGPOOLHOST_H_
+#pragma once
 
 #include <cmath>
 #include <miopen/tensor.hpp>
@@ -52,7 +51,7 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD
         size_t n = nc / C, c = nc % C;
 
         if(n >= N)
-            return 0;
+            return miopenStatusSuccess;
 
         size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
         size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
@@ -65,7 +64,7 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD
 
         output[output_tv.get_tensor_view_idx({n, c, oh})] = static_cast<Tcheck>(sum / kh);
     }
-    return 0;
+    return miopenStatusSuccess;
 }
 
 template <typename Tgpu, typename Tcheck>
@@ -93,7 +92,7 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD
         size_t n = nc / C, c = nc % C;
 
         if(n >= N)
-            return 0;
+            return miopenStatusSuccess;
 
         size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
         size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
@@ -113,7 +112,7 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD
 
         output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = static_cast<Tcheck>(sum / divider);
     }
-    return 0;
+    return miopenStatusSuccess;
 }
 
 template <typename Tgpu, typename Tcheck>
@@ -144,7 +143,7 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD
         size_t n = nc / C, c = nc % C;
 
         if(n >= N)
-            return 0;
+            return miopenStatusSuccess;
 
         size_t d  = static_cast<size_t>(std::floor(static_cast<float>(od * D) / OD));
         size_t kd = static_cast<size_t>(std::ceil(static_cast<float>((od + 1) * D) / OD)) - d;
@@ -171,7 +170,7 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD
         output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] =
             static_cast<Tcheck>(sum / (kd * kh * kw));
     }
-    return 0;
+    return miopenStatusSuccess;
 }
 
 template <typename Tgpu, typename Tcheck>
@@ -196,7 +195,7 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu
         size_t n = nc / C, c = nc % C;
 
         if(n >= N)
-            return 0;
+            return miopenStatusSuccess;
 
         size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
         size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
@@ -212,7 +211,7 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu
         }
         input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = static_cast<Tcheck>(grad);
     }
-    return 0;
+    return miopenStatusSuccess;
 }
 
 template <typename Tgpu, typename Tcheck>
@@ -240,7 +239,7 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu
         size_t n = nc / C, c = nc % C;
 
         if(n >= N)
-            return 0;
+            return miopenStatusSuccess;
 
         size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
         size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
@@ -265,7 +264,7 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu
 
         input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = static_cast<Tcheck>(grad);
     }
-    return 0;
+    return miopenStatusSuccess;
 }
 
 template <typename Tgpu, typename Tcheck>
@@ -296,7 +295,7 @@ int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outpu
         size_t n = nc / C, c = nc % C;
 
         if(n >= N)
-            return 0;
+            return miopenStatusSuccess;
 
         size_t od  = static_cast<size_t>(std::floor(static_cast<float>(d * OD) / D));
         size_t kod = static_cast<size_t>(std::ceil(static_cast<float>((d + 1) * OD) / D)) - od;
@@ -331,7 +330,5 @@ int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outpu
 
         input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = static_cast<Tcheck>(grad);
     }
-    return 0;
+    return miopenStatusSuccess;
 }
-
-#endif // MLO_ADAPTIVEAVGPOOLHOST_H_
diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp
index d4f3aa4163..1b095affb7 100644
--- a/src/include/miopen/tensor_view_utils.hpp
+++ b/src/include/miopen/tensor_view_utils.hpp
@@ -30,7 +30,6 @@
 #include <miopen/common.hpp>
 #include <miopen/tensor.hpp>
 #include "../../kernels/tensor_view.hpp"
-#include "miopen/tensor.hpp"
 
 namespace miopen {
 

From 22ae98aa8e06d6c82bbf977ea69ba4ab7a1534f9 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 7 Oct 2024 10:55:35 +0700
Subject: [PATCH 21/38] for to parford

---
 driver/mloAdaptiveAvgPoolHost.hpp | 49 ++++++++-----------------------
 1 file changed, 13 insertions(+), 36 deletions(-)

diff --git a/driver/mloAdaptiveAvgPoolHost.hpp b/driver/mloAdaptiveAvgPoolHost.hpp
index 21e758f494..38088cf09e 100644
--- a/driver/mloAdaptiveAvgPoolHost.hpp
+++ b/driver/mloAdaptiveAvgPoolHost.hpp
@@ -28,6 +28,7 @@
 #include <cmath>
 #include <miopen/tensor.hpp>
 #include <miopen/tensor_view_utils.hpp>
+#include <../test/ford.hpp>
 
 template <typename Tgpu, typename Tcheck>
 int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputDesc,
@@ -45,14 +46,10 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD
     auto input_tv  = miopen::get_inner_expanded_tv<3>(miopen::deref(inputDesc));
     auto output_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(outputDesc));
 
-    for(size_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](size_t gid) {
         size_t nc = gid / OH, oh = gid % OH;
         size_t n = nc / C, c = nc % C;
 
-        if(n >= N)
-            return miopenStatusSuccess;
-
         size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
         size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
 
@@ -63,7 +60,7 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD
         }
 
         output[output_tv.get_tensor_view_idx({n, c, oh})] = static_cast<Tcheck>(sum / kh);
-    }
+    });
     return miopenStatusSuccess;
 }
 
@@ -85,15 +82,11 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD
     auto input_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc));
     auto output_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputDesc));
 
-    for(size_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](size_t gid) {
         size_t ncoh = gid / OW, ow = gid % OW;
         size_t nc = ncoh / OH, oh = ncoh % OH;
         size_t n = nc / C, c = nc % C;
 
-        if(n >= N)
-            return miopenStatusSuccess;
-
         size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
         size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
 
@@ -111,7 +104,7 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD
         }
 
         output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = static_cast<Tcheck>(sum / divider);
-    }
+    });
     return miopenStatusSuccess;
 }
 
@@ -135,16 +128,12 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD
     auto input_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
     auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
 
-    for(size_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](size_t gid) {
         size_t ncodoh = gid / OW, ow = gid % OW;
         size_t ncod = ncodoh / OH, oh = ncodoh % OH;
         size_t nc = ncod / OD, od = ncod % OD;
         size_t n = nc / C, c = nc % C;
 
-        if(n >= N)
-            return miopenStatusSuccess;
-
         size_t d  = static_cast<size_t>(std::floor(static_cast<float>(od * D) / OD));
         size_t kd = static_cast<size_t>(std::ceil(static_cast<float>((od + 1) * D) / OD)) - d;
 
@@ -169,7 +158,7 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD
 
         output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] =
             static_cast<Tcheck>(sum / (kd * kh * kw));
-    }
+    });
     return miopenStatusSuccess;
 }
 
@@ -189,14 +178,10 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu
     auto output_grad_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(outputGradDesc));
     auto input_grad_tv  = miopen::get_inner_expanded_tv<3>(miopen::deref(inputGradDesc));
 
-    for(size_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](size_t gid) {
         size_t nc = gid / H, h = gid % H;
         size_t n = nc / C, c = nc % C;
 
-        if(n >= N)
-            return miopenStatusSuccess;
-
         size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
         size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
 
@@ -210,7 +195,7 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu
                 kh;
         }
         input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = static_cast<Tcheck>(grad);
-    }
+    });
     return miopenStatusSuccess;
 }
 
@@ -232,15 +217,11 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu
     auto output_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc));
     auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc));
 
-    for(size_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](size_t gid) {
         size_t nch = gid / W, w = gid % W;
         size_t nc = nch / H, h = nch % H;
         size_t n = nc / C, c = nc % C;
 
-        if(n >= N)
-            return miopenStatusSuccess;
-
         size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
         size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
 
@@ -263,7 +244,7 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu
         }
 
         input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = static_cast<Tcheck>(grad);
-    }
+    });
     return miopenStatusSuccess;
 }
 
@@ -287,16 +268,12 @@ int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outpu
     auto output_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc));
     auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc));
 
-    for(size_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](size_t gid) {
         size_t ncdh = gid / W, w = gid % W;
         size_t ncd = ncdh / H, h = ncdh % H;
         size_t nc = ncd / D, d = ncd % D;
         size_t n = nc / C, c = nc % C;
 
-        if(n >= N)
-            return miopenStatusSuccess;
-
         size_t od  = static_cast<size_t>(std::floor(static_cast<float>(d * OD) / D));
         size_t kod = static_cast<size_t>(std::ceil(static_cast<float>((d + 1) * OD) / D)) - od;
 
@@ -329,6 +306,6 @@ int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outpu
         }
 
         input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = static_cast<Tcheck>(grad);
-    }
+    });
     return miopenStatusSuccess;
 }

From 6ef45f6f51a90b120c15d5d84684c3894c4f31c7 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 7 Oct 2024 14:02:38 +0700
Subject: [PATCH 22/38] resolved comments

---
 src/CMakeLists.txt                            |   6 +-
 src/include/miopen/adaptiveavgpool.hpp        |   6 +-
 .../miopen/adaptiveavgpool/invoke_params.hpp  |   2 +-
 .../adaptiveavgpool/problem_description.hpp   |  31 +-
 .../miopen/adaptiveavgpool/solvers.hpp        |  14 +-
 src/kernels/MIOpenAdaptiveAvgPool.cpp         | 297 +++++++++---------
 src/kernels/tensor_view.hpp                   |   1 +
 src/solver.cpp                                |   2 +-
 .../backward_adaptiveavgpool_1d.cpp           |  18 +-
 .../backward_adaptiveavgpool_2d.cpp           |  22 +-
 .../backward_adaptiveavgpool_3d.cpp           |  26 +-
 .../forward_adaptiveavgpool_1d.cpp            |  18 +-
 .../forward_adaptiveavgpool_2d.cpp            |  22 +-
 .../forward_adaptiveavgpool_3d.cpp            |  26 +-
 test/cpu_adaptiveavgpool.hpp                  |  54 +---
 test/gtest/adaptiveavgpool.hpp                |   8 +-
 16 files changed, 246 insertions(+), 307 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 416f5ef4e3..4c219e3d2c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -87,10 +87,10 @@ set( MIOpen_Source
     activ_api.cpp
     adam/problem_description.cpp
     adam_api.cpp
-    addlayernorm_api.cpp
-    api/find2_0_commons.cpp
     adaptiveavgpool_api.cpp
     adaptiveavgpool/problem_description.cpp
+    addlayernorm_api.cpp
+    api/find2_0_commons.cpp
     batch_norm.cpp
     batch_norm_api.cpp
     batchnorm/problem_description.cpp
@@ -659,8 +659,8 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
     list(APPEND MIOpen_Source
         activ.cpp
         adam.cpp
-        addlayernorm.cpp
         adaptiveavgpool.cpp
+        addlayernorm.cpp
         cat.cpp
         groupnorm.cpp
         getitem.cpp
diff --git a/src/include/miopen/adaptiveavgpool.hpp b/src/include/miopen/adaptiveavgpool.hpp
index 9902befb99..7f04af7b8d 100644
--- a/src/include/miopen/adaptiveavgpool.hpp
+++ b/src/include/miopen/adaptiveavgpool.hpp
@@ -23,10 +23,7 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include <miopen/miopen.h>
-#ifndef MIOPEN_ADAPTIVEAVGPOOL_HPP_
-#define MIOPEN_ADAPTIVEAVGPOOL_HPP_
-
+#pragma once
 #include <miopen/common.hpp>
 
 namespace miopen {
@@ -52,4 +49,3 @@ AdaptiveAvgPoolBackward(Handle& handle,
 } // namespace adaptiveavgpool
 
 } // namespace miopen
-#endif // _MIOPEN_ADAPTIVEAVGPOOL_HPP_
diff --git a/src/include/miopen/adaptiveavgpool/invoke_params.hpp b/src/include/miopen/adaptiveavgpool/invoke_params.hpp
index e97a66a427..b9a30f7236 100644
--- a/src/include/miopen/adaptiveavgpool/invoke_params.hpp
+++ b/src/include/miopen/adaptiveavgpool/invoke_params.hpp
@@ -26,7 +26,7 @@
 
 #pragma once
 
-#include "miopen/common.hpp"
+#include <miopen/common.hpp>
 #include <miopen/invoke_params.hpp>
 #include <miopen/tensor.hpp>
 
diff --git a/src/include/miopen/adaptiveavgpool/problem_description.hpp b/src/include/miopen/adaptiveavgpool/problem_description.hpp
index 2fda5f111b..d8b112e46e 100644
--- a/src/include/miopen/adaptiveavgpool/problem_description.hpp
+++ b/src/include/miopen/adaptiveavgpool/problem_description.hpp
@@ -106,22 +106,7 @@ struct FwdProblemDescription : ProblemDescriptionBase
         return true;
     }
 
-    bool IsAllContiguous() const
-    {
-        auto isContiguous = [](TensorDescriptor td) {
-            size_t s = 1;
-            for(int i = td.GetNumDims() - 1; i >= 0; --i)
-            {
-                if(s != td.GetStrides()[i])
-                {
-                    return false;
-                }
-                s *= td.GetLengths()[i];
-            }
-            return true;
-        };
-        return isContiguous(inputDesc) && isContiguous(outputDesc);
-    }
+    bool IsAllContiguous() const { return inputDesc.IsContiguous() && outputDesc.IsContiguous(); }
 
     bool IsSameType() const
     {
@@ -214,19 +199,7 @@ struct BwdProblemDescription : ProblemDescriptionBase
 
     bool IsAllContiguous() const
     {
-        auto isContiguous = [](TensorDescriptor td) {
-            size_t s = 1;
-            for(int i = td.GetNumDims() - 1; i >= 0; --i)
-            {
-                if(s != td.GetStrides()[i])
-                {
-                    return false;
-                }
-                s *= td.GetLengths()[i];
-            }
-            return true;
-        };
-        return isContiguous(inputGradDesc) && isContiguous(outputGradDesc);
+        return inputGradDesc.IsContiguous() && outputGradDesc.IsContiguous();
     }
 
     bool IsSameType() const
diff --git a/src/include/miopen/adaptiveavgpool/solvers.hpp b/src/include/miopen/adaptiveavgpool/solvers.hpp
index ce2419527a..980bb1a330 100644
--- a/src/include/miopen/adaptiveavgpool/solvers.hpp
+++ b/src/include/miopen/adaptiveavgpool/solvers.hpp
@@ -26,18 +26,20 @@
 
 #pragma once
 
-#include "miopen/conv_solution.hpp"
-#include "miopen/execution_context.hpp"
+#include <miopen/conv_solution.hpp>
+#include <miopen/execution_context.hpp>
 #include <miopen/solver.hpp>
 #include <miopen/adaptiveavgpool/problem_description.hpp>
-#include "miopen/kernel_build_params.hpp"
-#include "miopen/kernel_info.hpp"
-#include "miopen/mlo_internal.hpp"
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/kernel_info.hpp>
+#include <miopen/mlo_internal.hpp>
 
 namespace miopen {
 
 namespace solver {
 
+namespace adaptiveavgpool {
+
 const auto make_hip_kernel = [](std::vector<size_t> localsize,
                                 std::vector<size_t> gridsize,
                                 std::string kernel_file,
@@ -53,8 +55,6 @@ const auto make_hip_kernel = [](std::vector<size_t> localsize,
         build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name};
 };
 
-namespace adaptiveavgpool {
-
 using AdaptiveAvgPoolForward =
     NonTunableSolverBase<ExecutionContext, miopen::adaptiveavgpool::FwdProblemDescription>;
 
diff --git a/src/kernels/MIOpenAdaptiveAvgPool.cpp b/src/kernels/MIOpenAdaptiveAvgPool.cpp
index 17877fdf0c..8d26ea0301 100644
--- a/src/kernels/MIOpenAdaptiveAvgPool.cpp
+++ b/src/kernels/MIOpenAdaptiveAvgPool.cpp
@@ -31,35 +31,27 @@
 #include "float_types.h"
 #include "tensor_view.hpp"
 
-#ifndef INPUT_TYPE
-#define INPUT_TYPE float
-#endif
-
-#ifndef OUTPUT_TYPE
-#define OUTPUT_TYPE float
-#endif
-
 template <typename TI, typename TO>
 __device__ void adaptiveAvgPoolForward1d(const TI* __restrict__ input,
                                          TO* __restrict__ output,
-                                         size_t N,
-                                         size_t C,
-                                         size_t H,
-                                         size_t OH,
+                                         uint64_t N,
+                                         uint64_t C,
+                                         uint64_t H,
+                                         uint64_t OH,
                                          tensor_view_t<3> input_tv,
                                          tensor_view_t<3> output_tv)
 {
-    size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
-    size_t nc = gid / OH, oh = gid % OH;
-    size_t n = nc / C, c = nc % C;
+    uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+    uint64_t nc = gid / OH, oh = gid % OH;
+    uint64_t n = nc / C, c = nc % C;
     if(n >= N)
         return;
 
-    size_t h  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(oh * H) / OH));
-    size_t kh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((oh + 1) * H) / OH)) - h;
+    uint64_t h  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(oh * H) / OH));
+    uint64_t kh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((oh + 1) * H) / OH)) - h;
 
     FLOAT_ACCUM sum = 0;
-    for(size_t ih = h; ih < (h + kh); ++ih)
+    for(uint64_t ih = h; ih < (h + kh); ++ih)
     {
         sum += CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx({n, c, ih})]);
     }
@@ -67,10 +59,10 @@ __device__ void adaptiveAvgPoolForward1d(const TI* __restrict__ input,
 }
 extern "C" __global__ void AdaptiveAvgPoolForward1d(const INPUT_TYPE* __restrict__ input,
                                                     OUTPUT_TYPE* __restrict__ output,
-                                                    size_t N,
-                                                    size_t C,
-                                                    size_t H,
-                                                    size_t OH,
+                                                    uint64_t N,
+                                                    uint64_t C,
+                                                    uint64_t H,
+                                                    uint64_t OH,
                                                     tensor_view_t<3> input_tv,
                                                     tensor_view_t<3> output_tv)
 {
@@ -81,27 +73,27 @@ extern "C" __global__ void AdaptiveAvgPoolForward1d(const INPUT_TYPE* __restrict
 template <typename TI, typename TO>
 __device__ void adaptiveAvgPoolBackward1d(const TI* __restrict__ output_grad,
                                           TO* __restrict__ input_grad,
-                                          size_t N,
-                                          size_t C,
-                                          size_t H,
-                                          size_t OH,
+                                          uint64_t N,
+                                          uint64_t C,
+                                          uint64_t H,
+                                          uint64_t OH,
                                           tensor_view_t<3> output_grad_tv,
                                           tensor_view_t<3> input_grad_tv)
 {
-    size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
-    size_t nc = gid / H, h = gid % H;
-    size_t n = nc / C, c = nc % C;
+    uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+    uint64_t nc = gid / H, h = gid % H;
+    uint64_t n = nc / C, c = nc % C;
     if(n >= N)
         return;
 
-    size_t oh  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(h * OH) / H));
-    size_t koh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((h + 1) * OH) / H)) - oh;
+    uint64_t oh  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(h * OH) / H));
+    uint64_t koh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((h + 1) * OH) / H)) - oh;
 
     FLOAT_ACCUM grad = 0;
-    for(size_t ih = oh; ih < (oh + koh); ++ih)
+    for(uint64_t ih = oh; ih < (oh + koh); ++ih)
     {
-        size_t kh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((ih + 1) * H) / OH)) -
-                    static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(ih * H) / OH));
+        uint64_t kh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((ih + 1) * H) / OH)) -
+                      static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(ih * H) / OH));
         grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih})]) / kh;
     }
     input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = CVT_ACCUM2FLOAT(grad);
@@ -109,10 +101,10 @@ __device__ void adaptiveAvgPoolBackward1d(const TI* __restrict__ output_grad,
 
 extern "C" __global__ void AdaptiveAvgPoolBackward1d(const INPUT_TYPE* __restrict__ output_grad,
                                                      OUTPUT_TYPE* __restrict__ input_grad,
-                                                     size_t N,
-                                                     size_t C,
-                                                     size_t H,
-                                                     size_t OH,
+                                                     uint64_t N,
+                                                     uint64_t C,
+                                                     uint64_t H,
+                                                     uint64_t OH,
                                                      tensor_view_t<3> output_grad_tv,
                                                      tensor_view_t<3> input_grad_tv)
 {
@@ -123,34 +115,34 @@ extern "C" __global__ void AdaptiveAvgPoolBackward1d(const INPUT_TYPE* __restric
 template <typename TI, typename TO>
 __device__ void adaptiveAvgPoolForward2d(const TI* __restrict__ input,
                                          TO* __restrict__ output,
-                                         size_t N,
-                                         size_t C,
-                                         size_t H,
-                                         size_t W,
-                                         size_t OH,
-                                         size_t OW,
+                                         uint64_t N,
+                                         uint64_t C,
+                                         uint64_t H,
+                                         uint64_t W,
+                                         uint64_t OH,
+                                         uint64_t OW,
                                          tensor_view_t<4> input_tv,
                                          tensor_view_t<4> output_tv)
 {
-    size_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
-    size_t ncoh = gid / OW, ow = gid % OW;
-    size_t nc = ncoh / OH, oh = ncoh % OH;
-    size_t n = nc / C, c = nc % C;
+    uint64_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
+    uint64_t ncoh = gid / OW, ow = gid % OW;
+    uint64_t nc = ncoh / OH, oh = ncoh % OH;
+    uint64_t n = nc / C, c = nc % C;
 
     if(n >= N)
         return;
 
-    size_t h  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(oh * H) / OH));
-    size_t kh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((oh + 1) * H) / OH)) - h;
+    uint64_t h  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(oh * H) / OH));
+    uint64_t kh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((oh + 1) * H) / OH)) - h;
 
-    size_t w  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(ow * W) / OW));
-    size_t kw = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((ow + 1) * W) / OW)) - w;
+    uint64_t w  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(ow * W) / OW));
+    uint64_t kw = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((ow + 1) * W) / OW)) - w;
 
     FLOAT_ACCUM divider = static_cast<FLOAT_ACCUM>(kh * kw);
     FLOAT_ACCUM sum     = 0;
-    for(size_t ih = h; ih < (h + kh); ++ih)
+    for(uint64_t ih = h; ih < (h + kh); ++ih)
     {
-        for(size_t iw = w; iw < (w + kw); ++iw)
+        for(uint64_t iw = w; iw < (w + kw); ++iw)
         {
             sum += CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx({n, c, ih, iw})]);
         }
@@ -160,12 +152,12 @@ __device__ void adaptiveAvgPoolForward2d(const TI* __restrict__ input,
 
 extern "C" __global__ void AdaptiveAvgPoolForward2d(const INPUT_TYPE* __restrict__ input,
                                                     OUTPUT_TYPE* __restrict__ output,
-                                                    size_t N,
-                                                    size_t C,
-                                                    size_t H,
-                                                    size_t W,
-                                                    size_t OH,
-                                                    size_t OW,
+                                                    uint64_t N,
+                                                    uint64_t C,
+                                                    uint64_t H,
+                                                    uint64_t W,
+                                                    uint64_t OH,
+                                                    uint64_t OW,
                                                     tensor_view_t<4> input_tv,
                                                     tensor_view_t<4> output_tv)
 {
@@ -176,38 +168,38 @@ extern "C" __global__ void AdaptiveAvgPoolForward2d(const INPUT_TYPE* __restrict
 template <typename TI, typename TO>
 __device__ void adaptiveAvgPoolBackward2d(const TI* __restrict__ output_grad,
                                           TO* __restrict__ input_grad,
-                                          size_t N,
-                                          size_t C,
-                                          size_t H,
-                                          size_t W,
-                                          size_t OH,
-                                          size_t OW,
+                                          uint64_t N,
+                                          uint64_t C,
+                                          uint64_t H,
+                                          uint64_t W,
+                                          uint64_t OH,
+                                          uint64_t OW,
                                           tensor_view_t<4> output_grad_tv,
                                           tensor_view_t<4> input_grad_tv)
 {
-    size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
-    size_t nch = gid / W, w = gid % W;
-    size_t nc = nch / H, h = nch % H;
-    size_t n = nc / C, c = nc % C;
+    uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+    uint64_t nch = gid / W, w = gid % W;
+    uint64_t nc = nch / H, h = nch % H;
+    uint64_t n = nc / C, c = nc % C;
 
     if(n >= N)
         return;
 
-    size_t oh  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(h * OH) / H));
-    size_t koh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((h + 1) * OH) / H)) - oh;
+    uint64_t oh  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(h * OH) / H));
+    uint64_t koh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((h + 1) * OH) / H)) - oh;
 
-    size_t ow  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(w * OW) / W));
-    size_t kow = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((w + 1) * OW) / W)) - ow;
+    uint64_t ow  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(w * OW) / W));
+    uint64_t kow = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((w + 1) * OW) / W)) - ow;
 
     FLOAT_ACCUM grad = 0;
-    for(size_t ih = oh; ih < (oh + koh); ++ih)
+    for(uint64_t ih = oh; ih < (oh + koh); ++ih)
     {
-        size_t kh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((ih + 1) * H) / OH)) -
-                    static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(ih * H) / OH));
-        for(size_t iw = ow; iw < (ow + kow); ++iw)
+        uint64_t kh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((ih + 1) * H) / OH)) -
+                      static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(ih * H) / OH));
+        for(uint64_t iw = ow; iw < (ow + kow); ++iw)
         {
-            size_t kw = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((iw + 1) * W) / OW)) -
-                        static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(iw * W) / OW));
+            uint64_t kw = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((iw + 1) * W) / OW)) -
+                          static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(iw * W) / OW));
             grad +=
                 CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih, iw})]) /
                 (kh * kw);
@@ -219,12 +211,12 @@ __device__ void adaptiveAvgPoolBackward2d(const TI* __restrict__ output_grad,
 
 extern "C" __global__ void AdaptiveAvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad,
                                                      OUTPUT_TYPE* __restrict__ input_grad,
-                                                     size_t N,
-                                                     size_t C,
-                                                     size_t H,
-                                                     size_t W,
-                                                     size_t OH,
-                                                     size_t OW,
+                                                     uint64_t N,
+                                                     uint64_t C,
+                                                     uint64_t H,
+                                                     uint64_t W,
+                                                     uint64_t OH,
+                                                     uint64_t OW,
                                                      tensor_view_t<4> output_grad_tv,
                                                      tensor_view_t<4> input_grad_tv)
 {
@@ -235,40 +227,40 @@ extern "C" __global__ void AdaptiveAvgPoolBackward2d(const INPUT_TYPE* __restric
 template <typename TI, typename TO>
 __device__ void adaptiveAvgPoolForward3d(const TI* __restrict__ input,
                                          TO* __restrict__ output,
-                                         size_t N,
-                                         size_t C,
-                                         size_t D,
-                                         size_t H,
-                                         size_t W,
-                                         size_t OD,
-                                         size_t OH,
-                                         size_t OW,
+                                         uint64_t N,
+                                         uint64_t C,
+                                         uint64_t D,
+                                         uint64_t H,
+                                         uint64_t W,
+                                         uint64_t OD,
+                                         uint64_t OH,
+                                         uint64_t OW,
                                          tensor_view_t<5> input_tv,
                                          tensor_view_t<5> output_tv)
 {
-    size_t gid    = threadIdx.x + blockIdx.x * blockDim.x;
-    size_t ncodoh = gid / OW, ow = gid % OW;
-    size_t ncod = ncodoh / OH, oh = ncodoh % OH;
-    size_t nc = ncod / OD, od = ncod % OD;
-    size_t n = nc / C, c = nc % C;
+    uint64_t gid    = threadIdx.x + blockIdx.x * blockDim.x;
+    uint64_t ncodoh = gid / OW, ow = gid % OW;
+    uint64_t ncod = ncodoh / OH, oh = ncodoh % OH;
+    uint64_t nc = ncod / OD, od = ncod % OD;
+    uint64_t n = nc / C, c = nc % C;
 
     if(n >= N)
         return;
-    size_t d  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(od * D) / OD));
-    size_t kd = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((od + 1) * D) / OD)) - d;
+    uint64_t d  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(od * D) / OD));
+    uint64_t kd = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((od + 1) * D) / OD)) - d;
 
-    size_t h  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(oh * H) / OH));
-    size_t kh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((oh + 1) * H) / OH)) - h;
+    uint64_t h  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(oh * H) / OH));
+    uint64_t kh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((oh + 1) * H) / OH)) - h;
 
-    size_t w  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(ow * W) / OW));
-    size_t kw = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((ow + 1) * W) / OW)) - w;
+    uint64_t w  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(ow * W) / OW));
+    uint64_t kw = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((ow + 1) * W) / OW)) - w;
 
     FLOAT_ACCUM sum = 0;
-    for(size_t id = d; id < (d + kd); ++id)
+    for(uint64_t id = d; id < (d + kd); ++id)
     {
-        for(size_t ih = h; ih < (h + kh); ++ih)
+        for(uint64_t ih = h; ih < (h + kh); ++ih)
         {
-            for(size_t iw = w; iw < (w + kw); ++iw)
+            for(uint64_t iw = w; iw < (w + kw); ++iw)
             {
                 sum += CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx({n, c, id, ih, iw})]);
             }
@@ -281,14 +273,14 @@ __device__ void adaptiveAvgPoolForward3d(const TI* __restrict__ input,
 
 extern "C" __global__ void AdaptiveAvgPoolForward3d(const INPUT_TYPE* __restrict__ input,
                                                     OUTPUT_TYPE* __restrict__ output,
-                                                    size_t N,
-                                                    size_t C,
-                                                    size_t D,
-                                                    size_t H,
-                                                    size_t W,
-                                                    size_t OD,
-                                                    size_t OH,
-                                                    size_t OW,
+                                                    uint64_t N,
+                                                    uint64_t C,
+                                                    uint64_t D,
+                                                    uint64_t H,
+                                                    uint64_t W,
+                                                    uint64_t OD,
+                                                    uint64_t OH,
+                                                    uint64_t OW,
                                                     tensor_view_t<5> input_tv,
                                                     tensor_view_t<5> output_tv)
 {
@@ -299,48 +291,49 @@ extern "C" __global__ void AdaptiveAvgPoolForward3d(const INPUT_TYPE* __restrict
 template <typename TI, typename TO>
 __device__ void adaptiveAvgPoolBackward3d(const TI* __restrict__ output_grad,
                                           TO* __restrict__ input_grad,
-                                          size_t N,
-                                          size_t C,
-                                          size_t D,
-                                          size_t H,
-                                          size_t W,
-                                          size_t OD,
-                                          size_t OH,
-                                          size_t OW,
+                                          uint64_t N,
+                                          uint64_t C,
+                                          uint64_t D,
+                                          uint64_t H,
+                                          uint64_t W,
+                                          uint64_t OD,
+                                          uint64_t OH,
+                                          uint64_t OW,
                                           tensor_view_t<5> output_grad_tv,
                                           tensor_view_t<5> input_grad_tv)
 {
-    size_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
-    size_t ncdh = gid / W, w = gid % W;
-    size_t ncd = ncdh / H, h = ncdh % H;
-    size_t nc = ncd / D, d = ncd % D;
-    size_t n = nc / C, c = nc % C;
+    uint64_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
+    uint64_t ncdh = gid / W, w = gid % W;
+    uint64_t ncd = ncdh / H, h = ncdh % H;
+    uint64_t nc = ncd / D, d = ncd % D;
+    uint64_t n = nc / C, c = nc % C;
 
     if(n >= N)
         return;
 
-    size_t od  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(d * OD) / D));
-    size_t kod = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((d + 1) * OD) / D)) - od;
+    uint64_t od  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(d * OD) / D));
+    uint64_t kod = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((d + 1) * OD) / D)) - od;
 
-    size_t oh  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(h * OH) / H));
-    size_t koh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((h + 1) * OH) / H)) - oh;
+    uint64_t oh  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(h * OH) / H));
+    uint64_t koh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((h + 1) * OH) / H)) - oh;
 
-    size_t ow  = static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(w * OW) / W));
-    size_t kow = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((w + 1) * OW) / W)) - ow;
+    uint64_t ow  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(w * OW) / W));
+    uint64_t kow = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((w + 1) * OW) / W)) - ow;
 
     FLOAT_ACCUM grad = 0;
-    for(size_t id = od; id < (od + kod); ++id)
+    for(uint64_t id = od; id < (od + kod); ++id)
     {
-        size_t kd = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((id + 1) * D) / OD)) -
-                    static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(id * D) / OD));
-        for(size_t ih = oh; ih < (oh + koh); ++ih)
+        uint64_t kd = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((id + 1) * D) / OD)) -
+                      static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(id * D) / OD));
+        for(uint64_t ih = oh; ih < (oh + koh); ++ih)
         {
-            size_t kh = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((ih + 1) * H) / OH)) -
-                        static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(ih * H) / OH));
-            for(size_t iw = ow; iw < (ow + kow); ++iw)
+            uint64_t kh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((ih + 1) * H) / OH)) -
+                          static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(ih * H) / OH));
+            for(uint64_t iw = ow; iw < (ow + kow); ++iw)
             {
-                size_t kw = static_cast<size_t>(ceil(static_cast<FLOAT_ACCUM>((iw + 1) * W) / OW)) -
-                            static_cast<size_t>(floor(static_cast<FLOAT_ACCUM>(iw * W) / OW));
+                uint64_t kw =
+                    static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((iw + 1) * W) / OW)) -
+                    static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(iw * W) / OW));
                 grad += CVT_FLOAT2ACCUM(
                             output_grad[output_grad_tv.get_tensor_view_idx({n, c, id, ih, iw})]) /
                         (kd * kh * kw);
@@ -353,14 +346,14 @@ __device__ void adaptiveAvgPoolBackward3d(const TI* __restrict__ output_grad,
 
 extern "C" __global__ void AdaptiveAvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad,
                                                      OUTPUT_TYPE* __restrict__ input_grad,
-                                                     size_t N,
-                                                     size_t C,
-                                                     size_t D,
-                                                     size_t H,
-                                                     size_t W,
-                                                     size_t OD,
-                                                     size_t OH,
-                                                     size_t OW,
+                                                     uint64_t N,
+                                                     uint64_t C,
+                                                     uint64_t D,
+                                                     uint64_t H,
+                                                     uint64_t W,
+                                                     uint64_t OD,
+                                                     uint64_t OH,
+                                                     uint64_t OW,
                                                      tensor_view_t<5> output_grad_tv,
                                                      tensor_view_t<5> input_grad_tv)
 {
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index 12394dbde6..c9357dd729 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -49,6 +49,7 @@ struct tensor_view_t
     uint64_t stride[N];
     uint64_t size[N];
 };
+
 template <int N>
 struct tensor_layout_t
 {
diff --git a/src/solver.cpp b/src/solver.cpp
index 90ea7c263e..62ba83cda2 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -24,9 +24,9 @@
  *
  *******************************************************************************/
 
-#include "miopen/adaptiveavgpool/solvers.hpp"
 #include <miopen/activ/solvers.hpp>
 #include <miopen/adam/solvers.hpp>
+#include <miopen/adaptiveavgpool/solvers.hpp>
 #include <miopen/batchnorm/solvers.hpp>
 #include <miopen/cat/solvers.hpp>
 #include <miopen/conv/solvers.hpp>
diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
index 19dfa7d5f9..700029db10 100644
--- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
+++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
@@ -24,10 +24,10 @@
  *
  *******************************************************************************/
 
-#include "miopen/conv_solution.hpp"
-#include "miopen/execution_context.hpp"
-#include "miopen/invoke_params.hpp"
-#include "miopen/tensor_view_utils.hpp"
+#include <miopen/conv_solution.hpp>
+#include <miopen/execution_context.hpp>
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor_view_utils.hpp>
 #include <miopen/adaptiveavgpool/solvers.hpp>
 
 #include <miopen/adaptiveavgpool/invoke_params.hpp>
@@ -90,7 +90,7 @@ ConvSolution AdaptiveAvgPoolBackward1d::GetSolution(
     auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
     auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
     auto dtype        = problem.GetInputGradDesc().GetType();
-    size_t N_total    = problem.GetNtotal();
+    uint64_t N_total  = problem.GetNtotal();
 
     auto build_params = KernelBuildParameters{
         {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
@@ -115,10 +115,10 @@ ConvSolution AdaptiveAvgPoolBackward1d::GetSolution(
             auto input_grad_tv  = get_inner_expanded_tv<3>(deref(params.inputGradDesc));
             auto output_grad_tv = get_inner_expanded_tv<3>(deref(params.outputGradDesc));
 
-            auto N  = deref(params.inputGradDesc).GetLengths()[0];
-            auto C  = deref(params.inputGradDesc).GetLengths()[1];
-            auto H  = deref(params.inputGradDesc).GetLengths()[2];
-            auto OH = deref(params.outputGradDesc).GetLengths()[2];
+            uint64_t N  = deref(params.inputGradDesc).GetLengths()[0];
+            uint64_t C  = deref(params.inputGradDesc).GetLengths()[1];
+            uint64_t H  = deref(params.inputGradDesc).GetLengths()[2];
+            uint64_t OH = deref(params.outputGradDesc).GetLengths()[2];
 
             kernel(
                 params.output_grad, params.input_grad, N, C, H, OH, output_grad_tv, input_grad_tv);
diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
index bc813dd7bf..8d3e78eb27 100644
--- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
+++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
@@ -24,10 +24,10 @@
  *
  *******************************************************************************/
 
-#include "miopen/conv_solution.hpp"
-#include "miopen/execution_context.hpp"
-#include "miopen/invoke_params.hpp"
-#include "miopen/tensor_view_utils.hpp"
+#include <miopen/conv_solution.hpp>
+#include <miopen/execution_context.hpp>
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor_view_utils.hpp>
 #include <miopen/adaptiveavgpool/solvers.hpp>
 
 #include <miopen/adaptiveavgpool/invoke_params.hpp>
@@ -106,7 +106,7 @@ ConvSolution AdaptiveAvgPoolBackward2d::GetSolution(
     auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
     auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
     auto dtype        = problem.GetInputGradDesc().GetType();
-    size_t N_total    = problem.GetNtotal();
+    uint64_t N_total  = problem.GetNtotal();
 
     auto build_params = KernelBuildParameters{
         {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
@@ -131,12 +131,12 @@ ConvSolution AdaptiveAvgPoolBackward2d::GetSolution(
             auto input_grad_tv  = get_inner_expanded_tv<4>(deref(params.inputGradDesc));
             auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc));
 
-            auto N  = deref(params.inputGradDesc).GetLengths()[0];
-            auto C  = deref(params.inputGradDesc).GetLengths()[1];
-            auto H  = deref(params.inputGradDesc).GetLengths()[2];
-            auto W  = deref(params.inputGradDesc).GetLengths()[3];
-            auto OH = deref(params.outputGradDesc).GetLengths()[2];
-            auto OW = deref(params.outputGradDesc).GetLengths()[3];
+            uint64_t N  = deref(params.inputGradDesc).GetLengths()[0];
+            uint64_t C  = deref(params.inputGradDesc).GetLengths()[1];
+            uint64_t H  = deref(params.inputGradDesc).GetLengths()[2];
+            uint64_t W  = deref(params.inputGradDesc).GetLengths()[3];
+            uint64_t OH = deref(params.outputGradDesc).GetLengths()[2];
+            uint64_t OW = deref(params.outputGradDesc).GetLengths()[3];
 
             kernel(params.output_grad,
                    params.input_grad,
diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
index d2073f4304..4918f2c970 100644
--- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
+++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
@@ -24,10 +24,10 @@
  *
  *******************************************************************************/
 
-#include "miopen/conv_solution.hpp"
-#include "miopen/execution_context.hpp"
-#include "miopen/invoke_params.hpp"
-#include "miopen/tensor_view_utils.hpp"
+#include <miopen/conv_solution.hpp>
+#include <miopen/execution_context.hpp>
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor_view_utils.hpp>
 #include <miopen/adaptiveavgpool/solvers.hpp>
 
 #include <miopen/adaptiveavgpool/invoke_params.hpp>
@@ -87,7 +87,7 @@ ConvSolution AdaptiveAvgPoolBackward3d::GetSolution(
     auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
     auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
     auto dtype        = problem.GetInputGradDesc().GetType();
-    size_t N_total    = problem.GetNtotal();
+    uint64_t N_total  = problem.GetNtotal();
 
     auto build_params = KernelBuildParameters{
         {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
@@ -112,14 +112,14 @@ ConvSolution AdaptiveAvgPoolBackward3d::GetSolution(
             auto input_grad_tv  = get_inner_expanded_tv<5>(deref(params.inputGradDesc));
             auto output_grad_tv = get_inner_expanded_tv<5>(deref(params.outputGradDesc));
 
-            auto N  = deref(params.inputGradDesc).GetLengths()[0];
-            auto C  = deref(params.inputGradDesc).GetLengths()[1];
-            auto D  = deref(params.inputGradDesc).GetLengths()[2];
-            auto H  = deref(params.inputGradDesc).GetLengths()[3];
-            auto W  = deref(params.inputGradDesc).GetLengths()[4];
-            auto OD = deref(params.outputGradDesc).GetLengths()[2];
-            auto OH = deref(params.outputGradDesc).GetLengths()[3];
-            auto OW = deref(params.outputGradDesc).GetLengths()[4];
+            uint64_t N  = deref(params.inputGradDesc).GetLengths()[0];
+            uint64_t C  = deref(params.inputGradDesc).GetLengths()[1];
+            uint64_t D  = deref(params.inputGradDesc).GetLengths()[2];
+            uint64_t H  = deref(params.inputGradDesc).GetLengths()[3];
+            uint64_t W  = deref(params.inputGradDesc).GetLengths()[4];
+            uint64_t OD = deref(params.outputGradDesc).GetLengths()[2];
+            uint64_t OH = deref(params.outputGradDesc).GetLengths()[3];
+            uint64_t OW = deref(params.outputGradDesc).GetLengths()[4];
 
             kernel(params.output_grad,
                    params.input_grad,
diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
index 1dc63c5858..f50bd5a56f 100644
--- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
+++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
@@ -24,10 +24,10 @@
  *
  *******************************************************************************/
 
-#include "miopen/conv_solution.hpp"
-#include "miopen/execution_context.hpp"
-#include "miopen/invoke_params.hpp"
-#include "miopen/tensor_view_utils.hpp"
+#include <miopen/conv_solution.hpp>
+#include <miopen/execution_context.hpp>
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor_view_utils.hpp>
 #include <miopen/adaptiveavgpool/solvers.hpp>
 
 #include <miopen/adaptiveavgpool/invoke_params.hpp>
@@ -84,7 +84,7 @@ ConvSolution AdaptiveAvgPoolForward1d::GetSolution(
     auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
     auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
     auto dtype        = problem.GetOutputDesc().GetType();
-    size_t N_total    = problem.GetNtotal();
+    uint64_t N_total  = problem.GetNtotal();
 
     auto build_params = KernelBuildParameters{
         {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
@@ -109,10 +109,10 @@ ConvSolution AdaptiveAvgPoolForward1d::GetSolution(
             auto input_tv  = get_inner_expanded_tv<3>(deref(params.inputDesc));
             auto output_tv = get_inner_expanded_tv<3>(deref(params.outputDesc));
 
-            size_t N  = deref(params.inputDesc).GetLengths()[0];
-            size_t C  = deref(params.inputDesc).GetLengths()[1];
-            size_t H  = deref(params.inputDesc).GetLengths()[2];
-            size_t OH = deref(params.outputDesc).GetLengths()[2];
+            uint64_t N  = deref(params.inputDesc).GetLengths()[0];
+            uint64_t C  = deref(params.inputDesc).GetLengths()[1];
+            uint64_t H  = deref(params.inputDesc).GetLengths()[2];
+            uint64_t OH = deref(params.outputDesc).GetLengths()[2];
 
             kernel(params.input, params.output, N, C, H, OH, input_tv, output_tv);
         };
diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
index 623485634a..ff62625dcd 100644
--- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
+++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
@@ -24,10 +24,10 @@
  *
  *******************************************************************************/
 
-#include "miopen/conv_solution.hpp"
-#include "miopen/execution_context.hpp"
-#include "miopen/invoke_params.hpp"
-#include "miopen/tensor_view_utils.hpp"
+#include <miopen/conv_solution.hpp>
+#include <miopen/execution_context.hpp>
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor_view_utils.hpp>
 #include <miopen/adaptiveavgpool/solvers.hpp>
 
 #include <miopen/adaptiveavgpool/invoke_params.hpp>
@@ -90,7 +90,7 @@ ConvSolution AdaptiveAvgPoolForward2d::GetSolution(
     auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
     auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
     auto dtype        = problem.GetOutputDesc().GetType();
-    size_t N_total    = problem.GetNtotal();
+    uint64_t N_total  = problem.GetNtotal();
 
     auto build_params = KernelBuildParameters{
         {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
@@ -115,12 +115,12 @@ ConvSolution AdaptiveAvgPoolForward2d::GetSolution(
             auto input_tv  = get_inner_expanded_tv<4>(deref(params.inputDesc));
             auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc));
 
-            size_t N  = deref(params.inputDesc).GetLengths()[0];
-            size_t C  = deref(params.inputDesc).GetLengths()[1];
-            size_t H  = deref(params.inputDesc).GetLengths()[2];
-            size_t W  = deref(params.inputDesc).GetLengths()[3];
-            size_t OH = deref(params.outputDesc).GetLengths()[2];
-            size_t OW = deref(params.outputDesc).GetLengths()[3];
+            uint64_t N  = deref(params.inputDesc).GetLengths()[0];
+            uint64_t C  = deref(params.inputDesc).GetLengths()[1];
+            uint64_t H  = deref(params.inputDesc).GetLengths()[2];
+            uint64_t W  = deref(params.inputDesc).GetLengths()[3];
+            uint64_t OH = deref(params.outputDesc).GetLengths()[2];
+            uint64_t OW = deref(params.outputDesc).GetLengths()[3];
 
             kernel(params.input, params.output, N, C, H, W, OH, OW, input_tv, output_tv);
         };
diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
index b4081849eb..2c31e96f24 100644
--- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
+++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
@@ -24,10 +24,10 @@
  *
  *******************************************************************************/
 
-#include "miopen/conv_solution.hpp"
-#include "miopen/execution_context.hpp"
-#include "miopen/invoke_params.hpp"
-#include "miopen/tensor_view_utils.hpp"
+#include <miopen/conv_solution.hpp>
+#include <miopen/execution_context.hpp>
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor_view_utils.hpp>
 #include <miopen/adaptiveavgpool/solvers.hpp>
 
 #include <miopen/adaptiveavgpool/invoke_params.hpp>
@@ -90,7 +90,7 @@ ConvSolution AdaptiveAvgPoolForward3d::GetSolution(
     auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
     auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
     auto dtype        = problem.GetOutputDesc().GetType();
-    size_t N_total    = problem.GetNtotal();
+    uint64_t N_total  = problem.GetNtotal();
 
     auto build_params = KernelBuildParameters{
         {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
@@ -115,14 +115,14 @@ ConvSolution AdaptiveAvgPoolForward3d::GetSolution(
             auto input_tv  = get_inner_expanded_tv<5>(deref(params.inputDesc));
             auto output_tv = get_inner_expanded_tv<5>(deref(params.outputDesc));
 
-            auto N  = deref(params.inputDesc).GetLengths()[0];
-            auto C  = deref(params.inputDesc).GetLengths()[1];
-            auto D  = deref(params.inputDesc).GetLengths()[2];
-            auto H  = deref(params.inputDesc).GetLengths()[3];
-            auto W  = deref(params.inputDesc).GetLengths()[4];
-            auto OD = deref(params.outputDesc).GetLengths()[2];
-            auto OH = deref(params.outputDesc).GetLengths()[3];
-            auto OW = deref(params.outputDesc).GetLengths()[4];
+            uint64_t N  = deref(params.inputDesc).GetLengths()[0];
+            uint64_t C  = deref(params.inputDesc).GetLengths()[1];
+            uint64_t D  = deref(params.inputDesc).GetLengths()[2];
+            uint64_t H  = deref(params.inputDesc).GetLengths()[3];
+            uint64_t W  = deref(params.inputDesc).GetLengths()[4];
+            uint64_t OD = deref(params.outputDesc).GetLengths()[2];
+            uint64_t OH = deref(params.outputDesc).GetLengths()[3];
+            uint64_t OW = deref(params.outputDesc).GetLengths()[4];
 
             kernel(params.input, params.output, N, C, D, H, W, OD, OH, OW, input_tv, output_tv);
         };
diff --git a/test/cpu_adaptiveavgpool.hpp b/test/cpu_adaptiveavgpool.hpp
index 4b6dd99dda..955cdbb3b7 100644
--- a/test/cpu_adaptiveavgpool.hpp
+++ b/test/cpu_adaptiveavgpool.hpp
@@ -23,11 +23,11 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#ifndef GUARD_CPU_AVGPOOL_HPP
-#define GUARD_CPU_AVGPOOL_HPP
+#pragma once
 
 #include "tensor_holder.hpp"
 #include <miopen/tensor_view_utils.hpp>
+#include "ford.hpp"
 
 template <class T>
 void cpu_adaptiveavgpool_forward_1d(
@@ -39,14 +39,10 @@ void cpu_adaptiveavgpool_forward_1d(
     auto input_tv  = miopen::get_inner_expanded_tv<3>(input.desc);
     auto output_tv = miopen::get_inner_expanded_tv<3>(output.desc);
 
-    for(size_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](size_t gid) {
         size_t nc = gid / OH, oh = gid % OH;
         size_t n = nc / C, c = nc % C;
 
-        if(n >= N)
-            return;
-
         size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
         size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
 
@@ -57,7 +53,7 @@ void cpu_adaptiveavgpool_forward_1d(
         }
 
         output[output_tv.get_tensor_view_idx({n, c, oh})] = static_cast<T>(sum / kh);
-    }
+    });
 }
 
 template <class T>
@@ -76,15 +72,11 @@ void cpu_adaptiveavgpool_forward_2d(tensor<T> input,
     auto input_tv  = miopen::get_inner_expanded_tv<4>(input.desc);
     auto output_tv = miopen::get_inner_expanded_tv<4>(output.desc);
 
-    for(size_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](size_t gid) {
         size_t ncoh = gid / OW, ow = gid % OW;
         size_t nc = ncoh / OH, oh = ncoh % OH;
         size_t n = nc / C, c = nc % C;
 
-        if(n >= N)
-            return;
-
         size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
         size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
 
@@ -102,7 +94,7 @@ void cpu_adaptiveavgpool_forward_2d(tensor<T> input,
         }
 
         output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = static_cast<T>(sum / divider);
-    }
+    });
 }
 
 template <class T>
@@ -123,16 +115,12 @@ void cpu_adaptiveavgpool_forward_3d(tensor<T> input,
     auto input_tv  = miopen::get_inner_expanded_tv<5>(input.desc);
     auto output_tv = miopen::get_inner_expanded_tv<5>(output.desc);
 
-    for(size_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](size_t gid) {
         size_t ncodoh = gid / OW, ow = gid % OW;
         size_t ncod = ncodoh / OH, oh = ncodoh % OH;
         size_t nc = ncod / OD, od = ncod % OD;
         size_t n = nc / C, c = nc % C;
 
-        if(n >= N)
-            return;
-
         size_t d  = static_cast<size_t>(std::floor(static_cast<float>(od * D) / OD));
         size_t kd = static_cast<size_t>(std::ceil(static_cast<float>((od + 1) * D) / OD)) - d;
 
@@ -157,7 +145,7 @@ void cpu_adaptiveavgpool_forward_3d(tensor<T> input,
 
         output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] =
             static_cast<T>(sum / (kd * kh * kw));
-    }
+    });
 }
 
 template <class T>
@@ -170,14 +158,10 @@ void cpu_adaptiveavgpool_backward_1d(
     auto output_grad_tv = miopen::get_inner_expanded_tv<3>(output_grad.desc);
     auto input_grad_tv  = miopen::get_inner_expanded_tv<3>(input_grad.desc);
 
-    for(size_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](size_t gid) {
         size_t nc = gid / H, h = gid % H;
         size_t n = nc / C, c = nc % C;
 
-        if(n >= N)
-            return;
-
         size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
         size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
 
@@ -192,7 +176,7 @@ void cpu_adaptiveavgpool_backward_1d(
         }
 
         input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = static_cast<T>(grad);
-    }
+    });
 }
 
 template <class T>
@@ -211,15 +195,11 @@ void cpu_adaptiveavgpool_backward_2d(tensor<T> output_grad,
     auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc);
     auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(input_grad.desc);
 
-    for(size_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](size_t gid) {
         size_t nch = gid / W, w = gid % W;
         size_t nc = nch / H, h = nch % H;
         size_t n = nc / C, c = nc % C;
 
-        if(n >= N)
-            return;
-
         size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
         size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
 
@@ -242,7 +222,7 @@ void cpu_adaptiveavgpool_backward_2d(tensor<T> output_grad,
         }
 
         input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = static_cast<T>(grad);
-    }
+    });
 }
 
 template <class T>
@@ -263,16 +243,12 @@ void cpu_adaptiveavgpool_backward_3d(tensor<T> output_grad,
     auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc);
     auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(input_grad.desc);
 
-    for(size_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](size_t gid) {
         size_t ncdh = gid / W, w = gid % W;
         size_t ncd = ncdh / H, h = ncdh % H;
         size_t nc = ncd / D, d = ncd % D;
         size_t n = nc / C, c = nc % C;
 
-        if(n >= N)
-            return;
-
         size_t od  = static_cast<size_t>(std::floor(static_cast<float>(d * OD) / D));
         size_t kod = static_cast<size_t>(std::ceil(static_cast<float>((d + 1) * OD) / D)) - od;
 
@@ -305,7 +281,5 @@ void cpu_adaptiveavgpool_backward_3d(tensor<T> output_grad,
         }
 
         input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = static_cast<T>(grad);
-    }
+    });
 }
-
-#endif
diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp
index 7f01813331..d7d493ed27 100644
--- a/test/gtest/adaptiveavgpool.hpp
+++ b/test/gtest/adaptiveavgpool.hpp
@@ -232,7 +232,6 @@ struct AdaptiveAvgPoolTestFwd : public ::testing::TestWithParam<AdaptiveAvgPoolT
         }
         status = miopen::adaptiveavgpool::AdaptiveAvgPoolForward(
             handle, input.desc, input_dev.get(), output.desc, output_dev.get());
-        fflush(stdout);
         ASSERT_EQ(status, miopenStatusSuccess);
 
         output.data = handle.Read<T>(output_dev, output.data.size());
@@ -245,7 +244,8 @@ struct AdaptiveAvgPoolTestFwd : public ::testing::TestWithParam<AdaptiveAvgPoolT
         auto error = miopen::rms_range(ref_output, output);
 
         ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output));
-        EXPECT_LT(error, threshold * 10);
+        EXPECT_LT(error, threshold * 10) << "Error forward Output beyond 10xthreshold : " << error
+                                         << " Tolerance: " << threshold * 10;
     }
     AdaptiveAvgPoolTestCase adaptiveavgpool_config;
 
@@ -362,7 +362,9 @@ struct AdaptiveAvgPoolTestBwd : public ::testing::TestWithParam<AdaptiveAvgPoolT
         double threshold = std::numeric_limits<T>::epsilon();
         auto error       = miopen::rms_range(ref_input_grad, input_grad);
         ASSERT_EQ(miopen::range_distance(ref_input_grad), miopen::range_distance(input_grad));
-        EXPECT_LT(error, threshold * 10);
+        EXPECT_LT(error, threshold * 10)
+            << "Error backward Input Gradient beyond 10xthreshold : " << error
+            << " Tolerance: " << threshold * 10;
     }
     AdaptiveAvgPoolTestCase adaptiveavgpool_config;
 

From 796adb177aa9b2862a4193c2d9090427c73fd3d7 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 7 Oct 2024 14:36:25 +0700
Subject: [PATCH 23/38] rm large number of cast in kernel

---
 src/adaptiveavgpool/problem_description.cpp | 12 ++--
 src/kernels/MIOpenAdaptiveAvgPool.cpp       | 67 +++++++++------------
 2 files changed, 34 insertions(+), 45 deletions(-)

diff --git a/src/adaptiveavgpool/problem_description.cpp b/src/adaptiveavgpool/problem_description.cpp
index ec3b9cf636..148a67e299 100644
--- a/src/adaptiveavgpool/problem_description.cpp
+++ b/src/adaptiveavgpool/problem_description.cpp
@@ -47,10 +47,8 @@ inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
 
 NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
 {
-    auto input_size    = inputDesc.GetLengths();
-    auto output_size   = outputDesc.GetLengths();
-    auto input_stride  = inputDesc.GetStrides();
-    auto output_stride = outputDesc.GetStrides();
+    auto input_size  = inputDesc.GetLengths();
+    auto output_size = outputDesc.GetLengths();
 
     auto input_dtype = inputDesc.GetType();
 
@@ -60,8 +58,7 @@ NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
     ss << "-input_dtype" << input_dtype;
     ss << "-Is" << input_size;
     ss << "-Os" << output_size;
-    ss << "-Si" << input_stride;
-    ss << "-So" << output_stride;
+    ss << "-Con" << IsAllContiguous();
 
     return NetworkConfig{ss.str()};
 }
@@ -81,8 +78,7 @@ NetworkConfig BwdProblemDescription::MakeNetworkConfig() const
     ss << "-input_dtype" << input_dtype;
     ss << "-dIs" << input_grad_size;
     ss << "-dOs" << output_grad_size;
-    ss << "-dSi" << input_grad_stride;
-    ss << "-dSo" << output_grad_stride;
+    ss << "-Con" << IsAllContiguous();
 
     return NetworkConfig{ss.str()};
 }
diff --git a/src/kernels/MIOpenAdaptiveAvgPool.cpp b/src/kernels/MIOpenAdaptiveAvgPool.cpp
index 8d26ea0301..273ec99087 100644
--- a/src/kernels/MIOpenAdaptiveAvgPool.cpp
+++ b/src/kernels/MIOpenAdaptiveAvgPool.cpp
@@ -47,8 +47,8 @@ __device__ void adaptiveAvgPoolForward1d(const TI* __restrict__ input,
     if(n >= N)
         return;
 
-    uint64_t h  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(oh * H) / OH));
-    uint64_t kh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((oh + 1) * H) / OH)) - h;
+    uint64_t h  = oh * H / OH;
+    uint64_t kh = (((oh + 1) * H + OH - 1) / OH) - h;
 
     FLOAT_ACCUM sum = 0;
     for(uint64_t ih = h; ih < (h + kh); ++ih)
@@ -86,14 +86,13 @@ __device__ void adaptiveAvgPoolBackward1d(const TI* __restrict__ output_grad,
     if(n >= N)
         return;
 
-    uint64_t oh  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(h * OH) / H));
-    uint64_t koh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((h + 1) * OH) / H)) - oh;
+    uint64_t oh  = (h * OH) / H;
+    uint64_t koh = (((h + 1) * OH + H - 1) / H) - oh;
 
     FLOAT_ACCUM grad = 0;
     for(uint64_t ih = oh; ih < (oh + koh); ++ih)
     {
-        uint64_t kh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((ih + 1) * H) / OH)) -
-                      static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(ih * H) / OH));
+        uint64_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH;
         grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih})]) / kh;
     }
     input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = CVT_ACCUM2FLOAT(grad);
@@ -132,11 +131,11 @@ __device__ void adaptiveAvgPoolForward2d(const TI* __restrict__ input,
     if(n >= N)
         return;
 
-    uint64_t h  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(oh * H) / OH));
-    uint64_t kh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((oh + 1) * H) / OH)) - h;
+    uint64_t h  = (oh * H) / OH;
+    uint64_t kh = (((oh + 1) * H + OH - 1) / OH) - h;
 
-    uint64_t w  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(ow * W) / OW));
-    uint64_t kw = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((ow + 1) * W) / OW)) - w;
+    uint64_t w  = (ow * W) / OW;
+    uint64_t kw = (((ow + 1) * W + OW - 1) / OW) - w;
 
     FLOAT_ACCUM divider = static_cast<FLOAT_ACCUM>(kh * kw);
     FLOAT_ACCUM sum     = 0;
@@ -185,21 +184,19 @@ __device__ void adaptiveAvgPoolBackward2d(const TI* __restrict__ output_grad,
     if(n >= N)
         return;
 
-    uint64_t oh  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(h * OH) / H));
-    uint64_t koh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((h + 1) * OH) / H)) - oh;
+    uint64_t oh  = (h * OH) / H;
+    uint64_t koh = ((h + 1) * OH + H - 1) / H - oh;
 
-    uint64_t ow  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(w * OW) / W));
-    uint64_t kow = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((w + 1) * OW) / W)) - ow;
+    uint64_t ow  = (w * OW) / W;
+    uint64_t kow = ((w + 1) * OW + W - 1) / W - ow;
 
     FLOAT_ACCUM grad = 0;
     for(uint64_t ih = oh; ih < (oh + koh); ++ih)
     {
-        uint64_t kh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((ih + 1) * H) / OH)) -
-                      static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(ih * H) / OH));
+        uint64_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH;
         for(uint64_t iw = ow; iw < (ow + kow); ++iw)
         {
-            uint64_t kw = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((iw + 1) * W) / OW)) -
-                          static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(iw * W) / OW));
+            uint64_t kw = ((iw + 1) * W + OW - 1) / OW - (iw * W) / OW;
             grad +=
                 CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih, iw})]) /
                 (kh * kw);
@@ -246,14 +243,14 @@ __device__ void adaptiveAvgPoolForward3d(const TI* __restrict__ input,
 
     if(n >= N)
         return;
-    uint64_t d  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(od * D) / OD));
-    uint64_t kd = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((od + 1) * D) / OD)) - d;
+    uint64_t d  = (od * D) / OD;
+    uint64_t kd = ((od + 1) * D + OD - 1) / OD - d;
 
-    uint64_t h  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(oh * H) / OH));
-    uint64_t kh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((oh + 1) * H) / OH)) - h;
+    uint64_t h  = (oh * H) / OH;
+    uint64_t kh = ((oh + 1) * H + OH - 1) / OH - h;
 
-    uint64_t w  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(ow * W) / OW));
-    uint64_t kw = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((ow + 1) * W) / OW)) - w;
+    uint64_t w  = (ow * W) / OW;
+    uint64_t kw = ((ow + 1) * W + OW - 1) / OW - w;
 
     FLOAT_ACCUM sum = 0;
     for(uint64_t id = d; id < (d + kd); ++id)
@@ -311,29 +308,25 @@ __device__ void adaptiveAvgPoolBackward3d(const TI* __restrict__ output_grad,
     if(n >= N)
         return;
 
-    uint64_t od  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(d * OD) / D));
-    uint64_t kod = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((d + 1) * OD) / D)) - od;
+    uint64_t od  = (d * OD) / D;
+    uint64_t kod = ((d + 1) * OD + D - 1) / D - od;
 
-    uint64_t oh  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(h * OH) / H));
-    uint64_t koh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((h + 1) * OH) / H)) - oh;
+    uint64_t oh  = (h * OH) / H;
+    uint64_t koh = ((h + 1) * OH + H - 1) / H - oh;
 
-    uint64_t ow  = static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(w * OW) / W));
-    uint64_t kow = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((w + 1) * OW) / W)) - ow;
+    uint64_t ow  = (w * OW) / W;
+    uint64_t kow = ((w + 1) * OW + W - 1) / W - ow;
 
     FLOAT_ACCUM grad = 0;
     for(uint64_t id = od; id < (od + kod); ++id)
     {
-        uint64_t kd = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((id + 1) * D) / OD)) -
-                      static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(id * D) / OD));
+        uint64_t kd = ((id + 1) * D + OD - 1) / OD - (id * D) / OD;
         for(uint64_t ih = oh; ih < (oh + koh); ++ih)
         {
-            uint64_t kh = static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((ih + 1) * H) / OH)) -
-                          static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(ih * H) / OH));
+            uint64_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH;
             for(uint64_t iw = ow; iw < (ow + kow); ++iw)
             {
-                uint64_t kw =
-                    static_cast<uint64_t>(ceil(static_cast<FLOAT_ACCUM>((iw + 1) * W) / OW)) -
-                    static_cast<uint64_t>(floor(static_cast<FLOAT_ACCUM>(iw * W) / OW));
+                uint64_t kw = ((iw + 1) * W + OW - 1) / OW - (iw * W) / OW;
                 grad += CVT_FLOAT2ACCUM(
                             output_grad[output_grad_tv.get_tensor_view_idx({n, c, id, ih, iw})]) /
                         (kd * kh * kw);

From 93985d4003d40389f105253824b56fba60b2664f Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 7 Oct 2024 17:33:35 +0700
Subject: [PATCH 24/38] resolved comments

---
 driver/adaptiveavgpool_driver.hpp | 3 +++
 test/gtest/adaptiveavgpool.hpp    | 5 ++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp
index 65badc5fa2..f179ff5b5b 100644
--- a/driver/adaptiveavgpool_driver.hpp
+++ b/driver/adaptiveavgpool_driver.hpp
@@ -261,7 +261,10 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     status |= output_grad_dev->ToGPU(q, output_grad.data());
 
     if(status != 0)
+    {
         std::cout << "Error copying data to GPU\n" << std::endl;
+        return miopenStatusAllocFailed;
+    }
 
     return miopenStatusSuccess;
 }
diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp
index d7d493ed27..d4e5f1829e 100644
--- a/test/gtest/adaptiveavgpool.hpp
+++ b/test/gtest/adaptiveavgpool.hpp
@@ -228,7 +228,7 @@ struct AdaptiveAvgPoolTestFwd : public ::testing::TestWithParam<AdaptiveAvgPoolT
         }
         else if(dims == 5)
         {
-            cpu_adaptiveavgpool_forward_3d<T>(input, ref_output, N, C, D, H, W, OD, OH, OW);
+            cpu_adaptiveavgpool_forward_3d(input, ref_output, N, C, D, H, W, OD, OH, OW);
         }
         status = miopen::adaptiveavgpool::AdaptiveAvgPoolForward(
             handle, input.desc, input_dev.get(), output.desc, output_dev.get());
@@ -346,8 +346,7 @@ struct AdaptiveAvgPoolTestBwd : public ::testing::TestWithParam<AdaptiveAvgPoolT
         }
         else if(dims == 5)
         {
-            cpu_adaptiveavgpool_backward_3d<T>(
-                output_grad, ref_input_grad, N, C, D, H, W, OD, OH, OW);
+            cpu_adaptiveavgpool_backward_3d(output_grad, ref_input_grad, N, C, D, H, W, OD, OH, OW);
         }
         status = miopen::adaptiveavgpool::AdaptiveAvgPoolBackward(
             handle, output_grad.desc, output_grad_dev.get(), input_grad.desc, input_grad_dev.get());

From 7795a19d84fc3f2136d12fb10031fb1c60026543 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 7 Oct 2024 17:41:03 +0700
Subject: [PATCH 25/38] add T

---
 test/gtest/adaptiveavgpool.hpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp
index d4e5f1829e..58b82cdf4e 100644
--- a/test/gtest/adaptiveavgpool.hpp
+++ b/test/gtest/adaptiveavgpool.hpp
@@ -220,15 +220,15 @@ struct AdaptiveAvgPoolTestFwd : public ::testing::TestWithParam<AdaptiveAvgPoolT
         auto dims = input.desc.GetNumDims();
         if(dims == 3)
         {
-            cpu_adaptiveavgpool_forward_1d(input, ref_output, N, C, H, OH);
+            cpu_adaptiveavgpool_forward_1d<T>(input, ref_output, N, C, H, OH);
         }
         else if(dims == 4)
         {
-            cpu_adaptiveavgpool_forward_2d(input, ref_output, N, C, H, W, OH, OW);
+            cpu_adaptiveavgpool_forward_2d<T>(input, ref_output, N, C, H, W, OH, OW);
         }
         else if(dims == 5)
         {
-            cpu_adaptiveavgpool_forward_3d(input, ref_output, N, C, D, H, W, OD, OH, OW);
+            cpu_adaptiveavgpool_forward_3d<T>(input, ref_output, N, C, D, H, W, OD, OH, OW);
         }
         status = miopen::adaptiveavgpool::AdaptiveAvgPoolForward(
             handle, input.desc, input_dev.get(), output.desc, output_dev.get());
@@ -338,15 +338,16 @@ struct AdaptiveAvgPoolTestBwd : public ::testing::TestWithParam<AdaptiveAvgPoolT
         auto dims = input_grad.desc.GetNumDims();
         if(dims == 3)
         {
-            cpu_adaptiveavgpool_backward_1d(output_grad, ref_input_grad, N, C, H, OH);
+            cpu_adaptiveavgpool_backward_1d<T>(output_grad, ref_input_grad, N, C, H, OH);
         }
         else if(dims == 4)
         {
-            cpu_adaptiveavgpool_backward_2d(output_grad, ref_input_grad, N, C, H, W, OH, OW);
+            cpu_adaptiveavgpool_backward_2d<T>(output_grad, ref_input_grad, N, C, H, W, OH, OW);
         }
         else if(dims == 5)
         {
-            cpu_adaptiveavgpool_backward_3d(output_grad, ref_input_grad, N, C, D, H, W, OD, OH, OW);
+            cpu_adaptiveavgpool_backward_3d<T>(
+                output_grad, ref_input_grad, N, C, D, H, W, OD, OH, OW);
         }
         status = miopen::adaptiveavgpool::AdaptiveAvgPoolBackward(
             handle, output_grad.desc, output_grad_dev.get(), input_grad.desc, input_grad_dev.get());

From 58bcf1e1a161714156a46a786d728579c9a7092f Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 7 Oct 2024 18:28:35 +0700
Subject: [PATCH 26/38] reorder

---
 include/miopen/miopen.h | 82 ++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index a1dcc49bd4..cae252b9a2 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -7826,47 +7826,6 @@ MIOPEN_EXPORT miopenStatus_t miopenPReLUBackward(miopenHandle_t handle,
 // CLOSEOUT RELU DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
-#ifdef MIOPEN_BETA_API
-// adaptiveavgpool APIs
-/** @addtogroup adaptiveavgpool
- *
- *  @{
- */
-
-/*! @brief Execute an adaptiveavgpool forward layer
- *
- * @param handle                   MIOpen handle (input)
- * @param inputDesc                Tensor descriptor for input tensor (input)
- * @param input                    Data tensor input (input)
- * @param outputDesc               Tensor descriptor for output tensor (input)
- * @param output                   Data tensor output (output)
- * @return                         miopenStatus_t
- */
-MIOPEN_EXPORT miopenStatus_t miopenAdaptiveAvgPoolForward(miopenHandle_t handle,
-                                                          const miopenTensorDescriptor_t inputDesc,
-                                                          const void* input,
-                                                          const miopenTensorDescriptor_t outputDesc,
-                                                          void* output);
-
-/*! @brief Execute an adaptiveavgpool backward layer
- *
- * @param handle                   MIOpen handle (input)
- * @param outputGradDesc           Tensor descriptor for output grad tensor (input)
- * @param output_grad              Data tensor output grad (input)
- * @param inputGradDesc            Tensor descriptor for input grad tensor (input)
- * @param input_grad               Data tensor input grad (output)
- * @return                         miopenStatus_t
- */
-MIOPEN_EXPORT miopenStatus_t
-miopenAdaptiveAvgPoolBackward(miopenHandle_t handle,
-                              const miopenTensorDescriptor_t outputGradDesc,
-                              const void* output_grad,
-                              const miopenTensorDescriptor_t inputGradDesc,
-                              void* input_grad);
-/** @} */
-// CLOSEOUT adaptiveavgpool DOXYGEN GROUP
-#endif // MIOPEN_BETA_API
-
 #ifdef MIOPEN_BETA_API
 
 /*! @ingroup LossFunction
@@ -7963,6 +7922,47 @@ MIOPEN_EXPORT miopenStatus_t miopenSoftMarginLossBackward(miopenHandle_t handle,
 // CLOSEOUT LossFunction DOXYGEN GROUP
 #endif
 
+#ifdef MIOPEN_BETA_API
+// adaptiveavgpool APIs
+/** @addtogroup adaptiveavgpool
+ *
+ *  @{
+ */
+
+/*! @brief Execute an adaptiveavgpool forward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Data tensor input (input)
+ * @param outputDesc               Tensor descriptor for output tensor (input)
+ * @param output                   Data tensor output (output)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenAdaptiveAvgPoolForward(miopenHandle_t handle,
+                                                          const miopenTensorDescriptor_t inputDesc,
+                                                          const void* input,
+                                                          const miopenTensorDescriptor_t outputDesc,
+                                                          void* output);
+
+/*! @brief Execute an adaptiveavgpool backward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param outputGradDesc           Tensor descriptor for output grad tensor (input)
+ * @param output_grad              Data tensor output grad (input)
+ * @param inputGradDesc            Tensor descriptor for input grad tensor (input)
+ * @param input_grad               Data tensor input grad (output)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenAdaptiveAvgPoolBackward(miopenHandle_t handle,
+                              const miopenTensorDescriptor_t outputGradDesc,
+                              const void* output_grad,
+                              const miopenTensorDescriptor_t inputGradDesc,
+                              void* input_grad);
+/** @} */
+// CLOSEOUT adaptiveavgpool DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
 #ifdef __cplusplus
 }
 #endif

From 335f1d596a89598b0ec3cadf32054656a55327cb Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Tue, 8 Oct 2024 10:38:23 +0700
Subject: [PATCH 27/38] small fix

---
 src/include/miopen/solver_id.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index a2b5a4214f..8559acaf49 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -63,7 +63,8 @@ enum class Primitive
     ReLU,
     Kthvalue,
     SoftMarginLoss,
-    MultiMarginLoss AdaptiveAvgPool,
+    MultiMarginLoss,
+    AdaptiveAvgPool,
 };
 
 struct MIOPEN_INTERNALS_EXPORT Id

From 702443bdf356932bfbf2eb1f138367951c93a1e6 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Wed, 16 Oct 2024 13:58:51 +0700
Subject: [PATCH 28/38] rerun CI

---
 test/gtest/adaptiveavgpool.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/gtest/adaptiveavgpool.cpp b/test/gtest/adaptiveavgpool.cpp
index e12e327500..2a49a2d7eb 100644
--- a/test/gtest/adaptiveavgpool.cpp
+++ b/test/gtest/adaptiveavgpool.cpp
@@ -50,13 +50,13 @@ TEST_P(GPU_AdaptiveAvgpool_fwd_BFP16, AdaptiveAvgPoolTestFwd)
     Verify();
 };
 
-INSTANTIATE_TEST_SUITE_P(Smoke,
+INSTANTIATE_TEST_SUITE_P(Full,
                          GPU_AdaptiveAvgpool_fwd_FP32,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp32()));
-INSTANTIATE_TEST_SUITE_P(Smoke,
+INSTANTIATE_TEST_SUITE_P(Full,
                          GPU_AdaptiveAvgpool_fwd_FP16,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp16()));
-INSTANTIATE_TEST_SUITE_P(Smoke,
+INSTANTIATE_TEST_SUITE_P(Full,
                          GPU_AdaptiveAvgpool_fwd_BFP16,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdBfp16()));
 
@@ -83,12 +83,12 @@ TEST_P(GPU_AdaptiveAvgpool_bwd_BFP16, AdaptiveAvgPoolTestBwd)
     Verify();
 };
 
-INSTANTIATE_TEST_SUITE_P(Smoke,
+INSTANTIATE_TEST_SUITE_P(Full,
                          GPU_AdaptiveAvgpool_bwd_FP32,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp32()));
-INSTANTIATE_TEST_SUITE_P(Smoke,
+INSTANTIATE_TEST_SUITE_P(Full,
                          GPU_AdaptiveAvgpool_bwd_FP16,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp16()));
-INSTANTIATE_TEST_SUITE_P(Smoke,
+INSTANTIATE_TEST_SUITE_P(Full,
                          GPU_AdaptiveAvgpool_bwd_BFP16,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdBfp16()));

From bd72a3ef4b851769ffc168937480bb2be2ec6978 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Thu, 17 Oct 2024 15:45:37 +0700
Subject: [PATCH 29/38] rerun CI

---
 test/gtest/adaptiveavgpool.cpp | 36 +++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/test/gtest/adaptiveavgpool.cpp b/test/gtest/adaptiveavgpool.cpp
index 2a49a2d7eb..b36051f40a 100644
--- a/test/gtest/adaptiveavgpool.cpp
+++ b/test/gtest/adaptiveavgpool.cpp
@@ -28,67 +28,67 @@
 using float16 = half_float::half;
 
 // FORWARD TEST
-using GPU_AdaptiveAvgpool_fwd_FP32  = AdaptiveAvgPoolTestFwd<float>;
-using GPU_AdaptiveAvgpool_fwd_FP16  = AdaptiveAvgPoolTestFwd<float16>;
-using GPU_AdaptiveAvgpool_fwd_BFP16 = AdaptiveAvgPoolTestFwd<bfloat16>;
+using GPU_AdaptiveAvgPool_fwd_FP32  = AdaptiveAvgPoolTestFwd<float>;
+using GPU_AdaptiveAvgPool_fwd_FP16  = AdaptiveAvgPoolTestFwd<float16>;
+using GPU_AdaptiveAvgPool_fwd_BFP16 = AdaptiveAvgPoolTestFwd<bfloat16>;
 
-TEST_P(GPU_AdaptiveAvgpool_fwd_FP32, AdaptiveAvgPoolTestFwd)
+TEST_P(GPU_AdaptiveAvgPool_fwd_FP32, AdaptiveAvgPoolTestFwd)
 {
     RunTest();
     Verify();
 };
 
-TEST_P(GPU_AdaptiveAvgpool_fwd_FP16, AdaptiveAvgPoolTestFwd)
+TEST_P(GPU_AdaptiveAvgPool_fwd_FP16, AdaptiveAvgPoolTestFwd)
 {
     RunTest();
     Verify();
 };
 
-TEST_P(GPU_AdaptiveAvgpool_fwd_BFP16, AdaptiveAvgPoolTestFwd)
+TEST_P(GPU_AdaptiveAvgPool_fwd_BFP16, AdaptiveAvgPoolTestFwd)
 {
     RunTest();
     Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_AdaptiveAvgpool_fwd_FP32,
+                         GPU_AdaptiveAvgPool_fwd_FP32,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp32()));
 INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_AdaptiveAvgpool_fwd_FP16,
+                         GPU_AdaptiveAvgPool_fwd_FP16,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp16()));
 INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_AdaptiveAvgpool_fwd_BFP16,
+                         GPU_AdaptiveAvgPool_fwd_BFP16,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdBfp16()));
 
 // BACKWARD TEST
-using GPU_AdaptiveAvgpool_bwd_FP32  = AdaptiveAvgPoolTestBwd<float>;
-using GPU_AdaptiveAvgpool_bwd_FP16  = AdaptiveAvgPoolTestBwd<float16>;
-using GPU_AdaptiveAvgpool_bwd_BFP16 = AdaptiveAvgPoolTestBwd<bfloat16>;
+using GPU_AdaptiveAvgPool_bwd_FP32  = AdaptiveAvgPoolTestBwd<float>;
+using GPU_AdaptiveAvgPool_bwd_FP16  = AdaptiveAvgPoolTestBwd<float16>;
+using GPU_AdaptiveAvgPool_bwd_BFP16 = AdaptiveAvgPoolTestBwd<bfloat16>;
 
-TEST_P(GPU_AdaptiveAvgpool_bwd_FP32, AdaptiveAvgPoolTestBwd)
+TEST_P(GPU_AdaptiveAvgPool_bwd_FP32, AdaptiveAvgPoolTestBwd)
 {
     RunTest();
     Verify();
 };
 
-TEST_P(GPU_AdaptiveAvgpool_bwd_FP16, AdaptiveAvgPoolTestBwd)
+TEST_P(GPU_AdaptiveAvgPool_bwd_FP16, AdaptiveAvgPoolTestBwd)
 {
     RunTest();
     Verify();
 };
 
-TEST_P(GPU_AdaptiveAvgpool_bwd_BFP16, AdaptiveAvgPoolTestBwd)
+TEST_P(GPU_AdaptiveAvgPool_bwd_BFP16, AdaptiveAvgPoolTestBwd)
 {
     RunTest();
     Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_AdaptiveAvgpool_bwd_FP32,
+                         GPU_AdaptiveAvgPool_bwd_FP32,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp32()));
 INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_AdaptiveAvgpool_bwd_FP16,
+                         GPU_AdaptiveAvgPool_bwd_FP16,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp16()));
 INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_AdaptiveAvgpool_bwd_BFP16,
+                         GPU_AdaptiveAvgPool_bwd_BFP16,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdBfp16()));

From f58fb6a15f4dc1b099f25f558917dff7c1a36036 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Tue, 5 Nov 2024 15:50:27 +0700
Subject: [PATCH 30/38] rerun CI

---
 driver/adaptiveavgpool_driver.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp
index f179ff5b5b..cecb5be58b 100644
--- a/driver/adaptiveavgpool_driver.hpp
+++ b/driver/adaptiveavgpool_driver.hpp
@@ -464,7 +464,6 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::VerifyForward()
         std::cout << "Forward AdaptiveAvgPool Verifies on CPU and GPU (err=" << error << ")"
                   << std::endl;
     }
-
     return miopenStatusSuccess;
 }
 

From 16c5eb279cc14ed1d65d28c54dcff44db2b5b3e4 Mon Sep 17 00:00:00 2001
From: hieule88 <hieule@moreh.com.vn>
Date: Wed, 6 Nov 2024 11:20:41 +0700
Subject: [PATCH 31/38] rerun CI

---
 src/adaptiveavgpool/problem_description.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/adaptiveavgpool/problem_description.cpp b/src/adaptiveavgpool/problem_description.cpp
index 148a67e299..61a07272ce 100644
--- a/src/adaptiveavgpool/problem_description.cpp
+++ b/src/adaptiveavgpool/problem_description.cpp
@@ -49,8 +49,7 @@ NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
 {
     auto input_size  = inputDesc.GetLengths();
     auto output_size = outputDesc.GetLengths();
-
-    auto input_dtype = inputDesc.GetType();
+    auto input_dtype         = inputDesc.GetType();
 
     std::ostringstream ss;
 
@@ -69,8 +68,7 @@ NetworkConfig BwdProblemDescription::MakeNetworkConfig() const
     auto output_grad_size   = outputGradDesc.GetLengths();
     auto input_grad_stride  = inputGradDesc.GetStrides();
     auto output_grad_stride = outputGradDesc.GetStrides();
-
-    auto input_dtype = inputGradDesc.GetType();
+    auto input_dtype                = inputGradDesc.GetType();
 
     std::ostringstream ss;
 

From 297e759f626b9544792912d0eb0aa745bcb3f8f1 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Fri, 22 Nov 2024 10:40:59 +0700
Subject: [PATCH 32/38] fix CICD

---
 driver/adaptiveavgpool_driver.hpp           | 10 ++--
 driver/mloAdaptiveAvgPoolHost.hpp           | 66 ++++++++++-----------
 src/adaptiveavgpool/problem_description.cpp |  4 +-
 test/cpu_adaptiveavgpool.hpp                | 17 ++----
 test/gtest/adaptiveavgpool.hpp              | 13 ++--
 5 files changed, 46 insertions(+), 64 deletions(-)

diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp
index cecb5be58b..062c56fdce 100644
--- a/driver/adaptiveavgpool_driver.hpp
+++ b/driver/adaptiveavgpool_driver.hpp
@@ -323,21 +323,21 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunForwardCPU()
     if(in_dim.size() == 3)
     {
         status = mloAdaptiveAvgPoolForward1dRunHost<Tgpu, Tref>(
-            inputDesc, outputDesc, input.data(), output_host.data(), N, C, H, OH);
+            inputDesc, outputDesc, input.data(), output_host.data(), C, H, OH);
         MIOPEN_THROW_IF(status != miopenStatusSuccess,
                         "Error in mloAdaptiveAvgPoolForward1dRunHost");
     }
     else if(in_dim.size() == 4)
     {
         status = mloAdaptiveAvgPoolForward2dRunHost<Tgpu, Tref>(
-            inputDesc, outputDesc, input.data(), output_host.data(), N, C, H, W, OH, OW);
+            inputDesc, outputDesc, input.data(), output_host.data(), C, H, W, OH, OW);
         MIOPEN_THROW_IF(status != miopenStatusSuccess,
                         "Error in mloAdaptiveAvgPoolForward2dRunHost");
     }
     else if(in_dim.size() == 5)
     {
         status = mloAdaptiveAvgPoolForward3dRunHost<Tgpu, Tref>(
-            inputDesc, outputDesc, input.data(), output_host.data(), N, C, D, H, W, OD, OH, OW);
+            inputDesc, outputDesc, input.data(), output_host.data(), C, D, H, W, OD, OH, OW);
         MIOPEN_THROW_IF(status != miopenStatusSuccess,
                         "Error in mloAdaptiveAvgPoolForward3dRunHost");
     }
@@ -401,7 +401,7 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunBackwardCPU()
     if(in_dim.size() == 3)
     {
         status = mloAdaptiveAvgPoolBackward1dRunHost<Tgpu, Tref>(
-            outputGradDesc, inputGradDesc, output_grad.data(), input_grad_host.data(), N, C, H, OH);
+            outputGradDesc, inputGradDesc, output_grad.data(), input_grad_host.data(), C, H, OH);
         MIOPEN_THROW_IF(status != miopenStatusSuccess,
                         "Error in mloAdaptiveAvgPoolBackward1dRunHost");
     }
@@ -411,7 +411,6 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunBackwardCPU()
                                                                  inputGradDesc,
                                                                  output_grad.data(),
                                                                  input_grad_host.data(),
-                                                                 N,
                                                                  C,
                                                                  H,
                                                                  W,
@@ -426,7 +425,6 @@ int AdaptiveAvgPoolDriver<Tgpu, Tref>::RunBackwardCPU()
                                                                  inputGradDesc,
                                                                  output_grad.data(),
                                                                  input_grad_host.data(),
-                                                                 N,
                                                                  C,
                                                                  D,
                                                                  H,
diff --git a/driver/mloAdaptiveAvgPoolHost.hpp b/driver/mloAdaptiveAvgPoolHost.hpp
index 38088cf09e..7274408148 100644
--- a/driver/mloAdaptiveAvgPoolHost.hpp
+++ b/driver/mloAdaptiveAvgPoolHost.hpp
@@ -35,10 +35,9 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD
                                            const miopenTensorDescriptor_t outputDesc,
                                            const Tgpu* input,
                                            Tcheck* output,
-                                           size_t N,
-                                           size_t C,
-                                           size_t H,
-                                           size_t OH)
+                                           const size_t C,
+                                           const size_t H,
+                                           const size_t OH)
 {
     auto dims  = miopen::deref(inputDesc).GetLengths();
     auto numel = miopen::deref(outputDesc).GetElementSize();
@@ -69,12 +68,11 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD
                                            const miopenTensorDescriptor_t outputDesc,
                                            const Tgpu* input,
                                            Tcheck* output,
-                                           size_t N,
-                                           size_t C,
-                                           size_t H,
-                                           size_t W,
-                                           size_t OH,
-                                           size_t OW)
+                                           const size_t C,
+                                           const size_t H,
+                                           const size_t W,
+                                           const size_t OH,
+                                           const size_t OW)
 {
     auto dims  = miopen::deref(inputDesc).GetLengths();
     auto numel = miopen::deref(outputDesc).GetElementSize();
@@ -113,14 +111,13 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD
                                            const miopenTensorDescriptor_t outputDesc,
                                            const Tgpu* input,
                                            Tcheck* output,
-                                           size_t N,
-                                           size_t C,
-                                           size_t D,
-                                           size_t H,
-                                           size_t W,
-                                           size_t OD,
-                                           size_t OH,
-                                           size_t OW)
+                                           const size_t C,
+                                           const size_t D,
+                                           const size_t H,
+                                           const size_t W,
+                                           const size_t OD,
+                                           const size_t OH,
+                                           const size_t OW)
 {
     auto dims  = miopen::deref(inputDesc).GetLengths();
     auto numel = miopen::deref(outputDesc).GetElementSize();
@@ -167,10 +164,9 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu
                                             const miopenTensorDescriptor_t inputGradDesc,
                                             const Tgpu* output_grad,
                                             Tcheck* input_grad,
-                                            size_t N,
-                                            size_t C,
-                                            size_t H,
-                                            size_t OH)
+                                            const size_t C,
+                                            const size_t H,
+                                            const size_t OH)
 {
     auto dims  = miopen::deref(inputGradDesc).GetLengths();
     auto numel = miopen::deref(inputGradDesc).GetElementSize();
@@ -204,12 +200,11 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu
                                             const miopenTensorDescriptor_t inputGradDesc,
                                             const Tgpu* output_grad,
                                             Tcheck* input_grad,
-                                            size_t N,
-                                            size_t C,
-                                            size_t H,
-                                            size_t W,
-                                            size_t OH,
-                                            size_t OW)
+                                            const size_t C,
+                                            const size_t H,
+                                            const size_t W,
+                                            const size_t OH,
+                                            const size_t OW)
 {
     auto dims  = miopen::deref(inputGradDesc).GetLengths();
     auto numel = miopen::deref(inputGradDesc).GetElementSize();
@@ -253,14 +248,13 @@ int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outpu
                                             const miopenTensorDescriptor_t inputGradDesc,
                                             const Tgpu* output_grad,
                                             Tcheck* input_grad,
-                                            size_t N,
-                                            size_t C,
-                                            size_t D,
-                                            size_t H,
-                                            size_t W,
-                                            size_t OD,
-                                            size_t OH,
-                                            size_t OW)
+                                            const size_t C,
+                                            const size_t D,
+                                            const size_t H,
+                                            const size_t W,
+                                            const size_t OD,
+                                            const size_t OH,
+                                            const size_t OW)
 {
     auto dims  = miopen::deref(inputGradDesc).GetLengths();
     auto numel = miopen::deref(inputGradDesc).GetElementSize();
diff --git a/src/adaptiveavgpool/problem_description.cpp b/src/adaptiveavgpool/problem_description.cpp
index 61a07272ce..f4ba38231d 100644
--- a/src/adaptiveavgpool/problem_description.cpp
+++ b/src/adaptiveavgpool/problem_description.cpp
@@ -49,7 +49,7 @@ NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
 {
     auto input_size  = inputDesc.GetLengths();
     auto output_size = outputDesc.GetLengths();
-    auto input_dtype         = inputDesc.GetType();
+    auto input_dtype = inputDesc.GetType();
 
     std::ostringstream ss;
 
@@ -68,7 +68,7 @@ NetworkConfig BwdProblemDescription::MakeNetworkConfig() const
     auto output_grad_size   = outputGradDesc.GetLengths();
     auto input_grad_stride  = inputGradDesc.GetStrides();
     auto output_grad_stride = outputGradDesc.GetStrides();
-    auto input_dtype                = inputGradDesc.GetType();
+    auto input_dtype        = inputGradDesc.GetType();
 
     std::ostringstream ss;
 
diff --git a/test/cpu_adaptiveavgpool.hpp b/test/cpu_adaptiveavgpool.hpp
index 955cdbb3b7..ec3e457bba 100644
--- a/test/cpu_adaptiveavgpool.hpp
+++ b/test/cpu_adaptiveavgpool.hpp
@@ -31,7 +31,7 @@
 
 template <class T>
 void cpu_adaptiveavgpool_forward_1d(
-    tensor<T> input, tensor<T>& output, size_t N, size_t C, size_t H, size_t OH)
+    tensor<T> input, tensor<T>& output, size_t C, size_t H, size_t OH)
 {
     auto dims  = input.desc.GetLengths();
     auto numel = output.desc.GetElementSize();
@@ -57,14 +57,8 @@ void cpu_adaptiveavgpool_forward_1d(
 }
 
 template <class T>
-void cpu_adaptiveavgpool_forward_2d(tensor<T> input,
-                                    tensor<T>& output,
-                                    size_t N,
-                                    size_t C,
-                                    size_t H,
-                                    size_t W,
-                                    size_t OH,
-                                    size_t OW)
+void cpu_adaptiveavgpool_forward_2d(
+    tensor<T> input, tensor<T>& output, size_t C, size_t H, size_t W, size_t OH, size_t OW)
 {
     auto dims  = input.desc.GetLengths();
     auto numel = output.desc.GetElementSize();
@@ -100,7 +94,6 @@ void cpu_adaptiveavgpool_forward_2d(tensor<T> input,
 template <class T>
 void cpu_adaptiveavgpool_forward_3d(tensor<T> input,
                                     tensor<T>& output,
-                                    size_t N,
                                     size_t C,
                                     size_t D,
                                     size_t H,
@@ -150,7 +143,7 @@ void cpu_adaptiveavgpool_forward_3d(tensor<T> input,
 
 template <class T>
 void cpu_adaptiveavgpool_backward_1d(
-    tensor<T> output_grad, tensor<T>& input_grad, size_t N, size_t C, size_t H, size_t OH)
+    tensor<T> output_grad, tensor<T>& input_grad, size_t C, size_t H, size_t OH)
 {
     auto dims  = input_grad.desc.GetLengths();
     auto numel = input_grad.desc.GetElementSize();
@@ -182,7 +175,6 @@ void cpu_adaptiveavgpool_backward_1d(
 template <class T>
 void cpu_adaptiveavgpool_backward_2d(tensor<T> output_grad,
                                      tensor<T>& input_grad,
-                                     size_t N,
                                      size_t C,
                                      size_t H,
                                      size_t W,
@@ -228,7 +220,6 @@ void cpu_adaptiveavgpool_backward_2d(tensor<T> output_grad,
 template <class T>
 void cpu_adaptiveavgpool_backward_3d(tensor<T> output_grad,
                                      tensor<T>& input_grad,
-                                     size_t N,
                                      size_t C,
                                      size_t D,
                                      size_t H,
diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp
index 58b82cdf4e..ad2ef3e2d1 100644
--- a/test/gtest/adaptiveavgpool.hpp
+++ b/test/gtest/adaptiveavgpool.hpp
@@ -220,15 +220,15 @@ struct AdaptiveAvgPoolTestFwd : public ::testing::TestWithParam<AdaptiveAvgPoolT
         auto dims = input.desc.GetNumDims();
         if(dims == 3)
         {
-            cpu_adaptiveavgpool_forward_1d<T>(input, ref_output, N, C, H, OH);
+            cpu_adaptiveavgpool_forward_1d<T>(input, ref_output, C, H, OH);
         }
         else if(dims == 4)
         {
-            cpu_adaptiveavgpool_forward_2d<T>(input, ref_output, N, C, H, W, OH, OW);
+            cpu_adaptiveavgpool_forward_2d<T>(input, ref_output, C, H, W, OH, OW);
         }
         else if(dims == 5)
         {
-            cpu_adaptiveavgpool_forward_3d<T>(input, ref_output, N, C, D, H, W, OD, OH, OW);
+            cpu_adaptiveavgpool_forward_3d<T>(input, ref_output, C, D, H, W, OD, OH, OW);
         }
         status = miopen::adaptiveavgpool::AdaptiveAvgPoolForward(
             handle, input.desc, input_dev.get(), output.desc, output_dev.get());
@@ -338,16 +338,15 @@ struct AdaptiveAvgPoolTestBwd : public ::testing::TestWithParam<AdaptiveAvgPoolT
         auto dims = input_grad.desc.GetNumDims();
         if(dims == 3)
         {
-            cpu_adaptiveavgpool_backward_1d<T>(output_grad, ref_input_grad, N, C, H, OH);
+            cpu_adaptiveavgpool_backward_1d<T>(output_grad, ref_input_grad, C, H, OH);
         }
         else if(dims == 4)
         {
-            cpu_adaptiveavgpool_backward_2d<T>(output_grad, ref_input_grad, N, C, H, W, OH, OW);
+            cpu_adaptiveavgpool_backward_2d<T>(output_grad, ref_input_grad, C, H, W, OH, OW);
         }
         else if(dims == 5)
         {
-            cpu_adaptiveavgpool_backward_3d<T>(
-                output_grad, ref_input_grad, N, C, D, H, W, OD, OH, OW);
+            cpu_adaptiveavgpool_backward_3d<T>(output_grad, ref_input_grad, C, D, H, W, OD, OH, OW);
         }
         status = miopen::adaptiveavgpool::AdaptiveAvgPoolBackward(
             handle, output_grad.desc, output_grad_dev.get(), input_grad.desc, input_grad_dev.get());

From 322047c3f160680887fc90e064778f89c6ac2e0c Mon Sep 17 00:00:00 2001
From: hieule88 <hieule@moreh.com.vn>
Date: Mon, 25 Nov 2024 10:44:14 +0700
Subject: [PATCH 33/38] rerun CICD

---
 test/gtest/adaptiveavgpool.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/gtest/adaptiveavgpool.cpp b/test/gtest/adaptiveavgpool.cpp
index b36051f40a..b09c286b15 100644
--- a/test/gtest/adaptiveavgpool.cpp
+++ b/test/gtest/adaptiveavgpool.cpp
@@ -50,13 +50,13 @@ TEST_P(GPU_AdaptiveAvgPool_fwd_BFP16, AdaptiveAvgPoolTestFwd)
     Verify();
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
+INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_AdaptiveAvgPool_fwd_FP32,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp32()));
-INSTANTIATE_TEST_SUITE_P(Full,
+INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_AdaptiveAvgPool_fwd_FP16,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp16()));
-INSTANTIATE_TEST_SUITE_P(Full,
+INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_AdaptiveAvgPool_fwd_BFP16,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdBfp16()));
 
@@ -83,12 +83,12 @@ TEST_P(GPU_AdaptiveAvgPool_bwd_BFP16, AdaptiveAvgPoolTestBwd)
     Verify();
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
+INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_AdaptiveAvgPool_bwd_FP32,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp32()));
-INSTANTIATE_TEST_SUITE_P(Full,
+INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_AdaptiveAvgPool_bwd_FP16,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp16()));
-INSTANTIATE_TEST_SUITE_P(Full,
+INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_AdaptiveAvgPool_bwd_BFP16,
                          testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdBfp16()));

From be5c06eab1d80f0e7b0c8a5db176ab7bc79447a6 Mon Sep 17 00:00:00 2001
From: hieule88 <hieule@moreh.com.vn>
Date: Thu, 16 Jan 2025 11:36:42 +0700
Subject: [PATCH 34/38] small fix CICD

---
 .gitignore                                  | 5 +++++
 src/adaptiveavgpool/problem_description.cpp | 2 +-
 src/adaptiveavgpool_api.cpp                 | 2 +-
 test/gtest/adaptiveavgpool.hpp              | 2 +-
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 380c163c3f..a341211e0b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,3 +65,8 @@ build*/
 
 # Python cache
 __pycache__/
+
+install_dir/
+.cache/
+.devcontainer/
+.gitignore
diff --git a/src/adaptiveavgpool/problem_description.cpp b/src/adaptiveavgpool/problem_description.cpp
index f4ba38231d..21800d4cd0 100644
--- a/src/adaptiveavgpool/problem_description.cpp
+++ b/src/adaptiveavgpool/problem_description.cpp
@@ -35,7 +35,7 @@ namespace adaptiveavgpool {
 inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
 {
     os << '{';
-    for(int i = 0; i < v.size(); ++i)
+    for(size_t i = 0; i < v.size(); ++i)
     {
         if(i != 0)
             os << ',';
diff --git a/src/adaptiveavgpool_api.cpp b/src/adaptiveavgpool_api.cpp
index c183386a6a..0f27507f46 100644
--- a/src/adaptiveavgpool_api.cpp
+++ b/src/adaptiveavgpool_api.cpp
@@ -33,7 +33,7 @@
 inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
 {
     os << '{';
-    for(int i = 0; i < v.size(); ++i)
+    for(size_t i = 0; i < v.size(); ++i)
     {
         if(i != 0)
             os << ',';
diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp
index ad2ef3e2d1..cf2e1fa5dd 100644
--- a/test/gtest/adaptiveavgpool.hpp
+++ b/test/gtest/adaptiveavgpool.hpp
@@ -37,7 +37,7 @@ template <class T>
 inline std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
 {
     os << '{';
-    for(int i = 0; i < v.size(); ++i)
+    for(size_t i = 0; i < v.size(); ++i)
     {
         if(i != 0)
             os << ',';

From fe4eefc529d9d91b01ef68b1a32d6523e82dc5b5 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 17 Feb 2025 11:15:53 +0700
Subject: [PATCH 35/38] rm floor,ceil

---
 driver/mloAdaptiveAvgPoolHost.hpp | 67 ++++++++++++++-----------------
 test/cpu_adaptiveavgpool.hpp      | 67 ++++++++++++++-----------------
 2 files changed, 60 insertions(+), 74 deletions(-)

diff --git a/driver/mloAdaptiveAvgPoolHost.hpp b/driver/mloAdaptiveAvgPoolHost.hpp
index 7274408148..5441b9a0a7 100644
--- a/driver/mloAdaptiveAvgPoolHost.hpp
+++ b/driver/mloAdaptiveAvgPoolHost.hpp
@@ -49,8 +49,8 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD
         size_t nc = gid / OH, oh = gid % OH;
         size_t n = nc / C, c = nc % C;
 
-        size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
-        size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
+        size_t h  = oh * H / OH;
+        size_t kh = (((oh + 1) * H + OH - 1) / OH) - h;
 
         float sum = 0;
         for(size_t ih = h; ih < (h + kh); ++ih)
@@ -85,11 +85,11 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD
         size_t nc = ncoh / OH, oh = ncoh % OH;
         size_t n = nc / C, c = nc % C;
 
-        size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
-        size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
+        size_t h  = (oh * H) / OH;
+        size_t kh = (((oh + 1) * H + OH - 1) / OH) - h;
 
-        size_t w  = static_cast<size_t>(std::floor(static_cast<float>(ow * W) / OW));
-        size_t kw = static_cast<size_t>(std::ceil(static_cast<float>((ow + 1) * W) / OW)) - w;
+        size_t w  = (ow * W) / OW;
+        size_t kw = (((ow + 1) * W + OW - 1) / OW) - w;
 
         float divider = static_cast<float>(kh * kw);
         float sum     = 0;
@@ -131,14 +131,14 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD
         size_t nc = ncod / OD, od = ncod % OD;
         size_t n = nc / C, c = nc % C;
 
-        size_t d  = static_cast<size_t>(std::floor(static_cast<float>(od * D) / OD));
-        size_t kd = static_cast<size_t>(std::ceil(static_cast<float>((od + 1) * D) / OD)) - d;
+        size_t d  = (od * D) / OD;
+        size_t kd = ((od + 1) * D + OD - 1) / OD - d;
 
-        size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
-        size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
+        size_t h  = (oh * H) / OH;
+        size_t kh = ((oh + 1) * H + OH - 1) / OH - h;
 
-        size_t w  = static_cast<size_t>(std::floor(static_cast<float>(ow * W) / OW));
-        size_t kw = static_cast<size_t>(std::ceil(static_cast<float>((ow + 1) * W) / OW)) - w;
+        size_t w  = (ow * W) / OW;
+        size_t kw = ((ow + 1) * W + OW - 1) / OW - w;
 
         float sum = 0;
         for(size_t id = d; id < (d + kd); ++id)
@@ -178,14 +178,13 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu
         size_t nc = gid / H, h = gid % H;
         size_t n = nc / C, c = nc % C;
 
-        size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
-        size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
+        size_t oh  = (h * OH) / H;
+        size_t koh = (((h + 1) * OH + H - 1) / H) - oh;
 
         float grad = 0;
         for(size_t ih = oh; ih < (oh + koh); ++ih)
         {
-            size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((ih + 1) * H) / OH)) -
-                        static_cast<size_t>(std::floor(static_cast<float>(ih * H) / OH));
+            size_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH;
             grad +=
                 static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih})]) /
                 kh;
@@ -217,21 +216,19 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu
         size_t nc = nch / H, h = nch % H;
         size_t n = nc / C, c = nc % C;
 
-        size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
-        size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
+        size_t oh  = (h * OH) / H;
+        size_t koh = ((h + 1) * OH + H - 1) / H - oh;
 
-        size_t ow  = static_cast<size_t>(std::floor(static_cast<float>(w * OW) / W));
-        size_t kow = static_cast<size_t>(std::ceil(static_cast<float>((w + 1) * OW) / W)) - ow;
+        size_t ow  = (w * OW) / W;
+        size_t kow = ((w + 1) * OW + W - 1) / W - ow;
 
         float grad = 0;
         for(size_t ih = oh; ih < (oh + koh); ++ih)
         {
-            size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((ih + 1) * H) / OH)) -
-                        static_cast<size_t>(std::floor(static_cast<float>(ih * H) / OH));
+            size_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH;
             for(size_t iw = ow; iw < (ow + kow); ++iw)
             {
-                size_t kw = static_cast<size_t>(std::ceil(static_cast<float>((iw + 1) * W) / OW)) -
-                            static_cast<size_t>(std::floor(static_cast<float>(iw * W) / OW));
+                size_t kw = ((iw + 1) * W + OW - 1) / OW - (iw * W) / OW;
                 grad += static_cast<float>(
                             output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih, iw})]) /
                         (kh * kw);
@@ -268,29 +265,25 @@ int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outpu
         size_t nc = ncd / D, d = ncd % D;
         size_t n = nc / C, c = nc % C;
 
-        size_t od  = static_cast<size_t>(std::floor(static_cast<float>(d * OD) / D));
-        size_t kod = static_cast<size_t>(std::ceil(static_cast<float>((d + 1) * OD) / D)) - od;
+        size_t od  = (d * OD) / D;
+        size_t kod = ((d + 1) * OD + D - 1) / D - od;
 
-        size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
-        size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
+        size_t oh  = (h * OH) / H;
+        size_t koh = ((h + 1) * OH + H - 1) / H - oh;
 
-        size_t ow  = static_cast<size_t>(std::floor(static_cast<float>(w * OW) / W));
-        size_t kow = static_cast<size_t>(std::ceil(static_cast<float>((w + 1) * OW) / W)) - ow;
+        size_t ow  = (w * OW) / W;
+        size_t kow = ((w + 1) * OW + W - 1) / W - ow;
 
         float grad = 0;
         for(size_t id = od; id < (od + kod); ++id)
         {
-            size_t kd = static_cast<size_t>(std::ceil(static_cast<float>((id + 1) * D) / OD)) -
-                        static_cast<size_t>(std::floor(static_cast<float>(id * D) / OD));
+            size_t kd = ((id + 1) * D + OD - 1) / OD - (id * D) / OD;
             for(size_t ih = oh; ih < (oh + koh); ++ih)
             {
-                size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((ih + 1) * H) / OH)) -
-                            static_cast<size_t>(std::floor(static_cast<float>(ih * H) / OH));
+                size_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH;
                 for(size_t iw = ow; iw < (ow + kow); ++iw)
                 {
-                    size_t kw =
-                        static_cast<size_t>(std::ceil(static_cast<float>((iw + 1) * W) / OW)) -
-                        static_cast<size_t>(std::floor(static_cast<float>(iw * W) / OW));
+                    size_t kw = ((iw + 1) * W + OW - 1) / OW - (iw * W) / OW;
                     grad +=
                         static_cast<float>(
                             output_grad[output_grad_tv.get_tensor_view_idx({n, c, id, ih, iw})]) /
diff --git a/test/cpu_adaptiveavgpool.hpp b/test/cpu_adaptiveavgpool.hpp
index ec3e457bba..a9d3ed0376 100644
--- a/test/cpu_adaptiveavgpool.hpp
+++ b/test/cpu_adaptiveavgpool.hpp
@@ -43,8 +43,8 @@ void cpu_adaptiveavgpool_forward_1d(
         size_t nc = gid / OH, oh = gid % OH;
         size_t n = nc / C, c = nc % C;
 
-        size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
-        size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
+        size_t h  = oh * H / OH;
+        size_t kh = (((oh + 1) * H + OH - 1) / OH) - h;
 
         float sum = 0;
         for(size_t ih = h; ih < (h + kh); ++ih)
@@ -71,11 +71,11 @@ void cpu_adaptiveavgpool_forward_2d(
         size_t nc = ncoh / OH, oh = ncoh % OH;
         size_t n = nc / C, c = nc % C;
 
-        size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
-        size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
+        size_t h  = (oh * H) / OH;
+        size_t kh = (((oh + 1) * H + OH - 1) / OH) - h;
 
-        size_t w  = static_cast<size_t>(std::floor(static_cast<float>(ow * W) / OW));
-        size_t kw = static_cast<size_t>(std::ceil(static_cast<float>((ow + 1) * W) / OW)) - w;
+        size_t w  = (ow * W) / OW;
+        size_t kw = (((ow + 1) * W + OW - 1) / OW) - w;
 
         float divider = static_cast<float>(kh * kw);
         float sum     = 0;
@@ -114,14 +114,14 @@ void cpu_adaptiveavgpool_forward_3d(tensor<T> input,
         size_t nc = ncod / OD, od = ncod % OD;
         size_t n = nc / C, c = nc % C;
 
-        size_t d  = static_cast<size_t>(std::floor(static_cast<float>(od * D) / OD));
-        size_t kd = static_cast<size_t>(std::ceil(static_cast<float>((od + 1) * D) / OD)) - d;
+        size_t d  = (od * D) / OD;
+        size_t kd = ((od + 1) * D + OD - 1) / OD - d;
 
-        size_t h  = static_cast<size_t>(std::floor(static_cast<float>(oh * H) / OH));
-        size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((oh + 1) * H) / OH)) - h;
+        size_t h  = (oh * H) / OH;
+        size_t kh = ((oh + 1) * H + OH - 1) / OH - h;
 
-        size_t w  = static_cast<size_t>(std::floor(static_cast<float>(ow * W) / OW));
-        size_t kw = static_cast<size_t>(std::ceil(static_cast<float>((ow + 1) * W) / OW)) - w;
+        size_t w  = (ow * W) / OW;
+        size_t kw = ((ow + 1) * W + OW - 1) / OW - w;
 
         float sum = 0;
         for(size_t id = d; id < (d + kd); ++id)
@@ -155,14 +155,13 @@ void cpu_adaptiveavgpool_backward_1d(
         size_t nc = gid / H, h = gid % H;
         size_t n = nc / C, c = nc % C;
 
-        size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
-        size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
+        size_t oh  = (h * OH) / H;
+        size_t koh = (((h + 1) * OH + H - 1) / H) - oh;
 
         float grad = 0;
         for(size_t ih = oh; ih < (oh + koh); ++ih)
         {
-            size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((ih + 1) * H) / OH)) -
-                        static_cast<size_t>(std::floor(static_cast<float>(ih * H) / OH));
+            size_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH;
             grad +=
                 static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih})]) /
                 kh;
@@ -192,21 +191,19 @@ void cpu_adaptiveavgpool_backward_2d(tensor<T> output_grad,
         size_t nc = nch / H, h = nch % H;
         size_t n = nc / C, c = nc % C;
 
-        size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
-        size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
+        size_t oh  = (h * OH) / H;
+        size_t koh = ((h + 1) * OH + H - 1) / H - oh;
 
-        size_t ow  = static_cast<size_t>(std::floor(static_cast<float>(w * OW) / W));
-        size_t kow = static_cast<size_t>(std::ceil(static_cast<float>((w + 1) * OW) / W)) - ow;
+        size_t ow  = (w * OW) / W;
+        size_t kow = ((w + 1) * OW + W - 1) / W - ow;
 
         float grad = 0;
         for(size_t ih = oh; ih < (oh + koh); ++ih)
         {
-            size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((ih + 1) * H) / OH)) -
-                        static_cast<size_t>(std::floor(static_cast<float>(ih * H) / OH));
+            size_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH;
             for(size_t iw = ow; iw < (ow + kow); ++iw)
             {
-                size_t kw = static_cast<size_t>(std::ceil(static_cast<float>((iw + 1) * W) / OW)) -
-                            static_cast<size_t>(std::floor(static_cast<float>(iw * W) / OW));
+                size_t kw = ((iw + 1) * W + OW - 1) / OW - (iw * W) / OW;
                 grad += static_cast<float>(
                             output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih, iw})]) /
                         (kh * kw);
@@ -240,29 +237,25 @@ void cpu_adaptiveavgpool_backward_3d(tensor<T> output_grad,
         size_t nc = ncd / D, d = ncd % D;
         size_t n = nc / C, c = nc % C;
 
-        size_t od  = static_cast<size_t>(std::floor(static_cast<float>(d * OD) / D));
-        size_t kod = static_cast<size_t>(std::ceil(static_cast<float>((d + 1) * OD) / D)) - od;
+        size_t od  = (d * OD) / D;
+        size_t kod = ((d + 1) * OD + D - 1) / D - od;
 
-        size_t oh  = static_cast<size_t>(std::floor(static_cast<float>(h * OH) / H));
-        size_t koh = static_cast<size_t>(std::ceil(static_cast<float>((h + 1) * OH) / H)) - oh;
+        size_t oh  = (h * OH) / H;
+        size_t koh = ((h + 1) * OH + H - 1) / H - oh;
 
-        size_t ow  = static_cast<size_t>(std::floor(static_cast<float>(w * OW) / W));
-        size_t kow = static_cast<size_t>(std::ceil(static_cast<float>((w + 1) * OW) / W)) - ow;
+        size_t ow  = (w * OW) / W;
+        size_t kow = ((w + 1) * OW + W - 1) / W - ow;
 
         float grad = 0;
         for(size_t id = od; id < (od + kod); ++id)
         {
-            size_t kd = static_cast<size_t>(std::ceil(static_cast<float>((id + 1) * D) / OD)) -
-                        static_cast<size_t>(std::floor(static_cast<float>(id * D) / OD));
+            size_t kd = ((id + 1) * D + OD - 1) / OD - (id * D) / OD;
             for(size_t ih = oh; ih < (oh + koh); ++ih)
             {
-                size_t kh = static_cast<size_t>(std::ceil(static_cast<float>((ih + 1) * H) / OH)) -
-                            static_cast<size_t>(std::floor(static_cast<float>(ih * H) / OH));
+                size_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH;
                 for(size_t iw = ow; iw < (ow + kow); ++iw)
                 {
-                    size_t kw =
-                        static_cast<size_t>(std::ceil(static_cast<float>((iw + 1) * W) / OW)) -
-                        static_cast<size_t>(std::floor(static_cast<float>(iw * W) / OW));
+                    size_t kw = ((iw + 1) * W + OW - 1) / OW - (iw * W) / OW;
                     grad +=
                         static_cast<float>(
                             output_grad[output_grad_tv.get_tensor_view_idx({n, c, id, ih, iw})]) /

From 949988b6b57e9cfa1088b12215fb2afc5a61e1a9 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 17 Feb 2025 17:14:40 +0700
Subject: [PATCH 36/38] rm unused

---
 driver/mloAdaptiveAvgPoolHost.hpp                          | 6 ------
 src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp | 4 ++++
 src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp | 4 ++++
 src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp | 4 ++++
 src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp  | 4 ++++
 src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp  | 4 ++++
 src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp  | 4 ++++
 test/cpu_adaptiveavgpool.hpp                               | 6 ------
 8 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/driver/mloAdaptiveAvgPoolHost.hpp b/driver/mloAdaptiveAvgPoolHost.hpp
index 5441b9a0a7..73848ca38f 100644
--- a/driver/mloAdaptiveAvgPoolHost.hpp
+++ b/driver/mloAdaptiveAvgPoolHost.hpp
@@ -39,7 +39,6 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD
                                            const size_t H,
                                            const size_t OH)
 {
-    auto dims  = miopen::deref(inputDesc).GetLengths();
     auto numel = miopen::deref(outputDesc).GetElementSize();
 
     auto input_tv  = miopen::get_inner_expanded_tv<3>(miopen::deref(inputDesc));
@@ -74,7 +73,6 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD
                                            const size_t OH,
                                            const size_t OW)
 {
-    auto dims  = miopen::deref(inputDesc).GetLengths();
     auto numel = miopen::deref(outputDesc).GetElementSize();
 
     auto input_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc));
@@ -119,7 +117,6 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD
                                            const size_t OH,
                                            const size_t OW)
 {
-    auto dims  = miopen::deref(inputDesc).GetLengths();
     auto numel = miopen::deref(outputDesc).GetElementSize();
 
     auto input_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
@@ -168,7 +165,6 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu
                                             const size_t H,
                                             const size_t OH)
 {
-    auto dims  = miopen::deref(inputGradDesc).GetLengths();
     auto numel = miopen::deref(inputGradDesc).GetElementSize();
 
     auto output_grad_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(outputGradDesc));
@@ -205,7 +201,6 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu
                                             const size_t OH,
                                             const size_t OW)
 {
-    auto dims  = miopen::deref(inputGradDesc).GetLengths();
     auto numel = miopen::deref(inputGradDesc).GetElementSize();
 
     auto output_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc));
@@ -253,7 +248,6 @@ int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outpu
                                             const size_t OH,
                                             const size_t OW)
 {
-    auto dims  = miopen::deref(inputGradDesc).GetLengths();
     auto numel = miopen::deref(inputGradDesc).GetElementSize();
 
     auto output_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc));
diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
index 700029db10..1552ac8385 100644
--- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
+++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp
@@ -43,6 +43,8 @@ namespace solver {
 
 namespace adaptiveavgpool {
 
+namespace {
+
 bool IsOverRocmBwd1d(const miopen::adaptiveavgpool::BwdProblemDescription& problem)
 {
     if(!problem.IsAllContiguous())
@@ -61,6 +63,8 @@ bool IsOverRocmBwd1d(const miopen::adaptiveavgpool::BwdProblemDescription& probl
     return false;
 }
 
+} // namespace
+
 bool AdaptiveAvgPoolBackward1d::IsApplicable(
     const ExecutionContext&, const miopen::adaptiveavgpool::BwdProblemDescription& problem) const
 {
diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
index 8d3e78eb27..46dcef3e88 100644
--- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
+++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp
@@ -43,6 +43,8 @@ namespace solver {
 
 namespace adaptiveavgpool {
 
+namespace {
+
 bool IsOverRocmBwd2d(const miopen::adaptiveavgpool::BwdProblemDescription& problem)
 {
     if(problem.IsAllContiguous())
@@ -77,6 +79,8 @@ bool IsOverRocmBwd2d(const miopen::adaptiveavgpool::BwdProblemDescription& probl
     return false;
 }
 
+} // namespace
+
 bool AdaptiveAvgPoolBackward2d::IsApplicable(
     const ExecutionContext&, const miopen::adaptiveavgpool::BwdProblemDescription& problem) const
 {
diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
index 4918f2c970..c16603a530 100644
--- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
+++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp
@@ -43,6 +43,8 @@ namespace solver {
 
 namespace adaptiveavgpool {
 
+namespace {
+
 bool IsOverRocmBwd3d(const miopen::adaptiveavgpool::BwdProblemDescription& problem)
 {
     if(!problem.IsAllContiguous())
@@ -58,6 +60,8 @@ bool IsOverRocmBwd3d(const miopen::adaptiveavgpool::BwdProblemDescription& probl
     return false;
 }
 
+} // namespace
+
 bool AdaptiveAvgPoolBackward3d::IsApplicable(
     const ExecutionContext&, const miopen::adaptiveavgpool::BwdProblemDescription& problem) const
 {
diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
index f50bd5a56f..f31d80c8be 100644
--- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
+++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp
@@ -43,6 +43,8 @@ namespace solver {
 
 namespace adaptiveavgpool {
 
+namespace {
+
 bool IsOverRocmFwd1d(const miopen::adaptiveavgpool::FwdProblemDescription& problem)
 {
     auto in_nelems   = problem.GetInputDesc().GetLengths()[-1];
@@ -56,6 +58,8 @@ bool IsOverRocmFwd1d(const miopen::adaptiveavgpool::FwdProblemDescription& probl
     return false;
 }
 
+} // namespace
+
 bool AdaptiveAvgPoolForward1d::IsApplicable(
     const ExecutionContext&, const miopen::adaptiveavgpool::FwdProblemDescription& problem) const
 {
diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
index ff62625dcd..344071a3a4 100644
--- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
+++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp
@@ -43,6 +43,8 @@ namespace solver {
 
 namespace adaptiveavgpool {
 
+namespace {
+
 bool IsOverRocmFwd2d(const miopen::adaptiveavgpool::FwdProblemDescription& problem)
 {
     auto in_nelems   = problem.GetInputDesc().GetElementSize();
@@ -62,6 +64,8 @@ bool IsOverRocmFwd2d(const miopen::adaptiveavgpool::FwdProblemDescription& probl
     return false;
 }
 
+} // namespace
+
 bool AdaptiveAvgPoolForward2d::IsApplicable(
     const ExecutionContext&, const miopen::adaptiveavgpool::FwdProblemDescription& problem) const
 {
diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
index 2c31e96f24..3c4fcf552f 100644
--- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
+++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp
@@ -43,6 +43,8 @@ namespace solver {
 
 namespace adaptiveavgpool {
 
+namespace {
+
 bool IsOverRocmFwd3d(const miopen::adaptiveavgpool::FwdProblemDescription& problem)
 {
     auto in_nelems   = problem.GetInputDesc().GetElementSize();
@@ -62,6 +64,8 @@ bool IsOverRocmFwd3d(const miopen::adaptiveavgpool::FwdProblemDescription& probl
     return false;
 }
 
+} // namespace
+
 bool AdaptiveAvgPoolForward3d::IsApplicable(
     const ExecutionContext&, const miopen::adaptiveavgpool::FwdProblemDescription& problem) const
 {
diff --git a/test/cpu_adaptiveavgpool.hpp b/test/cpu_adaptiveavgpool.hpp
index a9d3ed0376..462cbda67c 100644
--- a/test/cpu_adaptiveavgpool.hpp
+++ b/test/cpu_adaptiveavgpool.hpp
@@ -33,7 +33,6 @@ template <class T>
 void cpu_adaptiveavgpool_forward_1d(
     tensor<T> input, tensor<T>& output, size_t C, size_t H, size_t OH)
 {
-    auto dims  = input.desc.GetLengths();
     auto numel = output.desc.GetElementSize();
 
     auto input_tv  = miopen::get_inner_expanded_tv<3>(input.desc);
@@ -60,7 +59,6 @@ template <class T>
 void cpu_adaptiveavgpool_forward_2d(
     tensor<T> input, tensor<T>& output, size_t C, size_t H, size_t W, size_t OH, size_t OW)
 {
-    auto dims  = input.desc.GetLengths();
     auto numel = output.desc.GetElementSize();
 
     auto input_tv  = miopen::get_inner_expanded_tv<4>(input.desc);
@@ -102,7 +100,6 @@ void cpu_adaptiveavgpool_forward_3d(tensor<T> input,
                                     size_t OH,
                                     size_t OW)
 {
-    auto dims  = input.desc.GetLengths();
     auto numel = output.desc.GetElementSize();
 
     auto input_tv  = miopen::get_inner_expanded_tv<5>(input.desc);
@@ -145,7 +142,6 @@ template <class T>
 void cpu_adaptiveavgpool_backward_1d(
     tensor<T> output_grad, tensor<T>& input_grad, size_t C, size_t H, size_t OH)
 {
-    auto dims  = input_grad.desc.GetLengths();
     auto numel = input_grad.desc.GetElementSize();
 
     auto output_grad_tv = miopen::get_inner_expanded_tv<3>(output_grad.desc);
@@ -180,7 +176,6 @@ void cpu_adaptiveavgpool_backward_2d(tensor<T> output_grad,
                                      size_t OH,
                                      size_t OW)
 {
-    auto dims  = input_grad.desc.GetLengths();
     auto numel = input_grad.desc.GetElementSize();
 
     auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc);
@@ -225,7 +220,6 @@ void cpu_adaptiveavgpool_backward_3d(tensor<T> output_grad,
                                      size_t OH,
                                      size_t OW)
 {
-    auto dims  = input_grad.desc.GetLengths();
     auto numel = input_grad.desc.GetElementSize();
 
     auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc);

From ade653f67d3f5cb1ae7e83360e841fbea8f26272 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Wed, 12 Mar 2025 16:38:02 +0700
Subject: [PATCH 37/38] small fix

---
 docs/reference/index.rst | 1 -
 include/miopen/miopen.h  | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 835acf89bd..c2b74eabee 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -39,4 +39,3 @@ The MIOpen API library is structured as follows:
   * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental)
   * :doc:`Kthvalue <../doxygen/html/group__kthvalue>` (experimental)
   * :doc:`GLU <../doxygen/html/group__glu>` (experimental)
-  * :doc:`AdaptiveAvgPool <../doxygen/html/group__adaptiveavgpool>` (experimental)
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 54ec3677bc..f67e61d87c 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -72,7 +72,6 @@
  * @defgroup ReduceCalculation
  * @defgroup RotaryPositionalEmbeddings
  * @defgroup ReLU
- * @defgroup adaptiveavgpool
  *
  */
 
@@ -8229,7 +8228,7 @@ MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle,
 
 #ifdef MIOPEN_BETA_API
 // adaptiveavgpool APIs
-/** @addtogroup adaptiveavgpool
+/** @addtogroup pooling
  *
  *  @{
  */
@@ -8265,7 +8264,7 @@ miopenAdaptiveAvgPoolBackward(miopenHandle_t handle,
                               const miopenTensorDescriptor_t inputGradDesc,
                               void* input_grad);
 /** @} */
-// CLOSEOUT adaptiveavgpool DOXYGEN GROUP
+// CLOSEOUT pooling DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
 #ifdef __cplusplus

From d3bdd4c4c00e555d802b0e25a76fca8ae120d3b8 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Wed, 12 Mar 2025 16:38:52 +0700
Subject: [PATCH 38/38] fix gitignore

---
 .gitignore | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index a341211e0b..380c163c3f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,8 +65,3 @@ build*/
 
 # Python cache
 __pycache__/
-
-install_dir/
-.cache/
-.devcontainer/
-.gitignore