From d197108888a5a37ed7d866f2cf1a25eb9611be8e Mon Sep 17 00:00:00 2001 From: hieule88 Date: Mon, 5 Aug 2024 18:11:48 +0700 Subject: [PATCH 01/38] init --- docs/reference/index.rst | 1 + include/miopen/miopen.h | 77 ++++++++++++++++++++++++++ src/avgpool.cpp | 0 src/avgpool_api.cpp | 113 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 191 insertions(+) create mode 100644 src/avgpool.cpp create mode 100644 src/avgpool_api.cpp diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 90e29ffaa9..9594e00ef0 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -35,3 +35,4 @@ The MIOpen API library is structured as follows: * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental) * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental) * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental) + * :doc:`AvgPool <../doxygen/html/group__avgpool>` (experimental) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 3b9bbeccc1..fda8817e3a 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -70,6 +70,7 @@ * @defgroup SGD * @defgroup getitem * @defgroup ReduceCalculation + * @defgroup avgpool * */ @@ -7621,6 +7622,82 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle, // CLOSEOUT GETITEM DOXYGEN GROUP #endif // MIOPEN_BETA_API +#ifdef MIOPEN_BETA_API +// avgpool APIs +/** @addtogroup avgpool + * + * @{ + */ + +/*! @brief Execute an avgpool forward layer + * + * @param handle MIOpen handle (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param input Data tensor input (input) + * @param outputDesc Tensor descriptor for output tensor (input) + * @param output Data tensor output (output) + * @param strideDesc Tensor descriptor for stride tensor (input) + * @param stride Data tensor stride (output) + * @param paddingDesc Tensor descriptor for padding tensor (input) + * @param padding Data tensor padding (output) + * @param kinforDesc Tensor descriptor for kinfor tensor (input) + * @param kinfor Data tensor kinfor (output) + * @param count_include_pad When True, will include the zero-padding in the averaging + * calculation (input) + * @param divisor_override If non-zero, will use this value as the divisor, otherwise will + * use the number of elements in the pooling window (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const miopenTensorDescriptor_t strideDesc, + const void* stride, + const miopenTensorDescriptor_t paddingDesc, + const void* padding, + const miopenTensorDescriptor_t kinforDesc, + const void* kinfor, + const bool count_include_pad, + const int32_t divisor_override); + +/*! @brief Execute an avgpool backward layer + * + * @param handle MIOpen handle (input) + * @param outputGradDesc Tensor descriptor for output grad tensor (input) + * @param output_grad Data tensor output grad (input) + * @param inputGradDesc Tensor descriptor for input grad tensor (input) + * @param input_grad Data tensor input grad (output) + * @param strideDesc Tensor descriptor for stride tensor (input) + * @param stride Data tensor stride (output) + * @param paddingDesc Tensor descriptor for padding tensor (input) + * @param padding Data tensor padding (output) + * @param kinforDesc Tensor descriptor for kinfor tensor (input) + * @param kinfor Data tensor kinfor (output) + * @param count_include_pad When True, will include the zero-padding in the averaging + * calculation (input) + * @param divisor_override If non-zero, will use this value as the divisor, otherwise will + * use the number of elements in the pooling window (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle, + const miopenTensorDescriptor_t outputGradDesc, + const void* output_grad, + const miopenTensorDescriptor_t inputGradDesc, + void* input_grad, + const miopenTensorDescriptor_t strideDesc, + const void* stride, + const miopenTensorDescriptor_t paddingDesc, + const void* padding, + const miopenTensorDescriptor_t kinforDesc, + const void* kinfor, + const bool count_include_pad, + const int32_t divisor_override); +/** @} */ +// CLOSEOUT avgpool DOXYGEN GROUP +#endif // MIOPEN_BETA_API + #ifdef __cplusplus } #endif diff --git a/src/avgpool.cpp b/src/avgpool.cpp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp new file mode 100644 index 0000000000..643d494cee --- /dev/null +++ b/src/avgpool_api.cpp @@ -0,0 +1,113 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +miopenStatus_t AvgPoolForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const TensorDescriptor& strideDesc, + ConstData_t stride, + bool log_target) +{ + const auto problem = avgpool::UnreducedProblemDescription{ + inputDesc, targetDesc, outputGradDesc, log_target, false}; + + const auto invoke_params = [&]() { + auto tmp = avgpool::BwdInvokeParams{}; + tmp.inputDesc = &inputDesc; + tmp.targetDesc = &targetDesc; + tmp.outputGradDesc = &outputGradDesc; + tmp.inputGradDesc = &inputGradDesc; + tmp.targetGradDesc = &targetGradDesc; + + tmp.input = input; + tmp.target = target; + tmp.output_grad = output_grad; + tmp.input_grad = input_grad; + tmp.target_grad = target_grad; + + tmp.log_target = log_target; + + return tmp; + }(); + const auto algo = AlgorithmName{"AvgPoolForward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +miopenStatus_t AvgPoolBackward(Handle& handle, + const TensorDescriptor& outputGradDesc, + ConstData_t output_grad, + const TensorDescriptor& inputGradDesc, + Data_t input_grad, + const TensorDescriptor& windowInforDesc, + ConstData_t window_infor, + bool log_target) +{ + const auto problem = avgpool::ReducedProblemDescription{ + inputDesc, targetDesc, outputGradDesc, divisor, log_target, false}; + + const auto invoke_params = [&]() { + auto tmp = avgpool::BwdInvokeParams{}; + tmp.inputDesc = &inputDesc; + tmp.targetDesc = &targetDesc; + tmp.outputGradDesc = &outputGradDesc; + tmp.inputGradDesc = &inputGradDesc; + tmp.targetGradDesc = &targetGradDesc; + + tmp.input = input; + tmp.target = target; + tmp.output_grad = output_grad; + tmp.input_grad = input_grad; + tmp.target_grad = target_grad; + + tmp.divisor = divisor; + tmp.log_target = log_target; + + return tmp; + }(); + const auto algo = AlgorithmName{"AvgPoolBackward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen From 3c90908ea196e4051d85cd9fe916788d3cce71ac Mon Sep 17 00:00:00 2001 From: hieule88 Date: Mon, 5 Aug 2024 23:30:39 +0700 Subject: [PATCH 02/38] skeleton code --- src/CMakeLists.txt | 8 + src/avgpool.cpp | 134 +++++++++ src/avgpool/problem_description.cpp | 85 ++++++ src/avgpool_api.cpp | 217 +++++++++----- src/include/miopen/avgpool.hpp | 65 ++++ src/include/miopen/avgpool/invoke_params.hpp | 85 ++++++ .../miopen/avgpool/problem_description.hpp | 215 ++++++++++++++ src/include/miopen/avgpool/solvers.hpp | 281 ++++++++++++++++++ src/include/miopen/solver_id.hpp | 3 +- src/kernels/MIOpenAvgPool.cpp | 0 src/solver/avgpool/backward_avgpool_2d.cpp | 0 src/solver/avgpool/backward_avgpool_3d.cpp | 0 src/solver/avgpool/forward_avgpool_2d.cpp | 0 src/solver/avgpool/forward_avgpool_3d.cpp | 0 14 files changed, 1013 insertions(+), 80 deletions(-) create mode 100644 src/avgpool/problem_description.cpp create mode 100644 src/include/miopen/avgpool.hpp create mode 100644 src/include/miopen/avgpool/invoke_params.hpp create mode 100644 src/include/miopen/avgpool/problem_description.hpp create mode 100644 src/include/miopen/avgpool/solvers.hpp create mode 100644 src/kernels/MIOpenAvgPool.cpp create mode 100644 src/solver/avgpool/backward_avgpool_2d.cpp create mode 100644 src/solver/avgpool/backward_avgpool_3d.cpp create mode 100644 src/solver/avgpool/forward_avgpool_2d.cpp create mode 100644 src/solver/avgpool/forward_avgpool_3d.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 77acf3f7d3..ee36c92967 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -89,6 +89,8 @@ set( MIOpen_Source adam_api.cpp addlayernorm_api.cpp api/find2_0_commons.cpp + avgpool_api.cpp + avgpool/problem_description.cpp batch_norm.cpp batch_norm_api.cpp batchnorm/problem_description.cpp @@ -191,6 +193,10 @@ set( MIOpen_Source solver/activ/fwd_1.cpp solver/adam/adam.cpp solver/adam/transformers_adam_w.cpp + solver/avgpool/backward_avgpool_2d.cpp + solver/avgpool/backward_avgpool_3d.cpp + solver/avgpool/forward_avgpool_2d.cpp + solver/avgpool/forward_avgpool_3d.cpp solver/batchnorm/backward_ck.cpp solver/batchnorm/backward_per_activation.cpp solver/batchnorm/backward_per_activation_fused.cpp @@ -482,6 +488,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN ${GPU_BATCHED_TRANSPOSE_KERNEL_HIP} ${GPU_GENERAL_TENSOR_REORDER_KERNEL_HIP_SOURCE} kernels/MIOpenAdam.cpp + kernels/MIOpenAvgPool.cpp kernels/MIOpenCat.cpp kernels/MIOpenCheckNumerics.cpp kernels/MIOpenBatchNormActivBwdPerAct.cl @@ -626,6 +633,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN activ.cpp adam.cpp addlayernorm.cpp + avgpool.cpp cat.cpp groupnorm.cpp getitem.cpp diff --git a/src/avgpool.cpp b/src/avgpool.cpp index e69de29bb2..15bea1f9d8 100644 --- a/src/avgpool.cpp +++ b/src/avgpool.cpp @@ -0,0 +1,134 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +miopenStatus_t AvgPoolForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const TensorDescriptor& strideDesc, + ConstData_t stride, + const TensorDescriptor& paddingDesc, + ConstData_t padding, + const TensorDescriptor& kinforDesc, + ConstData_t kinfor, + const bool count_include_pad, + const int32_t divisor_override) +{ + const auto problem = avgpool::FwdProblemDescription{inputDesc, + outputDesc, + strideDesc, + paddingDesc, + kinforDesc, + count_include_pad, + divisor_override}; + + const auto invoke_params = [&]() { + auto tmp = avgpool::FwdInvokeParams{}; + tmp.inputDesc = &inputDesc; + tmp.outputDesc = &outputDesc; + tmp.strideDesc = &strideDesc; + tmp.paddingDesc = &paddingDesc; + tmp.kinforDesc = &kinforDesc; + + tmp.input = input; + tmp.output = output; + tmp.stride = stride; + tmp.padding = padding; + tmp.kinfor = kinfor; + tmp.count_include_pad = count_include_pad; + tmp.divisor_override = divisor_override; + + return tmp; + }(); + const auto algo = AlgorithmName{"AvgPoolForward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +miopenStatus_t AvgPoolBackward(Handle& handle, + const TensorDescriptor& outputGradDesc, + ConstData_t output_grad, + const TensorDescriptor& inputGradDesc, + Data_t input_grad, + const TensorDescriptor& strideDesc, + ConstData_t stride, + const TensorDescriptor& paddingDesc, + ConstData_t padding, + const TensorDescriptor& kinforDesc, + ConstData_t kinfor, + const bool count_include_pad, + const int32_t divisor_override) +{ + const auto problem = avgpool::BwdProblemDescription{outputGradDesc, + inputGradDesc, + strideDesc, + paddingDesc, + kinforDesc, + count_include_pad, + divisor_override}; + + const auto invoke_params = [&]() { + auto tmp = avgpool::BwdInvokeParams{}; + tmp.outputGradDesc = &outputGradDesc; + tmp.inputGradDesc = &inputGradDesc; + tmp.strideDesc = &strideDesc; + tmp.paddingDesc = &paddingDesc; + tmp.kinforDesc = &kinforDesc; + + tmp.output_grad = output_grad; + tmp.input_grad = input_grad; + tmp.stride = stride; + tmp.padding = padding; + tmp.kinfor = kinfor; + tmp.count_include_pad = count_include_pad; + tmp.divisor_override = divisor_override; + + return tmp; + }(); + const auto algo = AlgorithmName{"AvgPoolBackward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/avgpool/problem_description.cpp b/src/avgpool/problem_description.cpp new file mode 100644 index 0000000000..dd2144f429 --- /dev/null +++ b/src/avgpool/problem_description.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include + +#include + +namespace miopen { + +namespace avgpool { + +NetworkConfig FwdProblemDescription::MakeNetworkConfig() const +{ + size_t numel = GetNtotal(); + size_t num_batches = inputDesc.GetLengths()[0]; + size_t num_classes = GetC(); + size_t num_dims = inputDesc.GetNumDims(); + + auto input_dtype = inputDesc.GetType(); + + std::ostringstream ss; + + ss << "avgpool_unreduce"; + ss << "is_fwd" << is_fwd; + ss << "contiguous" << contiguous; + ss << "input_dtype" << input_dtype; + ss << "numel" << numel; + ss << "num_dims" << num_dims; + ss << "num_batches" << num_batches; + ss << "num_classes" << num_classes; + + return NetworkConfig{ss.str()}; +} + +NetworkConfig BwdProblemDescription::MakeNetworkConfig() const +{ + size_t numel = GetNtotal(); + size_t num_batches = inputDesc.GetLengths()[0]; + size_t num_classes = GetC(); + size_t num_dims = inputDesc.GetNumDims(); + + auto input_dtype = inputDesc.GetType(); + + std::ostringstream ss; + + ss << "avgpool_reduce"; + ss << "is_fwd" << is_fwd; + ss << "input_dtype" << input_dtype; + ss << "divisor" << divisor; + ss << "numel" << numel; + ss << "num_dims" << num_dims; + ss << "num_batches" << num_batches; + ss << "num_classes" << num_classes; + + return NetworkConfig{ss.str()}; +} + +} // namespace avgpool + +} // namespace miopen diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp index 643d494cee..4e62bd5e7b 100644 --- a/src/avgpool_api.cpp +++ b/src/avgpool_api.cpp @@ -23,91 +23,150 @@ * SOFTWARE. * *******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -namespace miopen { +#include +#include +#include +#include +#include -miopenStatus_t AvgPoolForward(Handle& handle, - const TensorDescriptor& inputDesc, - ConstData_t input, - const TensorDescriptor& outputDesc, - Data_t output, - const TensorDescriptor& strideDesc, - ConstData_t stride, - bool log_target) +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) { - const auto problem = avgpool::UnreducedProblemDescription{ - inputDesc, targetDesc, outputGradDesc, log_target, false}; - - const auto invoke_params = [&]() { - auto tmp = avgpool::BwdInvokeParams{}; - tmp.inputDesc = &inputDesc; - tmp.targetDesc = &targetDesc; - tmp.outputGradDesc = &outputGradDesc; - tmp.inputGradDesc = &inputGradDesc; - tmp.targetGradDesc = &targetGradDesc; - - tmp.input = input; - tmp.target = target; - tmp.output_grad = output_grad; - tmp.input_grad = input_grad; - tmp.target_grad = target_grad; - - tmp.log_target = log_target; - - return tmp; - }(); - const auto algo = AlgorithmName{"AvgPoolForward"}; - const auto solvers = solver::SolverContainer{}; - - solvers.ExecutePrimitive(handle, problem, algo, invoke_params); - - return miopenStatusSuccess; + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; } -miopenStatus_t AvgPoolBackward(Handle& handle, - const TensorDescriptor& outputGradDesc, - ConstData_t output_grad, - const TensorDescriptor& inputGradDesc, - Data_t input_grad, - const TensorDescriptor& windowInforDesc, - ConstData_t window_infor, - bool log_target) +static void LogCmdAvgPool(const miopenTensorDescriptor_t xDesc, + const miopenTensorDescriptor_t oDesc, + const bool count_include_pad, + const int32_t divisor_override, + const bool is_fwd) { - const auto problem = avgpool::ReducedProblemDescription{ - inputDesc, targetDesc, outputGradDesc, divisor, log_target, false}; - - const auto invoke_params = [&]() { - auto tmp = avgpool::BwdInvokeParams{}; - tmp.inputDesc = &inputDesc; - tmp.targetDesc = &targetDesc; - tmp.outputGradDesc = &outputGradDesc; - tmp.inputGradDesc = &inputGradDesc; - tmp.targetGradDesc = &targetGradDesc; - - tmp.input = input; - tmp.target = target; - tmp.output_grad = output_grad; - tmp.input_grad = input_grad; - tmp.target_grad = target_grad; - - tmp.divisor = divisor; - tmp.log_target = log_target; - - return tmp; - }(); - const auto algo = AlgorithmName{"AvgPoolBackward"}; - const auto solvers = solver::SolverContainer{}; - - solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + if(miopen::IsLoggingCmd()) + { + std::stringstream ss; + auto dtype = miopen::deref(xDesc).GetType(); + if(dtype == miopenHalf) + { + ss << "avgpoolfp16"; + } + else if(dtype == miopenFloat) + { + ss << "avgpoolfp32"; + } + else if(dtype == miopenBFloat16) + { + ss << "avgpoolbfp16"; + } + + MIOPEN_LOG_FUNCTION(xDesc, oDesc, count_include_pad, divisor_override); + ss << " -Is " << miopen::deref(xDesc).GetLengths(); + ss << " -Os " << miopen::deref(oDesc).GetLengths(); + ss << " -Si " << miopen::deref(xDesc).GetStrides(); + ss << " -So " << miopen::deref(oDesc).GetStrides(); + ss << " -Cp " << count_include_pad; + ss << " -Do " << divisor_override; + ss << " -F " << ((is_fwd) ? "1" : "2"); + + MIOPEN_LOG_DRIVER_CMD(ss.str()); + } +} - return miopenStatusSuccess; +extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const miopenTensorDescriptor_t strideDesc, + const void* stride, + const miopenTensorDescriptor_t paddingDesc, + const void* padding, + const miopenTensorDescriptor_t kinforDesc, + const void* kinfor, + const bool count_include_pad, + const int32_t divisor_override) +{ + MIOPEN_LOG_FUNCTION(handle, + inputDesc, + input, + outputDesc, + output, + strideDesc, + stride, + paddingDesc, + padding, + kinforDesc, + kinfor, + count_include_pad, + divisor_override); + + LogCmdAvgPool(inputDesc, outputDesc, count_include_pad, divisor_override, true); + return miopen::try_([&] { + miopen::AvgPoolForward(miopen::deref(handle), + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(outputDesc), + DataCast(output), + miopen::deref(strideDesc), + DataCast(stride), + miopen::deref(paddingDesc), + DataCast(padding), + miopen::deref(kinforDesc), + DataCast(kinfor), + count_include_pad, + divisor_override); + }); } -} // namespace miopen +extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle, + const miopenTensorDescriptor_t outputGradDesc, + const void* output_grad, + const miopenTensorDescriptor_t inputGradDesc, + void* input_grad, + const miopenTensorDescriptor_t strideDesc, + const void* stride, + const miopenTensorDescriptor_t paddingDesc, + const void* padding, + const miopenTensorDescriptor_t kinforDesc, + const void* kinfor, + const bool count_include_pad, + const int32_t divisor_override) +{ + MIOPEN_LOG_FUNCTION(handle, + outputGradDesc, + output_grad, + inputGradDesc, + input_grad, + strideDesc, + stride, + paddingDesc, + padding, + kinforDesc, + kinfor, + count_include_pad, + divisor_override); + + LogCmdAvgPool(inputGradDesc, outputGradDesc, count_include_pad, divisor_override, false); + return miopen::try_([&] { + miopen::AvgPoolBackward(miopen::deref(handle), + miopen::deref(outputGradDesc), + DataCast(output_grad), + miopen::deref(inputGradDesc), + DataCast(input_grad), + miopen::deref(strideDesc), + DataCast(stride), + miopen::deref(paddingDesc), + DataCast(padding), + miopen::deref(kinforDesc), + DataCast(kinfor), + count_include_pad, + divisor_override); + }); +} diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp new file mode 100644 index 0000000000..1a46b974b2 --- /dev/null +++ b/src/include/miopen/avgpool.hpp @@ -0,0 +1,65 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#ifndef MIOPEN_AVGPOOL_HPP_ +#define MIOPEN_AVGPOOL_HPP_ + +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const TensorDescriptor& strideDesc, + ConstData_t stride, + const TensorDescriptor& paddingDesc, + ConstData_t padding, + const TensorDescriptor& kinforDesc, + ConstData_t kinfor, + bool count_include_pad, + int32_t divisor_override); + +MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle, + const TensorDescriptor& outputGradDesc, + Data_t output_grad, + const TensorDescriptor& inputGradDesc, + Data_t input_grad, + const TensorDescriptor& strideDesc, + ConstData_t stride, + const TensorDescriptor& paddingDesc, + ConstData_t padding, + const TensorDescriptor& kinforDesc, + ConstData_t kinfor, + bool count_include_pad, + int32_t divisor_override); +} // namespace miopen +#endif // _MIOPEN_AVGPOOL_HPP_ diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp new file mode 100644 index 0000000000..de2e87ea1b --- /dev/null +++ b/src/include/miopen/avgpool/invoke_params.hpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include "miopen/common.hpp" +#include +#include + +namespace miopen { +namespace avgpool { + +struct FwdInvokeParams : public miopen::InvokeParams +{ + + FwdInvokeParams() = default; + + const TensorDescriptor* inputDesc = nullptr; + const TensorDescriptor* outputDesc = nullptr; + const TensorDescriptor* strideDesc = nullptr; + const TensorDescriptor* paddingDesc = nullptr; + const TensorDescriptor* kinfor = nullptr; + + ConstData_t input = nullptr; + Data_t output = nullptr; + ConstData_t stride = nullptr; + ConstData_t padding = nullptr; + ConstData_t kinfo = nullptr; + + const bool count_include_pad = false; + const int32_t divisor_override = 0; + + std::size_t GetWorkspaceSize() const { return 0; } + Data_t GetWorkspace() const { return nullptr; } +}; + +struct BwdInvokeParams : public miopen::InvokeParams +{ + + BwdInvokeParams() = default; + + const TensorDescriptor* outputGradDesc = nullptr; + const TensorDescriptor* inputGradDesc = nullptr; + const TensorDescriptor* strideDesc = nullptr; + const TensorDescriptor* paddingDesc = nullptr; + const TensorDescriptor* kinfor = nullptr; + + ConstData_t output_grad = nullptr; + Data_t input_grad = nullptr; + ConstData_t stride = nullptr; + ConstData_t padding = nullptr; + ConstData_t kinfo = nullptr; + + const bool count_include_pad = false; + const int32_t divisor_override = 0; + + std::size_t GetWorkspaceSize() const { return 0; } + Data_t GetWorkspace() const { return nullptr; } +}; + +} // namespace avgpool +} // namespace miopen diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp new file mode 100644 index 0000000000..2b3ec555db --- /dev/null +++ b/src/include/miopen/avgpool/problem_description.hpp @@ -0,0 +1,215 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include + +namespace miopen { + +struct NetworkConfig; + +namespace avgpool { + +struct ProblemDescription : ProblemDescriptionBase +{ + ProblemDescription(const TensorDescriptor& inputDesc_, + const TensorDescriptor& targetDesc_, + const TensorDescriptor& weightDesc_, + const TensorDescriptor& outputDesc_, + int32_t ignore_index_, + bool is_fwd_) + : inputDesc(inputDesc_), + targetDesc(targetDesc_), + weightDesc(weightDesc_), + outputDesc(outputDesc_), + ignore_index(ignore_index_), + is_fwd(is_fwd_) + { + } + + const TensorDescriptor& GetInputDesc() const { return inputDesc; } + const TensorDescriptor& GetTargetDesc() const { return targetDesc; } + const TensorDescriptor& GetWeightDesc() const { return weightDesc; } + const TensorDescriptor& GetOutputDesc() const { return outputDesc; } + int32_t GetIgnoreIndex() const { return ignore_index; } + + bool IsValidLength() const + { + if(targetDesc.GetLengths()[0] != inputDesc.GetLengths()[0]) + MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match."); + + for(int32_t i = 1; i < targetDesc.GetNumDims(); ++i) + { + if(targetDesc.GetLengths()[i] != inputDesc.GetLengths()[i + 1]) + { + MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match."); + } + } + if(weightDesc.GetLengths()[0] != inputDesc.GetLengths()[1]) + { + MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match."); + } + if(inputDesc.GetLengths().size() > 5) + { + MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Do not support Input Tensor dims > 5."); + } + return true; + } + + bool IsValidStride() const + { + auto isRightStride = [](TensorDescriptor td) { + auto strides = td.GetStrides(); + auto lengths = td.GetLengths(); + std::vector> p; + p.reserve(td.GetNumDims()); + std::transform(strides.begin(), + strides.end(), + lengths.begin(), + std::back_inserter(p), + [](size_t a, size_t b) { return std::make_pair(a, b); }); + std::sort(p.begin(), p.end()); + for(int i = 1; i < p.size(); ++i) + { + if(p[i].first != p[i - 1].first * p[i - 1].second) + MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor strides do not valid."); + } + return true; + }; + return isRightStride(inputDesc) && isRightStride(targetDesc) && isRightStride(outputDesc) && + isRightStride(weightDesc); + } + + bool IsSameType() const + { + if(inputDesc.GetType() != weightDesc.GetType()) + { + MIOPEN_THROW(miopenStatusBadParm, + "NLLLoss: Input and Weight tensors types do not match."); + } + return true; + } + + bool IsAllContiguous() const + { + auto isContiguous = [](TensorDescriptor td) { + size_t s = 1; + for(int i = td.GetNumDims() - 1; i >= 0; --i) + { + if(s != td.GetStrides()[i]) + { + return false; + } + s *= td.GetLengths()[i]; + } + return true; + }; + return isContiguous(inputDesc) && isContiguous(targetDesc) && isContiguous(weightDesc) && + isContiguous(outputDesc); + } + +protected: + TensorDescriptor inputDesc; + TensorDescriptor targetDesc; + TensorDescriptor weightDesc; + TensorDescriptor outputDesc; + + int32_t ignore_index; + bool is_fwd; + + NetworkConfig MakeForwardNetworkConfig() const; +}; + +struct UnreduceProblemDescription : ProblemDescription +{ + UnreduceProblemDescription(const TensorDescriptor& inputDesc_, + const TensorDescriptor& targetDesc_, + const TensorDescriptor& weightDesc_, + const TensorDescriptor& outputDesc_, + int32_t ignore_index_, + bool is_fwd_) + : ProblemDescription( + inputDesc_, targetDesc_, weightDesc_, outputDesc_, ignore_index_, is_fwd_) + { + IsSameType(); + IsValidLength(); + IsValidStride(); + } + + size_t GetNtotal() const { return outputDesc.GetElementSize(); } + size_t GetC() const { return weightDesc.GetElementSize(); } + + NetworkConfig MakeNetworkConfig() const override; + +private: + NetworkConfig MakeForwardNetworkConfig() const; +}; + +struct ReduceProblemDescription : ProblemDescription +{ + ReduceProblemDescription(const TensorDescriptor& inputDesc_, + const TensorDescriptor& targetDesc_, + const TensorDescriptor& weightDesc_, + const TensorDescriptor& outputDesc_, + int32_t ignore_index_, + float divisor_, + bool is_fwd_) + : ProblemDescription( + inputDesc_, targetDesc_, weightDesc_, outputDesc_, ignore_index_, is_fwd_) + { + divisor = divisor_; + IsSameType(); + IsValidLength(); + IsValidStride(); + } + + size_t GetNtotal() const { return targetDesc.GetElementSize(); } + size_t GetC() const { return weightDesc.GetElementSize(); } + + bool IsValidLength() const + { + if(outputDesc.GetNumDims() != 1 || outputDesc.GetLengths()[0] != 1) + MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Output Tensor size must be (1)."); + if(!ProblemDescription::IsValidLength()) + return false; + return true; + } + + NetworkConfig MakeNetworkConfig() const override; + +private: + float divisor; + NetworkConfig MakeForwardNetworkConfig() const; +}; + +} // namespace avgpool + +} // namespace miopen diff --git a/src/include/miopen/avgpool/solvers.hpp b/src/include/miopen/avgpool/solvers.hpp new file mode 100644 index 0000000000..34adc12b4c --- /dev/null +++ b/src/include/miopen/avgpool/solvers.hpp @@ -0,0 +1,281 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include +#include +#include "miopen/kernel_build_params.hpp" +#include "miopen/kernel_info.hpp" + +#include + +namespace miopen { + +namespace solver { + +const auto make_hip_kernel = [](std::vector localsize, + std::vector gridsize, + std::string kernel_file, + std::string kernel_name, + KernelBuildParameters build_params) { + while(localsize.size() < 3) + localsize.push_back(1); + while(gridsize.size() < 3) + gridsize.push_back(1); + for(int i = 0; i < localsize.size(); ++i) + gridsize[i] = AlignUp(gridsize[i], localsize[i]); + return KernelInfo{ + build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name}; +}; + +namespace avgpool { + +using NLLLossUnreduce = + NonTunableSolverBase; + +using NLLLossReduce = + NonTunableSolverBase; + +struct NLLLossUnreduceSolver : NLLLossUnreduce +{ + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; +}; + +struct NLLLossReduceSolver : NLLLossReduce +{ + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::ReduceProblemDescription& problem) const override; +}; + +// FORWARD UNREDUCE +struct NLLLossUnreduceForwardContiguous4d final : NLLLossUnreduceSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; +}; + +struct NLLLossUnreduceForwardContiguous2d final : NLLLossUnreduceSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; +}; + +struct NLLLossUnreduceForward4d final : NLLLossUnreduceSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; +}; + +struct NLLLossUnreduceForward2d final : NLLLossUnreduceSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; +}; + +struct NLLLossUnreduceForward5d final : NLLLossUnreduceSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; +}; + +// FORWARD REDUCE +struct NLLLossReduceForward5d final : NLLLossReduceSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::ReduceProblemDescription& problem) const override; + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::avgpool::ReduceProblemDescription& problem) const override; + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::avgpool::ReduceProblemDescription& problem) const override; + bool MayNeedWorkspace() const override { return true; } +}; + +// BACKWARD UNREDUCE +struct NLLLossUnreduceBackwardContiguous2d final : NLLLossUnreduceSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; +}; + +struct NLLLossUnreduceBackwardContiguous4d final : NLLLossUnreduceSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; +}; + +struct NLLLossUnreduceBackward4d final : NLLLossUnreduceSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; +}; + +struct NLLLossUnreduceBackward2d final : NLLLossUnreduceSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; +}; + +struct NLLLossUnreduceBackward5d final : NLLLossUnreduceSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::avgpool::UnreduceProblemDescription& problem) const override; +}; + +// BACKWARD REDUCE +struct NLLLossReduceBackward2d final : NLLLossReduceSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::ReduceProblemDescription& problem) const override; + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::avgpool::ReduceProblemDescription& problem) const override; +}; + +struct NLLLossReduceBackward5d final : NLLLossReduceSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::ReduceProblemDescription& problem) const override; + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::avgpool::ReduceProblemDescription& problem) const override; +}; + +} // namespace avgpool + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp index 81c15f6bea..194afd79ac 100644 --- a/src/include/miopen/solver_id.hpp +++ b/src/include/miopen/solver_id.hpp @@ -59,7 +59,8 @@ enum class Primitive Mha, Softmax, Adam, - Item + Item, + AvgPool }; struct MIOPEN_INTERNALS_EXPORT Id diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp new file mode 100644 index 0000000000..e69de29bb2 From 86a50733653b8cce2fcfccb4d79869385e149181 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Tue, 6 Aug 2024 19:12:43 +0700 Subject: [PATCH 03/38] add solver, kernel --- src/avgpool/problem_description.cpp | 66 +++--- src/include/miopen/avgpool.hpp | 2 +- src/include/miopen/avgpool/invoke_params.hpp | 16 +- .../miopen/avgpool/problem_description.hpp | 218 +++++++---------- src/include/miopen/avgpool/solvers.hpp | 220 +++--------------- src/include/miopen/tensor_view_utils.hpp | 1 + src/kernels/tensor_view.hpp | 40 ++++ src/solver/avgpool/backward_avgpool_2d.cpp | 116 +++++++++ src/solver/avgpool/backward_avgpool_3d.cpp | 120 ++++++++++ src/solver/avgpool/forward_avgpool_2d.cpp | 116 +++++++++ src/solver/avgpool/forward_avgpool_3d.cpp | 120 ++++++++++ 11 files changed, 668 insertions(+), 367 deletions(-) diff --git a/src/avgpool/problem_description.cpp b/src/avgpool/problem_description.cpp index dd2144f429..96ecb4bb72 100644 --- a/src/avgpool/problem_description.cpp +++ b/src/avgpool/problem_description.cpp @@ -24,58 +24,68 @@ * *******************************************************************************/ -#include #include #include -#include - namespace miopen { namespace avgpool { +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + NetworkConfig FwdProblemDescription::MakeNetworkConfig() const { - size_t numel = GetNtotal(); - size_t num_batches = inputDesc.GetLengths()[0]; - size_t num_classes = GetC(); - size_t num_dims = inputDesc.GetNumDims(); + auto input_size = inputDesc.GetLengths(); + auto output_size = outputDesc.GetLengths(); + auto input_stride = inputDesc.GetStrides(); + auto output_stride = outputDesc.GetStrides(); auto input_dtype = inputDesc.GetType(); std::ostringstream ss; - ss << "avgpool_unreduce"; - ss << "is_fwd" << is_fwd; - ss << "contiguous" << contiguous; - ss << "input_dtype" << input_dtype; - ss << "numel" << numel; - ss << "num_dims" << num_dims; - ss << "num_batches" << num_batches; - ss << "num_classes" << num_classes; + ss << "avgpool_fwd"; + ss << "-input_dtype" << input_dtype; + ss << "-Is" << input_size; + ss << "-Os" << output_size; + ss << "-Si" << input_stride; + ss << "-So" << output_stride; + ss << "-Cp " << count_include_pad; + ss << "-Do " << divisor_override; return NetworkConfig{ss.str()}; } NetworkConfig BwdProblemDescription::MakeNetworkConfig() const { - size_t numel = GetNtotal(); - size_t num_batches = inputDesc.GetLengths()[0]; - size_t num_classes = GetC(); - size_t num_dims = inputDesc.GetNumDims(); + auto input_grad_size = inputGradDesc.GetLengths(); + auto output_grad_size = outputGradDesc.GetLengths(); + auto input_grad_stride = inputGradDesc.GetStrides(); + auto output_grad_stride = outputGradDesc.GetStrides(); - auto input_dtype = inputDesc.GetType(); + auto input_dtype = inputGradDesc.GetType(); std::ostringstream ss; - ss << "avgpool_reduce"; - ss << "is_fwd" << is_fwd; - ss << "input_dtype" << input_dtype; - ss << "divisor" << divisor; - ss << "numel" << numel; - ss << "num_dims" << num_dims; - ss << "num_batches" << num_batches; - ss << "num_classes" << num_classes; + ss << "avgpool_bwd"; + ss << "-input_dtype" << input_dtype; + ss << "-dIs" << input_grad_size; + ss << "-dOs" << output_grad_size; + ss << "-dSi" << input_grad_stride; + ss << "-dSo" << output_grad_stride; + ss << "-Cp " << count_include_pad; + ss << "-Do " << divisor_override; return NetworkConfig{ss.str()}; } diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp index 1a46b974b2..617ed56782 100644 --- a/src/include/miopen/avgpool.hpp +++ b/src/include/miopen/avgpool.hpp @@ -50,7 +50,7 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle, MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle, const TensorDescriptor& outputGradDesc, - Data_t output_grad, + ConstData_t output_grad, const TensorDescriptor& inputGradDesc, Data_t input_grad, const TensorDescriptor& strideDesc, diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp index de2e87ea1b..b57f8e0edc 100644 --- a/src/include/miopen/avgpool/invoke_params.hpp +++ b/src/include/miopen/avgpool/invoke_params.hpp @@ -42,16 +42,16 @@ struct FwdInvokeParams : public miopen::InvokeParams const TensorDescriptor* outputDesc = nullptr; const TensorDescriptor* strideDesc = nullptr; const TensorDescriptor* paddingDesc = nullptr; - const TensorDescriptor* kinfor = nullptr; + const TensorDescriptor* kinforDesc = nullptr; ConstData_t input = nullptr; Data_t output = nullptr; ConstData_t stride = nullptr; ConstData_t padding = nullptr; - ConstData_t kinfo = nullptr; + ConstData_t kinfor = nullptr; - const bool count_include_pad = false; - const int32_t divisor_override = 0; + bool count_include_pad = false; + int32_t divisor_override = 0; std::size_t GetWorkspaceSize() const { return 0; } Data_t GetWorkspace() const { return nullptr; } @@ -66,16 +66,16 @@ struct BwdInvokeParams : public miopen::InvokeParams const TensorDescriptor* inputGradDesc = nullptr; const TensorDescriptor* strideDesc = nullptr; const TensorDescriptor* paddingDesc = nullptr; - const TensorDescriptor* kinfor = nullptr; + const TensorDescriptor* kinforDesc = nullptr; ConstData_t output_grad = nullptr; Data_t input_grad = nullptr; ConstData_t stride = nullptr; ConstData_t padding = nullptr; - ConstData_t kinfo = nullptr; + ConstData_t kinfor = nullptr; - const bool count_include_pad = false; - const int32_t divisor_override = 0; + bool count_include_pad = false; + int32_t divisor_override = 0; std::size_t GetWorkspaceSize() const { return 0; } Data_t GetWorkspace() const { return nullptr; } diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp index 2b3ec555db..9400bd67a0 100644 --- a/src/include/miopen/avgpool/problem_description.hpp +++ b/src/include/miopen/avgpool/problem_description.hpp @@ -29,8 +29,6 @@ #include #include #include -#include -#include namespace miopen { @@ -40,174 +38,122 @@ namespace avgpool { struct ProblemDescription : ProblemDescriptionBase { - ProblemDescription(const TensorDescriptor& inputDesc_, - const TensorDescriptor& targetDesc_, - const TensorDescriptor& weightDesc_, - const TensorDescriptor& outputDesc_, - int32_t ignore_index_, - bool is_fwd_) - : inputDesc(inputDesc_), - targetDesc(targetDesc_), - weightDesc(weightDesc_), - outputDesc(outputDesc_), - ignore_index(ignore_index_), - is_fwd(is_fwd_) + ProblemDescription(const TensorDescriptor& strideDesc_, + const TensorDescriptor& paddingDesc_, + const TensorDescriptor& kinforDesc_, + const bool count_include_pad_, + const int32_t divisor_override_) + : strideDesc(strideDesc_), + paddingDesc(paddingDesc_), + kinforDesc(kinforDesc_), + count_include_pad(count_include_pad_), + divisor_override(divisor_override_) { - } - - const TensorDescriptor& GetInputDesc() const { return inputDesc; } - const TensorDescriptor& GetTargetDesc() const { return targetDesc; } - const TensorDescriptor& GetWeightDesc() const { return weightDesc; } - const TensorDescriptor& GetOutputDesc() const { return outputDesc; } - int32_t GetIgnoreIndex() const { return ignore_index; } - - bool IsValidLength() const - { - if(targetDesc.GetLengths()[0] != inputDesc.GetLengths()[0]) - MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match."); - - for(int32_t i = 1; i < targetDesc.GetNumDims(); ++i) + if(divisor_override < 0) { - if(targetDesc.GetLengths()[i] != inputDesc.GetLengths()[i + 1]) - { - MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match."); - } + MIOPEN_THROW(miopenStatusBadParm, "AvgPool: divisor_override must be non-negative."); } - if(weightDesc.GetLengths()[0] != inputDesc.GetLengths()[1]) - { - MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match."); - } - if(inputDesc.GetLengths().size() > 5) - { - MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Do not support Input Tensor dims > 5."); - } - return true; - } - - bool IsValidStride() const - { - auto isRightStride = [](TensorDescriptor td) { - auto strides = td.GetStrides(); - auto lengths = td.GetLengths(); - std::vector> p; - p.reserve(td.GetNumDims()); - std::transform(strides.begin(), - strides.end(), - lengths.begin(), - std::back_inserter(p), - [](size_t a, size_t b) { return std::make_pair(a, b); }); - std::sort(p.begin(), p.end()); - for(int i = 1; i < p.size(); ++i) - { - if(p[i].first != p[i - 1].first * p[i - 1].second) - MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor strides do not valid."); - } - return true; - }; - return isRightStride(inputDesc) && isRightStride(targetDesc) && isRightStride(outputDesc) && - isRightStride(weightDesc); - } - - bool IsSameType() const - { - if(inputDesc.GetType() != weightDesc.GetType()) - { - MIOPEN_THROW(miopenStatusBadParm, - "NLLLoss: Input and Weight tensors types do not match."); - } - return true; - } - - bool IsAllContiguous() const - { - auto isContiguous = [](TensorDescriptor td) { - size_t s = 1; - for(int i = td.GetNumDims() - 1; i >= 0; --i) - { - if(s != td.GetStrides()[i]) - { - return false; - } - s *= td.GetLengths()[i]; - } - return true; - }; - return isContiguous(inputDesc) && isContiguous(targetDesc) && isContiguous(weightDesc) && - isContiguous(outputDesc); } protected: - TensorDescriptor inputDesc; - TensorDescriptor targetDesc; - TensorDescriptor weightDesc; - TensorDescriptor outputDesc; - - int32_t ignore_index; - bool is_fwd; + TensorDescriptor strideDesc; + TensorDescriptor paddingDesc; + TensorDescriptor kinforDesc; - NetworkConfig MakeForwardNetworkConfig() const; + bool count_include_pad; + int32_t divisor_override; }; -struct UnreduceProblemDescription : ProblemDescription +struct FwdProblemDescription : ProblemDescription { - UnreduceProblemDescription(const TensorDescriptor& inputDesc_, - const TensorDescriptor& targetDesc_, - const TensorDescriptor& weightDesc_, - const TensorDescriptor& outputDesc_, - int32_t ignore_index_, - bool is_fwd_) + FwdProblemDescription(const TensorDescriptor& inputDesc_, + const TensorDescriptor& outputDesc_, + const TensorDescriptor& strideDesc_, + const TensorDescriptor& paddingDesc_, + const TensorDescriptor& kinforDesc_, + const bool count_include_pad_, + const int32_t divisor_override_) : ProblemDescription( - inputDesc_, targetDesc_, weightDesc_, outputDesc_, ignore_index_, is_fwd_) + strideDesc_, paddingDesc_, kinforDesc_, count_include_pad_, divisor_override_), + inputDesc(inputDesc_), + outputDesc(outputDesc_) { - IsSameType(); IsValidLength(); - IsValidStride(); } - size_t GetNtotal() const { return outputDesc.GetElementSize(); } - size_t GetC() const { return weightDesc.GetElementSize(); } + auto GetInputDesc() const { return inputDesc; } + auto GetOutputDesc() const { return outputDesc; } + auto GetNtotal() const { return outputDesc.GetElementSize(); } + + bool IsValidLength() const + { + auto input_dims = inputDesc.GetLengths().size(); + if(outputDesc.GetLengths()[0] != inputDesc.GetLengths()[0] || + outputDesc.GetLengths()[1] != inputDesc.GetLengths()[1] || + outputDesc.GetLengths().size() != input_dims) + { + MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match."); + } + if(input_dims != strideDesc.GetElementSize() || + input_dims != paddingDesc.GetElementSize() || input_dims != kinforDesc.GetElementSize()) + { + MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match."); + } + + return true; + } NetworkConfig MakeNetworkConfig() const override; -private: - NetworkConfig MakeForwardNetworkConfig() const; +protected: + TensorDescriptor inputDesc; + TensorDescriptor outputDesc; }; -struct ReduceProblemDescription : ProblemDescription +struct BwdProblemDescription : ProblemDescription { - ReduceProblemDescription(const TensorDescriptor& inputDesc_, - const TensorDescriptor& targetDesc_, - const TensorDescriptor& weightDesc_, - const TensorDescriptor& outputDesc_, - int32_t ignore_index_, - float divisor_, - bool is_fwd_) + BwdProblemDescription(const TensorDescriptor& outputGradDesc_, + const TensorDescriptor& inputGradDesc_, + const TensorDescriptor& strideDesc_, + const TensorDescriptor& paddingDesc_, + const TensorDescriptor& kinforDesc_, + const bool count_include_pad_, + const int32_t divisor_override_) : ProblemDescription( - inputDesc_, targetDesc_, weightDesc_, outputDesc_, ignore_index_, is_fwd_) + strideDesc_, paddingDesc_, kinforDesc_, count_include_pad_, divisor_override_), + outputGradDesc(outputGradDesc_), + inputGradDesc(inputGradDesc_) { - divisor = divisor_; - IsSameType(); IsValidLength(); - IsValidStride(); } - size_t GetNtotal() const { return targetDesc.GetElementSize(); } - size_t GetC() const { return weightDesc.GetElementSize(); } + auto GetOutputGradDesc() const { return outputGradDesc; } + auto GetInputGradDesc() const { return inputGradDesc; } + auto GetNtotal() const { return inputGradDesc.GetElementSize(); } bool IsValidLength() const { - if(outputDesc.GetNumDims() != 1 || outputDesc.GetLengths()[0] != 1) - MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Output Tensor size must be (1)."); - if(!ProblemDescription::IsValidLength()) - return false; + auto input_dims = inputGradDesc.GetLengths().size(); + if(outputGradDesc.GetLengths()[0] != inputGradDesc.GetLengths()[0] || + outputGradDesc.GetLengths()[1] != inputGradDesc.GetLengths()[1] || + outputGradDesc.GetLengths().size() != input_dims) + { + MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match."); + } + if(input_dims != strideDesc.GetElementSize() || + input_dims != paddingDesc.GetElementSize() || input_dims != kinforDesc.GetElementSize()) + { + MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match."); + } + return true; } NetworkConfig MakeNetworkConfig() const override; -private: - float divisor; - NetworkConfig MakeForwardNetworkConfig() const; +protected: + TensorDescriptor outputGradDesc; + TensorDescriptor inputGradDesc; }; } // namespace avgpool diff --git a/src/include/miopen/avgpool/solvers.hpp b/src/include/miopen/avgpool/solvers.hpp index 34adc12b4c..5577b9fad6 100644 --- a/src/include/miopen/avgpool/solvers.hpp +++ b/src/include/miopen/avgpool/solvers.hpp @@ -33,8 +33,6 @@ #include "miopen/kernel_build_params.hpp" #include "miopen/kernel_info.hpp" -#include - namespace miopen { namespace solver { @@ -56,222 +54,56 @@ const auto make_hip_kernel = [](std::vector localsize, namespace avgpool { -using NLLLossUnreduce = - NonTunableSolverBase; - -using NLLLossReduce = - NonTunableSolverBase; - -struct NLLLossUnreduceSolver : NLLLossUnreduce -{ - bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; -}; - -struct NLLLossReduceSolver : NLLLossReduce -{ - bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::ReduceProblemDescription& problem) const override; -}; - -// FORWARD UNREDUCE -struct NLLLossUnreduceForwardContiguous4d final : NLLLossUnreduceSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; - - ConvSolution - GetSolution(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; -}; - -struct NLLLossUnreduceForwardContiguous2d final : NLLLossUnreduceSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; - - ConvSolution - GetSolution(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; -}; - -struct NLLLossUnreduceForward4d final : NLLLossUnreduceSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; - - ConvSolution - GetSolution(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; -}; - -struct NLLLossUnreduceForward2d final : NLLLossUnreduceSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; - - ConvSolution - GetSolution(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; -}; - -struct NLLLossUnreduceForward5d final : NLLLossUnreduceSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; - - ConvSolution - GetSolution(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; -}; - -// FORWARD REDUCE -struct NLLLossReduceForward5d final : NLLLossReduceSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::ReduceProblemDescription& problem) const override; - ConvSolution - GetSolution(const ExecutionContext& context, - const miopen::avgpool::ReduceProblemDescription& problem) const override; - std::size_t - GetWorkspaceSize(const ExecutionContext& context, - const miopen::avgpool::ReduceProblemDescription& problem) const override; - bool MayNeedWorkspace() const override { return true; } -}; - -// BACKWARD UNREDUCE -struct NLLLossUnreduceBackwardContiguous2d final : NLLLossUnreduceSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; +using AvgPoolForward = + NonTunableSolverBase; - ConvSolution - GetSolution(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; -}; +using AvgPoolBackward = + NonTunableSolverBase; -struct NLLLossUnreduceBackwardContiguous4d final : NLLLossUnreduceSolver +// FORWARD +struct AvgPoolForward2d final : AvgPoolForward { - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } + const std::string& SolverDbId() const override { return GetSolverDbId(); } bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; + const miopen::avgpool::FwdProblemDescription& problem) const override; - ConvSolution - GetSolution(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::avgpool::FwdProblemDescription& problem) const override; }; -struct NLLLossUnreduceBackward4d final : NLLLossUnreduceSolver +struct AvgPoolForward3d final : AvgPoolForward { - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } + const std::string& SolverDbId() const override { return GetSolverDbId(); } bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; + const miopen::avgpool::FwdProblemDescription& problem) const override; - ConvSolution - GetSolution(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::avgpool::FwdProblemDescription& problem) const override; }; -struct NLLLossUnreduceBackward2d final : NLLLossUnreduceSolver +// BACKWARD +struct AvgPoolBackward2d final : AvgPoolBackward { - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } + const std::string& SolverDbId() const override { return GetSolverDbId(); } bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; + const miopen::avgpool::BwdProblemDescription& problem) const override; - ConvSolution - GetSolution(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::avgpool::BwdProblemDescription& problem) const override; }; -struct NLLLossUnreduceBackward5d final : NLLLossUnreduceSolver +struct AvgPoolBackward3d final : AvgPoolBackward { - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } + const std::string& SolverDbId() const override { return GetSolverDbId(); } bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; - - ConvSolution - GetSolution(const ExecutionContext& context, - const miopen::avgpool::UnreduceProblemDescription& problem) const override; -}; + const miopen::avgpool::BwdProblemDescription& problem) const override; -// BACKWARD REDUCE -struct NLLLossReduceBackward2d final : NLLLossReduceSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::ReduceProblemDescription& problem) const override; - ConvSolution - GetSolution(const ExecutionContext& context, - const miopen::avgpool::ReduceProblemDescription& problem) const override; -}; - -struct NLLLossReduceBackward5d final : NLLLossReduceSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::ReduceProblemDescription& problem) const override; - ConvSolution - GetSolution(const ExecutionContext& context, - const miopen::avgpool::ReduceProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::avgpool::BwdProblemDescription& problem) const override; }; } // namespace avgpool diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp index 9f7430ba8a..050d431844 100644 --- a/src/include/miopen/tensor_view_utils.hpp +++ b/src/include/miopen/tensor_view_utils.hpp @@ -29,6 +29,7 @@ #include #include "../../kernels/tensor_view.hpp" +#include "miopen/tensor.hpp" namespace miopen { diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp index d35bfd93fc..d64dbf21f9 100644 --- a/src/kernels/tensor_view.hpp +++ b/src/kernels/tensor_view.hpp @@ -72,6 +72,46 @@ struct tensor_layout_t } } + constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w) + { + static_assert(N == 5); + layout[0] = n; + layout[1] = c; + layout[2] = d; + layout[3] = h; + layout[4] = w; + } + + constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t h, uint64_t w) + { + static_assert(N == 4); + layout[0] = n; + layout[1] = c; + layout[2] = h; + layout[3] = w; + } + + constexpr tensor_layout_t(uint64_t n, uint64_t h, uint64_t w) + { + static_assert(N == 3); + layout[0] = n; + layout[1] = h; + layout[2] = w; + } + + constexpr tensor_layout_t(uint64_t n, uint64_t w) + { + static_assert(N == 2); + layout[0] = n; + layout[1] = w; + } + + constexpr tensor_layout_t(uint64_t n) + { + static_assert(N == 1); + layout[0] = n; + } + uint64_t layout[N]; }; diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp index e69de29bb2..10c9479b0c 100644 --- a/src/solver/avgpool/backward_avgpool_2d.cpp +++ b/src/solver/avgpool/backward_avgpool_2d.cpp @@ -0,0 +1,116 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include "miopen/tensor_view_utils.hpp" +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_BWD_2D 1024 + +namespace miopen { + +namespace solver { + +namespace avgpool { + +bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context, + const miopen::avgpool::BwdProblemDescription& problem) const +{ + return true; +} + +ConvSolution +AvgPoolBackward2d::GetSolution(const ExecutionContext& context, + const miopen::avgpool::BwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetOutputGradDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType()); + auto dtype = problem.GetInputGradDesc().GetType(); + size_t N_total = problem.GetNtotal(); + + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; + + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_BWD_2D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolBackward2d", build_params)); + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + decltype(auto) kernel = handle_.Run(kernels.front()); + + auto input_grad_tv = get_inner_expanded_tv<4>(deref(params.inputGradDesc)); + auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc)); + + auto N = deref(params.inputGradDesc).GetLengths()[0]; + auto C = deref(params.inputGradDesc).GetLengths()[1]; + auto H = deref(params.inputGradDesc).GetLengths()[2]; + auto W = deref(params.inputGradDesc).GetLengths()[3]; + auto OH = deref(params.outputGradDesc).GetLengths()[2]; + auto OW = deref(params.outputGradDesc).GetLengths()[3]; + + kernel(params.output_grad, + params.input_grad, + N, + C, + H, + W, + OH, + OW, + params.kinfor, + params.stride, + params.padding, + params.count_include_pad, + params.divisor_override, + output_grad_tv, + input_grad_tv); + }; + }; + + return result; +} + +} // namespace avgpool + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp index e69de29bb2..b960554348 100644 --- a/src/solver/avgpool/backward_avgpool_3d.cpp +++ b/src/solver/avgpool/backward_avgpool_3d.cpp @@ -0,0 +1,120 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include "miopen/tensor_view_utils.hpp" +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_BWD_3D 1024 + +namespace miopen { + +namespace solver { + +namespace avgpool { + +bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context, + const miopen::avgpool::BwdProblemDescription& problem) const +{ + return true; +} + +ConvSolution +AvgPoolBackward3d::GetSolution(const ExecutionContext& context, + const miopen::avgpool::BwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetOutputGradDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType()); + auto dtype = problem.GetInputGradDesc().GetType(); + size_t N_total = problem.GetNtotal(); + + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; + + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_BWD_3D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolBackward3d", build_params)); + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + decltype(auto) kernel = handle_.Run(kernels.front()); + + auto input_grad_tv = get_inner_expanded_tv<5>(deref(params.inputGradDesc)); + auto output_grad_tv = get_inner_expanded_tv<5>(deref(params.outputGradDesc)); + + auto N = deref(params.inputGradDesc).GetLengths()[0]; + auto C = deref(params.inputGradDesc).GetLengths()[1]; + auto D = deref(params.inputGradDesc).GetLengths()[2]; + auto H = deref(params.inputGradDesc).GetLengths()[3]; + auto W = deref(params.inputGradDesc).GetLengths()[4]; + auto OD = deref(params.outputGradDesc).GetLengths()[2]; + auto OH = deref(params.outputGradDesc).GetLengths()[3]; + auto OW = deref(params.outputGradDesc).GetLengths()[4]; + + kernel(params.output_grad, + params.input_grad, + N, + C, + D, + H, + W, + OD, + OH, + OW, + params.kinfor, + params.stride, + params.padding, + params.count_include_pad, + params.divisor_override, + output_grad_tv, + input_grad_tv); + }; + }; + + return result; +} + +} // namespace avgpool + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp index e69de29bb2..d0e37b5464 100644 --- a/src/solver/avgpool/forward_avgpool_2d.cpp +++ b/src/solver/avgpool/forward_avgpool_2d.cpp @@ -0,0 +1,116 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include "miopen/tensor_view_utils.hpp" +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_FWD_2D 1024 + +namespace miopen { + +namespace solver { + +namespace avgpool { + +bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context, + const miopen::avgpool::FwdProblemDescription& problem) const +{ + return true; +} + +ConvSolution +AvgPoolForward2d::GetSolution(const ExecutionContext& context, + const miopen::avgpool::FwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType()); + auto dtype = problem.GetOutputDesc().GetType(); + size_t N_total = problem.GetNtotal(); + + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; + + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_FWD_2D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolForward2d", build_params)); + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + decltype(auto) kernel = handle_.Run(kernels.front()); + + auto input_tv = get_inner_expanded_tv<4>(deref(params.inputDesc)); + auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc)); + + auto N = deref(params.inputDesc).GetLengths()[0]; + auto C = deref(params.inputDesc).GetLengths()[1]; + auto H = deref(params.inputDesc).GetLengths()[2]; + auto W = deref(params.inputDesc).GetLengths()[3]; + auto OH = deref(params.outputDesc).GetLengths()[2]; + auto OW = deref(params.outputDesc).GetLengths()[3]; + + kernel(params.input, + params.output, + N, + C, + H, + W, + OH, + OW, + params.kinfor, + params.stride, + params.padding, + params.count_include_pad, + params.divisor_override, + input_tv, + output_tv); + }; + }; + + return result; +} + +} // namespace avgpool + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp index e69de29bb2..9dd8c03cba 100644 --- a/src/solver/avgpool/forward_avgpool_3d.cpp +++ b/src/solver/avgpool/forward_avgpool_3d.cpp @@ -0,0 +1,120 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include "miopen/tensor_view_utils.hpp" +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_FWD_3D 1024 + +namespace miopen { + +namespace solver { + +namespace avgpool { + +bool AvgPoolForward3d::IsApplicable(const ExecutionContext& context, + const miopen::avgpool::FwdProblemDescription& problem) const +{ + return true; +} + +ConvSolution +AvgPoolForward3d::GetSolution(const ExecutionContext& context, + const miopen::avgpool::FwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType()); + auto dtype = problem.GetOutputDesc().GetType(); + size_t N_total = problem.GetNtotal(); + + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; + + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_FWD_3D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolForward3d", build_params)); + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + decltype(auto) kernel = handle_.Run(kernels.front()); + + auto input_tv = get_inner_expanded_tv<5>(deref(params.inputDesc)); + auto output_tv = get_inner_expanded_tv<5>(deref(params.outputDesc)); + + auto N = deref(params.inputDesc).GetLengths()[0]; + auto C = deref(params.inputDesc).GetLengths()[1]; + auto D = deref(params.inputDesc).GetLengths()[2]; + auto H = deref(params.inputDesc).GetLengths()[3]; + auto W = deref(params.inputDesc).GetLengths()[4]; + auto OD = deref(params.outputDesc).GetLengths()[2]; + auto OH = deref(params.outputDesc).GetLengths()[3]; + auto OW = deref(params.outputDesc).GetLengths()[4]; + + kernel(params.input, + params.output, + N, + C, + D, + H, + W, + OD, + OH, + OW, + params.kinfor, + params.stride, + params.padding, + params.count_include_pad, + params.divisor_override, + input_tv, + output_tv); + }; + }; + + return result; +} + +} // namespace avgpool + +} // namespace solver + +} // namespace miopen From ca4ad974e8392c209814afa2478af48a3bb2bf1c Mon Sep 17 00:00:00 2001 From: hieule88 Date: Wed, 7 Aug 2024 18:17:48 +0700 Subject: [PATCH 04/38] add gtest --- .../miopen/avgpool/problem_description.hpp | 24 +- src/kernels/MIOpenAvgPool.cpp | 550 ++++++++++++++++++ src/solver/avgpool/forward_avgpool_2d.cpp | 13 +- test/cpu_avgpool.hpp | 426 ++++++++++++++ test/gtest/avgpool.cpp | 163 ++++++ test/gtest/avgpool.hpp | 426 ++++++++++++++ 6 files changed, 1588 insertions(+), 14 deletions(-) create mode 100644 test/cpu_avgpool.hpp create mode 100644 test/gtest/avgpool.cpp create mode 100644 test/gtest/avgpool.hpp diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp index 9400bd67a0..9166762235 100644 --- a/src/include/miopen/avgpool/problem_description.hpp +++ b/src/include/miopen/avgpool/problem_description.hpp @@ -92,12 +92,16 @@ struct FwdProblemDescription : ProblemDescription outputDesc.GetLengths()[1] != inputDesc.GetLengths()[1] || outputDesc.GetLengths().size() != input_dims) { - MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match."); + MIOPEN_THROW(miopenStatusBadParm, + "AvgPool: Input and output tensor sizes do not match."); } - if(input_dims != strideDesc.GetElementSize() || - input_dims != paddingDesc.GetElementSize() || input_dims != kinforDesc.GetElementSize()) + if(input_dims - 2 != strideDesc.GetElementSize() || + input_dims - 2 != paddingDesc.GetElementSize() || + input_dims - 2 != kinforDesc.GetElementSize()) { - MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match."); + MIOPEN_THROW(miopenStatusBadParm, + "AvgPool: Input tensor sizes and Kernel size or stride " + "or padding do not match."); } return true; @@ -138,12 +142,16 @@ struct BwdProblemDescription : ProblemDescription outputGradDesc.GetLengths()[1] != inputGradDesc.GetLengths()[1] || outputGradDesc.GetLengths().size() != input_dims) { - MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match."); + MIOPEN_THROW(miopenStatusBadParm, + "AvgPool: Input grad and output grad tensor sizes do not match."); } - if(input_dims != strideDesc.GetElementSize() || - input_dims != paddingDesc.GetElementSize() || input_dims != kinforDesc.GetElementSize()) + if(input_dims - 2 != strideDesc.GetElementSize() || + input_dims - 2 != paddingDesc.GetElementSize() || + input_dims - 2 != kinforDesc.GetElementSize()) { - MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match."); + MIOPEN_THROW(miopenStatusBadParm, + "AvgPool: Input grad tensor sizes and Kernel size or stride or padding do " + "not match."); } return true; diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp index e69de29bb2..bcbf4f6c60 100644 --- a/src/kernels/MIOpenAvgPool.cpp +++ b/src/kernels/MIOpenAvgPool.cpp @@ -0,0 +1,550 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "tensor_view.hpp" + +#ifndef INPUT_TYPE +#define INPUT_TYPE float +#endif + +#ifndef OUTPUT_TYPE +#define OUTPUT_TYPE float +#endif + +template +__device__ void avgPoolForward2d(const TI* __restrict__ input, + TO* __restrict__ output, + int32_t N, + int32_t C, + int32_t H, + int32_t W, + int32_t OH, + int32_t OW, + int32_t* kinfor, + int32_t* stride, + int32_t* padding, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<4> input_tv, + tensor_view_t<4> output_tv) +{ + int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; + int32_t ncoh = gid / OW, ow = gid % OW; + int32_t nc = ncoh / OH, oh = ncoh % OH; + int32_t n = nc / C, c = nc % C; + int32_t R = kinfor[0]; + int32_t S = kinfor[1]; + int32_t sh = stride[0]; + int32_t sw = stride[1]; + int32_t ph = padding[0]; + int32_t pw = padding[1]; + + if(n >= N) + return; + + FLOAT_ACCUM m = 0; + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + // input idx : (n, c, h, w) + int32_t h = oh * sh - ph + r; + if(h < 0 || h >= H) + continue; + int32_t w = ow * sw - pw + s; + if(w < 0 || w >= W) + continue; + // int32_t input_idx = ((n * C + c) * H + h) * W + w; + m += CVT_FLOAT2ACCUM( + input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]); + } + } + + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (hend - hstart) * (wend - wstart); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, H); + wend = min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (hend - hstart) * (wend - wstart); + } + } + FLOAT_ACCUM val = m / divide_factor; + + output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = CVT_ACCUM2FLOAT(val); +} + +extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + int32_t N, + int32_t C, + int32_t H, + int32_t W, + int32_t OH, + int32_t OW, + int32_t* kinfor, + int32_t* stride, + int32_t* padding, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<4> input_tv, + tensor_view_t<4> output_tv) +{ + avgPoolForward2d(input, + output, + N, + C, + H, + W, + OH, + OW, + kinfor, + stride, + padding, + count_include_pad, + divisor_override, + input_tv, + output_tv); +} + +template +__device__ void avgPoolForward3d(const TI* __restrict__ input, + TO* __restrict__ output, + int32_t N, + int32_t C, + int32_t D, + int32_t H, + int32_t W, + int32_t OD, + int32_t OH, + int32_t OW, + int32_t* kinfor, + int32_t* stride, + int32_t* padding, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<5> input_tv, + tensor_view_t<5> output_tv) +{ + int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; + int32_t ncodoh = gid / OW, ow = gid % OW; + int32_t ncod = ncodoh / OH, oh = ncodoh % OH; + int32_t nc = ncod / OD, od = ncod % OD; + int32_t n = nc / C, c = nc % C; + int32_t KD = kinfor[0]; + int32_t R = kinfor[1]; + int32_t S = kinfor[2]; + int32_t sd = stride[0]; + int32_t sh = stride[1]; + int32_t sw = stride[2]; + int32_t pd = padding[0]; + int32_t ph = padding[1]; + int32_t pw = padding[2]; + + if(n >= N) + return; + FLOAT_ACCUM sum = 0; + for(int32_t kd = 0; kd < KD; ++kd) + { + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + // input idx : (n, c, d, h, w) + int32_t d = od * sd - pd + kd; + if(d < 0 || d >= D) + continue; + int32_t h = oh * sh - ph + r; + if(h < 0 || h >= H) + continue; + int32_t w = ow * sw - pw + s; + if(w < 0 || w >= W) + continue; + // int32_t input_idx = ((n * C + c) * H + h) * W + w; + sum += CVT_FLOAT2ACCUM( + input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]); + } + } + } + int32_t dstart = od * sd - pd; + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t dend = min(dstart + KD, D + pd); + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dend = min(dend, D); + hend = min(hend, H); + wend = min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); + } + } + FLOAT_ACCUM val = sum / divide_factor; + output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] = + CVT_ACCUM2FLOAT(val); +} + +extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + int32_t N, + int32_t C, + int32_t D, + int32_t H, + int32_t W, + int32_t OD, + int32_t OH, + int32_t OW, + int32_t* kinfor, + int32_t* stride, + int32_t* padding, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<5> input_tv, + tensor_view_t<5> output_tv) +{ + avgPoolForward3d(input, + output, + N, + C, + D, + H, + W, + OD, + OH, + OW, + kinfor, + stride, + padding, + count_include_pad, + divisor_override, + input_tv, + output_tv); +} + +template +__device__ void avgPoolBackward2d(const TI* __restrict__ output_grad, + TO* __restrict__ input_grad, + int32_t N, + int32_t C, + int32_t H, + int32_t W, + int32_t OH, + int32_t OW, + int32_t* kinfor, + int32_t* stride, + int32_t* padding, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<4> output_grad_tv, + tensor_view_t<4> input_grad_tv) +{ + int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; + int32_t nch = gid / W, w = gid % W; + int32_t nc = nch / H, h = nch % H; + int32_t n = nc / C, c = nc % C; + int32_t R = kinfor[0]; + int32_t S = kinfor[1]; + int32_t sh = stride[0]; + int32_t sw = stride[1]; + int32_t ph = padding[0]; + int32_t pw = padding[1]; + + if(n >= N) + return; + + FLOAT_ACCUM grad = 0; + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + int32_t ohsh = h + ph - r; + if(ohsh % sh != 0) + continue; + int32_t oh = ohsh / sh; + if(oh < 0 || oh >= OH) + continue; + int32_t owsw = w + pw - s; + if(owsw % sw != 0) + continue; + int32_t ow = owsw / sw; + if(ow < 0 || ow >= OW) + continue; + + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (hend - hstart) * (wend - wstart); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, H); + wend = min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (hend - hstart) * (wend - wstart); + } + } + + grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx( + tensor_layout_t<4>(n, c, oh, ow))]) / + divide_factor; + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] = + CVT_ACCUM2FLOAT(grad); +} + +extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad, + OUTPUT_TYPE* __restrict__ input_grad, + int32_t N, + int32_t C, + int32_t H, + int32_t W, + int32_t OH, + int32_t OW, + int32_t* kinfor, + int32_t* stride, + int32_t* padding, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<4> output_grad_tv, + tensor_view_t<4> input_grad_tv) +{ + avgPoolBackward2d(output_grad, + input_grad, + N, + C, + H, + W, + OH, + OW, + kinfor, + stride, + padding, + count_include_pad, + divisor_override, + output_grad_tv, + input_grad_tv); +} + +template +__device__ void avgPoolBackward3d(const TI* __restrict__ output_grad, + TO* __restrict__ input_grad, + int32_t N, + int32_t C, + int32_t D, + int32_t H, + int32_t W, + int32_t OD, + int32_t OH, + int32_t OW, + int32_t* kinfor, + int32_t* stride, + int32_t* padding, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<5> output_grad_tv, + tensor_view_t<5> input_grad_tv) +{ + int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; + int32_t ncdh = gid / W, w = gid % W; + int32_t ncd = ncdh / H, h = ncdh % H; + int32_t nc = ncd / D, d = ncd % D; + int32_t n = nc / C, c = nc % C; + int32_t KD = kinfor[0]; + int32_t R = kinfor[1]; + int32_t S = kinfor[2]; + int32_t sd = stride[0]; + int32_t sh = stride[1]; + int32_t sw = stride[2]; + int32_t pd = padding[0]; + int32_t ph = padding[1]; + int32_t pw = padding[2]; + + if(n >= N) + return; + + FLOAT_ACCUM grad = 0; + for(int32_t kd = 0; kd < KD; ++kd) + { + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + int32_t odsd = d + pd - kd; + if(odsd % sd != 0) + continue; + int32_t od = odsd / sd; + if(od < 0 || od >= OD) + continue; + + int32_t ohsh = h + ph - r; + if(ohsh % sh != 0) + continue; + int32_t oh = ohsh / sh; + if(oh < 0 || oh >= OH) + continue; + + int32_t owsw = w + pw - s; + if(owsw % sw != 0) + continue; + int32_t ow = owsw / sw; + if(ow < 0 || ow >= OW) + continue; + + int32_t dstart = od * sd - pd; + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t dend = min(dstart + KD, D + pd); + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dend = min(dend, D); + hend = min(hend, H); + wend = min(wend, W); + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); + } + } + grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx( + tensor_layout_t<5>(n, c, od, oh, ow))]) / + divide_factor; + } + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] = + CVT_ACCUM2FLOAT(grad); +} + +extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad, + OUTPUT_TYPE* __restrict__ input_grad, + int32_t N, + int32_t C, + int32_t D, + int32_t H, + int32_t W, + int32_t OD, + int32_t OH, + int32_t OW, + int32_t* kinfor, + int32_t* stride, + int32_t* padding, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<5> output_grad_tv, + tensor_view_t<5> input_grad_tv) +{ + avgPoolBackward3d(output_grad, + input_grad, + N, + C, + D, + H, + W, + OD, + OH, + OW, + kinfor, + stride, + padding, + count_include_pad, + divisor_override, + output_grad_tv, + input_grad_tv); +} diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp index d0e37b5464..8b444370a0 100644 --- a/src/solver/avgpool/forward_avgpool_2d.cpp +++ b/src/solver/avgpool/forward_avgpool_2d.cpp @@ -28,6 +28,7 @@ #include "miopen/execution_context.hpp" #include "miopen/invoke_params.hpp" #include "miopen/tensor_view_utils.hpp" +#include #include #include @@ -81,12 +82,12 @@ AvgPoolForward2d::GetSolution(const ExecutionContext& context, auto input_tv = get_inner_expanded_tv<4>(deref(params.inputDesc)); auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc)); - auto N = deref(params.inputDesc).GetLengths()[0]; - auto C = deref(params.inputDesc).GetLengths()[1]; - auto H = deref(params.inputDesc).GetLengths()[2]; - auto W = deref(params.inputDesc).GetLengths()[3]; - auto OH = deref(params.outputDesc).GetLengths()[2]; - auto OW = deref(params.outputDesc).GetLengths()[3]; + size_t N = deref(params.inputDesc).GetLengths()[0]; + size_t C = deref(params.inputDesc).GetLengths()[1]; + size_t H = deref(params.inputDesc).GetLengths()[2]; + size_t W = deref(params.inputDesc).GetLengths()[3]; + size_t OH = deref(params.outputDesc).GetLengths()[2]; + size_t OW = deref(params.outputDesc).GetLengths()[3]; kernel(params.input, params.output, diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp new file mode 100644 index 0000000000..40a67a8d7d --- /dev/null +++ b/test/cpu_avgpool.hpp @@ -0,0 +1,426 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_CPU_AVGPOOL_HPP +#define GUARD_CPU_AVGPOOL_HPP + +#include "tensor_holder.hpp" +#include + +template +void cpu_avgpool_forward_2d(tensor input, + tensor& output, + int32_t N, + int32_t C, + int32_t H, + int32_t W, + int32_t OH, + int32_t OW, + tensor kinfor, + tensor stride, + tensor padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = input.desc.GetLengths(); + auto numel = output.desc.GetElementSize(); + + auto input_tv = miopen::get_inner_expanded_tv<4>(input.desc); + auto output_tv = miopen::get_inner_expanded_tv<4>(output.desc); + + for(int32_t gid = 0; gid < numel; gid++) + { + int32_t ncoh = gid / OW, ow = gid % OW; + int32_t nc = ncoh / OH, oh = ncoh % OH; + int32_t n = nc / C, c = nc % C; + int32_t R = kinfor[0]; + int32_t S = kinfor[1]; + int32_t sh = stride[0]; + int32_t sw = stride[1]; + int32_t ph = padding[0]; + int32_t pw = padding[1]; + + if(n >= N) + return; + + float m = 0; + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + // input idx : (n, c, h, w) + int32_t h = oh * sh - ph + r; + if(h < 0 || h >= H) + continue; + int32_t w = ow * sw - pw + s; + if(w < 0 || w >= W) + continue; + // int32_t input_idx = ((n * C + c) * H + h) * W + w; + m += static_cast( + input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]); + } + } + + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t hend = std::min(hstart + R, H + ph); + int32_t wend = std::min(wstart + S, W + pw); + + const int32_t pool_size = (hend - hstart) * (wend - wstart); + + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + hend = std::min(hend, H); + wend = std::min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (hend - hstart) * (wend - wstart); + } + } + float val = m / divide_factor; + + output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = + static_cast(val); + } +} + +template +void cpu_avgpool_forward_3d(tensor input, + tensor& output, + int32_t N, + int32_t C, + int32_t D, + int32_t H, + int32_t W, + int32_t OD, + int32_t OH, + int32_t OW, + tensor kinfor, + tensor stride, + tensor padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = input.desc.GetLengths(); + auto numel = output.desc.GetElementSize(); + + auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); + auto output_tv = miopen::get_inner_expanded_tv<5>(output.desc); + + for(int32_t gid = 0; gid < numel; gid++) + { + int32_t ncodoh = gid / OW, ow = gid % OW; + int32_t ncod = ncodoh / OH, oh = ncodoh % OH; + int32_t nc = ncod / OD, od = ncod % OD; + int32_t n = nc / C, c = nc % C; + int32_t KD = kinfor[0]; + int32_t R = kinfor[1]; + int32_t S = kinfor[2]; + int32_t sd = stride[0]; + int32_t sh = stride[1]; + int32_t sw = stride[2]; + int32_t pd = padding[0]; + int32_t ph = padding[1]; + int32_t pw = padding[2]; + + if(n >= N) + return; + float sum = 0; + for(int32_t kd = 0; kd < KD; ++kd) + { + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + // input idx : (n, c, d, h, w) + int32_t d = od * sd - pd + kd; + if(d < 0 || d >= D) + continue; + int32_t h = oh * sh - ph + r; + if(h < 0 || h >= H) + continue; + int32_t w = ow * sw - pw + s; + if(w < 0 || w >= W) + continue; + // int32_t input_idx = ((n * C + c) * H + h) * W + w; + sum += static_cast( + input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]); + } + } + } + int32_t dstart = od * sd - pd; + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t dend = std::min(dstart + KD, D + pd); + int32_t hend = std::min(hstart + R, H + ph); + int32_t wend = std::min(wstart + S, W + pw); + + const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + dend = std::min(dend, D); + hend = std::min(hend, H); + wend = std::min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); + } + } + float val = sum / divide_factor; + output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] = + static_cast(val); + } +} + +template +void cpu_avgpool_backward_2d(tensor output_grad, + tensor& input_grad, + int32_t N, + int32_t C, + int32_t H, + int32_t W, + int32_t OH, + int32_t OW, + tensor kinfor, + tensor stride, + tensor padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = input_grad.desc.GetLengths(); + auto numel = input_grad.desc.GetElementSize(); + + auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc); + auto input_grad_tv = miopen::get_inner_expanded_tv<4>(input_grad.desc); + + for(size_t gid = 0; gid < numel; gid++) + { + int32_t nch = gid / W, w = gid % W; + int32_t nc = nch / H, h = nch % H; + int32_t n = nc / C, c = nc % C; + int32_t R = kinfor[0]; + int32_t S = kinfor[1]; + int32_t sh = stride[0]; + int32_t sw = stride[1]; + int32_t ph = padding[0]; + int32_t pw = padding[1]; + + if(n >= N) + return; + + float grad = 0; + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + int32_t ohsh = h + ph - r; + if(ohsh % sh != 0) + continue; + int32_t oh = ohsh / sh; + if(oh < 0 || oh >= OH) + continue; + int32_t owsw = w + pw - s; + if(owsw % sw != 0) + continue; + int32_t ow = owsw / sw; + if(ow < 0 || ow >= OW) + continue; + + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t hend = std::min(hstart + R, H + ph); + int32_t wend = std::min(wstart + S, W + pw); + + const int32_t pool_size = (hend - hstart) * (wend - wstart); + + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + hend = std::min(hend, H); + wend = std::min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (hend - hstart) * (wend - wstart); + } + } + + grad += static_cast(output_grad[output_grad_tv.get_tensor_view_idx( + tensor_layout_t<4>(n, c, oh, ow))]) / + divide_factor; + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] = + static_cast(grad); + } +} + +template +void cpu_avgpool_backward_3d(tensor output_grad, + tensor& input_grad, + int32_t N, + int32_t C, + int32_t D, + int32_t H, + int32_t W, + int32_t OD, + int32_t OH, + int32_t OW, + tensor kinfor, + tensor stride, + tensor padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = input_grad.desc.GetLengths(); + auto numel = input_grad.desc.GetElementSize(); + + auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc); + auto input_grad_tv = miopen::get_inner_expanded_tv<5>(input_grad.desc); + + for(size_t gid = 0; gid < numel; gid++) + { + int32_t ncdh = gid / W, w = gid % W; + int32_t ncd = ncdh / H, h = ncdh % H; + int32_t nc = ncd / D, d = ncd % D; + int32_t n = nc / C, c = nc % C; + int32_t KD = kinfor[0]; + int32_t R = kinfor[1]; + int32_t S = kinfor[2]; + int32_t sd = stride[0]; + int32_t sh = stride[1]; + int32_t sw = stride[2]; + int32_t pd = padding[0]; + int32_t ph = padding[1]; + int32_t pw = padding[2]; + + if(n >= N) + return; + + float grad = 0; + for(int32_t kd = 0; kd < KD; ++kd) + { + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + int32_t odsd = d + pd - kd; + if(odsd % sd != 0) + continue; + int32_t od = odsd / sd; + if(od < 0 || od >= OD) + continue; + + int32_t ohsh = h + ph - r; + if(ohsh % sh != 0) + continue; + int32_t oh = ohsh / sh; + if(oh < 0 || oh >= OH) + continue; + + int32_t owsw = w + pw - s; + if(owsw % sw != 0) + continue; + int32_t ow = owsw / sw; + if(ow < 0 || ow >= OW) + continue; + + int32_t dstart = od * sd - pd; + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t dend = std::min(dstart + KD, D + pd); + int32_t hend = std::min(hstart + R, H + ph); + int32_t wend = std::min(wstart + S, W + pw); + + const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + dend = std::min(dend, D); + hend = std::min(hend, H); + wend = std::min(wend, W); + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); + } + } + grad += static_cast(output_grad[output_grad_tv.get_tensor_view_idx( + tensor_layout_t<5>(n, c, od, oh, ow))]) / + divide_factor; + } + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] = + static_cast(grad); + } +} + +#endif diff --git a/test/gtest/avgpool.cpp b/test/gtest/avgpool.cpp new file mode 100644 index 0000000000..1dd5502339 --- /dev/null +++ b/test/gtest/avgpool.cpp @@ -0,0 +1,163 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "avgpool.hpp" +#include + +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) + +namespace avgpool { + +std::string GetFloatArg() +{ + const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG); + if(tmp.empty()) + { + return ""; + } + return tmp; +} + +struct GPU_Avgpool_fwd_FP32 : AvgPoolTestFwd +{ +}; + +struct GPU_Avgpool_fwd_FP16 : AvgPoolTestFwd +{ +}; + +struct GPU_Avgpool_fwd_BFP16 : AvgPoolTestFwd +{ +}; + +struct GPU_Avgpool_bwd_FP32 : AvgPoolTestBwd +{ +}; + +struct GPU_Avgpool_bwd_FP16 : AvgPoolTestBwd +{ +}; + +struct GPU_Avgpool_bwd_BFP16 : AvgPoolTestBwd +{ +}; + +} // namespace avgpool +using namespace avgpool; + +// FORWARD TEST +TEST_P(GPU_Avgpool_fwd_FP32, AvgPoolTestFwd) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_Avgpool_fwd_FP16, AvgPoolTestFwd) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_Avgpool_fwd_BFP16, AvgPoolTestFwd) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP32, testing::ValuesIn(AvgPoolTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP16, testing::ValuesIn(AvgPoolTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs())); + +// // BACKWARD TEST +// TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd) +// { +// if(!MIOPEN_TEST_ALL || +// (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) +// { +// RunTest(); +// Verify(); +// } +// else +// { +// GTEST_SKIP(); +// } +// }; + +// TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd) +// { +// if(!MIOPEN_TEST_ALL || +// (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) +// { +// RunTest(); +// Verify(); +// } +// else +// { +// GTEST_SKIP(); +// } +// }; + +// TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd) +// { +// if(!MIOPEN_TEST_ALL || +// (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) +// { +// RunTest(); +// Verify(); +// } +// else +// { +// GTEST_SKIP(); +// } +// }; + +// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP32, testing::ValuesIn(AvgPoolTestConfigs())); +// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP16, testing::ValuesIn(AvgPoolTestConfigs())); +// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs())); diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp new file mode 100644 index 0000000000..23ec4c1726 --- /dev/null +++ b/test/gtest/avgpool.hpp @@ -0,0 +1,426 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "../driver/tensor_driver.hpp" +#include "cpu_avgpool.hpp" +#include "get_handle.hpp" +#include "random.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include +#include +#include +#include +#include + +template +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +struct AvgPoolTestCase +{ + std::vector input_dims; + std::vector kernel_size; + std::vector stride; + std::vector padding; + bool ceil_mode; + bool count_include_pad; + int32_t divisor_override; + + friend std::ostream& operator<<(std::ostream& os, const AvgPoolTestCase& tc) + { + return os << " input_dims:" << tc.input_dims << " kernel_size:" << tc.kernel_size + << " stride:" << tc.stride << " padding:" << tc.padding + << " ceil_mode:" << tc.ceil_mode << " count_include_pad:" << tc.count_include_pad + << " divisor_override:" << tc.divisor_override; + } + + std::vector GetInput() const { return input_dims; } +}; + +inline std::vector AvgPoolTestConfigs() +{ + return { + {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, false, 0}, + // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, false, 0}, + // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, true, 0}, + // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, true, 0}, + // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, false, 1}, + // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, false, 1}, + // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, true, 1}, + // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, true, 1}, + {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, false, 0}, + // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, false, 0}, + // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, + // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, true, 0}, + // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, false, 1}, + // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, false, 1}, + // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 1}, + // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, true, 1}, + }; +} + +// FORWARD TEST +template +struct AvgPoolTestFwd : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + avgpool_config = GetParam(); + auto in_dim = avgpool_config.GetInput(); + N = in_dim[0]; + C = in_dim[1]; + D = in_dim.size() == 5 ? in_dim[2] : 1; + H = in_dim.size() == 5 ? in_dim[3] : in_dim[2]; + W = in_dim.size() == 5 ? in_dim[4] : in_dim[3]; + ksize = tensor{in_dim.size() - 2}; + ksize.data = avgpool_config.kernel_size; + stride = tensor{in_dim.size() - 2}; + stride.data = avgpool_config.stride; + padding = tensor{in_dim.size() - 2}; + padding.data = avgpool_config.padding; + ceil_mode = avgpool_config.ceil_mode; + count_include_pad = avgpool_config.count_include_pad; + divisor_override = avgpool_config.divisor_override; + + auto gen_input_value = [](auto...) { + return prng::gen_A_to_B(static_cast(-10.0f), static_cast(10.0f)); + }; + input = tensor{in_dim}.generate(gen_input_value); + + std::vector out_dim; + if(in_dim.size() == 5) + { + if(ceil_mode) + { + OD = std::ceil(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OH = std::ceil(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + OW = std::ceil(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; + } + else + { + OD = std::floor(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OH = std::floor(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + OW = std::floor(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; + } + out_dim = {N, C, OD, OH, OW}; + } + else + { + if(ceil_mode) + { + OH = std::ceil(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OW = std::ceil(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + } + else + { + OH = std::floor(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OW = std::floor(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + } + out_dim = {N, C, OH, OW}; + } + + output = tensor{out_dim}; + std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); + + ref_output = tensor{out_dim}; + std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits::quiet_NaN()); + + input_dev = handle.Write(input.data); + output_dev = handle.Write(output.data); + ksize_dev = handle.Write(ksize.data); + stride_dev = handle.Write(stride.data); + padding_dev = handle.Write(padding.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + miopenStatus_t status; + + auto dims = input.desc.GetNumDims(); + if(dims == 4) + { + cpu_avgpool_forward_2d(input, + ref_output, + N, + C, + H, + W, + OH, + OW, + ksize, + stride, + padding, + count_include_pad, + divisor_override); + } + else if(dims == 5) + { + cpu_avgpool_forward_3d(input, + ref_output, + N, + C, + D, + H, + W, + OD, + OH, + OW, + ksize, + stride, + padding, + count_include_pad, + divisor_override); + } + status = miopen::AvgPoolForward(handle, + input.desc, + input_dev.get(), + output.desc, + output_dev.get(), + stride.desc, + stride_dev.get(), + padding.desc, + padding_dev.get(), + ksize.desc, + ksize_dev.get(), + count_include_pad, + divisor_override); + fflush(stdout); + + ASSERT_EQ(status, miopenStatusSuccess); + + output.data = handle.Read(output_dev, output.data.size()); + } + + void Verify() + { + double threshold = std::numeric_limits::epsilon(); + + auto error = miopen::rms_range(ref_output, output); + + ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output)); + for(int i = 0; i < 10; ++i) + { + std::cout << "output cpu: " << ref_output[i] << " output gpu: " << output[i] + << std::endl; + } + EXPECT_LT(error, threshold * 10); + } + AvgPoolTestCase avgpool_config; + + tensor input; + tensor output; + tensor ref_output; + tensor ksize; + tensor stride; + tensor padding; + + bool ceil_mode; + bool count_include_pad; + int32_t divisor_override; + int32_t N, C, D, H, W, OD, OH, OW; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr output_dev; + miopen::Allocator::ManageDataPtr ksize_dev; + miopen::Allocator::ManageDataPtr stride_dev; + miopen::Allocator::ManageDataPtr padding_dev; +}; + +// BACKWARD TEST +template +struct AvgPoolTestBwd : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + avgpool_config = GetParam(); + auto in_grad_dim = avgpool_config.GetInput(); + N = in_grad_dim[0]; + C = in_grad_dim[1]; + D = in_grad_dim.size() == 5 ? in_grad_dim[2] : 1; + H = in_grad_dim.size() == 5 ? in_grad_dim[3] : in_grad_dim[2]; + W = in_grad_dim.size() == 5 ? in_grad_dim[4] : in_grad_dim[3]; + ksize = tensor{in_grad_dim.size() - 2}; + ksize.data = avgpool_config.kernel_size; + stride = tensor{in_grad_dim.size() - 2}; + stride.data = avgpool_config.stride; + padding = tensor{in_grad_dim.size() - 2}; + padding.data = avgpool_config.padding; + ceil_mode = avgpool_config.ceil_mode; + count_include_pad = avgpool_config.count_include_pad; + divisor_override = avgpool_config.divisor_override; + + std::vector out_grad_dim; + if(in_grad_dim.size() == 5) + { + if(ceil_mode) + { + OD = std::ceil(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OH = std::ceil(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + OW = std::ceil(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; + } + else + { + OD = std::floor(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OH = std::floor(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + OW = std::floor(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; + } + out_grad_dim = {N, C, OD, OH, OW}; + } + else + { + if(ceil_mode) + { + OH = std::ceil(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OW = std::ceil(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + } + else + { + OH = std::floor(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OW = std::floor(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + } + out_grad_dim = {N, C, OH, OW}; + } + auto gen_output_grad_value = [](auto...) { + return prng::gen_A_to_B(static_cast(-10.0f), static_cast(10.0f)); + }; + output_grad = tensor{out_grad_dim}.generate(gen_output_grad_value); + + input_grad = tensor{in_grad_dim}; + std::fill(input_grad.begin(), input_grad.end(), std::numeric_limits::quiet_NaN()); + + ref_input_grad = tensor{in_grad_dim}; + std::fill( + ref_input_grad.begin(), ref_input_grad.end(), std::numeric_limits::quiet_NaN()); + + output_grad_dev = handle.Write(output_grad.data); + input_grad_dev = handle.Write(input_grad.data); + ksize_dev = handle.Write(ksize.data); + stride_dev = handle.Write(stride.data); + padding_dev = handle.Write(padding.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + + miopenStatus_t status; + + auto dims = input_grad.desc.GetNumDims(); + if(dims == 4) + { + cpu_avgpool_backward_2d(output_grad, + ref_input_grad, + N, + C, + H, + W, + OH, + OW, + ksize, + stride, + padding, + count_include_pad, + divisor_override); + } + else if(dims == 5) + { + cpu_avgpool_backward_3d(output_grad, + ref_input_grad, + N, + C, + D, + H, + W, + OD, + OH, + OW, + ksize, + stride, + padding, + count_include_pad, + divisor_override); + } + status = miopen::AvgPoolBackward(handle, + output_grad.desc, + output_grad_dev.get(), + input_grad.desc, + input_grad_dev.get(), + stride.desc, + stride_dev.get(), + padding.desc, + padding_dev.get(), + ksize.desc, + ksize_dev.get(), + count_include_pad, + divisor_override); + + ASSERT_EQ(status, miopenStatusSuccess); + + input_grad.data = handle.Read(input_grad_dev, input_grad.data.size()); + } + + void Verify() + { + double threshold = std::numeric_limits::epsilon(); + auto error = miopen::rms_range(ref_input_grad, input_grad); + ASSERT_EQ(miopen::range_distance(ref_input_grad), miopen::range_distance(input_grad)); + EXPECT_LT(error, threshold * 10); + } + AvgPoolTestCase avgpool_config; + + tensor output_grad; + tensor input_grad; + tensor ref_input_grad; + tensor ksize; + tensor stride; + tensor padding; + + bool ceil_mode; + bool count_include_pad; + int32_t divisor_override; + int32_t N, C, D, H, W, OD, OH, OW; + + miopen::Allocator::ManageDataPtr output_grad_dev; + miopen::Allocator::ManageDataPtr input_grad_dev; + miopen::Allocator::ManageDataPtr ksize_dev; + miopen::Allocator::ManageDataPtr stride_dev; + miopen::Allocator::ManageDataPtr padding_dev; +}; From 0492fc71c714c320b7d0d53f67030ba8e3fe2a90 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Mon, 12 Aug 2024 10:21:50 +0700 Subject: [PATCH 05/38] add driver test --- driver/CMakeLists.txt | 1 + driver/avgpool_driver.hpp | 596 +++++++++++++++++++++ driver/dm_avgpool.cpp | 40 ++ driver/driver.hpp | 5 +- driver/mloAvgPoolHost.hpp | 438 +++++++++++++++ src/kernels/MIOpenAvgPool.cpp | 118 ++-- src/solver/avgpool/backward_avgpool_2d.cpp | 5 + src/solver/avgpool/backward_avgpool_3d.cpp | 5 + src/solver/avgpool/forward_avgpool_2d.cpp | 4 + src/solver/avgpool/forward_avgpool_3d.cpp | 4 + test/cpu_avgpool.hpp | 116 ++-- test/gtest/avgpool.cpp | 92 ++-- test/gtest/avgpool.hpp | 6 - 13 files changed, 1259 insertions(+), 171 deletions(-) create mode 100644 driver/avgpool_driver.hpp create mode 100644 driver/dm_avgpool.cpp create mode 100644 driver/mloAvgPoolHost.hpp diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index cd663eb8b4..385580e2e1 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -32,6 +32,7 @@ add_executable(MIOpenDriver dm_activ.cpp dm_adam.cpp dm_addlayernorm.cpp + dm_avgpool.cpp dm_bnorm.cpp dm_cat.cpp dm_conv.cpp diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp new file mode 100644 index 0000000000..38beba92f1 --- /dev/null +++ b/driver/avgpool_driver.hpp @@ -0,0 +1,596 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_AVGPOOL_DRIVER_HPP +#define GUARD_MIOPEN_AVGPOOL_DRIVER_HPP + +#include "InputFlags.hpp" +#include "driver.hpp" +#include "mloAvgPoolHost.hpp" +#include "random.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" + +#include <../test/tensor_holder.hpp> +#include <../test/verify.hpp> + +#include +#include +#include +#include +#include + +template +class AvgPoolDriver : public Driver +{ +public: + AvgPoolDriver() : Driver() + { + miopenCreateTensorDescriptor(&inputDesc); + miopenCreateTensorDescriptor(&outputDesc); + miopenCreateTensorDescriptor(&inputGradDesc); + miopenCreateTensorDescriptor(&outputGradDesc); + miopenCreateTensorDescriptor(&ksizeDesc); + miopenCreateTensorDescriptor(&strideDesc); + miopenCreateTensorDescriptor(&paddingDesc); + + data_type = miopen_type{}; + } + + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + std::vector GetInputTensorDimsFromCmd(const char* param); + int GetandSetData() override; + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + int RunBackwardCPU(); + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~AvgPoolDriver() override + { + miopenDestroyTensorDescriptor(inputDesc); + miopenDestroyTensorDescriptor(outputDesc); + miopenDestroyTensorDescriptor(inputGradDesc); + miopenDestroyTensorDescriptor(outputGradDesc); + miopenDestroyTensorDescriptor(ksizeDesc); + miopenDestroyTensorDescriptor(strideDesc); + miopenDestroyTensorDescriptor(paddingDesc); + } + +private: + InputFlags inflags; + + int forw; + + miopenTensorDescriptor_t inputDesc; + miopenTensorDescriptor_t outputDesc; + miopenTensorDescriptor_t inputGradDesc; + miopenTensorDescriptor_t outputGradDesc; + miopenTensorDescriptor_t ksizeDesc; + miopenTensorDescriptor_t strideDesc; + miopenTensorDescriptor_t paddingDesc; + + std::unique_ptr input_dev; + std::unique_ptr output_dev; + std::unique_ptr input_grad_dev; + std::unique_ptr output_grad_dev; + std::unique_ptr ksize_dev; + std::unique_ptr stride_dev; + std::unique_ptr padding_dev; + + std::vector input; + std::vector output; + std::vector output_host; + std::vector input_grad; + std::vector input_grad_host; + std::vector output_grad; + std::vector ksize; + std::vector stride; + std::vector padding; + + bool ceil_mode; + bool count_include_pad; + int32_t divisor_override; + int32_t N, C, D, H, W, OD, OH, OW; + + std::vector in_dim; +}; + +template +int AvgPoolDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + return miopenStatusSuccess; +} + +template +std::vector AvgPoolDriver::GetInputTensorDimsFromCmd(const char* param) +{ + std::string lengthsStr = inflags.GetValueStr(param); + + std::vector lengths; + std::size_t pos = 0; + std::size_t new_pos; + + new_pos = lengthsStr.find(',', pos); + while(new_pos != std::string::npos) + { + std::string sliceStr = lengthsStr.substr(pos, new_pos - pos); + + int len = std::stoi(sliceStr); + + lengths.push_back(len); + + pos = new_pos + 1; + new_pos = lengthsStr.find(',', pos); + }; + + std::string sliceStr = lengthsStr.substr(pos); + int len = std::stoi(sliceStr); + + lengths.push_back(len); + + return (lengths); +} + +template +int AvgPoolDriver::GetandSetData() +{ + in_dim = GetInputTensorDimsFromCmd("input_dims"); + std::vector ksp_dim = {in_dim.size() - 2}; + ksize = GetInputTensorDimsFromCmd("kernel_size"); + stride = GetInputTensorDimsFromCmd("stride"); + padding = GetInputTensorDimsFromCmd("padding"); + + if(ksize.size() != ksp_dim[0]) + { + int ref = ksp_dim[0] - ksize.size(); + while(ref--) + ksize.push_back(1); + } + if(stride.size() != ksp_dim[0]) + { + int ref = ksp_dim[0] - ksize.size(); + while(ref--) + stride.push_back(1); + } + if(padding.size() != ksp_dim[0]) + { + int ref = ksp_dim[0] - ksize.size(); + while(ref--) + padding.push_back(0); + } + + ceil_mode = static_cast(inflags.GetValueInt("ceil_mode")); + count_include_pad = static_cast(inflags.GetValueInt("count_include_pad")); + divisor_override = inflags.GetValueInt("divisor_override"); + + N = in_dim[0]; + C = in_dim[1]; + D = in_dim.size() == 5 ? in_dim[2] : 1; + H = in_dim.size() == 5 ? in_dim[3] : in_dim[2]; + W = in_dim.size() == 5 ? in_dim[4] : in_dim[3]; + + std::vector out_dim; + if(in_dim.size() == 5) + { + if(ceil_mode) + { + OD = std::ceil(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OH = std::ceil(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + OW = std::ceil(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; + } + else + { + OD = std::floor(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OH = std::floor(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + OW = std::floor(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; + } + out_dim = std::vector{N, C, OD, OH, OW}; + } + else + { + if(ceil_mode) + { + OH = std::ceil(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OW = std::ceil(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + } + else + { + OH = std::floor(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OW = std::floor(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + } + out_dim = std::vector{N, C, OH, OW}; + } + SetTensorNd(inputDesc, in_dim, data_type); + SetTensorNd(outputDesc, out_dim, data_type); + SetTensorNd(outputGradDesc, out_dim, data_type); + SetTensorNd(inputGradDesc, in_dim, data_type); + SetTensorNd(ksizeDesc, ksp_dim, miopen_type{}); + SetTensorNd(strideDesc, ksp_dim, miopen_type{}); + SetTensorNd(paddingDesc, ksp_dim, miopen_type{}); + + return miopenStatusSuccess; +} + +template +int AvgPoolDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "1", "Run only Forward AvgPool (Default=1)", "int"); + inflags.AddInputFlag( + "input_dims", + 'D', + "2,3,7,9", + "The dimensional lengths of the input tensor: N,C,D1,D2,... Example: 2,3,7,9.", + "string"); + inflags.AddInputFlag( + "kernel_size", 'k', "1,1", "The size of the window D1,D2,... Example: 1,1.", "string"); + inflags.AddInputFlag( + "stride", + 's', + "1,1", + "The stride of the window. Default value is kernel_size D1,D2,... Example: 1,1.", + "string"); + inflags.AddInputFlag("padding", + 'p', + "0,0", + "Implicit zero padding to be added on both sides D1,D2,... Example: 0,0.", + "string"); + inflags.AddInputFlag("ceil_mode", + 'c', + "1", + "When 1, will use ceil instead of floor to compute the output shape.", + "int"); + inflags.AddInputFlag("count_include_pad", + 'P', + "0", + "When 1, will include the zero-padding in the averaging calculation.", + "int"); + inflags.AddInputFlag("divisor_override", + 'd', + "0", + "If specified, it will be used as divisor, otherwise size of the pooling " + "region will be used.", + "int"); + + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int"); + inflags.AddInputFlag("time", 't', "1", "Time (Default=1)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +int AvgPoolDriver::AllocateBuffersAndCopy() +{ + size_t input_sz = GetTensorSize(inputDesc); + size_t output_sz = GetTensorSize(outputDesc); + size_t ksize_sz = GetTensorSize(ksizeDesc); + size_t stride_sz = GetTensorSize(strideDesc); + size_t padding_sz = GetTensorSize(paddingDesc); + + uint32_t ctx = 0; + + input_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + output_dev = std::unique_ptr(new GPUMem(ctx, output_sz, sizeof(Tgpu))); + input_grad_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + output_grad_dev = std::unique_ptr(new GPUMem(ctx, output_sz, sizeof(Tgpu))); + ksize_dev = std::unique_ptr(new GPUMem(ctx, ksize_sz, sizeof(int32_t))); + stride_dev = std::unique_ptr(new GPUMem(ctx, stride_sz, sizeof(int32_t))); + padding_dev = std::unique_ptr(new GPUMem(ctx, padding_sz, sizeof(int32_t))); + + input = std::vector(input_sz, static_cast(0)); + output = std::vector(output_sz, static_cast(0)); + output_host = std::vector(output_sz, static_cast(0)); + + input_grad = std::vector(input_sz, static_cast(0)); + input_grad_host = std::vector(input_sz, static_cast(0)); + output_grad = std::vector(output_sz, static_cast(0)); + + int status; + + for(int i = 0; i < input_sz; i++) + { + input[i] = prng::gen_A_to_B(static_cast(-10.0f), static_cast(10.0f)); + } + status = input_dev->ToGPU(q, input.data()); + + status |= output_dev->ToGPU(q, output.data()); + + status |= input_grad_dev->ToGPU(q, input_grad.data()); + + for(int i = 0; i < output_sz; i++) + { + output_grad[i] = prng::gen_A_to_B(static_cast(-1.0), static_cast(1.0)); + } + status |= output_grad_dev->ToGPU(q, output_grad.data()); + + status |= ksize_dev->ToGPU(q, ksize.data()); + + status |= stride_dev->ToGPU(q, stride.data()); + + status |= padding_dev->ToGPU(q, padding.data()); + + if(status != 0) + std::cout << "Error copying data to GPU\n" << std::endl; + + return miopenStatusSuccess; +} + +template +int AvgPoolDriver::RunForwardGPU() +{ + float kernel_total_time = 0.0; + float kernel_first_time = 0.0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenAvgPoolForward(GetHandle(), + inputDesc, + input_dev->GetMem(), + outputDesc, + output_dev->GetMem(), + strideDesc, + stride_dev->GetMem(), + paddingDesc, + padding_dev->GetMem(), + ksizeDesc, + ksize_dev->GetMem(), + count_include_pad, + divisor_override); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + printf("Wall-clock Time Forward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter); + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + printf("GPU Kernel Time Forward AvgPool Elapsed: %f ms\n", kernel_average_time); + } + + output_dev->FromGPU(GetStream(), output.data()); + + return miopenStatusSuccess; +} + +template +int AvgPoolDriver::RunForwardCPU() +{ + if(in_dim.size() == 4) + { + mloAvgPoolForward2dRunHost(inputDesc, + outputDesc, + input.data(), + output_host.data(), + N, + C, + H, + W, + OH, + OW, + ksize.data(), + stride.data(), + padding.data(), + count_include_pad, + divisor_override); + } + else if(in_dim.size() == 5) + { + mloAvgPoolForward3dRunHost(inputDesc, + outputDesc, + input.data(), + output_host.data(), + N, + C, + D, + H, + W, + OD, + OH, + OW, + ksize.data(), + stride.data(), + padding.data(), + count_include_pad, + divisor_override); + } + return miopenStatusSuccess; +} + +template +int AvgPoolDriver::RunBackwardGPU() +{ + float kernel_total_time = 0.0; + float kernel_first_time = 0.0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenAvgPoolBackward(GetHandle(), + outputGradDesc, + output_grad_dev->GetMem(), + inputGradDesc, + input_grad_dev->GetMem(), + strideDesc, + stride_dev->GetMem(), + paddingDesc, + padding_dev->GetMem(), + ksizeDesc, + ksize_dev->GetMem(), + count_include_pad, + divisor_override); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + printf("Wall-clock Time Backward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter); + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + printf("GPU Kernel Time Backward AvgPool Elapsed: %f ms\n", kernel_average_time); + } + + input_grad_dev->FromGPU(GetStream(), input_grad.data()); + + return miopenStatusSuccess; +} + +template +int AvgPoolDriver::RunBackwardCPU() +{ + if(in_dim.size() == 4) + { + mloAvgPoolBackward2dRunHost(outputGradDesc, + inputGradDesc, + output_grad.data(), + input_grad_host.data(), + N, + C, + H, + W, + OH, + OW, + ksize.data(), + stride.data(), + padding.data(), + count_include_pad, + divisor_override); + } + else if(in_dim.size() == 5) + { + mloAvgPoolBackward3dRunHost(outputGradDesc, + inputGradDesc, + output_grad.data(), + input_grad_host.data(), + N, + C, + D, + H, + W, + OD, + OH, + OW, + ksize.data(), + stride.data(), + padding.data(), + count_include_pad, + divisor_override); + } + return miopenStatusSuccess; +} + +template +Tref AvgPoolDriver::GetTolerance() +{ + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + return tolerance; +} + +template +int AvgPoolDriver::VerifyForward() +{ + RunForwardCPU(); + const Tref tolerance = GetTolerance(); + auto error = miopen::rms_range(output_host, output); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Forward AvgPool FAILED: " << error << std::endl; + return EC_VerifyFwd; + } + else + { + printf("Forward AvgPool Verifies on CPU and GPU (err=%f)\n", error); + } + + return miopenStatusSuccess; +} + +template +int AvgPoolDriver::VerifyBackward() +{ + RunBackwardCPU(); + const Tref tolerance = GetTolerance(); + auto error = miopen::rms_range(input_grad_host, input_grad); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Backward AvgPool FAILED: " << error << std::endl; + return EC_VerifyFwd; + } + else + { + printf("Backward AvgPool Verifies on CPU and GPU (err=%f)\n", error); + } + return miopenStatusSuccess; +} + +#endif // GUARD_MIOPEN_AVGPOOL_DRIVER_HPP diff --git a/driver/dm_avgpool.cpp b/driver/dm_avgpool.cpp new file mode 100644 index 0000000000..ec0e457056 --- /dev/null +++ b/driver/dm_avgpool.cpp @@ -0,0 +1,40 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "registry_driver_maker.hpp" +#include "avgpool_driver.hpp" + +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "avgpool") + return new AvgPoolDriver(); + if(base_arg == "avgpoolfp16") + return new AvgPoolDriver(); + if(base_arg == "avgpoolbfp16") + return new AvgPoolDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/driver.hpp b/driver/driver.hpp index b23df690d1..bd42f6ee13 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -175,7 +175,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, " - "getitem[bfp16|fp16], reducecalculation[bfp16|fp16]\n"); + "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], avgpool[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -206,7 +206,8 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" && arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" && arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" && - arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "--version") + arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "avgpool" && + arg != "avgpoolfp16" && arg != "avgpoolbfp16" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/mloAvgPoolHost.hpp b/driver/mloAvgPoolHost.hpp new file mode 100644 index 0000000000..ad55c53c66 --- /dev/null +++ b/driver/mloAvgPoolHost.hpp @@ -0,0 +1,438 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MLO_AVGPOOLHOST_H_ +#define MLO_AVGPOOLHOST_H_ + +#include +#include + +template +int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + Tgpu* input, + Tcheck* output, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + const int32_t* kinfor, + const int32_t* stride, + const int32_t* padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = miopen::deref(inputDesc).GetLengths(); + auto numel = miopen::deref(outputDesc).GetElementSize(); + + auto input_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc)); + auto output_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputDesc)); + + for(int32_t gid = 0; gid < numel; gid++) + { + int32_t ncoh = gid / OW, ow = gid % OW; + int32_t nc = ncoh / OH, oh = ncoh % OH; + int32_t n = nc / C, c = nc % C; + int32_t R = kinfor[0]; + int32_t S = kinfor[1]; + int32_t sh = stride[0]; + int32_t sw = stride[1]; + int32_t ph = padding[0]; + int32_t pw = padding[1]; + + if(n >= N) + return 0; + + float m = 0; + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + // input idx : (n, c, h, w) + int32_t h = oh * sh - ph + r; + if(h < 0 || h >= H) + continue; + int32_t w = ow * sw - pw + s; + if(w < 0 || w >= W) + continue; + // int32_t input_idx = ((n * C + c) * H + h) * W + w; + m += static_cast( + input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]); + } + } + + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (hend - hstart) * (wend - wstart); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, H); + wend = min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (hend - hstart) * (wend - wstart); + } + } + float val = m / divide_factor; + + output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = + static_cast(val); + } + return 0; +} + +template +int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + Tgpu* input, + Tcheck* output, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + const int32_t* kinfor, + const int32_t* stride, + const int32_t* padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = miopen::deref(inputDesc).GetLengths(); + auto numel = miopen::deref(outputDesc).GetElementSize(); + + auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); + auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc)); + + for(int32_t gid = 0; gid < numel; gid++) + { + int32_t ncodoh = gid / OW, ow = gid % OW; + int32_t ncod = ncodoh / OH, oh = ncodoh % OH; + int32_t nc = ncod / OD, od = ncod % OD; + int32_t n = nc / C, c = nc % C; + int32_t KD = kinfor[0]; + int32_t R = kinfor[1]; + int32_t S = kinfor[2]; + int32_t sd = stride[0]; + int32_t sh = stride[1]; + int32_t sw = stride[2]; + int32_t pd = padding[0]; + int32_t ph = padding[1]; + int32_t pw = padding[2]; + + if(n >= N) + return 0; + float sum = 0; + for(int32_t kd = 0; kd < KD; ++kd) + { + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + // input idx : (n, c, d, h, w) + int32_t d = od * sd - pd + kd; + if(d < 0 || d >= D) + continue; + int32_t h = oh * sh - ph + r; + if(h < 0 || h >= H) + continue; + int32_t w = ow * sw - pw + s; + if(w < 0 || w >= W) + continue; + // int32_t input_idx = ((n * C + c) * H + h) * W + w; + sum += static_cast( + input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]); + } + } + } + int32_t dstart = od * sd - pd; + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t dend = min(dstart + KD, D + pd); + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dend = min(dend, D); + hend = min(hend, H); + wend = min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); + } + } + float val = sum / divide_factor; + output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] = + static_cast(val); + } + return 0; +} + +template +int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDesc, + const miopenTensorDescriptor_t inputGradDesc, + Tgpu* output_grad, + Tcheck* input_grad, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + const int32_t* kinfor, + const int32_t* stride, + const int32_t* padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = miopen::deref(inputGradDesc).GetLengths(); + auto numel = miopen::deref(inputGradDesc).GetElementSize(); + + auto output_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc)); + auto input_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc)); + + for(size_t gid = 0; gid < numel; gid++) + { + int32_t nch = gid / W, w = gid % W; + int32_t nc = nch / H, h = nch % H; + int32_t n = nc / C, c = nc % C; + int32_t R = kinfor[0]; + int32_t S = kinfor[1]; + int32_t sh = stride[0]; + int32_t sw = stride[1]; + int32_t ph = padding[0]; + int32_t pw = padding[1]; + + if(n >= N) + return 0; + + float grad = 0; + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + int32_t ohsh = h + ph - r; + if(ohsh % sh != 0) + continue; + int32_t oh = ohsh / sh; + if(oh < 0 || oh >= OH) + continue; + int32_t owsw = w + pw - s; + if(owsw % sw != 0) + continue; + int32_t ow = owsw / sw; + if(ow < 0 || ow >= OW) + continue; + + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (hend - hstart) * (wend - wstart); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, H); + wend = min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (hend - hstart) * (wend - wstart); + } + } + + grad += static_cast(output_grad[output_grad_tv.get_tensor_view_idx( + tensor_layout_t<4>(n, c, oh, ow))]) / + divide_factor; + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] = + static_cast(grad); + } + return 0; +} + +template +int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDesc, + const miopenTensorDescriptor_t inputGradDesc, + Tgpu* output_grad, + Tcheck* input_grad, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + const int32_t* kinfor, + const int32_t* stride, + const int32_t* padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = miopen::deref(inputGradDesc).GetLengths(); + auto numel = miopen::deref(inputGradDesc).GetElementSize(); + + auto output_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc)); + auto input_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc)); + + for(size_t gid = 0; gid < numel; gid++) + { + int32_t ncdh = gid / W, w = gid % W; + int32_t ncd = ncdh / H, h = ncdh % H; + int32_t nc = ncd / D, d = ncd % D; + int32_t n = nc / C, c = nc % C; + int32_t KD = kinfor[0]; + int32_t R = kinfor[1]; + int32_t S = kinfor[2]; + int32_t sd = stride[0]; + int32_t sh = stride[1]; + int32_t sw = stride[2]; + int32_t pd = padding[0]; + int32_t ph = padding[1]; + int32_t pw = padding[2]; + + if(n >= N) + return 0; + + float grad = 0; + for(int32_t kd = 0; kd < KD; ++kd) + { + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + int32_t odsd = d + pd - kd; + if(odsd % sd != 0) + continue; + int32_t od = odsd / sd; + if(od < 0 || od >= OD) + continue; + + int32_t ohsh = h + ph - r; + if(ohsh % sh != 0) + continue; + int32_t oh = ohsh / sh; + if(oh < 0 || oh >= OH) + continue; + + int32_t owsw = w + pw - s; + if(owsw % sw != 0) + continue; + int32_t ow = owsw / sw; + if(ow < 0 || ow >= OW) + continue; + + int32_t dstart = od * sd - pd; + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t dend = min(dstart + KD, D + pd); + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dend = min(dend, D); + hend = min(hend, H); + wend = min(wend, W); + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); + } + } + grad += static_cast(output_grad[output_grad_tv.get_tensor_view_idx( + tensor_layout_t<5>(n, c, od, oh, ow))]) / + divide_factor; + } + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] = + static_cast(grad); + } + return 0; +} + +#endif // MLO_AVGPOOLHOST_H_ diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp index bcbf4f6c60..f4a9e95ce1 100644 --- a/src/kernels/MIOpenAvgPool.cpp +++ b/src/kernels/MIOpenAvgPool.cpp @@ -43,15 +43,15 @@ template __device__ void avgPoolForward2d(const TI* __restrict__ input, TO* __restrict__ output, - int32_t N, - int32_t C, - int32_t H, - int32_t W, - int32_t OH, - int32_t OW, - int32_t* kinfor, - int32_t* stride, - int32_t* padding, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + const int32_t* __restrict__ kinfor, + const int32_t* __restrict__ stride, + const int32_t* __restrict__ padding, bool count_include_pad, int32_t divisor_override, tensor_view_t<4> input_tv, @@ -124,12 +124,12 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input, extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input, OUTPUT_TYPE* __restrict__ output, - int32_t N, - int32_t C, - int32_t H, - int32_t W, - int32_t OH, - int32_t OW, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, int32_t* kinfor, int32_t* stride, int32_t* padding, @@ -158,14 +158,14 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input template __device__ void avgPoolForward3d(const TI* __restrict__ input, TO* __restrict__ output, - int32_t N, - int32_t C, - int32_t D, - int32_t H, - int32_t W, - int32_t OD, - int32_t OH, - int32_t OW, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, int32_t* kinfor, int32_t* stride, int32_t* padding, @@ -252,14 +252,14 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input, extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input, OUTPUT_TYPE* __restrict__ output, - int32_t N, - int32_t C, - int32_t D, - int32_t H, - int32_t W, - int32_t OD, - int32_t OH, - int32_t OW, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, int32_t* kinfor, int32_t* stride, int32_t* padding, @@ -290,12 +290,12 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input template __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad, TO* __restrict__ input_grad, - int32_t N, - int32_t C, - int32_t H, - int32_t W, - int32_t OH, - int32_t OW, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, int32_t* kinfor, int32_t* stride, int32_t* padding, @@ -376,12 +376,12 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad, extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad, OUTPUT_TYPE* __restrict__ input_grad, - int32_t N, - int32_t C, - int32_t H, - int32_t W, - int32_t OH, - int32_t OW, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, int32_t* kinfor, int32_t* stride, int32_t* padding, @@ -410,14 +410,14 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp template __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad, TO* __restrict__ input_grad, - int32_t N, - int32_t C, - int32_t D, - int32_t H, - int32_t W, - int32_t OD, - int32_t OH, - int32_t OW, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, int32_t* kinfor, int32_t* stride, int32_t* padding, @@ -514,14 +514,14 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad, extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad, OUTPUT_TYPE* __restrict__ input_grad, - int32_t N, - int32_t C, - int32_t D, - int32_t H, - int32_t W, - int32_t OD, - int32_t OH, - int32_t OW, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, int32_t* kinfor, int32_t* stride, int32_t* padding, diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp index 10c9479b0c..b677192b36 100644 --- a/src/solver/avgpool/backward_avgpool_2d.cpp +++ b/src/solver/avgpool/backward_avgpool_2d.cpp @@ -46,6 +46,11 @@ namespace avgpool { bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context, const miopen::avgpool::BwdProblemDescription& problem) const { + if(problem.GetInputGradDesc().GetNumDims() != 4 || + problem.GetOutputGradDesc().GetNumDims() != 4) + { + return false; + } return true; } diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp index b960554348..829511d8cb 100644 --- a/src/solver/avgpool/backward_avgpool_3d.cpp +++ b/src/solver/avgpool/backward_avgpool_3d.cpp @@ -46,6 +46,11 @@ namespace avgpool { bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context, const miopen::avgpool::BwdProblemDescription& problem) const { + if(problem.GetInputGradDesc().GetNumDims() != 5 || + problem.GetOutputGradDesc().GetNumDims() != 5) + { + return false; + } return true; } diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp index 8b444370a0..6ddef062da 100644 --- a/src/solver/avgpool/forward_avgpool_2d.cpp +++ b/src/solver/avgpool/forward_avgpool_2d.cpp @@ -47,6 +47,10 @@ namespace avgpool { bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context, const miopen::avgpool::FwdProblemDescription& problem) const { + if(problem.GetInputDesc().GetNumDims() != 4 || problem.GetOutputDesc().GetNumDims() != 4) + { + return false; + } return true; } diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp index 9dd8c03cba..c1ee497b27 100644 --- a/src/solver/avgpool/forward_avgpool_3d.cpp +++ b/src/solver/avgpool/forward_avgpool_3d.cpp @@ -46,6 +46,10 @@ namespace avgpool { bool AvgPoolForward3d::IsApplicable(const ExecutionContext& context, const miopen::avgpool::FwdProblemDescription& problem) const { + if(problem.GetInputDesc().GetNumDims() != 5 || problem.GetOutputDesc().GetNumDims() != 5) + { + return false; + } return true; } diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp index 40a67a8d7d..ef26e17d74 100644 --- a/test/cpu_avgpool.hpp +++ b/test/cpu_avgpool.hpp @@ -32,12 +32,12 @@ template void cpu_avgpool_forward_2d(tensor input, tensor& output, - int32_t N, - int32_t C, - int32_t H, - int32_t W, - int32_t OH, - int32_t OW, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, tensor kinfor, tensor stride, tensor padding, @@ -85,15 +85,15 @@ void cpu_avgpool_forward_2d(tensor input, int32_t hstart = oh * sh - ph; int32_t wstart = ow * sw - pw; - int32_t hend = std::min(hstart + R, H + ph); - int32_t wend = std::min(wstart + S, W + pw); + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); const int32_t pool_size = (hend - hstart) * (wend - wstart); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); - hend = std::min(hend, H); - wend = std::min(wend, W); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, H); + wend = min(wend, W); int32_t divide_factor; if(divisor_override != 0) @@ -121,14 +121,14 @@ void cpu_avgpool_forward_2d(tensor input, template void cpu_avgpool_forward_3d(tensor input, tensor& output, - int32_t N, - int32_t C, - int32_t D, - int32_t H, - int32_t W, - int32_t OD, - int32_t OH, - int32_t OW, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, tensor kinfor, tensor stride, tensor padding, @@ -185,17 +185,17 @@ void cpu_avgpool_forward_3d(tensor input, int32_t dstart = od * sd - pd; int32_t hstart = oh * sh - ph; int32_t wstart = ow * sw - pw; - int32_t dend = std::min(dstart + KD, D + pd); - int32_t hend = std::min(hstart + R, H + ph); - int32_t wend = std::min(wstart + S, W + pw); + int32_t dend = min(dstart + KD, D + pd); + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); - dstart = std::max(dstart, 0); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); - dend = std::min(dend, D); - hend = std::min(hend, H); - wend = std::min(wend, W); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dend = min(dend, D); + hend = min(hend, H); + wend = min(wend, W); int32_t divide_factor; if(divisor_override != 0) @@ -222,12 +222,12 @@ void cpu_avgpool_forward_3d(tensor input, template void cpu_avgpool_backward_2d(tensor output_grad, tensor& input_grad, - int32_t N, - int32_t C, - int32_t H, - int32_t W, - int32_t OH, - int32_t OW, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, tensor kinfor, tensor stride, tensor padding, @@ -275,15 +275,15 @@ void cpu_avgpool_backward_2d(tensor output_grad, int32_t hstart = oh * sh - ph; int32_t wstart = ow * sw - pw; - int32_t hend = std::min(hstart + R, H + ph); - int32_t wend = std::min(wstart + S, W + pw); + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); const int32_t pool_size = (hend - hstart) * (wend - wstart); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); - hend = std::min(hend, H); - wend = std::min(wend, W); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, H); + wend = min(wend, W); int32_t divide_factor; if(divisor_override != 0) @@ -315,14 +315,14 @@ void cpu_avgpool_backward_2d(tensor output_grad, template void cpu_avgpool_backward_3d(tensor output_grad, tensor& input_grad, - int32_t N, - int32_t C, - int32_t D, - int32_t H, - int32_t W, - int32_t OD, - int32_t OH, - int32_t OW, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, tensor kinfor, tensor stride, tensor padding, @@ -385,17 +385,17 @@ void cpu_avgpool_backward_3d(tensor output_grad, int32_t dstart = od * sd - pd; int32_t hstart = oh * sh - ph; int32_t wstart = ow * sw - pw; - int32_t dend = std::min(dstart + KD, D + pd); - int32_t hend = std::min(hstart + R, H + ph); - int32_t wend = std::min(wstart + S, W + pw); + int32_t dend = min(dstart + KD, D + pd); + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); - dstart = std::max(dstart, 0); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); - dend = std::min(dend, D); - hend = std::min(hend, H); - wend = std::min(wend, W); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dend = min(dend, D); + hend = min(hend, H); + wend = min(wend, W); int32_t divide_factor; if(divisor_override != 0) { diff --git a/test/gtest/avgpool.cpp b/test/gtest/avgpool.cpp index 1dd5502339..fa002e5610 100644 --- a/test/gtest/avgpool.cpp +++ b/test/gtest/avgpool.cpp @@ -115,49 +115,49 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP32, testing::ValuesIn(AvgPoolT INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP16, testing::ValuesIn(AvgPoolTestConfigs())); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs())); -// // BACKWARD TEST -// TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd) -// { -// if(!MIOPEN_TEST_ALL || -// (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) -// { -// RunTest(); -// Verify(); -// } -// else -// { -// GTEST_SKIP(); -// } -// }; - -// TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd) -// { -// if(!MIOPEN_TEST_ALL || -// (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) -// { -// RunTest(); -// Verify(); -// } -// else -// { -// GTEST_SKIP(); -// } -// }; - -// TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd) -// { -// if(!MIOPEN_TEST_ALL || -// (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) -// { -// RunTest(); -// Verify(); -// } -// else -// { -// GTEST_SKIP(); -// } -// }; - -// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP32, testing::ValuesIn(AvgPoolTestConfigs())); -// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP16, testing::ValuesIn(AvgPoolTestConfigs())); -// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs())); +// BACKWARD TEST +TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP32, testing::ValuesIn(AvgPoolTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP16, testing::ValuesIn(AvgPoolTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs())); diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp index 23ec4c1726..26548e0a12 100644 --- a/test/gtest/avgpool.hpp +++ b/test/gtest/avgpool.hpp @@ -221,7 +221,6 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam count_include_pad, divisor_override); fflush(stdout); - ASSERT_EQ(status, miopenStatusSuccess); output.data = handle.Read(output_dev, output.data.size()); @@ -234,11 +233,6 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam auto error = miopen::rms_range(ref_output, output); ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output)); - for(int i = 0; i < 10; ++i) - { - std::cout << "output cpu: " << ref_output[i] << " output gpu: " << output[i] - << std::endl; - } EXPECT_LT(error, threshold * 10); } AvgPoolTestCase avgpool_config; From 881e79671935b7cbc6a05ba2cf61ad8749927305 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Fri, 16 Aug 2024 11:49:15 +0700 Subject: [PATCH 06/38] change kinfor to ksize --- driver/mloAvgPoolHost.hpp | 28 ++++----- include/miopen/miopen.h | 16 ++--- src/avgpool.cpp | 20 +++--- src/avgpool_api.cpp | 24 ++++---- src/include/miopen/avgpool.hpp | 8 +-- src/include/miopen/avgpool/invoke_params.hpp | 8 +-- src/kernels/MIOpenAvgPool.cpp | 65 +++++++++++++------- src/solver/avgpool/backward_avgpool_2d.cpp | 38 +++++++++++- src/solver/avgpool/backward_avgpool_3d.cpp | 4 +- src/solver/avgpool/forward_avgpool_2d.cpp | 40 +++++++++++- src/solver/avgpool/forward_avgpool_3d.cpp | 4 +- test/cpu_avgpool.hpp | 28 ++++----- 12 files changed, 187 insertions(+), 96 deletions(-) diff --git a/driver/mloAvgPoolHost.hpp b/driver/mloAvgPoolHost.hpp index ad55c53c66..6980ce968e 100644 --- a/driver/mloAvgPoolHost.hpp +++ b/driver/mloAvgPoolHost.hpp @@ -40,7 +40,7 @@ int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc, size_t W, size_t OH, size_t OW, - const int32_t* kinfor, + const int32_t* ksize, const int32_t* stride, const int32_t* padding, bool count_include_pad, @@ -57,8 +57,8 @@ int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc, int32_t ncoh = gid / OW, ow = gid % OW; int32_t nc = ncoh / OH, oh = ncoh % OH; int32_t n = nc / C, c = nc % C; - int32_t R = kinfor[0]; - int32_t S = kinfor[1]; + int32_t R = ksize[0]; + int32_t S = ksize[1]; int32_t sh = stride[0]; int32_t sw = stride[1]; int32_t ph = padding[0]; @@ -134,7 +134,7 @@ int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc, size_t OD, size_t OH, size_t OW, - const int32_t* kinfor, + const int32_t* ksize, const int32_t* stride, const int32_t* padding, bool count_include_pad, @@ -152,9 +152,9 @@ int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc, int32_t ncod = ncodoh / OH, oh = ncodoh % OH; int32_t nc = ncod / OD, od = ncod % OD; int32_t n = nc / C, c = nc % C; - int32_t KD = kinfor[0]; - int32_t R = kinfor[1]; - int32_t S = kinfor[2]; + int32_t KD = ksize[0]; + int32_t R = ksize[1]; + int32_t S = ksize[2]; int32_t sd = stride[0]; int32_t sh = stride[1]; int32_t sw = stride[2]; @@ -236,7 +236,7 @@ int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDes size_t W, size_t OH, size_t OW, - const int32_t* kinfor, + const int32_t* ksize, const int32_t* stride, const int32_t* padding, bool count_include_pad, @@ -253,8 +253,8 @@ int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDes int32_t nch = gid / W, w = gid % W; int32_t nc = nch / H, h = nch % H; int32_t n = nc / C, c = nc % C; - int32_t R = kinfor[0]; - int32_t S = kinfor[1]; + int32_t R = ksize[0]; + int32_t S = ksize[1]; int32_t sh = stride[0]; int32_t sw = stride[1]; int32_t ph = padding[0]; @@ -334,7 +334,7 @@ int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDes size_t OD, size_t OH, size_t OW, - const int32_t* kinfor, + const int32_t* ksize, const int32_t* stride, const int32_t* padding, bool count_include_pad, @@ -352,9 +352,9 @@ int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDes int32_t ncd = ncdh / H, h = ncdh % H; int32_t nc = ncd / D, d = ncd % D; int32_t n = nc / C, c = nc % C; - int32_t KD = kinfor[0]; - int32_t R = kinfor[1]; - int32_t S = kinfor[2]; + int32_t KD = ksize[0]; + int32_t R = ksize[1]; + int32_t S = ksize[2]; int32_t sd = stride[0]; int32_t sh = stride[1]; int32_t sw = stride[2]; diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index fda8817e3a..18b0bcafdf 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -7640,8 +7640,8 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle, * @param stride Data tensor stride (output) * @param paddingDesc Tensor descriptor for padding tensor (input) * @param padding Data tensor padding (output) - * @param kinforDesc Tensor descriptor for kinfor tensor (input) - * @param kinfor Data tensor kinfor (output) + * @param ksizeDesc Tensor descriptor for ksize tensor (input) + * @param ksize Data tensor ksize (output) * @param count_include_pad When True, will include the zero-padding in the averaging * calculation (input) * @param divisor_override If non-zero, will use this value as the divisor, otherwise will @@ -7657,8 +7657,8 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, const void* stride, const miopenTensorDescriptor_t paddingDesc, const void* padding, - const miopenTensorDescriptor_t kinforDesc, - const void* kinfor, + const miopenTensorDescriptor_t ksizeDesc, + const void* ksize, const bool count_include_pad, const int32_t divisor_override); @@ -7673,8 +7673,8 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, * @param stride Data tensor stride (output) * @param paddingDesc Tensor descriptor for padding tensor (input) * @param padding Data tensor padding (output) - * @param kinforDesc Tensor descriptor for kinfor tensor (input) - * @param kinfor Data tensor kinfor (output) + * @param ksizeDesc Tensor descriptor for ksize tensor (input) + * @param ksize Data tensor ksize (output) * @param count_include_pad When True, will include the zero-padding in the averaging * calculation (input) * @param divisor_override If non-zero, will use this value as the divisor, otherwise will @@ -7690,8 +7690,8 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle, const void* stride, const miopenTensorDescriptor_t paddingDesc, const void* padding, - const miopenTensorDescriptor_t kinforDesc, - const void* kinfor, + const miopenTensorDescriptor_t ksizeDesc, + const void* ksize, const bool count_include_pad, const int32_t divisor_override); /** @} */ diff --git a/src/avgpool.cpp b/src/avgpool.cpp index 15bea1f9d8..87ff481c6a 100644 --- a/src/avgpool.cpp +++ b/src/avgpool.cpp @@ -42,8 +42,8 @@ miopenStatus_t AvgPoolForward(Handle& handle, ConstData_t stride, const TensorDescriptor& paddingDesc, ConstData_t padding, - const TensorDescriptor& kinforDesc, - ConstData_t kinfor, + const TensorDescriptor& ksizeDesc, + ConstData_t ksize, const bool count_include_pad, const int32_t divisor_override) { @@ -51,7 +51,7 @@ miopenStatus_t AvgPoolForward(Handle& handle, outputDesc, strideDesc, paddingDesc, - kinforDesc, + ksizeDesc, count_include_pad, divisor_override}; @@ -61,13 +61,13 @@ miopenStatus_t AvgPoolForward(Handle& handle, tmp.outputDesc = &outputDesc; tmp.strideDesc = &strideDesc; tmp.paddingDesc = &paddingDesc; - tmp.kinforDesc = &kinforDesc; + tmp.ksizeDesc = &ksizeDesc; tmp.input = input; tmp.output = output; tmp.stride = stride; tmp.padding = padding; - tmp.kinfor = kinfor; + tmp.ksize = ksize; tmp.count_include_pad = count_include_pad; tmp.divisor_override = divisor_override; @@ -91,8 +91,8 @@ miopenStatus_t AvgPoolBackward(Handle& handle, ConstData_t stride, const TensorDescriptor& paddingDesc, ConstData_t padding, - const TensorDescriptor& kinforDesc, - ConstData_t kinfor, + const TensorDescriptor& ksizeDesc, + ConstData_t ksize, const bool count_include_pad, const int32_t divisor_override) { @@ -100,7 +100,7 @@ miopenStatus_t AvgPoolBackward(Handle& handle, inputGradDesc, strideDesc, paddingDesc, - kinforDesc, + ksizeDesc, count_include_pad, divisor_override}; @@ -110,13 +110,13 @@ miopenStatus_t AvgPoolBackward(Handle& handle, tmp.inputGradDesc = &inputGradDesc; tmp.strideDesc = &strideDesc; tmp.paddingDesc = &paddingDesc; - tmp.kinforDesc = &kinforDesc; + tmp.ksizeDesc = &ksizeDesc; tmp.output_grad = output_grad; tmp.input_grad = input_grad; tmp.stride = stride; tmp.padding = padding; - tmp.kinfor = kinfor; + tmp.ksize = ksize; tmp.count_include_pad = count_include_pad; tmp.divisor_override = divisor_override; diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp index 4e62bd5e7b..fa2e8a957c 100644 --- a/src/avgpool_api.cpp +++ b/src/avgpool_api.cpp @@ -88,8 +88,8 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, const void* stride, const miopenTensorDescriptor_t paddingDesc, const void* padding, - const miopenTensorDescriptor_t kinforDesc, - const void* kinfor, + const miopenTensorDescriptor_t ksizeDesc, + const void* ksize, const bool count_include_pad, const int32_t divisor_override) { @@ -102,8 +102,8 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, stride, paddingDesc, padding, - kinforDesc, - kinfor, + ksizeDesc, + ksize, count_include_pad, divisor_override); @@ -118,8 +118,8 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, DataCast(stride), miopen::deref(paddingDesc), DataCast(padding), - miopen::deref(kinforDesc), - DataCast(kinfor), + miopen::deref(ksizeDesc), + DataCast(ksize), count_include_pad, divisor_override); }); @@ -134,8 +134,8 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle, const void* stride, const miopenTensorDescriptor_t paddingDesc, const void* padding, - const miopenTensorDescriptor_t kinforDesc, - const void* kinfor, + const miopenTensorDescriptor_t ksizeDesc, + const void* ksize, const bool count_include_pad, const int32_t divisor_override) { @@ -148,8 +148,8 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle, stride, paddingDesc, padding, - kinforDesc, - kinfor, + ksizeDesc, + ksize, count_include_pad, divisor_override); @@ -164,8 +164,8 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle, DataCast(stride), miopen::deref(paddingDesc), DataCast(padding), - miopen::deref(kinforDesc), - DataCast(kinfor), + miopen::deref(ksizeDesc), + DataCast(ksize), count_include_pad, divisor_override); }); diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp index 617ed56782..9210e45e3a 100644 --- a/src/include/miopen/avgpool.hpp +++ b/src/include/miopen/avgpool.hpp @@ -43,8 +43,8 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle, ConstData_t stride, const TensorDescriptor& paddingDesc, ConstData_t padding, - const TensorDescriptor& kinforDesc, - ConstData_t kinfor, + const TensorDescriptor& ksizeDesc, + ConstData_t ksize, bool count_include_pad, int32_t divisor_override); @@ -57,8 +57,8 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle, ConstData_t stride, const TensorDescriptor& paddingDesc, ConstData_t padding, - const TensorDescriptor& kinforDesc, - ConstData_t kinfor, + const TensorDescriptor& ksizeDesc, + ConstData_t ksize, bool count_include_pad, int32_t divisor_override); } // namespace miopen diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp index b57f8e0edc..91a70725ee 100644 --- a/src/include/miopen/avgpool/invoke_params.hpp +++ b/src/include/miopen/avgpool/invoke_params.hpp @@ -42,13 +42,13 @@ struct FwdInvokeParams : public miopen::InvokeParams const TensorDescriptor* outputDesc = nullptr; const TensorDescriptor* strideDesc = nullptr; const TensorDescriptor* paddingDesc = nullptr; - const TensorDescriptor* kinforDesc = nullptr; + const TensorDescriptor* ksizeDesc = nullptr; ConstData_t input = nullptr; Data_t output = nullptr; ConstData_t stride = nullptr; ConstData_t padding = nullptr; - ConstData_t kinfor = nullptr; + ConstData_t ksize = nullptr; bool count_include_pad = false; int32_t divisor_override = 0; @@ -66,13 +66,13 @@ struct BwdInvokeParams : public miopen::InvokeParams const TensorDescriptor* inputGradDesc = nullptr; const TensorDescriptor* strideDesc = nullptr; const TensorDescriptor* paddingDesc = nullptr; - const TensorDescriptor* kinforDesc = nullptr; + const TensorDescriptor* ksizeDesc = nullptr; ConstData_t output_grad = nullptr; Data_t input_grad = nullptr; ConstData_t stride = nullptr; ConstData_t padding = nullptr; - ConstData_t kinfor = nullptr; + ConstData_t ksize = nullptr; bool count_include_pad = false; int32_t divisor_override = 0; diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp index f4a9e95ce1..6d94bffac1 100644 --- a/src/kernels/MIOpenAvgPool.cpp +++ b/src/kernels/MIOpenAvgPool.cpp @@ -40,6 +40,27 @@ #define OUTPUT_TYPE float #endif +// template +// struct blockNd +// { +// T val[Nd]; +// }; + +// template +// __device__ void avgPoolForwardNdNew(const TI* __restrict__ input, +// TO* __restrict__ output, +// size_t N, +// size_t C, +// const blockNd sizeIn, +// const blockNd sizeOut, +// const blockNd ksize, +// const blockNd stride, +// const blockNd padding, +// bool count_include_pad, +// int32_t divisor_override, +// tensor_view_t input_tv, +// tensor_view_t output_tv); + template __device__ void avgPoolForward2d(const TI* __restrict__ input, TO* __restrict__ output, @@ -49,7 +70,7 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input, size_t W, size_t OH, size_t OW, - const int32_t* __restrict__ kinfor, + const int32_t* __restrict__ ksize, const int32_t* __restrict__ stride, const int32_t* __restrict__ padding, bool count_include_pad, @@ -61,8 +82,8 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input, int32_t ncoh = gid / OW, ow = gid % OW; int32_t nc = ncoh / OH, oh = ncoh % OH; int32_t n = nc / C, c = nc % C; - int32_t R = kinfor[0]; - int32_t S = kinfor[1]; + int32_t R = ksize[0]; + int32_t S = ksize[1]; int32_t sh = stride[0]; int32_t sw = stride[1]; int32_t ph = padding[0]; @@ -130,7 +151,7 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input size_t W, size_t OH, size_t OW, - int32_t* kinfor, + int32_t* ksize, int32_t* stride, int32_t* padding, bool count_include_pad, @@ -146,7 +167,7 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input W, OH, OW, - kinfor, + ksize, stride, padding, count_include_pad, @@ -166,7 +187,7 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input, size_t OD, size_t OH, size_t OW, - int32_t* kinfor, + int32_t* ksize, int32_t* stride, int32_t* padding, bool count_include_pad, @@ -179,9 +200,9 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input, int32_t ncod = ncodoh / OH, oh = ncodoh % OH; int32_t nc = ncod / OD, od = ncod % OD; int32_t n = nc / C, c = nc % C; - int32_t KD = kinfor[0]; - int32_t R = kinfor[1]; - int32_t S = kinfor[2]; + int32_t KD = ksize[0]; + int32_t R = ksize[1]; + int32_t S = ksize[2]; int32_t sd = stride[0]; int32_t sh = stride[1]; int32_t sw = stride[2]; @@ -260,7 +281,7 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input size_t OD, size_t OH, size_t OW, - int32_t* kinfor, + int32_t* ksize, int32_t* stride, int32_t* padding, bool count_include_pad, @@ -278,7 +299,7 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input OD, OH, OW, - kinfor, + ksize, stride, padding, count_include_pad, @@ -296,7 +317,7 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad, size_t W, size_t OH, size_t OW, - int32_t* kinfor, + int32_t* ksize, int32_t* stride, int32_t* padding, bool count_include_pad, @@ -308,8 +329,8 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad, int32_t nch = gid / W, w = gid % W; int32_t nc = nch / H, h = nch % H; int32_t n = nc / C, c = nc % C; - int32_t R = kinfor[0]; - int32_t S = kinfor[1]; + int32_t R = ksize[0]; + int32_t S = ksize[1]; int32_t sh = stride[0]; int32_t sw = stride[1]; int32_t ph = padding[0]; @@ -382,7 +403,7 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp size_t W, size_t OH, size_t OW, - int32_t* kinfor, + int32_t* ksize, int32_t* stride, int32_t* padding, bool count_include_pad, @@ -398,7 +419,7 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp W, OH, OW, - kinfor, + ksize, stride, padding, count_include_pad, @@ -418,7 +439,7 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad, size_t OD, size_t OH, size_t OW, - int32_t* kinfor, + int32_t* ksize, int32_t* stride, int32_t* padding, bool count_include_pad, @@ -431,9 +452,9 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad, int32_t ncd = ncdh / H, h = ncdh % H; int32_t nc = ncd / D, d = ncd % D; int32_t n = nc / C, c = nc % C; - int32_t KD = kinfor[0]; - int32_t R = kinfor[1]; - int32_t S = kinfor[2]; + int32_t KD = ksize[0]; + int32_t R = ksize[1]; + int32_t S = ksize[2]; int32_t sd = stride[0]; int32_t sh = stride[1]; int32_t sw = stride[2]; @@ -522,7 +543,7 @@ extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ outp size_t OD, size_t OH, size_t OW, - int32_t* kinfor, + int32_t* ksize, int32_t* stride, int32_t* padding, bool count_include_pad, @@ -540,7 +561,7 @@ extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ outp OD, OH, OW, - kinfor, + ksize, stride, padding, count_include_pad, diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp index b677192b36..4fe9d5bc76 100644 --- a/src/solver/avgpool/backward_avgpool_2d.cpp +++ b/src/solver/avgpool/backward_avgpool_2d.cpp @@ -35,7 +35,7 @@ #include #include -#define LOCAL_SIZE_BWD_2D 1024 +#define LOCAL_SIZE_BWD_2D 256 namespace miopen { @@ -43,6 +43,36 @@ namespace solver { namespace avgpool { +bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem) +{ + auto dtype = problem.GetInputGradDesc().GetType(); + auto in_nelems = problem.GetInputGradDesc().GetElementSize(); + auto out_nelems = problem.GetOutputGradDesc().GetElementSize(); + auto mul_nc = + problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1]; + auto in_over_out = static_cast(in_nelems) / out_nelems; + + if(dtype == miopenFloat) + { + return false; + } + else if(dtype == miopenHalf) + { + if(in_over_out < 2 && in_nelems >= 11075584) + { + return true; + } + } + else if(dtype == miopenBFloat16) + { + if(in_over_out < 2 || (in_nelems > 20000000 && mul_nc <= 2048)) + { + return true; + } + } + return false; +} + bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context, const miopen::avgpool::BwdProblemDescription& problem) const { @@ -51,6 +81,10 @@ bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context, { return false; } + if(!IsOverRocm(problem)) + { + return false; + } return true; } @@ -101,7 +135,7 @@ AvgPoolBackward2d::GetSolution(const ExecutionContext& context, W, OH, OW, - params.kinfor, + params.ksize, params.stride, params.padding, params.count_include_pad, diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp index 829511d8cb..6897097955 100644 --- a/src/solver/avgpool/backward_avgpool_3d.cpp +++ b/src/solver/avgpool/backward_avgpool_3d.cpp @@ -35,7 +35,7 @@ #include #include -#define LOCAL_SIZE_BWD_3D 1024 +#define LOCAL_SIZE_BWD_3D 256 namespace miopen { @@ -105,7 +105,7 @@ AvgPoolBackward3d::GetSolution(const ExecutionContext& context, OD, OH, OW, - params.kinfor, + params.ksize, params.stride, params.padding, params.count_include_pad, diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp index 6ddef062da..3e70264097 100644 --- a/src/solver/avgpool/forward_avgpool_2d.cpp +++ b/src/solver/avgpool/forward_avgpool_2d.cpp @@ -36,7 +36,7 @@ #include #include -#define LOCAL_SIZE_FWD_2D 1024 +#define LOCAL_SIZE_FWD_2D 256 namespace miopen { @@ -44,6 +44,38 @@ namespace solver { namespace avgpool { +bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem) +{ + auto dtype = problem.GetOutputDesc().GetType(); + auto in_nelems = problem.GetInputDesc().GetElementSize(); + auto out_nelems = problem.GetOutputDesc().GetElementSize(); + auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1]; + auto in_over_out = static_cast(in_nelems) / out_nelems; + + if(dtype == miopenFloat) + { + if(in_over_out > 11 || (in_over_out < 2 && mul_nc >= 12288)) + { + return true; + } + } + else if(dtype == miopenHalf) + { + if(in_over_out > 11 || (in_over_out < 2 && mul_nc < 90000)) + { + return true; + } + } + else if(dtype == miopenBFloat16) + { + if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 6000000) + { + return true; + } + } + return false; +} + bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context, const miopen::avgpool::FwdProblemDescription& problem) const { @@ -51,6 +83,10 @@ bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context, { return false; } + if(!IsOverRocm(problem)) + { + return false; + } return true; } @@ -101,7 +137,7 @@ AvgPoolForward2d::GetSolution(const ExecutionContext& context, W, OH, OW, - params.kinfor, + params.ksize, params.stride, params.padding, params.count_include_pad, diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp index c1ee497b27..088aac6dca 100644 --- a/src/solver/avgpool/forward_avgpool_3d.cpp +++ b/src/solver/avgpool/forward_avgpool_3d.cpp @@ -35,7 +35,7 @@ #include #include -#define LOCAL_SIZE_FWD_3D 1024 +#define LOCAL_SIZE_FWD_3D 256 namespace miopen { @@ -104,7 +104,7 @@ AvgPoolForward3d::GetSolution(const ExecutionContext& context, OD, OH, OW, - params.kinfor, + params.ksize, params.stride, params.padding, params.count_include_pad, diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp index ef26e17d74..5b91033633 100644 --- a/test/cpu_avgpool.hpp +++ b/test/cpu_avgpool.hpp @@ -38,7 +38,7 @@ void cpu_avgpool_forward_2d(tensor input, size_t W, size_t OH, size_t OW, - tensor kinfor, + tensor ksize, tensor stride, tensor padding, bool count_include_pad, @@ -55,8 +55,8 @@ void cpu_avgpool_forward_2d(tensor input, int32_t ncoh = gid / OW, ow = gid % OW; int32_t nc = ncoh / OH, oh = ncoh % OH; int32_t n = nc / C, c = nc % C; - int32_t R = kinfor[0]; - int32_t S = kinfor[1]; + int32_t R = ksize[0]; + int32_t S = ksize[1]; int32_t sh = stride[0]; int32_t sw = stride[1]; int32_t ph = padding[0]; @@ -129,7 +129,7 @@ void cpu_avgpool_forward_3d(tensor input, size_t OD, size_t OH, size_t OW, - tensor kinfor, + tensor ksize, tensor stride, tensor padding, bool count_include_pad, @@ -147,9 +147,9 @@ void cpu_avgpool_forward_3d(tensor input, int32_t ncod = ncodoh / OH, oh = ncodoh % OH; int32_t nc = ncod / OD, od = ncod % OD; int32_t n = nc / C, c = nc % C; - int32_t KD = kinfor[0]; - int32_t R = kinfor[1]; - int32_t S = kinfor[2]; + int32_t KD = ksize[0]; + int32_t R = ksize[1]; + int32_t S = ksize[2]; int32_t sd = stride[0]; int32_t sh = stride[1]; int32_t sw = stride[2]; @@ -228,7 +228,7 @@ void cpu_avgpool_backward_2d(tensor output_grad, size_t W, size_t OH, size_t OW, - tensor kinfor, + tensor ksize, tensor stride, tensor padding, bool count_include_pad, @@ -245,8 +245,8 @@ void cpu_avgpool_backward_2d(tensor output_grad, int32_t nch = gid / W, w = gid % W; int32_t nc = nch / H, h = nch % H; int32_t n = nc / C, c = nc % C; - int32_t R = kinfor[0]; - int32_t S = kinfor[1]; + int32_t R = ksize[0]; + int32_t S = ksize[1]; int32_t sh = stride[0]; int32_t sw = stride[1]; int32_t ph = padding[0]; @@ -323,7 +323,7 @@ void cpu_avgpool_backward_3d(tensor output_grad, size_t OD, size_t OH, size_t OW, - tensor kinfor, + tensor ksize, tensor stride, tensor padding, bool count_include_pad, @@ -341,9 +341,9 @@ void cpu_avgpool_backward_3d(tensor output_grad, int32_t ncd = ncdh / H, h = ncdh % H; int32_t nc = ncd / D, d = ncd % D; int32_t n = nc / C, c = nc % C; - int32_t KD = kinfor[0]; - int32_t R = kinfor[1]; - int32_t S = kinfor[2]; + int32_t KD = ksize[0]; + int32_t R = ksize[1]; + int32_t S = ksize[2]; int32_t sd = stride[0]; int32_t sh = stride[1]; int32_t sw = stride[2]; From 36128975121554bdd9336656f7781ddee410605f Mon Sep 17 00:00:00 2001 From: hieule88 Date: Mon, 19 Aug 2024 16:57:51 +0700 Subject: [PATCH 07/38] change params --- driver/avgpool_driver.hpp | 95 ++++----- include/miopen/miopen.h | 42 ++-- src/avgpool.cpp | 84 ++++---- src/avgpool_api.cpp | 90 +++++---- src/include/miopen/avgpool.hpp | 30 +-- src/include/miopen/avgpool/invoke_params.hpp | 42 ++-- .../miopen/avgpool/problem_description.hpp | 44 +---- src/kernels/MIOpenAvgPool.cpp | 183 +++++++++--------- src/solver/avgpool/backward_avgpool_2d.cpp | 17 +- src/solver/avgpool/backward_avgpool_3d.cpp | 52 ++++- src/solver/avgpool/forward_avgpool_2d.cpp | 17 +- src/solver/avgpool/forward_avgpool_3d.cpp | 48 ++++- 12 files changed, 409 insertions(+), 335 deletions(-) diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp index 38beba92f1..ff7d04edd5 100644 --- a/driver/avgpool_driver.hpp +++ b/driver/avgpool_driver.hpp @@ -52,9 +52,6 @@ class AvgPoolDriver : public Driver miopenCreateTensorDescriptor(&outputDesc); miopenCreateTensorDescriptor(&inputGradDesc); miopenCreateTensorDescriptor(&outputGradDesc); - miopenCreateTensorDescriptor(&ksizeDesc); - miopenCreateTensorDescriptor(&strideDesc); - miopenCreateTensorDescriptor(&paddingDesc); data_type = miopen_type{}; } @@ -83,9 +80,6 @@ class AvgPoolDriver : public Driver miopenDestroyTensorDescriptor(outputDesc); miopenDestroyTensorDescriptor(inputGradDesc); miopenDestroyTensorDescriptor(outputGradDesc); - miopenDestroyTensorDescriptor(ksizeDesc); - miopenDestroyTensorDescriptor(strideDesc); - miopenDestroyTensorDescriptor(paddingDesc); } private: @@ -97,17 +91,11 @@ class AvgPoolDriver : public Driver miopenTensorDescriptor_t outputDesc; miopenTensorDescriptor_t inputGradDesc; miopenTensorDescriptor_t outputGradDesc; - miopenTensorDescriptor_t ksizeDesc; - miopenTensorDescriptor_t strideDesc; - miopenTensorDescriptor_t paddingDesc; std::unique_ptr input_dev; std::unique_ptr output_dev; std::unique_ptr input_grad_dev; std::unique_ptr output_grad_dev; - std::unique_ptr ksize_dev; - std::unique_ptr stride_dev; - std::unique_ptr padding_dev; std::vector input; std::vector output; @@ -172,29 +160,29 @@ std::vector AvgPoolDriver::GetInputTensorDimsFromCmd(const char template int AvgPoolDriver::GetandSetData() { - in_dim = GetInputTensorDimsFromCmd("input_dims"); - std::vector ksp_dim = {in_dim.size() - 2}; - ksize = GetInputTensorDimsFromCmd("kernel_size"); - stride = GetInputTensorDimsFromCmd("stride"); - padding = GetInputTensorDimsFromCmd("padding"); + in_dim = GetInputTensorDimsFromCmd("input_dims"); + int ksp_dim = in_dim.size() - 2; + ksize = GetInputTensorDimsFromCmd("kernel_size"); + stride = GetInputTensorDimsFromCmd("stride"); + padding = GetInputTensorDimsFromCmd("padding"); - if(ksize.size() != ksp_dim[0]) + if(ksize.size() != ksp_dim) { - int ref = ksp_dim[0] - ksize.size(); - while(ref--) - ksize.push_back(1); + int ref = ksp_dim - ksize.size(); + while((ref--) != 0) + ksize.push_back(ksize[0]); } - if(stride.size() != ksp_dim[0]) + if(stride.size() != ksp_dim) { - int ref = ksp_dim[0] - ksize.size(); - while(ref--) - stride.push_back(1); + int ref = ksp_dim - stride.size(); + while((ref--) != 0) + stride.push_back(stride[0]); } - if(padding.size() != ksp_dim[0]) + if(padding.size() != ksp_dim) { - int ref = ksp_dim[0] - ksize.size(); - while(ref--) - padding.push_back(0); + int ref = ksp_dim - padding.size(); + while((ref--) != 0) + padding.push_back(padding[0]); } ceil_mode = static_cast(inflags.GetValueInt("ceil_mode")); @@ -242,9 +230,6 @@ int AvgPoolDriver::GetandSetData() SetTensorNd(outputDesc, out_dim, data_type); SetTensorNd(outputGradDesc, out_dim, data_type); SetTensorNd(inputGradDesc, in_dim, data_type); - SetTensorNd(ksizeDesc, ksp_dim, miopen_type{}); - SetTensorNd(strideDesc, ksp_dim, miopen_type{}); - SetTensorNd(paddingDesc, ksp_dim, miopen_type{}); return miopenStatusSuccess; } @@ -301,11 +286,8 @@ int AvgPoolDriver::AddCmdLineArgs() template int AvgPoolDriver::AllocateBuffersAndCopy() { - size_t input_sz = GetTensorSize(inputDesc); - size_t output_sz = GetTensorSize(outputDesc); - size_t ksize_sz = GetTensorSize(ksizeDesc); - size_t stride_sz = GetTensorSize(strideDesc); - size_t padding_sz = GetTensorSize(paddingDesc); + size_t input_sz = GetTensorSize(inputDesc); + size_t output_sz = GetTensorSize(outputDesc); uint32_t ctx = 0; @@ -313,9 +295,6 @@ int AvgPoolDriver::AllocateBuffersAndCopy() output_dev = std::unique_ptr(new GPUMem(ctx, output_sz, sizeof(Tgpu))); input_grad_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); output_grad_dev = std::unique_ptr(new GPUMem(ctx, output_sz, sizeof(Tgpu))); - ksize_dev = std::unique_ptr(new GPUMem(ctx, ksize_sz, sizeof(int32_t))); - stride_dev = std::unique_ptr(new GPUMem(ctx, stride_sz, sizeof(int32_t))); - padding_dev = std::unique_ptr(new GPUMem(ctx, padding_sz, sizeof(int32_t))); input = std::vector(input_sz, static_cast(0)); output = std::vector(output_sz, static_cast(0)); @@ -343,12 +322,6 @@ int AvgPoolDriver::AllocateBuffersAndCopy() } status |= output_grad_dev->ToGPU(q, output_grad.data()); - status |= ksize_dev->ToGPU(q, ksize.data()); - - status |= stride_dev->ToGPU(q, stride.data()); - - status |= padding_dev->ToGPU(q, padding.data()); - if(status != 0) std::cout << "Error copying data to GPU\n" << std::endl; @@ -371,12 +344,15 @@ int AvgPoolDriver::RunForwardGPU() input_dev->GetMem(), outputDesc, output_dev->GetMem(), - strideDesc, - stride_dev->GetMem(), - paddingDesc, - padding_dev->GetMem(), - ksizeDesc, - ksize_dev->GetMem(), + ksize.size() == 3 ? ksize[0] : 0, + ksize.size() == 3 ? ksize[1] : ksize[0], + ksize.size() == 3 ? ksize[2] : ksize[1], + stride.size() == 3 ? stride[0] : 0, + stride.size() == 3 ? stride[1] : stride[0], + stride.size() == 3 ? stride[2] : stride[1], + padding.size() == 3 ? padding[0] : 0, + padding.size() == 3 ? padding[1] : padding[0], + padding.size() == 3 ? padding[2] : padding[1], count_include_pad, divisor_override); @@ -464,12 +440,15 @@ int AvgPoolDriver::RunBackwardGPU() output_grad_dev->GetMem(), inputGradDesc, input_grad_dev->GetMem(), - strideDesc, - stride_dev->GetMem(), - paddingDesc, - padding_dev->GetMem(), - ksizeDesc, - ksize_dev->GetMem(), + ksize.size() == 3 ? ksize[0] : 0, + ksize.size() == 3 ? ksize[1] : ksize[0], + ksize.size() == 3 ? ksize[2] : ksize[1], + stride.size() == 3 ? stride[0] : 0, + stride.size() == 3 ? stride[1] : stride[0], + stride.size() == 3 ? stride[2] : stride[1], + padding.size() == 3 ? padding[0] : 0, + padding.size() == 3 ? padding[1] : padding[0], + padding.size() == 3 ? padding[2] : padding[1], count_include_pad, divisor_override); diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 18b0bcafdf..ea44de92d5 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -7636,12 +7636,6 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle, * @param input Data tensor input (input) * @param outputDesc Tensor descriptor for output tensor (input) * @param output Data tensor output (output) - * @param strideDesc Tensor descriptor for stride tensor (input) - * @param stride Data tensor stride (output) - * @param paddingDesc Tensor descriptor for padding tensor (input) - * @param padding Data tensor padding (output) - * @param ksizeDesc Tensor descriptor for ksize tensor (input) - * @param ksize Data tensor ksize (output) * @param count_include_pad When True, will include the zero-padding in the averaging * calculation (input) * @param divisor_override If non-zero, will use this value as the divisor, otherwise will @@ -7653,12 +7647,15 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, const void* input, const miopenTensorDescriptor_t outputDesc, void* output, - const miopenTensorDescriptor_t strideDesc, - const void* stride, - const miopenTensorDescriptor_t paddingDesc, - const void* padding, - const miopenTensorDescriptor_t ksizeDesc, - const void* ksize, + const int32_t KD, + const int32_t KH, + const int32_t KW, + const int32_t SD, + const int32_t SH, + const int32_t SW, + const int32_t PD, + const int32_t PH, + const int32_t PW, const bool count_include_pad, const int32_t divisor_override); @@ -7669,12 +7666,6 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, * @param output_grad Data tensor output grad (input) * @param inputGradDesc Tensor descriptor for input grad tensor (input) * @param input_grad Data tensor input grad (output) - * @param strideDesc Tensor descriptor for stride tensor (input) - * @param stride Data tensor stride (output) - * @param paddingDesc Tensor descriptor for padding tensor (input) - * @param padding Data tensor padding (output) - * @param ksizeDesc Tensor descriptor for ksize tensor (input) - * @param ksize Data tensor ksize (output) * @param count_include_pad When True, will include the zero-padding in the averaging * calculation (input) * @param divisor_override If non-zero, will use this value as the divisor, otherwise will @@ -7686,12 +7677,15 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle, const void* output_grad, const miopenTensorDescriptor_t inputGradDesc, void* input_grad, - const miopenTensorDescriptor_t strideDesc, - const void* stride, - const miopenTensorDescriptor_t paddingDesc, - const void* padding, - const miopenTensorDescriptor_t ksizeDesc, - const void* ksize, + const int32_t KD, + const int32_t KH, + const int32_t KW, + const int32_t SD, + const int32_t SH, + const int32_t SW, + const int32_t PD, + const int32_t PH, + const int32_t PW, const bool count_include_pad, const int32_t divisor_override); /** @} */ diff --git a/src/avgpool.cpp b/src/avgpool.cpp index 87ff481c6a..323f01c90e 100644 --- a/src/avgpool.cpp +++ b/src/avgpool.cpp @@ -38,36 +38,37 @@ miopenStatus_t AvgPoolForward(Handle& handle, ConstData_t input, const TensorDescriptor& outputDesc, Data_t output, - const TensorDescriptor& strideDesc, - ConstData_t stride, - const TensorDescriptor& paddingDesc, - ConstData_t padding, - const TensorDescriptor& ksizeDesc, - ConstData_t ksize, + const int32_t KD, + const int32_t KH, + const int32_t KW, + const int32_t SD, + const int32_t SH, + const int32_t SW, + const int32_t PD, + const int32_t PH, + const int32_t PW, const bool count_include_pad, const int32_t divisor_override) { - const auto problem = avgpool::FwdProblemDescription{inputDesc, - outputDesc, - strideDesc, - paddingDesc, - ksizeDesc, - count_include_pad, - divisor_override}; + const auto problem = + avgpool::FwdProblemDescription{inputDesc, outputDesc, count_include_pad, divisor_override}; const auto invoke_params = [&]() { - auto tmp = avgpool::FwdInvokeParams{}; - tmp.inputDesc = &inputDesc; - tmp.outputDesc = &outputDesc; - tmp.strideDesc = &strideDesc; - tmp.paddingDesc = &paddingDesc; - tmp.ksizeDesc = &ksizeDesc; + auto tmp = avgpool::FwdInvokeParams{}; + tmp.inputDesc = &inputDesc; + tmp.outputDesc = &outputDesc; tmp.input = input; tmp.output = output; - tmp.stride = stride; - tmp.padding = padding; - tmp.ksize = ksize; + tmp.KD = KD; + tmp.KH = KH; + tmp.KW = KW; + tmp.SD = SD; + tmp.SH = SH; + tmp.SW = SW; + tmp.PD = PD; + tmp.PH = PH; + tmp.PW = PW; tmp.count_include_pad = count_include_pad; tmp.divisor_override = divisor_override; @@ -87,36 +88,37 @@ miopenStatus_t AvgPoolBackward(Handle& handle, ConstData_t output_grad, const TensorDescriptor& inputGradDesc, Data_t input_grad, - const TensorDescriptor& strideDesc, - ConstData_t stride, - const TensorDescriptor& paddingDesc, - ConstData_t padding, - const TensorDescriptor& ksizeDesc, - ConstData_t ksize, + const int32_t KD, + const int32_t KH, + const int32_t KW, + const int32_t SD, + const int32_t SH, + const int32_t SW, + const int32_t PD, + const int32_t PH, + const int32_t PW, const bool count_include_pad, const int32_t divisor_override) { - const auto problem = avgpool::BwdProblemDescription{outputGradDesc, - inputGradDesc, - strideDesc, - paddingDesc, - ksizeDesc, - count_include_pad, - divisor_override}; + const auto problem = avgpool::BwdProblemDescription{ + outputGradDesc, inputGradDesc, count_include_pad, divisor_override}; const auto invoke_params = [&]() { auto tmp = avgpool::BwdInvokeParams{}; tmp.outputGradDesc = &outputGradDesc; tmp.inputGradDesc = &inputGradDesc; - tmp.strideDesc = &strideDesc; - tmp.paddingDesc = &paddingDesc; - tmp.ksizeDesc = &ksizeDesc; tmp.output_grad = output_grad; tmp.input_grad = input_grad; - tmp.stride = stride; - tmp.padding = padding; - tmp.ksize = ksize; + tmp.KD = KD; + tmp.KH = KH; + tmp.KW = KW; + tmp.SD = SD; + tmp.SH = SH; + tmp.SW = SW; + tmp.PD = PD; + tmp.PH = PH; + tmp.PW = PW; tmp.count_include_pad = count_include_pad; tmp.divisor_override = divisor_override; diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp index fa2e8a957c..32e1f12f92 100644 --- a/src/avgpool_api.cpp +++ b/src/avgpool_api.cpp @@ -84,12 +84,15 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, const void* input, const miopenTensorDescriptor_t outputDesc, void* output, - const miopenTensorDescriptor_t strideDesc, - const void* stride, - const miopenTensorDescriptor_t paddingDesc, - const void* padding, - const miopenTensorDescriptor_t ksizeDesc, - const void* ksize, + const int32_t KD, + const int32_t KH, + const int32_t KW, + const int32_t SD, + const int32_t SH, + const int32_t SW, + const int32_t PD, + const int32_t PH, + const int32_t PW, const bool count_include_pad, const int32_t divisor_override) { @@ -98,12 +101,15 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, input, outputDesc, output, - strideDesc, - stride, - paddingDesc, - padding, - ksizeDesc, - ksize, + KD, + KH, + KW, + SD, + SH, + SW, + PD, + PH, + PW, count_include_pad, divisor_override); @@ -114,12 +120,15 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, DataCast(input), miopen::deref(outputDesc), DataCast(output), - miopen::deref(strideDesc), - DataCast(stride), - miopen::deref(paddingDesc), - DataCast(padding), - miopen::deref(ksizeDesc), - DataCast(ksize), + KD, + KH, + KW, + SD, + SH, + SW, + PD, + PH, + PW, count_include_pad, divisor_override); }); @@ -130,12 +139,15 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle, const void* output_grad, const miopenTensorDescriptor_t inputGradDesc, void* input_grad, - const miopenTensorDescriptor_t strideDesc, - const void* stride, - const miopenTensorDescriptor_t paddingDesc, - const void* padding, - const miopenTensorDescriptor_t ksizeDesc, - const void* ksize, + const int32_t KD, + const int32_t KH, + const int32_t KW, + const int32_t SD, + const int32_t SH, + const int32_t SW, + const int32_t PD, + const int32_t PH, + const int32_t PW, const bool count_include_pad, const int32_t divisor_override) { @@ -144,12 +156,15 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle, output_grad, inputGradDesc, input_grad, - strideDesc, - stride, - paddingDesc, - padding, - ksizeDesc, - ksize, + KD, + KH, + KW, + SD, + SH, + SW, + PD, + PH, + PW, count_include_pad, divisor_override); @@ -160,12 +175,15 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle, DataCast(output_grad), miopen::deref(inputGradDesc), DataCast(input_grad), - miopen::deref(strideDesc), - DataCast(stride), - miopen::deref(paddingDesc), - DataCast(padding), - miopen::deref(ksizeDesc), - DataCast(ksize), + KD, + KH, + KW, + SD, + SH, + SW, + PD, + PH, + PW, count_include_pad, divisor_override); }); diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp index 9210e45e3a..00a2717ff6 100644 --- a/src/include/miopen/avgpool.hpp +++ b/src/include/miopen/avgpool.hpp @@ -39,12 +39,15 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle, ConstData_t input, const TensorDescriptor& outputDesc, Data_t output, - const TensorDescriptor& strideDesc, - ConstData_t stride, - const TensorDescriptor& paddingDesc, - ConstData_t padding, - const TensorDescriptor& ksizeDesc, - ConstData_t ksize, + int32_t KD, + int32_t KH, + int32_t KW, + int32_t SD, + int32_t SH, + int32_t SW, + int32_t PD, + int32_t PH, + int32_t PW, bool count_include_pad, int32_t divisor_override); @@ -53,12 +56,15 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle, ConstData_t output_grad, const TensorDescriptor& inputGradDesc, Data_t input_grad, - const TensorDescriptor& strideDesc, - ConstData_t stride, - const TensorDescriptor& paddingDesc, - ConstData_t padding, - const TensorDescriptor& ksizeDesc, - ConstData_t ksize, + int32_t KD, + int32_t KH, + int32_t KW, + int32_t SD, + int32_t SH, + int32_t SW, + int32_t PD, + int32_t PH, + int32_t PW, bool count_include_pad, int32_t divisor_override); } // namespace miopen diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp index 91a70725ee..e8bd9256ac 100644 --- a/src/include/miopen/avgpool/invoke_params.hpp +++ b/src/include/miopen/avgpool/invoke_params.hpp @@ -38,18 +38,22 @@ struct FwdInvokeParams : public miopen::InvokeParams FwdInvokeParams() = default; - const TensorDescriptor* inputDesc = nullptr; - const TensorDescriptor* outputDesc = nullptr; - const TensorDescriptor* strideDesc = nullptr; - const TensorDescriptor* paddingDesc = nullptr; - const TensorDescriptor* ksizeDesc = nullptr; - - ConstData_t input = nullptr; - Data_t output = nullptr; - ConstData_t stride = nullptr; - ConstData_t padding = nullptr; - ConstData_t ksize = nullptr; - + const TensorDescriptor* inputDesc = nullptr; + const TensorDescriptor* outputDesc = nullptr; + + ConstData_t input = nullptr; + Data_t output = nullptr; + ConstData_t ksize = nullptr; + + int32_t KD = 0; + int32_t KH = 0; + int32_t KW = 0; + int32_t SD = 0; + int32_t SH = 0; + int32_t SW = 0; + int32_t PD = 0; + int32_t PH = 0; + int32_t PW = 0; bool count_include_pad = false; int32_t divisor_override = 0; @@ -64,16 +68,20 @@ struct BwdInvokeParams : public miopen::InvokeParams const TensorDescriptor* outputGradDesc = nullptr; const TensorDescriptor* inputGradDesc = nullptr; - const TensorDescriptor* strideDesc = nullptr; - const TensorDescriptor* paddingDesc = nullptr; - const TensorDescriptor* ksizeDesc = nullptr; ConstData_t output_grad = nullptr; Data_t input_grad = nullptr; - ConstData_t stride = nullptr; - ConstData_t padding = nullptr; ConstData_t ksize = nullptr; + int32_t KD = 0; + int32_t KH = 0; + int32_t KW = 0; + int32_t SD = 0; + int32_t SH = 0; + int32_t SW = 0; + int32_t PD = 0; + int32_t PH = 0; + int32_t PW = 0; bool count_include_pad = false; int32_t divisor_override = 0; diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp index 9166762235..2dee6a30ea 100644 --- a/src/include/miopen/avgpool/problem_description.hpp +++ b/src/include/miopen/avgpool/problem_description.hpp @@ -38,16 +38,8 @@ namespace avgpool { struct ProblemDescription : ProblemDescriptionBase { - ProblemDescription(const TensorDescriptor& strideDesc_, - const TensorDescriptor& paddingDesc_, - const TensorDescriptor& kinforDesc_, - const bool count_include_pad_, - const int32_t divisor_override_) - : strideDesc(strideDesc_), - paddingDesc(paddingDesc_), - kinforDesc(kinforDesc_), - count_include_pad(count_include_pad_), - divisor_override(divisor_override_) + ProblemDescription(const bool count_include_pad_, const int32_t divisor_override_) + : count_include_pad(count_include_pad_), divisor_override(divisor_override_) { if(divisor_override < 0) { @@ -56,10 +48,6 @@ struct ProblemDescription : ProblemDescriptionBase } protected: - TensorDescriptor strideDesc; - TensorDescriptor paddingDesc; - TensorDescriptor kinforDesc; - bool count_include_pad; int32_t divisor_override; }; @@ -68,13 +56,9 @@ struct FwdProblemDescription : ProblemDescription { FwdProblemDescription(const TensorDescriptor& inputDesc_, const TensorDescriptor& outputDesc_, - const TensorDescriptor& strideDesc_, - const TensorDescriptor& paddingDesc_, - const TensorDescriptor& kinforDesc_, const bool count_include_pad_, const int32_t divisor_override_) - : ProblemDescription( - strideDesc_, paddingDesc_, kinforDesc_, count_include_pad_, divisor_override_), + : ProblemDescription(count_include_pad_, divisor_override_), inputDesc(inputDesc_), outputDesc(outputDesc_) { @@ -95,14 +79,6 @@ struct FwdProblemDescription : ProblemDescription MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Input and output tensor sizes do not match."); } - if(input_dims - 2 != strideDesc.GetElementSize() || - input_dims - 2 != paddingDesc.GetElementSize() || - input_dims - 2 != kinforDesc.GetElementSize()) - { - MIOPEN_THROW(miopenStatusBadParm, - "AvgPool: Input tensor sizes and Kernel size or stride " - "or padding do not match."); - } return true; } @@ -118,13 +94,9 @@ struct BwdProblemDescription : ProblemDescription { BwdProblemDescription(const TensorDescriptor& outputGradDesc_, const TensorDescriptor& inputGradDesc_, - const TensorDescriptor& strideDesc_, - const TensorDescriptor& paddingDesc_, - const TensorDescriptor& kinforDesc_, const bool count_include_pad_, const int32_t divisor_override_) - : ProblemDescription( - strideDesc_, paddingDesc_, kinforDesc_, count_include_pad_, divisor_override_), + : ProblemDescription(count_include_pad_, divisor_override_), outputGradDesc(outputGradDesc_), inputGradDesc(inputGradDesc_) { @@ -145,14 +117,6 @@ struct BwdProblemDescription : ProblemDescription MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Input grad and output grad tensor sizes do not match."); } - if(input_dims - 2 != strideDesc.GetElementSize() || - input_dims - 2 != paddingDesc.GetElementSize() || - input_dims - 2 != kinforDesc.GetElementSize()) - { - MIOPEN_THROW(miopenStatusBadParm, - "AvgPool: Input grad tensor sizes and Kernel size or stride or padding do " - "not match."); - } return true; } diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp index 6d94bffac1..32ac270b37 100644 --- a/src/kernels/MIOpenAvgPool.cpp +++ b/src/kernels/MIOpenAvgPool.cpp @@ -40,27 +40,6 @@ #define OUTPUT_TYPE float #endif -// template -// struct blockNd -// { -// T val[Nd]; -// }; - -// template -// __device__ void avgPoolForwardNdNew(const TI* __restrict__ input, -// TO* __restrict__ output, -// size_t N, -// size_t C, -// const blockNd sizeIn, -// const blockNd sizeOut, -// const blockNd ksize, -// const blockNd stride, -// const blockNd padding, -// bool count_include_pad, -// int32_t divisor_override, -// tensor_view_t input_tv, -// tensor_view_t output_tv); - template __device__ void avgPoolForward2d(const TI* __restrict__ input, TO* __restrict__ output, @@ -70,9 +49,12 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input, size_t W, size_t OH, size_t OW, - const int32_t* __restrict__ ksize, - const int32_t* __restrict__ stride, - const int32_t* __restrict__ padding, + int32_t R, + int32_t S, + int32_t sh, + int32_t sw, + int32_t ph, + int32_t pw, bool count_include_pad, int32_t divisor_override, tensor_view_t<4> input_tv, @@ -82,19 +64,15 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input, int32_t ncoh = gid / OW, ow = gid % OW; int32_t nc = ncoh / OH, oh = ncoh % OH; int32_t n = nc / C, c = nc % C; - int32_t R = ksize[0]; - int32_t S = ksize[1]; - int32_t sh = stride[0]; - int32_t sw = stride[1]; - int32_t ph = padding[0]; - int32_t pw = padding[1]; if(n >= N) return; FLOAT_ACCUM m = 0; +#pragma unroll for(int32_t r = 0; r < R; ++r) { +#pragma unroll for(int32_t s = 0; s < S; ++s) { // input idx : (n, c, h, w) @@ -151,9 +129,12 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input size_t W, size_t OH, size_t OW, - int32_t* ksize, - int32_t* stride, - int32_t* padding, + int32_t R, + int32_t S, + int32_t sh, + int32_t sw, + int32_t ph, + int32_t pw, bool count_include_pad, int32_t divisor_override, tensor_view_t<4> input_tv, @@ -167,9 +148,12 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input W, OH, OW, - ksize, - stride, - padding, + R, + S, + sh, + sw, + ph, + pw, count_include_pad, divisor_override, input_tv, @@ -187,9 +171,15 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input, size_t OD, size_t OH, size_t OW, - int32_t* ksize, - int32_t* stride, - int32_t* padding, + int32_t KD, + int32_t R, + int32_t S, + int32_t sd, + int32_t sh, + int32_t sw, + int32_t pd, + int32_t ph, + int32_t pw, bool count_include_pad, int32_t divisor_override, tensor_view_t<5> input_tv, @@ -200,19 +190,11 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input, int32_t ncod = ncodoh / OH, oh = ncodoh % OH; int32_t nc = ncod / OD, od = ncod % OD; int32_t n = nc / C, c = nc % C; - int32_t KD = ksize[0]; - int32_t R = ksize[1]; - int32_t S = ksize[2]; - int32_t sd = stride[0]; - int32_t sh = stride[1]; - int32_t sw = stride[2]; - int32_t pd = padding[0]; - int32_t ph = padding[1]; - int32_t pw = padding[2]; if(n >= N) return; FLOAT_ACCUM sum = 0; +#pragma unroll for(int32_t kd = 0; kd < KD; ++kd) { for(int32_t r = 0; r < R; ++r) @@ -281,9 +263,15 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input size_t OD, size_t OH, size_t OW, - int32_t* ksize, - int32_t* stride, - int32_t* padding, + int32_t KD, + int32_t R, + int32_t S, + int32_t sd, + int32_t sh, + int32_t sw, + int32_t pd, + int32_t ph, + int32_t pw, bool count_include_pad, int32_t divisor_override, tensor_view_t<5> input_tv, @@ -299,9 +287,15 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input OD, OH, OW, - ksize, - stride, - padding, + KD, + R, + S, + sd, + sh, + sw, + pd, + ph, + pw, count_include_pad, divisor_override, input_tv, @@ -317,9 +311,12 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad, size_t W, size_t OH, size_t OW, - int32_t* ksize, - int32_t* stride, - int32_t* padding, + int32_t R, + int32_t S, + int32_t sh, + int32_t sw, + int32_t ph, + int32_t pw, bool count_include_pad, int32_t divisor_override, tensor_view_t<4> output_grad_tv, @@ -329,19 +326,15 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad, int32_t nch = gid / W, w = gid % W; int32_t nc = nch / H, h = nch % H; int32_t n = nc / C, c = nc % C; - int32_t R = ksize[0]; - int32_t S = ksize[1]; - int32_t sh = stride[0]; - int32_t sw = stride[1]; - int32_t ph = padding[0]; - int32_t pw = padding[1]; if(n >= N) return; FLOAT_ACCUM grad = 0; +#pragma unroll for(int32_t r = 0; r < R; ++r) { +#pragma unroll for(int32_t s = 0; s < S; ++s) { int32_t ohsh = h + ph - r; @@ -403,9 +396,12 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp size_t W, size_t OH, size_t OW, - int32_t* ksize, - int32_t* stride, - int32_t* padding, + int32_t R, + int32_t S, + int32_t sh, + int32_t sw, + int32_t ph, + int32_t pw, bool count_include_pad, int32_t divisor_override, tensor_view_t<4> output_grad_tv, @@ -419,9 +415,12 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp W, OH, OW, - ksize, - stride, - padding, + R, + S, + sh, + sw, + ph, + pw, count_include_pad, divisor_override, output_grad_tv, @@ -439,9 +438,15 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad, size_t OD, size_t OH, size_t OW, - int32_t* ksize, - int32_t* stride, - int32_t* padding, + int32_t KD, + int32_t R, + int32_t S, + int32_t sd, + int32_t sh, + int32_t sw, + int32_t pd, + int32_t ph, + int32_t pw, bool count_include_pad, int32_t divisor_override, tensor_view_t<5> output_grad_tv, @@ -452,20 +457,12 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad, int32_t ncd = ncdh / H, h = ncdh % H; int32_t nc = ncd / D, d = ncd % D; int32_t n = nc / C, c = nc % C; - int32_t KD = ksize[0]; - int32_t R = ksize[1]; - int32_t S = ksize[2]; - int32_t sd = stride[0]; - int32_t sh = stride[1]; - int32_t sw = stride[2]; - int32_t pd = padding[0]; - int32_t ph = padding[1]; - int32_t pw = padding[2]; if(n >= N) return; FLOAT_ACCUM grad = 0; +#pragma unroll for(int32_t kd = 0; kd < KD; ++kd) { for(int32_t r = 0; r < R; ++r) @@ -543,9 +540,15 @@ extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ outp size_t OD, size_t OH, size_t OW, - int32_t* ksize, - int32_t* stride, - int32_t* padding, + int32_t KD, + int32_t R, + int32_t S, + int32_t sd, + int32_t sh, + int32_t sw, + int32_t pd, + int32_t ph, + int32_t pw, bool count_include_pad, int32_t divisor_override, tensor_view_t<5> output_grad_tv, @@ -561,9 +564,15 @@ extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ outp OD, OH, OW, - ksize, - stride, - padding, + KD, + R, + S, + sd, + sh, + sw, + pd, + ph, + pw, count_include_pad, divisor_override, output_grad_tv, diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp index 4fe9d5bc76..c5ed51dc27 100644 --- a/src/solver/avgpool/backward_avgpool_2d.cpp +++ b/src/solver/avgpool/backward_avgpool_2d.cpp @@ -81,10 +81,10 @@ bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context, { return false; } - if(!IsOverRocm(problem)) - { - return false; - } + // if(!IsOverRocm(problem)) + // { + // return false; + // } return true; } @@ -135,9 +135,12 @@ AvgPoolBackward2d::GetSolution(const ExecutionContext& context, W, OH, OW, - params.ksize, - params.stride, - params.padding, + params.KH, + params.KW, + params.SH, + params.SW, + params.PH, + params.PW, params.count_include_pad, params.divisor_override, output_grad_tv, diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp index 6897097955..96adbb2e46 100644 --- a/src/solver/avgpool/backward_avgpool_3d.cpp +++ b/src/solver/avgpool/backward_avgpool_3d.cpp @@ -43,6 +43,42 @@ namespace solver { namespace avgpool { +bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem) +{ + auto dtype = problem.GetInputGradDesc().GetType(); + auto in_nelems = problem.GetInputGradDesc().GetElementSize(); + auto out_nelems = problem.GetOutputGradDesc().GetElementSize(); + auto mul_nc = + problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1]; + auto in_over_out = static_cast(in_nelems) / out_nelems; + + if(dtype == miopenFloat) + { + if((in_over_out < 8 && in_over_out > 1) || (in_over_out < 2 && in_nelems <= 5971968)) + { + return true; + } + return false; + } + else if(dtype == miopenHalf) + { + if((in_over_out < 2 && mul_nc < 8192) || + (8 > in_over_out && in_over_out > 7 && out_nelems >= 32401152)) + { + return true; + } + } + else if(dtype == miopenBFloat16) + { + if((7 < in_over_out && in_over_out < 8 && in_nelems >= 944111616) || + (in_over_out < 2 && in_nelems >= 4194304)) + { + return true; + } + } + return false; +} + bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context, const miopen::avgpool::BwdProblemDescription& problem) const { @@ -51,6 +87,10 @@ bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context, { return false; } + // if(!IsOverRocm(problem)) + // { + // return false; + // } return true; } @@ -105,9 +145,15 @@ AvgPoolBackward3d::GetSolution(const ExecutionContext& context, OD, OH, OW, - params.ksize, - params.stride, - params.padding, + params.KD, + params.KH, + params.KW, + params.SD, + params.SH, + params.SW, + params.PD, + params.PH, + params.PW, params.count_include_pad, params.divisor_override, output_grad_tv, diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp index 3e70264097..ebc5c4b956 100644 --- a/src/solver/avgpool/forward_avgpool_2d.cpp +++ b/src/solver/avgpool/forward_avgpool_2d.cpp @@ -83,10 +83,10 @@ bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context, { return false; } - if(!IsOverRocm(problem)) - { - return false; - } + // if(!IsOverRocm(problem)) + // { + // return false; + // } return true; } @@ -137,9 +137,12 @@ AvgPoolForward2d::GetSolution(const ExecutionContext& context, W, OH, OW, - params.ksize, - params.stride, - params.padding, + params.KH, + params.KW, + params.SH, + params.SW, + params.PH, + params.PW, params.count_include_pad, params.divisor_override, input_tv, diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp index 088aac6dca..32a24d47bb 100644 --- a/src/solver/avgpool/forward_avgpool_3d.cpp +++ b/src/solver/avgpool/forward_avgpool_3d.cpp @@ -43,6 +43,38 @@ namespace solver { namespace avgpool { +bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem) +{ + auto dtype = problem.GetOutputDesc().GetType(); + auto in_nelems = problem.GetInputDesc().GetElementSize(); + auto out_nelems = problem.GetOutputDesc().GetElementSize(); + auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1]; + auto in_over_out = static_cast(in_nelems) / out_nelems; + + if(dtype == miopenFloat) + { + if(in_over_out < 8 || in_over_out >= 262144) + { + return true; + } + } + else if(dtype == miopenHalf) + { + if(in_nelems >= 201326592 || (in_over_out < 2 && mul_nc < 8192)) + { + return true; + } + } + else if(dtype == miopenBFloat16) + { + if((out_nelems >= 5971968 && in_over_out < 2) || out_nelems >= 74088000) + { + return true; + } + } + return false; +} + bool AvgPoolForward3d::IsApplicable(const ExecutionContext& context, const miopen::avgpool::FwdProblemDescription& problem) const { @@ -50,6 +82,10 @@ bool AvgPoolForward3d::IsApplicable(const ExecutionContext& context, { return false; } + // if(!IsOverRocm(problem)) + // { + // return false; + // } return true; } @@ -104,9 +140,15 @@ AvgPoolForward3d::GetSolution(const ExecutionContext& context, OD, OH, OW, - params.ksize, - params.stride, - params.padding, + params.KD, + params.KH, + params.KW, + params.SD, + params.SH, + params.SW, + params.PD, + params.PH, + params.PW, params.count_include_pad, params.divisor_override, input_tv, From 930d47e02a4573ac52713238704794d4228b7fb8 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Tue, 20 Aug 2024 18:42:22 +0700 Subject: [PATCH 08/38] fix gtest --- src/kernels/MIOpenAvgPool.cpp | 4 - src/solver/avgpool/backward_avgpool_2d.cpp | 12 +-- src/solver/avgpool/backward_avgpool_3d.cpp | 22 +++--- src/solver/avgpool/forward_avgpool_2d.cpp | 14 ++-- src/solver/avgpool/forward_avgpool_3d.cpp | 19 +++-- test/gtest/avgpool.cpp | 24 ++++-- test/gtest/avgpool.hpp | 89 +++++++++++++++------- 7 files changed, 114 insertions(+), 70 deletions(-) diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp index 32ac270b37..d17dcc38ff 100644 --- a/src/kernels/MIOpenAvgPool.cpp +++ b/src/kernels/MIOpenAvgPool.cpp @@ -72,7 +72,6 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input, #pragma unroll for(int32_t r = 0; r < R; ++r) { -#pragma unroll for(int32_t s = 0; s < S; ++s) { // input idx : (n, c, h, w) @@ -194,7 +193,6 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input, if(n >= N) return; FLOAT_ACCUM sum = 0; -#pragma unroll for(int32_t kd = 0; kd < KD; ++kd) { for(int32_t r = 0; r < R; ++r) @@ -334,7 +332,6 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad, #pragma unroll for(int32_t r = 0; r < R; ++r) { -#pragma unroll for(int32_t s = 0; s < S; ++s) { int32_t ohsh = h + ph - r; @@ -462,7 +459,6 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad, return; FLOAT_ACCUM grad = 0; -#pragma unroll for(int32_t kd = 0; kd < KD; ++kd) { for(int32_t r = 0; r < R; ++r) diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp index c5ed51dc27..73adabb8e7 100644 --- a/src/solver/avgpool/backward_avgpool_2d.cpp +++ b/src/solver/avgpool/backward_avgpool_2d.cpp @@ -43,7 +43,7 @@ namespace solver { namespace avgpool { -bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem) +bool IsOverRocmBwd2d(const miopen::avgpool::BwdProblemDescription& problem) { auto dtype = problem.GetInputGradDesc().GetType(); auto in_nelems = problem.GetInputGradDesc().GetElementSize(); @@ -73,7 +73,7 @@ bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem) return false; } -bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context, +bool AvgPoolBackward2d::IsApplicable(const ExecutionContext&, const miopen::avgpool::BwdProblemDescription& problem) const { if(problem.GetInputGradDesc().GetNumDims() != 4 || @@ -81,10 +81,10 @@ bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context, { return false; } - // if(!IsOverRocm(problem)) - // { - // return false; - // } + if(!IsOverRocmBwd2d(problem)) + { + return false; + } return true; } diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp index 96adbb2e46..4815803ad3 100644 --- a/src/solver/avgpool/backward_avgpool_3d.cpp +++ b/src/solver/avgpool/backward_avgpool_3d.cpp @@ -43,18 +43,19 @@ namespace solver { namespace avgpool { -bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem) +bool IsOverRocmBwd3d(const miopen::avgpool::BwdProblemDescription& problem) { auto dtype = problem.GetInputGradDesc().GetType(); auto in_nelems = problem.GetInputGradDesc().GetElementSize(); auto out_nelems = problem.GetOutputGradDesc().GetElementSize(); auto mul_nc = problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1]; + auto N = problem.GetOutputGradDesc().GetLengths()[0]; auto in_over_out = static_cast(in_nelems) / out_nelems; if(dtype == miopenFloat) { - if((in_over_out < 8 && in_over_out > 1) || (in_over_out < 2 && in_nelems <= 5971968)) + if((in_over_out < 2 && out_nelems <= 12582912) || (in_over_out <= 8 && N >= 6)) { return true; } @@ -62,16 +63,15 @@ bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem) } else if(dtype == miopenHalf) { - if((in_over_out < 2 && mul_nc < 8192) || - (8 > in_over_out && in_over_out > 7 && out_nelems >= 32401152)) + if((in_over_out < 2 && mul_nc < 8192) || (8 > in_over_out && out_nelems >= 29052108)) { return true; } } else if(dtype == miopenBFloat16) { - if((7 < in_over_out && in_over_out < 8 && in_nelems >= 944111616) || - (in_over_out < 2 && in_nelems >= 4194304)) + if((1 <= in_over_out && in_over_out < 2 && in_nelems >= 4194304) || + (in_over_out <= 8 && in_nelems >= 944111616)) { return true; } @@ -79,7 +79,7 @@ bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem) return false; } -bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context, +bool AvgPoolBackward3d::IsApplicable(const ExecutionContext&, const miopen::avgpool::BwdProblemDescription& problem) const { if(problem.GetInputGradDesc().GetNumDims() != 5 || @@ -87,10 +87,10 @@ bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context, { return false; } - // if(!IsOverRocm(problem)) - // { - // return false; - // } + if(!IsOverRocmBwd3d(problem)) + { + return false; + } return true; } diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp index ebc5c4b956..1c51feb54b 100644 --- a/src/solver/avgpool/forward_avgpool_2d.cpp +++ b/src/solver/avgpool/forward_avgpool_2d.cpp @@ -44,7 +44,7 @@ namespace solver { namespace avgpool { -bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem) +bool IsOverRocmFwd2d(const miopen::avgpool::FwdProblemDescription& problem) { auto dtype = problem.GetOutputDesc().GetType(); auto in_nelems = problem.GetInputDesc().GetElementSize(); @@ -68,7 +68,7 @@ bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem) } else if(dtype == miopenBFloat16) { - if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 6000000) + if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 4816896) { return true; } @@ -76,17 +76,17 @@ bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem) return false; } -bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context, +bool AvgPoolForward2d::IsApplicable(const ExecutionContext&, const miopen::avgpool::FwdProblemDescription& problem) const { if(problem.GetInputDesc().GetNumDims() != 4 || problem.GetOutputDesc().GetNumDims() != 4) { return false; } - // if(!IsOverRocm(problem)) - // { - // return false; - // } + if(!IsOverRocmFwd2d(problem)) + { + return false; + } return true; } diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp index 32a24d47bb..6f70a07419 100644 --- a/src/solver/avgpool/forward_avgpool_3d.cpp +++ b/src/solver/avgpool/forward_avgpool_3d.cpp @@ -43,17 +43,22 @@ namespace solver { namespace avgpool { -bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem) +bool IsOverRocmFwd3d(const miopen::avgpool::FwdProblemDescription& problem) { auto dtype = problem.GetOutputDesc().GetType(); auto in_nelems = problem.GetInputDesc().GetElementSize(); auto out_nelems = problem.GetOutputDesc().GetElementSize(); auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1]; + auto N = problem.GetOutputDesc().GetLengths()[0]; auto in_over_out = static_cast(in_nelems) / out_nelems; + std::cout << "in_over_out: " << in_over_out << std::endl; + std::cout << "in_nelems: " << in_nelems << std::endl; + std::cout << "out_nelems: " << out_nelems << std::endl; + if(dtype == miopenFloat) { - if(in_over_out < 8 || in_over_out >= 262144) + if(in_over_out < 2 || in_over_out >= 262144 || (out_nelems >= 10125000 && N > 4)) { return true; } @@ -75,17 +80,17 @@ bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem) return false; } -bool AvgPoolForward3d::IsApplicable(const ExecutionContext& context, +bool AvgPoolForward3d::IsApplicable(const ExecutionContext&, const miopen::avgpool::FwdProblemDescription& problem) const { if(problem.GetInputDesc().GetNumDims() != 5 || problem.GetOutputDesc().GetNumDims() != 5) { return false; } - // if(!IsOverRocm(problem)) - // { - // return false; - // } + if(!IsOverRocmFwd3d(problem)) + { + return false; + } return true; } diff --git a/test/gtest/avgpool.cpp b/test/gtest/avgpool.cpp index fa002e5610..3ab32be510 100644 --- a/test/gtest/avgpool.cpp +++ b/test/gtest/avgpool.cpp @@ -111,9 +111,15 @@ TEST_P(GPU_Avgpool_fwd_BFP16, AvgPoolTestFwd) } }; -INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP32, testing::ValuesIn(AvgPoolTestConfigs())); -INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP16, testing::ValuesIn(AvgPoolTestConfigs())); -INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Avgpool_fwd_FP32, + testing::ValuesIn(AvgPoolTestConfigsFwdFp32())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Avgpool_fwd_FP16, + testing::ValuesIn(AvgPoolTestConfigsFwdFp16())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Avgpool_fwd_BFP16, + testing::ValuesIn(AvgPoolTestConfigsFwdBfp16())); // BACKWARD TEST TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd) @@ -158,6 +164,12 @@ TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd) } }; -INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP32, testing::ValuesIn(AvgPoolTestConfigs())); -INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP16, testing::ValuesIn(AvgPoolTestConfigs())); -INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Avgpool_bwd_FP32, + testing::ValuesIn(AvgPoolTestConfigsBwdFp32())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Avgpool_bwd_FP16, + testing::ValuesIn(AvgPoolTestConfigsBwdFp16())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Avgpool_bwd_BFP16, + testing::ValuesIn(AvgPoolTestConfigsBwdBfp16())); diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp index 26548e0a12..fca812357d 100644 --- a/test/gtest/avgpool.hpp +++ b/test/gtest/avgpool.hpp @@ -70,25 +70,50 @@ struct AvgPoolTestCase std::vector GetInput() const { return input_dims; } }; -inline std::vector AvgPoolTestConfigs() +inline std::vector AvgPoolTestConfigsFwdFp32() { return { - {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, false, 0}, - // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, false, 0}, - // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, true, 0}, - // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, true, 0}, - // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, false, 1}, - // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, false, 1}, - // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, true, 1}, - // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, true, 1}, - {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, false, 0}, - // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, false, 0}, - // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, - // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, true, 0}, - // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, false, 1}, - // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, false, 1}, - // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 1}, - // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, true, 1}, + {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0}, + {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, + }; +} + +inline std::vector AvgPoolTestConfigsFwdFp16() +{ + return { + {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0}, + {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, + }; +} + +inline std::vector AvgPoolTestConfigsFwdBfp16() +{ + return { + {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0}, + {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, + }; +} + +inline std::vector AvgPoolTestConfigsBwdFp32() +{ + return { + {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, + }; +} + +inline std::vector AvgPoolTestConfigsBwdFp16() +{ + return { + {{64, 288, 35, 35}, {3, 3}, {1, 1}, {1, 1}, false, true, 0}, + {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, + }; +} + +inline std::vector AvgPoolTestConfigsBwdBfp16() +{ + return { + {{64, 2048, 9, 9}, {3, 3}, {1, 1}, {1, 1}, false, true, 0}, + {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, }; } @@ -212,12 +237,15 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam input_dev.get(), output.desc, output_dev.get(), - stride.desc, - stride_dev.get(), - padding.desc, - padding_dev.get(), - ksize.desc, - ksize_dev.get(), + ksize.GetSize() == 3 ? ksize[0] : 0, + ksize.GetSize() == 3 ? ksize[1] : ksize[0], + ksize.GetSize() == 3 ? ksize[2] : ksize[1], + stride.GetSize() == 3 ? stride[0] : 0, + stride.GetSize() == 3 ? stride[1] : stride[0], + stride.GetSize() == 3 ? stride[2] : stride[1], + padding.GetSize() == 3 ? padding[0] : 0, + padding.GetSize() == 3 ? padding[1] : padding[0], + padding.GetSize() == 3 ? padding[2] : padding[1], count_include_pad, divisor_override); fflush(stdout); @@ -377,12 +405,15 @@ struct AvgPoolTestBwd : public ::testing::TestWithParam output_grad_dev.get(), input_grad.desc, input_grad_dev.get(), - stride.desc, - stride_dev.get(), - padding.desc, - padding_dev.get(), - ksize.desc, - ksize_dev.get(), + ksize.GetSize() == 3 ? ksize[0] : 0, + ksize.GetSize() == 3 ? ksize[1] : ksize[0], + ksize.GetSize() == 3 ? ksize[2] : ksize[1], + stride.GetSize() == 3 ? stride[0] : 0, + stride.GetSize() == 3 ? stride[1] : stride[0], + stride.GetSize() == 3 ? stride[2] : stride[1], + padding.GetSize() == 3 ? padding[0] : 0, + padding.GetSize() == 3 ? padding[1] : padding[0], + padding.GetSize() == 3 ? padding[2] : padding[1], count_include_pad, divisor_override); From 5a357389c31287ebfdf57893d9a5046e08cce8a0 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Wed, 21 Aug 2024 17:48:10 +0700 Subject: [PATCH 09/38] passed gtest --- src/kernels/MIOpenAvgPool.cpp | 2 -- test/gtest/avgpool.hpp | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp index d17dcc38ff..76355d5729 100644 --- a/src/kernels/MIOpenAvgPool.cpp +++ b/src/kernels/MIOpenAvgPool.cpp @@ -69,7 +69,6 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input, return; FLOAT_ACCUM m = 0; -#pragma unroll for(int32_t r = 0; r < R; ++r) { for(int32_t s = 0; s < S; ++s) @@ -329,7 +328,6 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad, return; FLOAT_ACCUM grad = 0; -#pragma unroll for(int32_t r = 0; r < R; ++r) { for(int32_t s = 0; s < S; ++s) diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp index fca812357d..94898d32b6 100644 --- a/test/gtest/avgpool.hpp +++ b/test/gtest/avgpool.hpp @@ -105,7 +105,7 @@ inline std::vector AvgPoolTestConfigsBwdFp16() { return { {{64, 288, 35, 35}, {3, 3}, {1, 1}, {1, 1}, false, true, 0}, - {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, + {{6, 288, 35, 35, 35}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, false, true, 0}, }; } @@ -113,7 +113,7 @@ inline std::vector AvgPoolTestConfigsBwdBfp16() { return { {{64, 2048, 9, 9}, {3, 3}, {1, 1}, {1, 1}, false, true, 0}, - {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, + {{6, 128, 112, 112, 112}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, }; } From 27470a21c4f33a9426cc635e791f2b02ba6dc7ac Mon Sep 17 00:00:00 2001 From: hieule88 Date: Mon, 30 Sep 2024 13:38:02 +0700 Subject: [PATCH 10/38] skeleton code --- docs/reference/index.rst | 2 +- include/miopen/miopen.h | 48 +- src/CMakeLists.txt | 16 +- src/adaptiveavgpool.cpp | 94 +++ .../problem_description.cpp | 15 +- src/adaptiveavgpool_api.cpp | 113 ++++ src/avgpool.cpp | 136 ----- src/avgpool_api.cpp | 190 ------ src/include/miopen/adaptiveavgpool.hpp | 50 ++ .../invoke_params.hpp | 30 +- .../problem_description.hpp | 45 +- .../miopen/adaptiveavgpool/solvers.hpp | 159 +++++ src/include/miopen/avgpool.hpp | 71 --- src/include/miopen/avgpool/solvers.hpp | 113 ---- src/include/miopen/solver_id.hpp | 2 +- src/kernels/MIOpenAdaptiveAvgPool.cpp | 404 ++++++++++++ src/kernels/MIOpenAvgPool.cpp | 574 ------------------ src/solver.cpp | 25 + .../backward_adaptiveavgpool_1d.cpp} | 39 +- .../backward_adaptiveavgpool_2d.cpp | 153 +++++ .../backward_adaptiveavgpool_3d.cpp} | 42 +- .../forward_adaptiveavgpool_1d.cpp | 145 +++++ .../forward_adaptiveavgpool_2d.cpp} | 50 +- .../forward_adaptiveavgpool_3d.cpp} | 55 +- 24 files changed, 1251 insertions(+), 1320 deletions(-) create mode 100644 src/adaptiveavgpool.cpp rename src/{avgpool => adaptiveavgpool}/problem_description.cpp (90%) create mode 100644 src/adaptiveavgpool_api.cpp delete mode 100644 src/avgpool.cpp delete mode 100644 src/avgpool_api.cpp create mode 100644 src/include/miopen/adaptiveavgpool.hpp rename src/include/miopen/{avgpool => adaptiveavgpool}/invoke_params.hpp (71%) rename src/include/miopen/{avgpool => adaptiveavgpool}/problem_description.hpp (68%) create mode 100644 src/include/miopen/adaptiveavgpool/solvers.hpp delete mode 100644 src/include/miopen/avgpool.hpp delete mode 100644 src/include/miopen/avgpool/solvers.hpp create mode 100644 src/kernels/MIOpenAdaptiveAvgPool.cpp delete mode 100644 src/kernels/MIOpenAvgPool.cpp rename src/solver/{avgpool/backward_avgpool_2d.cpp => adaptiveavgpool/backward_adaptiveavgpool_1d.cpp} (81%) create mode 100644 src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp rename src/solver/{avgpool/backward_avgpool_3d.cpp => adaptiveavgpool/backward_adaptiveavgpool_3d.cpp} (81%) create mode 100644 src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp rename src/solver/{avgpool/forward_avgpool_2d.cpp => adaptiveavgpool/forward_adaptiveavgpool_2d.cpp} (76%) rename src/solver/{avgpool/forward_avgpool_3d.cpp => adaptiveavgpool/forward_adaptiveavgpool_3d.cpp} (76%) diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 9594e00ef0..d715ccef25 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -35,4 +35,4 @@ The MIOpen API library is structured as follows: * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental) * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental) * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental) - * :doc:`AvgPool <../doxygen/html/group__avgpool>` (experimental) + * :doc:`AdaptiveAvgPool <../doxygen/html/group__adaptiveavgpool>` (experimental) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index ea44de92d5..57aeeb5d3b 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -70,7 +70,7 @@ * @defgroup SGD * @defgroup getitem * @defgroup ReduceCalculation - * @defgroup avgpool + * @defgroup adaptiveavgpool * */ @@ -7623,73 +7623,43 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle, #endif // MIOPEN_BETA_API #ifdef MIOPEN_BETA_API -// avgpool APIs -/** @addtogroup avgpool +// adaptiveavgpool APIs +/** @addtogroup adaptiveavgpool * * @{ */ -/*! @brief Execute an avgpool forward layer +/*! @brief Execute an adaptiveavgpool forward layer * * @param handle MIOpen handle (input) * @param inputDesc Tensor descriptor for input tensor (input) * @param input Data tensor input (input) * @param outputDesc Tensor descriptor for output tensor (input) * @param output Data tensor output (output) - * @param count_include_pad When True, will include the zero-padding in the averaging - * calculation (input) - * @param divisor_override If non-zero, will use this value as the divisor, otherwise will - * use the number of elements in the pooling window (input) * @return miopenStatus_t */ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, const miopenTensorDescriptor_t inputDesc, const void* input, const miopenTensorDescriptor_t outputDesc, - void* output, - const int32_t KD, - const int32_t KH, - const int32_t KW, - const int32_t SD, - const int32_t SH, - const int32_t SW, - const int32_t PD, - const int32_t PH, - const int32_t PW, - const bool count_include_pad, - const int32_t divisor_override); - -/*! @brief Execute an avgpool backward layer + void* output); + +/*! @brief Execute an adaptiveavgpool backward layer * * @param handle MIOpen handle (input) * @param outputGradDesc Tensor descriptor for output grad tensor (input) * @param output_grad Data tensor output grad (input) * @param inputGradDesc Tensor descriptor for input grad tensor (input) * @param input_grad Data tensor input grad (output) - * @param count_include_pad When True, will include the zero-padding in the averaging - * calculation (input) - * @param divisor_override If non-zero, will use this value as the divisor, otherwise will - * use the number of elements in the pooling window (input) * @return miopenStatus_t */ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle, const miopenTensorDescriptor_t outputGradDesc, const void* output_grad, const miopenTensorDescriptor_t inputGradDesc, - void* input_grad, - const int32_t KD, - const int32_t KH, - const int32_t KW, - const int32_t SD, - const int32_t SH, - const int32_t SW, - const int32_t PD, - const int32_t PH, - const int32_t PW, - const bool count_include_pad, - const int32_t divisor_override); + void* input_grad); /** @} */ -// CLOSEOUT avgpool DOXYGEN GROUP +// CLOSEOUT adaptiveavgpool DOXYGEN GROUP #endif // MIOPEN_BETA_API #ifdef __cplusplus diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ee36c92967..ae621b28ad 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -89,8 +89,8 @@ set( MIOpen_Source adam_api.cpp addlayernorm_api.cpp api/find2_0_commons.cpp - avgpool_api.cpp - avgpool/problem_description.cpp + adaptiveavgpool_api.cpp + adaptiveavgpool/problem_description.cpp batch_norm.cpp batch_norm_api.cpp batchnorm/problem_description.cpp @@ -193,10 +193,12 @@ set( MIOpen_Source solver/activ/fwd_1.cpp solver/adam/adam.cpp solver/adam/transformers_adam_w.cpp - solver/avgpool/backward_avgpool_2d.cpp - solver/avgpool/backward_avgpool_3d.cpp - solver/avgpool/forward_avgpool_2d.cpp - solver/avgpool/forward_avgpool_3d.cpp + solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp + solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp + solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp + solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp + solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp + solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp solver/batchnorm/backward_ck.cpp solver/batchnorm/backward_per_activation.cpp solver/batchnorm/backward_per_activation_fused.cpp @@ -633,7 +635,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN activ.cpp adam.cpp addlayernorm.cpp - avgpool.cpp + adaptiveavgpool.cpp cat.cpp groupnorm.cpp getitem.cpp diff --git a/src/adaptiveavgpool.cpp b/src/adaptiveavgpool.cpp new file mode 100644 index 0000000000..fee382a4d1 --- /dev/null +++ b/src/adaptiveavgpool.cpp @@ -0,0 +1,94 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +miopenStatus_t AdaptiveAvgPoolForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output) +{ + const auto problem = adaptiveavgpool::FwdProblemDescription{inputDesc, outputDesc}; + + const auto invoke_params = [&]() { + auto tmp = adaptiveavgpool::FwdInvokeParams{}; + tmp.inputDesc = &inputDesc; + tmp.outputDesc = &outputDesc; + + tmp.input = input; + tmp.output = output; + + return tmp; + }(); + const auto algo = AlgorithmName{"AdaptiveAvgPoolForward"}; + const auto solvers = + solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +miopenStatus_t AdaptiveAvgPoolBackward(Handle& handle, + const TensorDescriptor& outputGradDesc, + ConstData_t output_grad, + const TensorDescriptor& inputGradDesc, + Data_t input_grad) +{ + const auto problem = adaptiveavgpool::BwdProblemDescription{outputGradDesc, inputGradDesc}; + + const auto invoke_params = [&]() { + auto tmp = adaptiveavgpool::BwdInvokeParams{}; + tmp.outputGradDesc = &outputGradDesc; + tmp.inputGradDesc = &inputGradDesc; + + tmp.output_grad = output_grad; + tmp.input_grad = input_grad; + + return tmp; + }(); + const auto algo = AlgorithmName{"AdaptiveAvgPoolBackward"}; + const auto solvers = + solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/avgpool/problem_description.cpp b/src/adaptiveavgpool/problem_description.cpp similarity index 90% rename from src/avgpool/problem_description.cpp rename to src/adaptiveavgpool/problem_description.cpp index 96ecb4bb72..ec3b9cf636 100644 --- a/src/avgpool/problem_description.cpp +++ b/src/adaptiveavgpool/problem_description.cpp @@ -24,12 +24,13 @@ * *******************************************************************************/ -#include +#include #include +#include namespace miopen { -namespace avgpool { +namespace adaptiveavgpool { inline std::ostream& operator<<(std::ostream& os, const std::vector& v) { @@ -55,14 +56,12 @@ NetworkConfig FwdProblemDescription::MakeNetworkConfig() const std::ostringstream ss; - ss << "avgpool_fwd"; + ss << "adaptiveavgpool_fwd"; ss << "-input_dtype" << input_dtype; ss << "-Is" << input_size; ss << "-Os" << output_size; ss << "-Si" << input_stride; ss << "-So" << output_stride; - ss << "-Cp " << count_include_pad; - ss << "-Do " << divisor_override; return NetworkConfig{ss.str()}; } @@ -78,18 +77,16 @@ NetworkConfig BwdProblemDescription::MakeNetworkConfig() const std::ostringstream ss; - ss << "avgpool_bwd"; + ss << "adaptiveavgpool_bwd"; ss << "-input_dtype" << input_dtype; ss << "-dIs" << input_grad_size; ss << "-dOs" << output_grad_size; ss << "-dSi" << input_grad_stride; ss << "-dSo" << output_grad_stride; - ss << "-Cp " << count_include_pad; - ss << "-Do " << divisor_override; return NetworkConfig{ss.str()}; } -} // namespace avgpool +} // namespace adaptiveavgpool } // namespace miopen diff --git a/src/adaptiveavgpool_api.cpp b/src/adaptiveavgpool_api.cpp new file mode 100644 index 0000000000..a9159258f9 --- /dev/null +++ b/src/adaptiveavgpool_api.cpp @@ -0,0 +1,113 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +static void LogCmdAdaptiveAvgPool(const miopenTensorDescriptor_t xDesc, + const miopenTensorDescriptor_t oDesc, + const bool is_fwd) +{ + if(miopen::IsLoggingCmd()) + { + std::stringstream ss; + auto dtype = miopen::deref(xDesc).GetType(); + if(dtype == miopenHalf) + { + ss << "adaptiveavgpoolfp16"; + } + else if(dtype == miopenFloat) + { + ss << "adaptiveavgpoolfp32"; + } + else if(dtype == miopenBFloat16) + { + ss << "adaptiveavgpoolbfp16"; + } + + MIOPEN_LOG_FUNCTION(xDesc, oDesc, is_fwd); + ss << " -Is " << miopen::deref(xDesc).GetLengths(); + ss << " -Os " << miopen::deref(oDesc).GetLengths(); + ss << " -Si " << miopen::deref(xDesc).GetStrides(); + ss << " -So " << miopen::deref(oDesc).GetStrides(); + ss << " -F " << ((is_fwd) ? "1" : "2"); + + MIOPEN_LOG_DRIVER_CMD(ss.str()); + } +} + +extern "C" miopenStatus_t miopenAdaptiveAvgPoolForward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output) +{ + MIOPEN_LOG_FUNCTION(handle, inputDesc, input, outputDesc, output); + + LogCmdAdaptiveAvgPool(inputDesc, outputDesc, true); + return miopen::try_([&] { + miopen::AdaptiveAvgPoolForward(miopen::deref(handle), + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(outputDesc), + DataCast(output)); + }); +} + +extern "C" miopenStatus_t +miopenAdaptiveAvgPoolBackward(miopenHandle_t handle, + const miopenTensorDescriptor_t outputGradDesc, + const void* output_grad, + const miopenTensorDescriptor_t inputGradDesc, + void* input_grad) +{ + MIOPEN_LOG_FUNCTION(handle, outputGradDesc, output_grad, inputGradDesc, input_grad); + + LogCmdAdaptiveAvgPool(inputGradDesc, outputGradDesc, false); + return miopen::try_([&] { + miopen::AdaptiveAvgPoolBackward(miopen::deref(handle), + miopen::deref(outputGradDesc), + DataCast(output_grad), + miopen::deref(inputGradDesc), + DataCast(input_grad)); + }); +} diff --git a/src/avgpool.cpp b/src/avgpool.cpp deleted file mode 100644 index 323f01c90e..0000000000 --- a/src/avgpool.cpp +++ /dev/null @@ -1,136 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2024 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include - -namespace miopen { - -miopenStatus_t AvgPoolForward(Handle& handle, - const TensorDescriptor& inputDesc, - ConstData_t input, - const TensorDescriptor& outputDesc, - Data_t output, - const int32_t KD, - const int32_t KH, - const int32_t KW, - const int32_t SD, - const int32_t SH, - const int32_t SW, - const int32_t PD, - const int32_t PH, - const int32_t PW, - const bool count_include_pad, - const int32_t divisor_override) -{ - const auto problem = - avgpool::FwdProblemDescription{inputDesc, outputDesc, count_include_pad, divisor_override}; - - const auto invoke_params = [&]() { - auto tmp = avgpool::FwdInvokeParams{}; - tmp.inputDesc = &inputDesc; - tmp.outputDesc = &outputDesc; - - tmp.input = input; - tmp.output = output; - tmp.KD = KD; - tmp.KH = KH; - tmp.KW = KW; - tmp.SD = SD; - tmp.SH = SH; - tmp.SW = SW; - tmp.PD = PD; - tmp.PH = PH; - tmp.PW = PW; - tmp.count_include_pad = count_include_pad; - tmp.divisor_override = divisor_override; - - return tmp; - }(); - const auto algo = AlgorithmName{"AvgPoolForward"}; - const auto solvers = solver::SolverContainer{}; - - solvers.ExecutePrimitive(handle, problem, algo, invoke_params); - - return miopenStatusSuccess; -} - -miopenStatus_t AvgPoolBackward(Handle& handle, - const TensorDescriptor& outputGradDesc, - ConstData_t output_grad, - const TensorDescriptor& inputGradDesc, - Data_t input_grad, - const int32_t KD, - const int32_t KH, - const int32_t KW, - const int32_t SD, - const int32_t SH, - const int32_t SW, - const int32_t PD, - const int32_t PH, - const int32_t PW, - const bool count_include_pad, - const int32_t divisor_override) -{ - const auto problem = avgpool::BwdProblemDescription{ - outputGradDesc, inputGradDesc, count_include_pad, divisor_override}; - - const auto invoke_params = [&]() { - auto tmp = avgpool::BwdInvokeParams{}; - tmp.outputGradDesc = &outputGradDesc; - tmp.inputGradDesc = &inputGradDesc; - - tmp.output_grad = output_grad; - tmp.input_grad = input_grad; - tmp.KD = KD; - tmp.KH = KH; - tmp.KW = KW; - tmp.SD = SD; - tmp.SH = SH; - tmp.SW = SW; - tmp.PD = PD; - tmp.PH = PH; - tmp.PW = PW; - tmp.count_include_pad = count_include_pad; - tmp.divisor_override = divisor_override; - - return tmp; - }(); - const auto algo = AlgorithmName{"AvgPoolBackward"}; - const auto solvers = solver::SolverContainer{}; - - solvers.ExecutePrimitive(handle, problem, algo, invoke_params); - - return miopenStatusSuccess; -} - -} // namespace miopen diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp deleted file mode 100644 index 32e1f12f92..0000000000 --- a/src/avgpool_api.cpp +++ /dev/null @@ -1,190 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2024 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#include -#include -#include -#include -#include - -inline std::ostream& operator<<(std::ostream& os, const std::vector& v) -{ - os << '{'; - for(int i = 0; i < v.size(); ++i) - { - if(i != 0) - os << ','; - os << v[i]; - } - os << '}'; - return os; -} - -static void LogCmdAvgPool(const miopenTensorDescriptor_t xDesc, - const miopenTensorDescriptor_t oDesc, - const bool count_include_pad, - const int32_t divisor_override, - const bool is_fwd) -{ - if(miopen::IsLoggingCmd()) - { - std::stringstream ss; - auto dtype = miopen::deref(xDesc).GetType(); - if(dtype == miopenHalf) - { - ss << "avgpoolfp16"; - } - else if(dtype == miopenFloat) - { - ss << "avgpoolfp32"; - } - else if(dtype == miopenBFloat16) - { - ss << "avgpoolbfp16"; - } - - MIOPEN_LOG_FUNCTION(xDesc, oDesc, count_include_pad, divisor_override); - ss << " -Is " << miopen::deref(xDesc).GetLengths(); - ss << " -Os " << miopen::deref(oDesc).GetLengths(); - ss << " -Si " << miopen::deref(xDesc).GetStrides(); - ss << " -So " << miopen::deref(oDesc).GetStrides(); - ss << " -Cp " << count_include_pad; - ss << " -Do " << divisor_override; - ss << " -F " << ((is_fwd) ? "1" : "2"); - - MIOPEN_LOG_DRIVER_CMD(ss.str()); - } -} - -extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, - const miopenTensorDescriptor_t inputDesc, - const void* input, - const miopenTensorDescriptor_t outputDesc, - void* output, - const int32_t KD, - const int32_t KH, - const int32_t KW, - const int32_t SD, - const int32_t SH, - const int32_t SW, - const int32_t PD, - const int32_t PH, - const int32_t PW, - const bool count_include_pad, - const int32_t divisor_override) -{ - MIOPEN_LOG_FUNCTION(handle, - inputDesc, - input, - outputDesc, - output, - KD, - KH, - KW, - SD, - SH, - SW, - PD, - PH, - PW, - count_include_pad, - divisor_override); - - LogCmdAvgPool(inputDesc, outputDesc, count_include_pad, divisor_override, true); - return miopen::try_([&] { - miopen::AvgPoolForward(miopen::deref(handle), - miopen::deref(inputDesc), - DataCast(input), - miopen::deref(outputDesc), - DataCast(output), - KD, - KH, - KW, - SD, - SH, - SW, - PD, - PH, - PW, - count_include_pad, - divisor_override); - }); -} - -extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle, - const miopenTensorDescriptor_t outputGradDesc, - const void* output_grad, - const miopenTensorDescriptor_t inputGradDesc, - void* input_grad, - const int32_t KD, - const int32_t KH, - const int32_t KW, - const int32_t SD, - const int32_t SH, - const int32_t SW, - const int32_t PD, - const int32_t PH, - const int32_t PW, - const bool count_include_pad, - const int32_t divisor_override) -{ - MIOPEN_LOG_FUNCTION(handle, - outputGradDesc, - output_grad, - inputGradDesc, - input_grad, - KD, - KH, - KW, - SD, - SH, - SW, - PD, - PH, - PW, - count_include_pad, - divisor_override); - - LogCmdAvgPool(inputGradDesc, outputGradDesc, count_include_pad, divisor_override, false); - return miopen::try_([&] { - miopen::AvgPoolBackward(miopen::deref(handle), - miopen::deref(outputGradDesc), - DataCast(output_grad), - miopen::deref(inputGradDesc), - DataCast(input_grad), - KD, - KH, - KW, - SD, - SH, - SW, - PD, - PH, - PW, - count_include_pad, - divisor_override); - }); -} diff --git a/src/include/miopen/adaptiveavgpool.hpp b/src/include/miopen/adaptiveavgpool.hpp new file mode 100644 index 0000000000..9f38a62d94 --- /dev/null +++ b/src/include/miopen/adaptiveavgpool.hpp @@ -0,0 +1,50 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#ifndef MIOPEN_ADAPTIVEAVGPOOL_HPP_ +#define MIOPEN_ADAPTIVEAVGPOOL_HPP_ + +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +MIOPEN_INTERNALS_EXPORT miopenStatus_t AdaptiveAvgPoolForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output); + +MIOPEN_INTERNALS_EXPORT miopenStatus_t +AdaptiveAvgPoolBackward(Handle& handle, + const TensorDescriptor& outputGradDesc, + ConstData_t output_grad, + const TensorDescriptor& inputGradDesc, + Data_t input_grad); +} // namespace miopen +#endif // _MIOPEN_ADAPTIVEAVGPOOL_HPP_ diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/adaptiveavgpool/invoke_params.hpp similarity index 71% rename from src/include/miopen/avgpool/invoke_params.hpp rename to src/include/miopen/adaptiveavgpool/invoke_params.hpp index e8bd9256ac..e97a66a427 100644 --- a/src/include/miopen/avgpool/invoke_params.hpp +++ b/src/include/miopen/adaptiveavgpool/invoke_params.hpp @@ -31,7 +31,7 @@ #include namespace miopen { -namespace avgpool { +namespace adaptiveavgpool { struct FwdInvokeParams : public miopen::InvokeParams { @@ -43,19 +43,6 @@ struct FwdInvokeParams : public miopen::InvokeParams ConstData_t input = nullptr; Data_t output = nullptr; - ConstData_t ksize = nullptr; - - int32_t KD = 0; - int32_t KH = 0; - int32_t KW = 0; - int32_t SD = 0; - int32_t SH = 0; - int32_t SW = 0; - int32_t PD = 0; - int32_t PH = 0; - int32_t PW = 0; - bool count_include_pad = false; - int32_t divisor_override = 0; std::size_t GetWorkspaceSize() const { return 0; } Data_t GetWorkspace() const { return nullptr; } @@ -71,23 +58,10 @@ struct BwdInvokeParams : public miopen::InvokeParams ConstData_t output_grad = nullptr; Data_t input_grad = nullptr; - ConstData_t ksize = nullptr; - - int32_t KD = 0; - int32_t KH = 0; - int32_t KW = 0; - int32_t SD = 0; - int32_t SH = 0; - int32_t SW = 0; - int32_t PD = 0; - int32_t PH = 0; - int32_t PW = 0; - bool count_include_pad = false; - int32_t divisor_override = 0; std::size_t GetWorkspaceSize() const { return 0; } Data_t GetWorkspace() const { return nullptr; } }; -} // namespace avgpool +} // namespace adaptiveavgpool } // namespace miopen diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/adaptiveavgpool/problem_description.hpp similarity index 68% rename from src/include/miopen/avgpool/problem_description.hpp rename to src/include/miopen/adaptiveavgpool/problem_description.hpp index 2dee6a30ea..53be89cd89 100644 --- a/src/include/miopen/avgpool/problem_description.hpp +++ b/src/include/miopen/adaptiveavgpool/problem_description.hpp @@ -34,33 +34,12 @@ namespace miopen { struct NetworkConfig; -namespace avgpool { +namespace adaptiveavgpool { -struct ProblemDescription : ProblemDescriptionBase +struct FwdProblemDescription : ProblemDescriptionBase { - ProblemDescription(const bool count_include_pad_, const int32_t divisor_override_) - : count_include_pad(count_include_pad_), divisor_override(divisor_override_) - { - if(divisor_override < 0) - { - MIOPEN_THROW(miopenStatusBadParm, "AvgPool: divisor_override must be non-negative."); - } - } - -protected: - bool count_include_pad; - int32_t divisor_override; -}; - -struct FwdProblemDescription : ProblemDescription -{ - FwdProblemDescription(const TensorDescriptor& inputDesc_, - const TensorDescriptor& outputDesc_, - const bool count_include_pad_, - const int32_t divisor_override_) - : ProblemDescription(count_include_pad_, divisor_override_), - inputDesc(inputDesc_), - outputDesc(outputDesc_) + FwdProblemDescription(const TensorDescriptor& inputDesc_, const TensorDescriptor& outputDesc_) + : inputDesc(inputDesc_), outputDesc(outputDesc_) { IsValidLength(); } @@ -77,7 +56,7 @@ struct FwdProblemDescription : ProblemDescription outputDesc.GetLengths().size() != input_dims) { MIOPEN_THROW(miopenStatusBadParm, - "AvgPool: Input and output tensor sizes do not match."); + "AdaptiveAvgPool: Input and output tensor sizes do not match."); } return true; @@ -90,15 +69,11 @@ struct FwdProblemDescription : ProblemDescription TensorDescriptor outputDesc; }; -struct BwdProblemDescription : ProblemDescription +struct BwdProblemDescription : ProblemDescriptionBase { BwdProblemDescription(const TensorDescriptor& outputGradDesc_, - const TensorDescriptor& inputGradDesc_, - const bool count_include_pad_, - const int32_t divisor_override_) - : ProblemDescription(count_include_pad_, divisor_override_), - outputGradDesc(outputGradDesc_), - inputGradDesc(inputGradDesc_) + const TensorDescriptor& inputGradDesc_) + : outputGradDesc(outputGradDesc_), inputGradDesc(inputGradDesc_) { IsValidLength(); } @@ -115,7 +90,7 @@ struct BwdProblemDescription : ProblemDescription outputGradDesc.GetLengths().size() != input_dims) { MIOPEN_THROW(miopenStatusBadParm, - "AvgPool: Input grad and output grad tensor sizes do not match."); + "AdaptiveAvgPool: Input grad and output grad tensor sizes do not match."); } return true; @@ -128,6 +103,6 @@ struct BwdProblemDescription : ProblemDescription TensorDescriptor inputGradDesc; }; -} // namespace avgpool +} // namespace adaptiveavgpool } // namespace miopen diff --git a/src/include/miopen/adaptiveavgpool/solvers.hpp b/src/include/miopen/adaptiveavgpool/solvers.hpp new file mode 100644 index 0000000000..25f08f3345 --- /dev/null +++ b/src/include/miopen/adaptiveavgpool/solvers.hpp @@ -0,0 +1,159 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include +#include +#include "miopen/kernel_build_params.hpp" +#include "miopen/kernel_info.hpp" + +namespace miopen { + +namespace solver { + +const auto make_hip_kernel = [](std::vector localsize, + std::vector gridsize, + std::string kernel_file, + std::string kernel_name, + KernelBuildParameters build_params) { + while(localsize.size() < 3) + localsize.push_back(1); + while(gridsize.size() < 3) + gridsize.push_back(1); + for(int i = 0; i < localsize.size(); ++i) + gridsize[i] = AlignUp(gridsize[i], localsize[i]); + return KernelInfo{ + build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name}; +}; + +namespace adaptiveavgpool { + +using AdaptiveAvgPoolForward = + NonTunableSolverBase; + +using AdaptiveAvgPoolBackward = + NonTunableSolverBase; + +// FORWARD +struct AdaptiveAvgPoolForward1d final : AdaptiveAvgPoolForward +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::adaptiveavgpool::FwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::adaptiveavgpool::FwdProblemDescription& problem) const override; +}; + +struct AdaptiveAvgPoolForward2d final : AdaptiveAvgPoolForward +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::adaptiveavgpool::FwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::adaptiveavgpool::FwdProblemDescription& problem) const override; +}; + +struct AdaptiveAvgPoolForward3d final : AdaptiveAvgPoolForward +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::adaptiveavgpool::FwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::adaptiveavgpool::FwdProblemDescription& problem) const override; +}; + +// BACKWARD +struct AdaptiveAvgPoolBackward1d final : AdaptiveAvgPoolBackward +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::adaptiveavgpool::BwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::adaptiveavgpool::BwdProblemDescription& problem) const override; +}; + +struct AdaptiveAvgPoolBackward2d final : AdaptiveAvgPoolBackward +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::adaptiveavgpool::BwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::adaptiveavgpool::BwdProblemDescription& problem) const override; +}; + +struct AdaptiveAvgPoolBackward3d final : AdaptiveAvgPoolBackward +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::adaptiveavgpool::BwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::adaptiveavgpool::BwdProblemDescription& problem) const override; +}; + +} // namespace adaptiveavgpool + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp deleted file mode 100644 index 00a2717ff6..0000000000 --- a/src/include/miopen/avgpool.hpp +++ /dev/null @@ -1,71 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2024 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include -#ifndef MIOPEN_AVGPOOL_HPP_ -#define MIOPEN_AVGPOOL_HPP_ - -#include - -namespace miopen { - -struct Handle; -struct TensorDescriptor; - -MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle, - const TensorDescriptor& inputDesc, - ConstData_t input, - const TensorDescriptor& outputDesc, - Data_t output, - int32_t KD, - int32_t KH, - int32_t KW, - int32_t SD, - int32_t SH, - int32_t SW, - int32_t PD, - int32_t PH, - int32_t PW, - bool count_include_pad, - int32_t divisor_override); - -MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle, - const TensorDescriptor& outputGradDesc, - ConstData_t output_grad, - const TensorDescriptor& inputGradDesc, - Data_t input_grad, - int32_t KD, - int32_t KH, - int32_t KW, - int32_t SD, - int32_t SH, - int32_t SW, - int32_t PD, - int32_t PH, - int32_t PW, - bool count_include_pad, - int32_t divisor_override); -} // namespace miopen -#endif // _MIOPEN_AVGPOOL_HPP_ diff --git a/src/include/miopen/avgpool/solvers.hpp b/src/include/miopen/avgpool/solvers.hpp deleted file mode 100644 index 5577b9fad6..0000000000 --- a/src/include/miopen/avgpool/solvers.hpp +++ /dev/null @@ -1,113 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2024 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#pragma once - -#include "miopen/conv_solution.hpp" -#include "miopen/execution_context.hpp" -#include -#include -#include "miopen/kernel_build_params.hpp" -#include "miopen/kernel_info.hpp" - -namespace miopen { - -namespace solver { - -const auto make_hip_kernel = [](std::vector localsize, - std::vector gridsize, - std::string kernel_file, - std::string kernel_name, - KernelBuildParameters build_params) { - while(localsize.size() < 3) - localsize.push_back(1); - while(gridsize.size() < 3) - gridsize.push_back(1); - for(int i = 0; i < localsize.size(); ++i) - gridsize[i] = AlignUp(gridsize[i], localsize[i]); - return KernelInfo{ - build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name}; -}; - -namespace avgpool { - -using AvgPoolForward = - NonTunableSolverBase; - -using AvgPoolBackward = - NonTunableSolverBase; - -// FORWARD -struct AvgPoolForward2d final : AvgPoolForward -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::FwdProblemDescription& problem) const override; - - ConvSolution GetSolution(const ExecutionContext& context, - const miopen::avgpool::FwdProblemDescription& problem) const override; -}; - -struct AvgPoolForward3d final : AvgPoolForward -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::FwdProblemDescription& problem) const override; - - ConvSolution GetSolution(const ExecutionContext& context, - const miopen::avgpool::FwdProblemDescription& problem) const override; -}; - -// BACKWARD -struct AvgPoolBackward2d final : AvgPoolBackward -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::BwdProblemDescription& problem) const override; - - ConvSolution GetSolution(const ExecutionContext& context, - const miopen::avgpool::BwdProblemDescription& problem) const override; -}; - -struct AvgPoolBackward3d final : AvgPoolBackward -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - bool IsApplicable(const ExecutionContext& context, - const miopen::avgpool::BwdProblemDescription& problem) const override; - - ConvSolution GetSolution(const ExecutionContext& context, - const miopen::avgpool::BwdProblemDescription& problem) const override; -}; - -} // namespace avgpool - -} // namespace solver - -} // namespace miopen diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp index 194afd79ac..25fc7aad16 100644 --- a/src/include/miopen/solver_id.hpp +++ b/src/include/miopen/solver_id.hpp @@ -60,7 +60,7 @@ enum class Primitive Softmax, Adam, Item, - AvgPool + AdaptiveAvgPool }; struct MIOPEN_INTERNALS_EXPORT Id diff --git a/src/kernels/MIOpenAdaptiveAvgPool.cpp b/src/kernels/MIOpenAdaptiveAvgPool.cpp new file mode 100644 index 0000000000..d29a03ab1d --- /dev/null +++ b/src/kernels/MIOpenAdaptiveAvgPool.cpp @@ -0,0 +1,404 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "tensor_view.hpp" + +#ifndef INPUT_TYPE +#define INPUT_TYPE float +#endif + +#ifndef OUTPUT_TYPE +#define OUTPUT_TYPE float +#endif + +template +__device__ void avgPoolForward1d(const TI* __restrict__ input, + TO* __restrict__ output, + size_t N, + size_t C, + size_t H, + size_t OH, + tensor_view_t<3> input_tv, + tensor_view_t<3> output_tv) +{ + size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + size_t nc = gid / OH, oh = gid % OH; + size_t n = nc / C, c = nc % C; + if(n >= N) + return; + + int32_t h = (int32_t)floor((float)(oh * H) / OH); + int32_t kh = (int32_t)ceil((float)((oh + 1) * H) / OH) - h; + + DTYPE_ACCURATE sum = 0; + for(int ih = h; ih < (h + kh); ++ih) + { + sum += GET_3D_VAL_AT(input, n, c, ih); + } + + SET_3D_VAL_AT(output, n, c, oh, sum / kh); +} +extern "C" __global__ void AvgPoolForward1d(const INPUT_TYPE* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + size_t N, + size_t C, + size_t H, + size_t OH, + tensor_view_t<3> input_tv, + tensor_view_t<3> output_tv) +{ + avgPoolForward1d(input, output, N, C, H, OH, input_tv, output_tv); +} + +template +__device__ void avgPoolBackward1d(const TI* __restrict__ output_grad, + TO* __restrict__ input_grad, + size_t N, + size_t C, + size_t H, + size_t OH, + tensor_view_t<3> output_grad_tv, + tensor_view_t<3> input_grad_tv) +{ + size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + size_t nc = gid / H, h = gid % H; + size_t n = nc / C, c = nc % C; + if(n >= N) + return; + + int32_t oh = (int32_t)floor((float)(h * OH) / H); + int32_t koh = (int32_t)ceil((float)((h + 1) * OH) / H) - oh; + + DTYPE_ACCURATE grad = 0; + for(int ih = oh; ih < (oh + koh); ++ih) + { + int32_t kh = + (int32_t)ceil((float)((ih + 1) * H) / OH) - (int32_t)floor((float)(ih * H) / OH); + grad += GET_3D_VAL_AT(output_grad, n, c, ih) / kh; + } + + SET_3D_VAL_AT(input_grad, n, c, h, grad); +} +extern "C" __global__ void AvgPoolBackward1d(const INPUT_TYPE* __restrict__ output_grad, + OUTPUT_TYPE* __restrict__ input_grad, + size_t N, + size_t C, + size_t H, + size_t OH, + tensor_view_t<3> output_grad_tv, + tensor_view_t<3> input_grad_tv) +{ + avgPoolBackward1d( + output_grad, input_grad, N, C, H, OH, output_grad_tv, input_grad_tv); +} + +template +__device__ void avgPoolForward2d(const TI* __restrict__ input, + TO* __restrict__ output, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + tensor_view_t<4> input_tv, + tensor_view_t<4> output_tv) +{ + int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; + int32_t ncoh = gid / OW, ow = gid % OW; + int32_t nc = ncoh / OH, oh = ncoh % OH; + int32_t n = nc / C, c = nc % C; + + if(n >= N) + return; + + size_t h = (size_t)floor((float)(oh * H) / OH); + size_t kh = (size_t)ceil((float)((oh + 1) * H) / OH) - h; + + size_t w = (size_t)floor((float)(ow * W) / OW); + size_t kw = (size_t)ceil((float)((ow + 1) * W) / OW) - w; + + FSTYPE divider = (FSTYPE)(kh * kw); + FSTYPE sum = 0; + for(size_t ih = h; ih < (h + kh); ++ih) + { + for(size_t iw = w; iw < (w + kw); ++iw) + { + sum += GET_4D_VAL_AT(input, n, c, ih, iw); + } + } + + SET_4D_VAL_AT(output, n, c, oh, ow, sum / divider); + + output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = CVT_ACCUM2FLOAT(val); +} + +extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + tensor_view_t<4> input_tv, + tensor_view_t<4> output_tv) +{ + avgPoolForward2d( + input, output, N, C, H, W, OH, OW, input_tv, output_tv); +} + +template +__device__ void avgPoolBackward2d(const TI* __restrict__ output_grad, + TO* __restrict__ input_grad, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + tensor_view_t<4> output_grad_tv, + tensor_view_t<4> input_grad_tv) +{ + int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; + int32_t nch = gid / W, w = gid % W; + int32_t nc = nch / H, h = nch % H; + int32_t n = nc / C, c = nc % C; + + if(n >= N) + return; + + size_t oh = (size_t)floor((float)(h * OH) / H); + size_t koh = (size_t)ceil((float)((h + 1) * OH) / H) - oh; + + size_t ow = (size_t)floor((float)(w * OW) / W); + size_t kow = (size_t)ceil((float)((w + 1) * OW) / W) - ow; + + FLOAT_ACCUM grad = 0; + for(size_t ih = oh; ih < (oh + koh); ++ih) + { + size_t kh = (size_t)ceil((float)((ih + 1) * H) / OH) - (size_t)floor((float)(ih * H) / OH); + for(size_t iw = ow; iw < (ow + kow); ++iw) + { + size_t kw = + (size_t)ceil((float)((iw + 1) * W) / OW) - (size_t)floor((float)(iw * W) / OW); + grad += (FSTYPE)(GET_4D_VAL_AT(output_grad, n, c, ih, iw)) / (kh * kw); + } + } + + SET_4D_VAL_AT(input_grad, n, c, h, w, grad); + + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] = + CVT_ACCUM2FLOAT(grad); +} + +extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad, + OUTPUT_TYPE* __restrict__ input_grad, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + tensor_view_t<4> output_grad_tv, + tensor_view_t<4> input_grad_tv) +{ + avgPoolBackward2d( + output_grad, input_grad, N, C, H, W, OH, OW, output_grad_tv, input_grad_tv); +} + +// __kernel void AdaptiveAvgpool2dBackward1x1OutputNHWC(const __global DTYPE_PTR output_grad, +// __global DTYPE_PTR input_grad, +// const int32_t N, +// const int32_t C, +// const int32_t HW, +// const int32_t output_grad_off, +// const int32_t input_grad_off) +// { +// /* VSIZE 2 and 16 is fastest but don't know why */ +// #define VSIZE 2 +// size_t gid = get_global_id(0) * VSIZE; +// size_t c = gid % C; +// size_t n = gid / C; +// if(n >= N) +// return; + +// __global DTYPE_VEC_PTR(VSIZE) output_grad_vec = +// (__global DTYPE_VEC_PTR(VSIZE))(output_grad + n * C + c + output_grad_off); + +// DTYPE_VEC(VSIZE) output_grad_v = GET(output_grad_vec, 0) / HW; + +// __global DTYPE_VEC_PTR(VSIZE) input_grad_vec = +// (__global DTYPE_VEC_PTR(VSIZE))(input_grad + n * C * HW + c + input_grad_off); + +// for(size_t i = 0; i < HW; ++i) +// { +// SET(input_grad_vec, i * C / VSIZE, output_grad_v); +// } +// #undef VSIZE +// } + +template +__device__ void avgPoolForward3d(const TI* __restrict__ input, + TO* __restrict__ output, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + tensor_view_t<5> input_tv, + tensor_view_t<5> output_tv) +{ + int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; + int32_t ncodoh = gid / OW, ow = gid % OW; + int32_t ncod = ncodoh / OH, oh = ncodoh % OH; + int32_t nc = ncod / OD, od = ncod % OD; + int32_t n = nc / C, c = nc % C; + + if(n >= N) + return; + int32_t d = (int32_t)floor((float)(od * D) / OD); + int32_t kd = (int32_t)ceil((float)((od + 1) * D) / OD) - d; + + int32_t h = (int32_t)floor((float)(oh * H) / OH); + int32_t kh = (int32_t)ceil((float)((oh + 1) * H) / OH) - h; + + int32_t w = (int32_t)floor((float)(ow * W) / OW); + int32_t kw = (int32_t)ceil((float)((ow + 1) * W) / OW) - w; + + DTYPE_ACCURATE sum = 0; + for(int32_t id = d; id < (d + kd); ++id) + { + for(int32_t ih = h; ih < (h + kh); ++ih) + { + for(int32_t iw = w; iw < (w + kw); ++iw) + { + sum += GET_5D_VAL_AT(input, n, c, id, ih, iw); + } + } + } + + output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] = + CVT_ACCUM2FLOAT(val); + SET_5D_VAL_AT(output, n, c, od, oh, ow, sum / (kd * kh * kw)); +} + +extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + tensor_view_t<5> input_tv, + tensor_view_t<5> output_tv) +{ + avgPoolForward3d( + input, output, N, C, D, H, W, OD, OH, OW, input_tv, output_tv); +} + +template +__device__ void avgPoolBackward3d(const TI* __restrict__ output_grad, + TO* __restrict__ input_grad, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + tensor_view_t<5> output_grad_tv, + tensor_view_t<5> input_grad_tv) +{ + int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; + int32_t ncdh = gid / W, w = gid % W; + int32_t ncd = ncdh / H, h = ncdh % H; + int32_t nc = ncd / D, d = ncd % D; + int32_t n = nc / C, c = nc % C; + + if(n >= N) + return; + + int32_t od = (int32_t)floor((float)(d * OD) / D); + int32_t kod = (int32_t)ceil((float)((d + 1) * OD) / D) - od; + + int32_t oh = (int32_t)floor((float)(h * OH) / H); + int32_t koh = (int32_t)ceil((float)((h + 1) * OH) / H) - oh; + + int32_t ow = (int32_t)floor((float)(w * OW) / W); + int32_t kow = (int32_t)ceil((float)((w + 1) * OW) / W) - ow; + + DTYPE_ACCURATE grad = 0; + for(int32_t id = od; id < (od + kod); ++id) + { + int32_t kd = + (int32_t)ceil((float)((id + 1) * D) / OD) - (int32_t)floor((float)(id * D) / OD); + for(int32_t ih = oh; ih < (oh + koh); ++ih) + { + int32_t kh = + (int32_t)ceil((float)((ih + 1) * H) / OH) - (int32_t)floor((float)(ih * H) / OH); + for(int32_t iw = ow; iw < (ow + kow); ++iw) + { + int32_t kw = (int32_t)ceil((float)((iw + 1) * W) / OW) - + (int32_t)floor((float)(iw * W) / OW); + grad += GET_5D_VAL_AT(output_grad, n, c, id, ih, iw) / (kd * kh * kw); + } + } + } + + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] = + CVT_ACCUM2FLOAT(grad); +} + +extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad, + OUTPUT_TYPE* __restrict__ input_grad, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + tensor_view_t<5> output_grad_tv, + tensor_view_t<5> input_grad_tv) +{ + avgPoolBackward3d( + output_grad, input_grad, N, C, D, H, W, OD, OH, OW, output_grad_tv, input_grad_tv); +} diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp deleted file mode 100644 index 76355d5729..0000000000 --- a/src/kernels/MIOpenAvgPool.cpp +++ /dev/null @@ -1,574 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2024 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include -#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS -#include -#include -#endif - -#include "float_types.h" -#include "tensor_view.hpp" - -#ifndef INPUT_TYPE -#define INPUT_TYPE float -#endif - -#ifndef OUTPUT_TYPE -#define OUTPUT_TYPE float -#endif - -template -__device__ void avgPoolForward2d(const TI* __restrict__ input, - TO* __restrict__ output, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, - int32_t R, - int32_t S, - int32_t sh, - int32_t sw, - int32_t ph, - int32_t pw, - bool count_include_pad, - int32_t divisor_override, - tensor_view_t<4> input_tv, - tensor_view_t<4> output_tv) -{ - int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; - int32_t ncoh = gid / OW, ow = gid % OW; - int32_t nc = ncoh / OH, oh = ncoh % OH; - int32_t n = nc / C, c = nc % C; - - if(n >= N) - return; - - FLOAT_ACCUM m = 0; - for(int32_t r = 0; r < R; ++r) - { - for(int32_t s = 0; s < S; ++s) - { - // input idx : (n, c, h, w) - int32_t h = oh * sh - ph + r; - if(h < 0 || h >= H) - continue; - int32_t w = ow * sw - pw + s; - if(w < 0 || w >= W) - continue; - // int32_t input_idx = ((n * C + c) * H + h) * W + w; - m += CVT_FLOAT2ACCUM( - input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]); - } - } - - int32_t hstart = oh * sh - ph; - int32_t wstart = ow * sw - pw; - int32_t hend = min(hstart + R, H + ph); - int32_t wend = min(wstart + S, W + pw); - - const int32_t pool_size = (hend - hstart) * (wend - wstart); - - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, H); - wend = min(wend, W); - - int32_t divide_factor; - if(divisor_override != 0) - { - divide_factor = divisor_override; - } - else - { - if(count_include_pad) - { - divide_factor = pool_size; - } - else - { - divide_factor = (hend - hstart) * (wend - wstart); - } - } - FLOAT_ACCUM val = m / divide_factor; - - output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = CVT_ACCUM2FLOAT(val); -} - -extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input, - OUTPUT_TYPE* __restrict__ output, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, - int32_t R, - int32_t S, - int32_t sh, - int32_t sw, - int32_t ph, - int32_t pw, - bool count_include_pad, - int32_t divisor_override, - tensor_view_t<4> input_tv, - tensor_view_t<4> output_tv) -{ - avgPoolForward2d(input, - output, - N, - C, - H, - W, - OH, - OW, - R, - S, - sh, - sw, - ph, - pw, - count_include_pad, - divisor_override, - input_tv, - output_tv); -} - -template -__device__ void avgPoolForward3d(const TI* __restrict__ input, - TO* __restrict__ output, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, - int32_t KD, - int32_t R, - int32_t S, - int32_t sd, - int32_t sh, - int32_t sw, - int32_t pd, - int32_t ph, - int32_t pw, - bool count_include_pad, - int32_t divisor_override, - tensor_view_t<5> input_tv, - tensor_view_t<5> output_tv) -{ - int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; - int32_t ncodoh = gid / OW, ow = gid % OW; - int32_t ncod = ncodoh / OH, oh = ncodoh % OH; - int32_t nc = ncod / OD, od = ncod % OD; - int32_t n = nc / C, c = nc % C; - - if(n >= N) - return; - FLOAT_ACCUM sum = 0; - for(int32_t kd = 0; kd < KD; ++kd) - { - for(int32_t r = 0; r < R; ++r) - { - for(int32_t s = 0; s < S; ++s) - { - // input idx : (n, c, d, h, w) - int32_t d = od * sd - pd + kd; - if(d < 0 || d >= D) - continue; - int32_t h = oh * sh - ph + r; - if(h < 0 || h >= H) - continue; - int32_t w = ow * sw - pw + s; - if(w < 0 || w >= W) - continue; - // int32_t input_idx = ((n * C + c) * H + h) * W + w; - sum += CVT_FLOAT2ACCUM( - input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]); - } - } - } - int32_t dstart = od * sd - pd; - int32_t hstart = oh * sh - ph; - int32_t wstart = ow * sw - pw; - int32_t dend = min(dstart + KD, D + pd); - int32_t hend = min(hstart + R, H + ph); - int32_t wend = min(wstart + S, W + pw); - - const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); - dstart = max(dstart, 0); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - dend = min(dend, D); - hend = min(hend, H); - wend = min(wend, W); - - int32_t divide_factor; - if(divisor_override != 0) - { - divide_factor = divisor_override; - } - else - { - if(count_include_pad) - { - divide_factor = pool_size; - } - else - { - divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); - } - } - FLOAT_ACCUM val = sum / divide_factor; - output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] = - CVT_ACCUM2FLOAT(val); -} - -extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input, - OUTPUT_TYPE* __restrict__ output, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, - int32_t KD, - int32_t R, - int32_t S, - int32_t sd, - int32_t sh, - int32_t sw, - int32_t pd, - int32_t ph, - int32_t pw, - bool count_include_pad, - int32_t divisor_override, - tensor_view_t<5> input_tv, - tensor_view_t<5> output_tv) -{ - avgPoolForward3d(input, - output, - N, - C, - D, - H, - W, - OD, - OH, - OW, - KD, - R, - S, - sd, - sh, - sw, - pd, - ph, - pw, - count_include_pad, - divisor_override, - input_tv, - output_tv); -} - -template -__device__ void avgPoolBackward2d(const TI* __restrict__ output_grad, - TO* __restrict__ input_grad, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, - int32_t R, - int32_t S, - int32_t sh, - int32_t sw, - int32_t ph, - int32_t pw, - bool count_include_pad, - int32_t divisor_override, - tensor_view_t<4> output_grad_tv, - tensor_view_t<4> input_grad_tv) -{ - int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; - int32_t nch = gid / W, w = gid % W; - int32_t nc = nch / H, h = nch % H; - int32_t n = nc / C, c = nc % C; - - if(n >= N) - return; - - FLOAT_ACCUM grad = 0; - for(int32_t r = 0; r < R; ++r) - { - for(int32_t s = 0; s < S; ++s) - { - int32_t ohsh = h + ph - r; - if(ohsh % sh != 0) - continue; - int32_t oh = ohsh / sh; - if(oh < 0 || oh >= OH) - continue; - int32_t owsw = w + pw - s; - if(owsw % sw != 0) - continue; - int32_t ow = owsw / sw; - if(ow < 0 || ow >= OW) - continue; - - int32_t hstart = oh * sh - ph; - int32_t wstart = ow * sw - pw; - int32_t hend = min(hstart + R, H + ph); - int32_t wend = min(wstart + S, W + pw); - - const int32_t pool_size = (hend - hstart) * (wend - wstart); - - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, H); - wend = min(wend, W); - - int32_t divide_factor; - if(divisor_override != 0) - { - divide_factor = divisor_override; - } - else - { - if(count_include_pad) - { - divide_factor = pool_size; - } - else - { - divide_factor = (hend - hstart) * (wend - wstart); - } - } - - grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx( - tensor_layout_t<4>(n, c, oh, ow))]) / - divide_factor; - } - } - input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] = - CVT_ACCUM2FLOAT(grad); -} - -extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad, - OUTPUT_TYPE* __restrict__ input_grad, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, - int32_t R, - int32_t S, - int32_t sh, - int32_t sw, - int32_t ph, - int32_t pw, - bool count_include_pad, - int32_t divisor_override, - tensor_view_t<4> output_grad_tv, - tensor_view_t<4> input_grad_tv) -{ - avgPoolBackward2d(output_grad, - input_grad, - N, - C, - H, - W, - OH, - OW, - R, - S, - sh, - sw, - ph, - pw, - count_include_pad, - divisor_override, - output_grad_tv, - input_grad_tv); -} - -template -__device__ void avgPoolBackward3d(const TI* __restrict__ output_grad, - TO* __restrict__ input_grad, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, - int32_t KD, - int32_t R, - int32_t S, - int32_t sd, - int32_t sh, - int32_t sw, - int32_t pd, - int32_t ph, - int32_t pw, - bool count_include_pad, - int32_t divisor_override, - tensor_view_t<5> output_grad_tv, - tensor_view_t<5> input_grad_tv) -{ - int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; - int32_t ncdh = gid / W, w = gid % W; - int32_t ncd = ncdh / H, h = ncdh % H; - int32_t nc = ncd / D, d = ncd % D; - int32_t n = nc / C, c = nc % C; - - if(n >= N) - return; - - FLOAT_ACCUM grad = 0; - for(int32_t kd = 0; kd < KD; ++kd) - { - for(int32_t r = 0; r < R; ++r) - { - for(int32_t s = 0; s < S; ++s) - { - int32_t odsd = d + pd - kd; - if(odsd % sd != 0) - continue; - int32_t od = odsd / sd; - if(od < 0 || od >= OD) - continue; - - int32_t ohsh = h + ph - r; - if(ohsh % sh != 0) - continue; - int32_t oh = ohsh / sh; - if(oh < 0 || oh >= OH) - continue; - - int32_t owsw = w + pw - s; - if(owsw % sw != 0) - continue; - int32_t ow = owsw / sw; - if(ow < 0 || ow >= OW) - continue; - - int32_t dstart = od * sd - pd; - int32_t hstart = oh * sh - ph; - int32_t wstart = ow * sw - pw; - int32_t dend = min(dstart + KD, D + pd); - int32_t hend = min(hstart + R, H + ph); - int32_t wend = min(wstart + S, W + pw); - - const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); - dstart = max(dstart, 0); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - dend = min(dend, D); - hend = min(hend, H); - wend = min(wend, W); - int32_t divide_factor; - if(divisor_override != 0) - { - divide_factor = divisor_override; - } - else - { - if(count_include_pad) - { - divide_factor = pool_size; - } - else - { - divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); - } - } - grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx( - tensor_layout_t<5>(n, c, od, oh, ow))]) / - divide_factor; - } - } - } - input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] = - CVT_ACCUM2FLOAT(grad); -} - -extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad, - OUTPUT_TYPE* __restrict__ input_grad, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, - int32_t KD, - int32_t R, - int32_t S, - int32_t sd, - int32_t sh, - int32_t sw, - int32_t pd, - int32_t ph, - int32_t pw, - bool count_include_pad, - int32_t divisor_override, - tensor_view_t<5> output_grad_tv, - tensor_view_t<5> input_grad_tv) -{ - avgPoolBackward3d(output_grad, - input_grad, - N, - C, - D, - H, - W, - OD, - OH, - OW, - KD, - R, - S, - sd, - sh, - sw, - pd, - ph, - pw, - count_include_pad, - divisor_override, - output_grad_tv, - input_grad_tv); -} diff --git a/src/solver.cpp b/src/solver.cpp index 6b451ca498..a20ebd6b6e 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -24,6 +24,7 @@ * *******************************************************************************/ +#include "miopen/adaptiveavgpool/solvers.hpp" #include #include @@ -673,6 +674,30 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) fusion::ConvWinoFuryRxSFused<2, 3>{}.SolverDbId(), miopenConvolutionAlgoWinograd); + Register(registry, + ++id, + Primitive::AdaptiveAvgPool, + adaptiveavgpool::AdaptiveAvgPoolForward1d{}.SolverDbId()); + Register(registry, + ++id, + Primitive::AdaptiveAvgPool, + adaptiveavgpool::AdaptiveAvgPoolForward2d{}.SolverDbId()); + Register(registry, + ++id, + Primitive::AdaptiveAvgPool, + adaptiveavgpool::AdaptiveAvgPoolForward3d{}.SolverDbId()); + Register(registry, + ++id, + Primitive::AdaptiveAvgPool, + adaptiveavgpool::AdaptiveAvgPoolBackward1d{}.SolverDbId()); + Register(registry, + ++id, + Primitive::AdaptiveAvgPool, + adaptiveavgpool::AdaptiveAvgPoolBackward2d{}.SolverDbId()); + Register(registry, + ++id, + Primitive::AdaptiveAvgPool, + adaptiveavgpool::AdaptiveAvgPoolBackward3d{}.SolverDbId()); // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp similarity index 81% rename from src/solver/avgpool/backward_avgpool_2d.cpp rename to src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp index 73adabb8e7..1afb78de45 100644 --- a/src/solver/avgpool/backward_avgpool_2d.cpp +++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp @@ -28,11 +28,11 @@ #include "miopen/execution_context.hpp" #include "miopen/invoke_params.hpp" #include "miopen/tensor_view_utils.hpp" -#include +#include -#include +#include #include -#include +#include #include #define LOCAL_SIZE_BWD_2D 256 @@ -41,9 +41,9 @@ namespace miopen { namespace solver { -namespace avgpool { +namespace adaptiveavgpool { -bool IsOverRocmBwd2d(const miopen::avgpool::BwdProblemDescription& problem) +bool IsOverRocmBwd2d(const miopen::adaptiveavgpool::BwdProblemDescription& problem) { auto dtype = problem.GetInputGradDesc().GetType(); auto in_nelems = problem.GetInputGradDesc().GetElementSize(); @@ -73,8 +73,8 @@ bool IsOverRocmBwd2d(const miopen::avgpool::BwdProblemDescription& problem) return false; } -bool AvgPoolBackward2d::IsApplicable(const ExecutionContext&, - const miopen::avgpool::BwdProblemDescription& problem) const +bool AdaptiveAvgPoolBackward2d::IsApplicable( + const ExecutionContext&, const miopen::adaptiveavgpool::BwdProblemDescription& problem) const { if(problem.GetInputGradDesc().GetNumDims() != 4 || problem.GetOutputGradDesc().GetNumDims() != 4) @@ -88,9 +88,9 @@ bool AvgPoolBackward2d::IsApplicable(const ExecutionContext&, return true; } -ConvSolution -AvgPoolBackward2d::GetSolution(const ExecutionContext& context, - const miopen::avgpool::BwdProblemDescription& problem) const +ConvSolution AdaptiveAvgPoolBackward2d::GetSolution( + const ExecutionContext& context, + const miopen::adaptiveavgpool::BwdProblemDescription& problem) const { std::ignore = context; @@ -108,12 +108,15 @@ AvgPoolBackward2d::GetSolution(const ExecutionContext& context, {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; - result.construction_params.push_back(make_hip_kernel( - {LOCAL_SIZE_BWD_2D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolBackward2d", build_params)); + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_2D}, + {N_total}, + "MIOpenAdaptiveAvgPool.cpp", + "AdaptiveAvgPoolBackward2d", + build_params)); result.invoker_factory = [](const std::vector& kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) params = raw_params.CastTo(); + decltype(auto) params = raw_params.CastTo(); decltype(auto) kernel = handle_.Run(kernels.front()); @@ -135,14 +138,6 @@ AvgPoolBackward2d::GetSolution(const ExecutionContext& context, W, OH, OW, - params.KH, - params.KW, - params.SH, - params.SW, - params.PH, - params.PW, - params.count_include_pad, - params.divisor_override, output_grad_tv, input_grad_tv); }; @@ -151,7 +146,7 @@ AvgPoolBackward2d::GetSolution(const ExecutionContext& context, return result; } -} // namespace avgpool +} // namespace adaptiveavgpool } // namespace solver diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp new file mode 100644 index 0000000000..1afb78de45 --- /dev/null +++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp @@ -0,0 +1,153 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include "miopen/tensor_view_utils.hpp" +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_BWD_2D 256 + +namespace miopen { + +namespace solver { + +namespace adaptiveavgpool { + +bool IsOverRocmBwd2d(const miopen::adaptiveavgpool::BwdProblemDescription& problem) +{ + auto dtype = problem.GetInputGradDesc().GetType(); + auto in_nelems = problem.GetInputGradDesc().GetElementSize(); + auto out_nelems = problem.GetOutputGradDesc().GetElementSize(); + auto mul_nc = + problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1]; + auto in_over_out = static_cast(in_nelems) / out_nelems; + + if(dtype == miopenFloat) + { + return false; + } + else if(dtype == miopenHalf) + { + if(in_over_out < 2 && in_nelems >= 11075584) + { + return true; + } + } + else if(dtype == miopenBFloat16) + { + if(in_over_out < 2 || (in_nelems > 20000000 && mul_nc <= 2048)) + { + return true; + } + } + return false; +} + +bool AdaptiveAvgPoolBackward2d::IsApplicable( + const ExecutionContext&, const miopen::adaptiveavgpool::BwdProblemDescription& problem) const +{ + if(problem.GetInputGradDesc().GetNumDims() != 4 || + problem.GetOutputGradDesc().GetNumDims() != 4) + { + return false; + } + if(!IsOverRocmBwd2d(problem)) + { + return false; + } + return true; +} + +ConvSolution AdaptiveAvgPoolBackward2d::GetSolution( + const ExecutionContext& context, + const miopen::adaptiveavgpool::BwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetOutputGradDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType()); + auto dtype = problem.GetInputGradDesc().GetType(); + size_t N_total = problem.GetNtotal(); + + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; + + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_2D}, + {N_total}, + "MIOpenAdaptiveAvgPool.cpp", + "AdaptiveAvgPoolBackward2d", + build_params)); + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + decltype(auto) kernel = handle_.Run(kernels.front()); + + auto input_grad_tv = get_inner_expanded_tv<4>(deref(params.inputGradDesc)); + auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc)); + + auto N = deref(params.inputGradDesc).GetLengths()[0]; + auto C = deref(params.inputGradDesc).GetLengths()[1]; + auto H = deref(params.inputGradDesc).GetLengths()[2]; + auto W = deref(params.inputGradDesc).GetLengths()[3]; + auto OH = deref(params.outputGradDesc).GetLengths()[2]; + auto OW = deref(params.outputGradDesc).GetLengths()[3]; + + kernel(params.output_grad, + params.input_grad, + N, + C, + H, + W, + OH, + OW, + output_grad_tv, + input_grad_tv); + }; + }; + + return result; +} + +} // namespace adaptiveavgpool + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp similarity index 81% rename from src/solver/avgpool/backward_avgpool_3d.cpp rename to src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp index 4815803ad3..51d815e281 100644 --- a/src/solver/avgpool/backward_avgpool_3d.cpp +++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp @@ -28,11 +28,11 @@ #include "miopen/execution_context.hpp" #include "miopen/invoke_params.hpp" #include "miopen/tensor_view_utils.hpp" -#include +#include -#include +#include #include -#include +#include #include #define LOCAL_SIZE_BWD_3D 256 @@ -41,9 +41,9 @@ namespace miopen { namespace solver { -namespace avgpool { +namespace adaptiveavgpool { -bool IsOverRocmBwd3d(const miopen::avgpool::BwdProblemDescription& problem) +bool IsOverRocmBwd3d(const miopen::adaptiveavgpool::BwdProblemDescription& problem) { auto dtype = problem.GetInputGradDesc().GetType(); auto in_nelems = problem.GetInputGradDesc().GetElementSize(); @@ -79,8 +79,8 @@ bool IsOverRocmBwd3d(const miopen::avgpool::BwdProblemDescription& problem) return false; } -bool AvgPoolBackward3d::IsApplicable(const ExecutionContext&, - const miopen::avgpool::BwdProblemDescription& problem) const +bool AdaptiveAvgPoolBackward3d::IsApplicable( + const ExecutionContext&, const miopen::adaptiveavgpool::BwdProblemDescription& problem) const { if(problem.GetInputGradDesc().GetNumDims() != 5 || problem.GetOutputGradDesc().GetNumDims() != 5) @@ -94,9 +94,9 @@ bool AvgPoolBackward3d::IsApplicable(const ExecutionContext&, return true; } -ConvSolution -AvgPoolBackward3d::GetSolution(const ExecutionContext& context, - const miopen::avgpool::BwdProblemDescription& problem) const +ConvSolution AdaptiveAvgPoolBackward3d::GetSolution( + const ExecutionContext& context, + const miopen::adaptiveavgpool::BwdProblemDescription& problem) const { std::ignore = context; @@ -114,12 +114,15 @@ AvgPoolBackward3d::GetSolution(const ExecutionContext& context, {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; - result.construction_params.push_back(make_hip_kernel( - {LOCAL_SIZE_BWD_3D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolBackward3d", build_params)); + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_3D}, + {N_total}, + "MIOpenAdaptiveAvgPool.cpp", + "AdaptiveAvgPoolBackward3d", + build_params)); result.invoker_factory = [](const std::vector& kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) params = raw_params.CastTo(); + decltype(auto) params = raw_params.CastTo(); decltype(auto) kernel = handle_.Run(kernels.front()); @@ -145,17 +148,6 @@ AvgPoolBackward3d::GetSolution(const ExecutionContext& context, OD, OH, OW, - params.KD, - params.KH, - params.KW, - params.SD, - params.SH, - params.SW, - params.PD, - params.PH, - params.PW, - params.count_include_pad, - params.divisor_override, output_grad_tv, input_grad_tv); }; @@ -164,7 +156,7 @@ AvgPoolBackward3d::GetSolution(const ExecutionContext& context, return result; } -} // namespace avgpool +} // namespace adaptiveavgpool } // namespace solver diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp new file mode 100644 index 0000000000..85bb5747f3 --- /dev/null +++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp @@ -0,0 +1,145 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include "miopen/tensor_view_utils.hpp" +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_FWD_2D 256 + +namespace miopen { + +namespace solver { + +namespace adaptiveavgpool { + +bool IsOverRocmFwd2d(const miopen::adaptiveavgpool::FwdProblemDescription& problem) +{ + auto dtype = problem.GetOutputDesc().GetType(); + auto in_nelems = problem.GetInputDesc().GetElementSize(); + auto out_nelems = problem.GetOutputDesc().GetElementSize(); + auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1]; + auto in_over_out = static_cast(in_nelems) / out_nelems; + + if(dtype == miopenFloat) + { + if(in_over_out > 11 || (in_over_out < 2 && mul_nc >= 12288)) + { + return true; + } + } + else if(dtype == miopenHalf) + { + if(in_over_out > 11 || (in_over_out < 2 && mul_nc < 90000)) + { + return true; + } + } + else if(dtype == miopenBFloat16) + { + if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 4816896) + { + return true; + } + } + return false; +} + +bool AdaptiveAvgPoolForward1d::IsApplicable( + const ExecutionContext&, const miopen::adaptiveavgpool::FwdProblemDescription& problem) const +{ + if(problem.GetInputDesc().GetNumDims() != 4 || problem.GetOutputDesc().GetNumDims() != 4) + { + return false; + } + if(!IsOverRocmFwd2d(problem)) + { + return false; + } + return true; +} + +ConvSolution AdaptiveAvgPoolForward1d::GetSolution( + const ExecutionContext& context, + const miopen::adaptiveavgpool::FwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType()); + auto dtype = problem.GetOutputDesc().GetType(); + size_t N_total = problem.GetNtotal(); + + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; + + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_2D}, + {N_total}, + "MIOpenAdaptiveAvgPool.cpp", + "AdaptiveAvgPoolForward1d", + build_params)); + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + decltype(auto) kernel = handle_.Run(kernels.front()); + + auto input_tv = get_inner_expanded_tv<4>(deref(params.inputDesc)); + auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc)); + + size_t N = deref(params.inputDesc).GetLengths()[0]; + size_t C = deref(params.inputDesc).GetLengths()[1]; + size_t H = deref(params.inputDesc).GetLengths()[2]; + size_t W = deref(params.inputDesc).GetLengths()[3]; + size_t OH = deref(params.outputDesc).GetLengths()[2]; + size_t OW = deref(params.outputDesc).GetLengths()[3]; + + kernel(params.input, params.output, N, C, H, W, OH, OW, input_tv, output_tv); + }; + }; + + return result; +} + +} // namespace adaptiveavgpool + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp similarity index 76% rename from src/solver/avgpool/forward_avgpool_2d.cpp rename to src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp index 1c51feb54b..d1afc40842 100644 --- a/src/solver/avgpool/forward_avgpool_2d.cpp +++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp @@ -29,11 +29,11 @@ #include "miopen/invoke_params.hpp" #include "miopen/tensor_view_utils.hpp" #include -#include +#include -#include +#include #include -#include +#include #include #define LOCAL_SIZE_FWD_2D 256 @@ -42,9 +42,9 @@ namespace miopen { namespace solver { -namespace avgpool { +namespace adaptiveavgpool { -bool IsOverRocmFwd2d(const miopen::avgpool::FwdProblemDescription& problem) +bool IsOverRocmFwd2d(const miopen::adaptiveavgpool::FwdProblemDescription& problem) { auto dtype = problem.GetOutputDesc().GetType(); auto in_nelems = problem.GetInputDesc().GetElementSize(); @@ -76,8 +76,8 @@ bool IsOverRocmFwd2d(const miopen::avgpool::FwdProblemDescription& problem) return false; } -bool AvgPoolForward2d::IsApplicable(const ExecutionContext&, - const miopen::avgpool::FwdProblemDescription& problem) const +bool AdaptiveAvgPoolForward2d::IsApplicable( + const ExecutionContext&, const miopen::adaptiveavgpool::FwdProblemDescription& problem) const { if(problem.GetInputDesc().GetNumDims() != 4 || problem.GetOutputDesc().GetNumDims() != 4) { @@ -90,9 +90,9 @@ bool AvgPoolForward2d::IsApplicable(const ExecutionContext&, return true; } -ConvSolution -AvgPoolForward2d::GetSolution(const ExecutionContext& context, - const miopen::avgpool::FwdProblemDescription& problem) const +ConvSolution AdaptiveAvgPoolForward2d::GetSolution( + const ExecutionContext& context, + const miopen::adaptiveavgpool::FwdProblemDescription& problem) const { std::ignore = context; @@ -110,12 +110,15 @@ AvgPoolForward2d::GetSolution(const ExecutionContext& context, {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; - result.construction_params.push_back(make_hip_kernel( - {LOCAL_SIZE_FWD_2D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolForward2d", build_params)); + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_2D}, + {N_total}, + "MIOpenAdaptiveAvgPool.cpp", + "AdaptiveAvgPoolForward2d", + build_params)); result.invoker_factory = [](const std::vector& kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) params = raw_params.CastTo(); + decltype(auto) params = raw_params.CastTo(); decltype(auto) kernel = handle_.Run(kernels.front()); @@ -129,31 +132,14 @@ AvgPoolForward2d::GetSolution(const ExecutionContext& context, size_t OH = deref(params.outputDesc).GetLengths()[2]; size_t OW = deref(params.outputDesc).GetLengths()[3]; - kernel(params.input, - params.output, - N, - C, - H, - W, - OH, - OW, - params.KH, - params.KW, - params.SH, - params.SW, - params.PH, - params.PW, - params.count_include_pad, - params.divisor_override, - input_tv, - output_tv); + kernel(params.input, params.output, N, C, H, W, OH, OW, input_tv, output_tv); }; }; return result; } -} // namespace avgpool +} // namespace adaptiveavgpool } // namespace solver diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp similarity index 76% rename from src/solver/avgpool/forward_avgpool_3d.cpp rename to src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp index 6f70a07419..cf9bf5a9b9 100644 --- a/src/solver/avgpool/forward_avgpool_3d.cpp +++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp @@ -28,11 +28,11 @@ #include "miopen/execution_context.hpp" #include "miopen/invoke_params.hpp" #include "miopen/tensor_view_utils.hpp" -#include +#include -#include +#include #include -#include +#include #include #define LOCAL_SIZE_FWD_3D 256 @@ -41,9 +41,9 @@ namespace miopen { namespace solver { -namespace avgpool { +namespace adaptiveavgpool { -bool IsOverRocmFwd3d(const miopen::avgpool::FwdProblemDescription& problem) +bool IsOverRocmFwd3d(const miopen::adaptiveavgpool::FwdProblemDescription& problem) { auto dtype = problem.GetOutputDesc().GetType(); auto in_nelems = problem.GetInputDesc().GetElementSize(); @@ -80,8 +80,8 @@ bool IsOverRocmFwd3d(const miopen::avgpool::FwdProblemDescription& problem) return false; } -bool AvgPoolForward3d::IsApplicable(const ExecutionContext&, - const miopen::avgpool::FwdProblemDescription& problem) const +bool AdaptiveAvgPoolForward3d::IsApplicable( + const ExecutionContext&, const miopen::adaptiveavgpool::FwdProblemDescription& problem) const { if(problem.GetInputDesc().GetNumDims() != 5 || problem.GetOutputDesc().GetNumDims() != 5) { @@ -94,9 +94,9 @@ bool AvgPoolForward3d::IsApplicable(const ExecutionContext&, return true; } -ConvSolution -AvgPoolForward3d::GetSolution(const ExecutionContext& context, - const miopen::avgpool::FwdProblemDescription& problem) const +ConvSolution AdaptiveAvgPoolForward3d::GetSolution( + const ExecutionContext& context, + const miopen::adaptiveavgpool::FwdProblemDescription& problem) const { std::ignore = context; @@ -114,12 +114,15 @@ AvgPoolForward3d::GetSolution(const ExecutionContext& context, {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; - result.construction_params.push_back(make_hip_kernel( - {LOCAL_SIZE_FWD_3D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolForward3d", build_params)); + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_3D}, + {N_total}, + "MIOpenAdaptiveAvgPool.cpp", + "AdaptiveAvgPoolForward3d", + build_params)); result.invoker_factory = [](const std::vector& kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) params = raw_params.CastTo(); + decltype(auto) params = raw_params.CastTo(); decltype(auto) kernel = handle_.Run(kernels.front()); @@ -135,36 +138,14 @@ AvgPoolForward3d::GetSolution(const ExecutionContext& context, auto OH = deref(params.outputDesc).GetLengths()[3]; auto OW = deref(params.outputDesc).GetLengths()[4]; - kernel(params.input, - params.output, - N, - C, - D, - H, - W, - OD, - OH, - OW, - params.KD, - params.KH, - params.KW, - params.SD, - params.SH, - params.SW, - params.PD, - params.PH, - params.PW, - params.count_include_pad, - params.divisor_override, - input_tv, - output_tv); + kernel(params.input, params.output, N, C, D, H, W, OD, OH, OW, input_tv, output_tv); }; }; return result; } -} // namespace avgpool +} // namespace adaptiveavgpool } // namespace solver From 2e131c7058cb58c25b470e2fd452b41335cf7e85 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Thu, 3 Oct 2024 16:13:11 +0700 Subject: [PATCH 11/38] small fix --- driver/CMakeLists.txt | 2 +- driver/adaptiveavgpool_driver.hpp | 490 +++++++++++++++ driver/avgpool_driver.hpp | 575 ------------------ ...{dm_avgpool.cpp => dm_adaptiveavgpool.cpp} | 14 +- driver/driver.hpp | 7 +- driver/mloAdaptiveAvgPoolHost.hpp | 337 ++++++++++ driver/mloAvgPoolHost.hpp | 438 ------------- include/miopen/miopen.h | 21 +- src/CMakeLists.txt | 2 +- .../adaptiveavgpool/problem_description.hpp | 120 ++++ src/kernels/MIOpenAdaptiveAvgPool.cpp | 461 +++++++------- src/kernels/tensor_view.hpp | 46 +- .../backward_adaptiveavgpool_1d.cpp | 62 +- .../backward_adaptiveavgpool_2d.cpp | 36 +- .../forward_adaptiveavgpool_1d.cpp | 45 +- .../forward_adaptiveavgpool_2d.cpp | 26 +- test/cpu_adaptiveavgpool.hpp | 311 ++++++++++ test/cpu_avgpool.hpp | 426 ------------- .../{avgpool.cpp => adaptiveavgpool.cpp} | 56 +- test/gtest/adaptiveavgpool.hpp | 380 ++++++++++++ test/gtest/avgpool.hpp | 451 -------------- 21 files changed, 1969 insertions(+), 2337 deletions(-) create mode 100644 driver/adaptiveavgpool_driver.hpp delete mode 100644 driver/avgpool_driver.hpp rename driver/{dm_avgpool.cpp => dm_adaptiveavgpool.cpp} (81%) create mode 100644 driver/mloAdaptiveAvgPoolHost.hpp delete mode 100644 driver/mloAvgPoolHost.hpp create mode 100644 test/cpu_adaptiveavgpool.hpp delete mode 100644 test/cpu_avgpool.hpp rename test/gtest/{avgpool.cpp => adaptiveavgpool.cpp} (64%) create mode 100644 test/gtest/adaptiveavgpool.hpp delete mode 100644 test/gtest/avgpool.hpp diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 385580e2e1..4fd3c033db 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -32,7 +32,7 @@ add_executable(MIOpenDriver dm_activ.cpp dm_adam.cpp dm_addlayernorm.cpp - dm_avgpool.cpp + dm_adaptiveavgpool.cpp dm_bnorm.cpp dm_cat.cpp dm_conv.cpp diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp new file mode 100644 index 0000000000..fd86cf9eec --- /dev/null +++ b/driver/adaptiveavgpool_driver.hpp @@ -0,0 +1,490 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_ADAPTIVEAVGPOOL_DRIVER_HPP +#define GUARD_MIOPEN_ADAPTIVEAVGPOOL_DRIVER_HPP + +#include "InputFlags.hpp" +#include "driver.hpp" +#include "mloAdaptiveAvgPoolHost.hpp" +#include "random.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" + +#include <../test/tensor_holder.hpp> +#include <../test/verify.hpp> + +#include +#include +#include +#include +#include + +template +class AdaptiveAvgPoolDriver : public Driver +{ +public: + AdaptiveAvgPoolDriver() : Driver() + { + miopenCreateTensorDescriptor(&inputDesc); + miopenCreateTensorDescriptor(&outputDesc); + miopenCreateTensorDescriptor(&inputGradDesc); + miopenCreateTensorDescriptor(&outputGradDesc); + + data_type = miopen_type{}; + } + + std::vector ComputeStrides(std::vector input); + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + std::vector GetInputTensorDimsFromCmd(const char* param); + int GetandSetData() override; + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + int RunBackwardCPU(); + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~AdaptiveAvgPoolDriver() override + { + miopenDestroyTensorDescriptor(inputDesc); + miopenDestroyTensorDescriptor(outputDesc); + miopenDestroyTensorDescriptor(inputGradDesc); + miopenDestroyTensorDescriptor(outputGradDesc); + } + +private: + InputFlags inflags; + + int forw; + + miopenTensorDescriptor_t inputDesc; + miopenTensorDescriptor_t outputDesc; + miopenTensorDescriptor_t inputGradDesc; + miopenTensorDescriptor_t outputGradDesc; + + std::unique_ptr input_dev; + std::unique_ptr output_dev; + std::unique_ptr input_grad_dev; + std::unique_ptr output_grad_dev; + + std::vector input; + std::vector output; + std::vector output_host; + std::vector input_grad; + std::vector input_grad_host; + std::vector output_grad; + + size_t N = 1, C = 1, D = 1, H = 1, W = 1, OD = 1, OH = 1, OW = 1; + + std::vector in_dim; + std::vector out_dim; + bool isContiguous; +}; + +template +int AdaptiveAvgPoolDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + isContiguous = inflags.GetValueInt("is-contiguous") == 1 ? true : false; + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + return miopenStatusSuccess; +} + +template +std::vector AdaptiveAvgPoolDriver::GetInputTensorDimsFromCmd(const char* param) +{ + std::string lengthsStr = inflags.GetValueStr(param); + + std::vector lengths; + std::size_t pos = 0; + std::size_t new_pos; + + new_pos = lengthsStr.find(',', pos); + while(new_pos != std::string::npos) + { + std::string sliceStr = lengthsStr.substr(pos, new_pos - pos); + + int len = std::stoi(sliceStr); + + lengths.push_back(len); + + pos = new_pos + 1; + new_pos = lengthsStr.find(',', pos); + }; + + std::string sliceStr = lengthsStr.substr(pos); + int len = std::stoi(sliceStr); + + lengths.push_back(len); + + return (lengths); +} + +template +int AdaptiveAvgPoolDriver::GetandSetData() +{ + in_dim = GetInputTensorDimsFromCmd("input_dims"); + std::vector in_stride = ComputeStrides(in_dim); + out_dim = GetInputTensorDimsFromCmd("output_dims"); + if(in_dim.size() != out_dim.size() + 2) + { + MIOPEN_THROW(miopenStatusBadParm, + "AdaptiveAvgPool: Input and output tensor sizes do not match."); + } + N = in_dim[0]; + C = in_dim[1]; + std::vector out_dim_final = {N, C}; + if(in_dim.size() == 3) + { + H = in_dim[2]; + + OH = out_dim[0]; + out_dim_final.push_back(OH); + } + else if(in_dim.size() == 4) + { + H = in_dim[2]; + W = in_dim[3]; + + OH = out_dim[0]; + OW = out_dim[1]; + out_dim_final.push_back(OH); + out_dim_final.push_back(OW); + } + else if(in_dim.size() == 5) + { + D = in_dim[2]; + H = in_dim[3]; + W = in_dim[4]; + + OD = out_dim[0]; + OH = out_dim[1]; + OW = out_dim[2]; + out_dim_final.push_back(OD); + out_dim_final.push_back(OH); + out_dim_final.push_back(OW); + } + std::vector out_grad_stride = ComputeStrides(out_dim_final); + SetTensorNd(inputDesc, in_dim, in_stride, data_type); + SetTensorNd(outputDesc, out_dim_final, data_type); + SetTensorNd(outputGradDesc, out_dim_final, out_grad_stride, data_type); + SetTensorNd(inputGradDesc, in_dim, data_type); + + return miopenStatusSuccess; +} + +// Equivalent to: tensor.tranpose(0, -1).contiguous().tranpose(0, -1) incase contiguous = False +template +std::vector AdaptiveAvgPoolDriver::ComputeStrides(std::vector inputDim) +{ + if(!isContiguous) + std::swap(inputDim.front(), inputDim.back()); + std::vector strides(inputDim.size()); + strides.back() = 1; + for(int i = inputDim.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * inputDim[i + 1]; + if(!isContiguous) + std::swap(strides.front(), strides.back()); + return strides; +} + +template +int AdaptiveAvgPoolDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "1", "Run only Forward AdaptiveAvgPool (Default=1)", "int"); + inflags.AddInputFlag( + "input_dims", + 'D', + "2,3,7,9,9", + "The dimensional lengths of the input tensor: N,C,D,H,W... Example: 2,3,7,9,9.", + "string"); + inflags.AddInputFlag( + "output_dims", + 'S', + "5,5,5", + "The dimensional lengths of the output tensor: OD,OH,OW,... Example: 5,5,5.", + "string"); + inflags.AddInputFlag("is-contiguous", 'c', "1", "is-contiguous (Default=1)", "int"); + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int"); + inflags.AddInputFlag("time", 't', "1", "Time (Default=1)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +int AdaptiveAvgPoolDriver::AllocateBuffersAndCopy() +{ + size_t input_sz = GetTensorSize(inputDesc); + size_t output_sz = GetTensorSize(outputDesc); + + uint32_t ctx = 0; + + input_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + output_dev = std::unique_ptr(new GPUMem(ctx, output_sz, sizeof(Tgpu))); + input_grad_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + output_grad_dev = std::unique_ptr(new GPUMem(ctx, output_sz, sizeof(Tgpu))); + + input = std::vector(input_sz, static_cast(0)); + output = std::vector(output_sz, static_cast(0)); + output_host = std::vector(output_sz, static_cast(0)); + + input_grad = std::vector(input_sz, static_cast(0)); + input_grad_host = std::vector(input_sz, static_cast(0)); + output_grad = std::vector(output_sz, static_cast(0)); + + int status; + + for(int i = 0; i < input_sz; i++) + { + input[i] = prng::gen_A_to_B(static_cast(-10.0f), static_cast(10.0f)); + } + status = input_dev->ToGPU(q, input.data()); + + status |= output_dev->ToGPU(q, output.data()); + + status |= input_grad_dev->ToGPU(q, input_grad.data()); + + for(int i = 0; i < output_sz; i++) + { + output_grad[i] = prng::gen_A_to_B(static_cast(-1.0), static_cast(1.0)); + } + status |= output_grad_dev->ToGPU(q, output_grad.data()); + + if(status != 0) + std::cout << "Error copying data to GPU\n" << std::endl; + + return miopenStatusSuccess; +} + +template +int AdaptiveAvgPoolDriver::RunForwardGPU() +{ + float kernel_total_time = 0.0; + float kernel_first_time = 0.0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenAdaptiveAvgPoolForward( + GetHandle(), inputDesc, input_dev->GetMem(), outputDesc, output_dev->GetMem()); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + printf("Wall-clock Time Forward AdaptiveAvgPool Elapsed: %f ms\n", + t.gettime_ms() / iter); + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + printf("GPU Kernel Time Forward AdaptiveAvgPool Elapsed: %f ms\n", kernel_average_time); + } + + output_dev->FromGPU(GetStream(), output.data()); + + return miopenStatusSuccess; +} + +template +int AdaptiveAvgPoolDriver::RunForwardCPU() +{ + if(in_dim.size() == 3) + { + mloAdaptiveAvgPoolForward1dRunHost( + inputDesc, outputDesc, input.data(), output_host.data(), N, C, H, OH); + } + else if(in_dim.size() == 4) + { + mloAdaptiveAvgPoolForward2dRunHost( + inputDesc, outputDesc, input.data(), output_host.data(), N, C, H, W, OH, OW); + } + else if(in_dim.size() == 5) + { + mloAdaptiveAvgPoolForward3dRunHost( + inputDesc, outputDesc, input.data(), output_host.data(), N, C, D, H, W, OD, OH, OW); + } + return miopenStatusSuccess; +} + +template +int AdaptiveAvgPoolDriver::RunBackwardGPU() +{ + float kernel_total_time = 0.0; + float kernel_first_time = 0.0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenAdaptiveAvgPoolBackward(GetHandle(), + outputGradDesc, + output_grad_dev->GetMem(), + inputGradDesc, + input_grad_dev->GetMem()); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + printf("Wall-clock Time Backward AdaptiveAvgPool Elapsed: %f ms\n", + t.gettime_ms() / iter); + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + printf("GPU Kernel Time Backward AdaptiveAvgPool Elapsed: %f ms\n", kernel_average_time); + } + + input_grad_dev->FromGPU(GetStream(), input_grad.data()); + + return miopenStatusSuccess; +} + +template +int AdaptiveAvgPoolDriver::RunBackwardCPU() +{ + if(in_dim.size() == 3) + { + mloAdaptiveAvgPoolBackward1dRunHost( + outputGradDesc, inputGradDesc, output_grad.data(), input_grad_host.data(), N, C, H, OH); + } + else if(in_dim.size() == 4) + { + mloAdaptiveAvgPoolBackward2dRunHost(outputGradDesc, + inputGradDesc, + output_grad.data(), + input_grad_host.data(), + N, + C, + H, + W, + OH, + OW); + } + else if(in_dim.size() == 5) + { + mloAdaptiveAvgPoolBackward3dRunHost(outputGradDesc, + inputGradDesc, + output_grad.data(), + input_grad_host.data(), + N, + C, + D, + H, + W, + OD, + OH, + OW); + } + return miopenStatusSuccess; +} + +template +Tref AdaptiveAvgPoolDriver::GetTolerance() +{ + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + return tolerance; +} + +template +int AdaptiveAvgPoolDriver::VerifyForward() +{ + RunForwardCPU(); + const Tref tolerance = GetTolerance(); + auto error = miopen::rms_range(output_host, output); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Forward AdaptiveAvgPool FAILED: " << error << std::endl; + return EC_VerifyFwd; + } + else + { + printf("Forward AdaptiveAvgPool Verifies on CPU and GPU (err=%f)\n", error); + } + + return miopenStatusSuccess; +} + +template +int AdaptiveAvgPoolDriver::VerifyBackward() +{ + RunBackwardCPU(); + const Tref tolerance = GetTolerance(); + auto error = miopen::rms_range(input_grad_host, input_grad); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Backward AdaptiveAvgPool FAILED: " << error << std::endl; + return EC_VerifyFwd; + } + else + { + printf("Backward AdaptiveAvgPool Verifies on CPU and GPU (err=%f)\n", error); + } + return miopenStatusSuccess; +} + +#endif // GUARD_MIOPEN_ADAPTIVEAVGPOOL_DRIVER_HPP diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp deleted file mode 100644 index ff7d04edd5..0000000000 --- a/driver/avgpool_driver.hpp +++ /dev/null @@ -1,575 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2024 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_MIOPEN_AVGPOOL_DRIVER_HPP -#define GUARD_MIOPEN_AVGPOOL_DRIVER_HPP - -#include "InputFlags.hpp" -#include "driver.hpp" -#include "mloAvgPoolHost.hpp" -#include "random.hpp" -#include "tensor_driver.hpp" -#include "timer.hpp" - -#include <../test/tensor_holder.hpp> -#include <../test/verify.hpp> - -#include -#include -#include -#include -#include - -template -class AvgPoolDriver : public Driver -{ -public: - AvgPoolDriver() : Driver() - { - miopenCreateTensorDescriptor(&inputDesc); - miopenCreateTensorDescriptor(&outputDesc); - miopenCreateTensorDescriptor(&inputGradDesc); - miopenCreateTensorDescriptor(&outputGradDesc); - - data_type = miopen_type{}; - } - - int AddCmdLineArgs() override; - int ParseCmdLineArgs(int argc, char* argv[]) override; - InputFlags& GetInputFlags() override { return inflags; } - - std::vector GetInputTensorDimsFromCmd(const char* param); - int GetandSetData() override; - - int AllocateBuffersAndCopy() override; - - int RunForwardGPU() override; - int RunForwardCPU(); - - int RunBackwardGPU() override; - int RunBackwardCPU(); - - Tref GetTolerance(); - int VerifyBackward() override; - int VerifyForward() override; - ~AvgPoolDriver() override - { - miopenDestroyTensorDescriptor(inputDesc); - miopenDestroyTensorDescriptor(outputDesc); - miopenDestroyTensorDescriptor(inputGradDesc); - miopenDestroyTensorDescriptor(outputGradDesc); - } - -private: - InputFlags inflags; - - int forw; - - miopenTensorDescriptor_t inputDesc; - miopenTensorDescriptor_t outputDesc; - miopenTensorDescriptor_t inputGradDesc; - miopenTensorDescriptor_t outputGradDesc; - - std::unique_ptr input_dev; - std::unique_ptr output_dev; - std::unique_ptr input_grad_dev; - std::unique_ptr output_grad_dev; - - std::vector input; - std::vector output; - std::vector output_host; - std::vector input_grad; - std::vector input_grad_host; - std::vector output_grad; - std::vector ksize; - std::vector stride; - std::vector padding; - - bool ceil_mode; - bool count_include_pad; - int32_t divisor_override; - int32_t N, C, D, H, W, OD, OH, OW; - - std::vector in_dim; -}; - -template -int AvgPoolDriver::ParseCmdLineArgs(int argc, char* argv[]) -{ - inflags.Parse(argc, argv); - - if(inflags.GetValueInt("time") == 1) - { - miopenEnableProfiling(GetHandle(), true); - } - return miopenStatusSuccess; -} - -template -std::vector AvgPoolDriver::GetInputTensorDimsFromCmd(const char* param) -{ - std::string lengthsStr = inflags.GetValueStr(param); - - std::vector lengths; - std::size_t pos = 0; - std::size_t new_pos; - - new_pos = lengthsStr.find(',', pos); - while(new_pos != std::string::npos) - { - std::string sliceStr = lengthsStr.substr(pos, new_pos - pos); - - int len = std::stoi(sliceStr); - - lengths.push_back(len); - - pos = new_pos + 1; - new_pos = lengthsStr.find(',', pos); - }; - - std::string sliceStr = lengthsStr.substr(pos); - int len = std::stoi(sliceStr); - - lengths.push_back(len); - - return (lengths); -} - -template -int AvgPoolDriver::GetandSetData() -{ - in_dim = GetInputTensorDimsFromCmd("input_dims"); - int ksp_dim = in_dim.size() - 2; - ksize = GetInputTensorDimsFromCmd("kernel_size"); - stride = GetInputTensorDimsFromCmd("stride"); - padding = GetInputTensorDimsFromCmd("padding"); - - if(ksize.size() != ksp_dim) - { - int ref = ksp_dim - ksize.size(); - while((ref--) != 0) - ksize.push_back(ksize[0]); - } - if(stride.size() != ksp_dim) - { - int ref = ksp_dim - stride.size(); - while((ref--) != 0) - stride.push_back(stride[0]); - } - if(padding.size() != ksp_dim) - { - int ref = ksp_dim - padding.size(); - while((ref--) != 0) - padding.push_back(padding[0]); - } - - ceil_mode = static_cast(inflags.GetValueInt("ceil_mode")); - count_include_pad = static_cast(inflags.GetValueInt("count_include_pad")); - divisor_override = inflags.GetValueInt("divisor_override"); - - N = in_dim[0]; - C = in_dim[1]; - D = in_dim.size() == 5 ? in_dim[2] : 1; - H = in_dim.size() == 5 ? in_dim[3] : in_dim[2]; - W = in_dim.size() == 5 ? in_dim[4] : in_dim[3]; - - std::vector out_dim; - if(in_dim.size() == 5) - { - if(ceil_mode) - { - OD = std::ceil(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; - OH = std::ceil(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; - OW = std::ceil(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; - } - else - { - OD = std::floor(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; - OH = std::floor(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; - OW = std::floor(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; - } - out_dim = std::vector{N, C, OD, OH, OW}; - } - else - { - if(ceil_mode) - { - OH = std::ceil(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; - OW = std::ceil(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; - } - else - { - OH = std::floor(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; - OW = std::floor(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; - } - out_dim = std::vector{N, C, OH, OW}; - } - SetTensorNd(inputDesc, in_dim, data_type); - SetTensorNd(outputDesc, out_dim, data_type); - SetTensorNd(outputGradDesc, out_dim, data_type); - SetTensorNd(inputGradDesc, in_dim, data_type); - - return miopenStatusSuccess; -} - -template -int AvgPoolDriver::AddCmdLineArgs() -{ - inflags.AddInputFlag("forw", 'F', "1", "Run only Forward AvgPool (Default=1)", "int"); - inflags.AddInputFlag( - "input_dims", - 'D', - "2,3,7,9", - "The dimensional lengths of the input tensor: N,C,D1,D2,... Example: 2,3,7,9.", - "string"); - inflags.AddInputFlag( - "kernel_size", 'k', "1,1", "The size of the window D1,D2,... Example: 1,1.", "string"); - inflags.AddInputFlag( - "stride", - 's', - "1,1", - "The stride of the window. Default value is kernel_size D1,D2,... Example: 1,1.", - "string"); - inflags.AddInputFlag("padding", - 'p', - "0,0", - "Implicit zero padding to be added on both sides D1,D2,... Example: 0,0.", - "string"); - inflags.AddInputFlag("ceil_mode", - 'c', - "1", - "When 1, will use ceil instead of floor to compute the output shape.", - "int"); - inflags.AddInputFlag("count_include_pad", - 'P', - "0", - "When 1, will include the zero-padding in the averaging calculation.", - "int"); - inflags.AddInputFlag("divisor_override", - 'd', - "0", - "If specified, it will be used as divisor, otherwise size of the pooling " - "region will be used.", - "int"); - - inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); - inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int"); - inflags.AddInputFlag("time", 't', "1", "Time (Default=1)", "int"); - inflags.AddInputFlag( - "wall", 'w', "0", "Wall-clock Time, Requires time == 1 (Default=0)", "int"); - - return miopenStatusSuccess; -} - -template -int AvgPoolDriver::AllocateBuffersAndCopy() -{ - size_t input_sz = GetTensorSize(inputDesc); - size_t output_sz = GetTensorSize(outputDesc); - - uint32_t ctx = 0; - - input_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); - output_dev = std::unique_ptr(new GPUMem(ctx, output_sz, sizeof(Tgpu))); - input_grad_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); - output_grad_dev = std::unique_ptr(new GPUMem(ctx, output_sz, sizeof(Tgpu))); - - input = std::vector(input_sz, static_cast(0)); - output = std::vector(output_sz, static_cast(0)); - output_host = std::vector(output_sz, static_cast(0)); - - input_grad = std::vector(input_sz, static_cast(0)); - input_grad_host = std::vector(input_sz, static_cast(0)); - output_grad = std::vector(output_sz, static_cast(0)); - - int status; - - for(int i = 0; i < input_sz; i++) - { - input[i] = prng::gen_A_to_B(static_cast(-10.0f), static_cast(10.0f)); - } - status = input_dev->ToGPU(q, input.data()); - - status |= output_dev->ToGPU(q, output.data()); - - status |= input_grad_dev->ToGPU(q, input_grad.data()); - - for(int i = 0; i < output_sz; i++) - { - output_grad[i] = prng::gen_A_to_B(static_cast(-1.0), static_cast(1.0)); - } - status |= output_grad_dev->ToGPU(q, output_grad.data()); - - if(status != 0) - std::cout << "Error copying data to GPU\n" << std::endl; - - return miopenStatusSuccess; -} - -template -int AvgPoolDriver::RunForwardGPU() -{ - float kernel_total_time = 0.0; - float kernel_first_time = 0.0; - - Timer t; - START_TIME - - for(int i = 0; i < inflags.GetValueInt("iter"); i++) - { - miopenAvgPoolForward(GetHandle(), - inputDesc, - input_dev->GetMem(), - outputDesc, - output_dev->GetMem(), - ksize.size() == 3 ? ksize[0] : 0, - ksize.size() == 3 ? ksize[1] : ksize[0], - ksize.size() == 3 ? ksize[2] : ksize[1], - stride.size() == 3 ? stride[0] : 0, - stride.size() == 3 ? stride[1] : stride[0], - stride.size() == 3 ? stride[2] : stride[1], - padding.size() == 3 ? padding[0] : 0, - padding.size() == 3 ? padding[1] : padding[0], - padding.size() == 3 ? padding[2] : padding[1], - count_include_pad, - divisor_override); - - float time = 0.0; - miopenGetKernelTime(GetHandle(), &time); - kernel_total_time += time; - if(i == 0) - kernel_first_time = time; - } - - if(inflags.GetValueInt("time") == 1) - { - STOP_TIME - int iter = inflags.GetValueInt("iter"); - if(WALL_CLOCK) - printf("Wall-clock Time Forward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter); - - float kernel_average_time = - iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; - printf("GPU Kernel Time Forward AvgPool Elapsed: %f ms\n", kernel_average_time); - } - - output_dev->FromGPU(GetStream(), output.data()); - - return miopenStatusSuccess; -} - -template -int AvgPoolDriver::RunForwardCPU() -{ - if(in_dim.size() == 4) - { - mloAvgPoolForward2dRunHost(inputDesc, - outputDesc, - input.data(), - output_host.data(), - N, - C, - H, - W, - OH, - OW, - ksize.data(), - stride.data(), - padding.data(), - count_include_pad, - divisor_override); - } - else if(in_dim.size() == 5) - { - mloAvgPoolForward3dRunHost(inputDesc, - outputDesc, - input.data(), - output_host.data(), - N, - C, - D, - H, - W, - OD, - OH, - OW, - ksize.data(), - stride.data(), - padding.data(), - count_include_pad, - divisor_override); - } - return miopenStatusSuccess; -} - -template -int AvgPoolDriver::RunBackwardGPU() -{ - float kernel_total_time = 0.0; - float kernel_first_time = 0.0; - - Timer t; - START_TIME - - for(int i = 0; i < inflags.GetValueInt("iter"); i++) - { - miopenAvgPoolBackward(GetHandle(), - outputGradDesc, - output_grad_dev->GetMem(), - inputGradDesc, - input_grad_dev->GetMem(), - ksize.size() == 3 ? ksize[0] : 0, - ksize.size() == 3 ? ksize[1] : ksize[0], - ksize.size() == 3 ? ksize[2] : ksize[1], - stride.size() == 3 ? stride[0] : 0, - stride.size() == 3 ? stride[1] : stride[0], - stride.size() == 3 ? stride[2] : stride[1], - padding.size() == 3 ? padding[0] : 0, - padding.size() == 3 ? padding[1] : padding[0], - padding.size() == 3 ? padding[2] : padding[1], - count_include_pad, - divisor_override); - - float time = 0.0; - miopenGetKernelTime(GetHandle(), &time); - kernel_total_time += time; - if(i == 0) - kernel_first_time = time; - } - - if(inflags.GetValueInt("time") == 1) - { - STOP_TIME - int iter = inflags.GetValueInt("iter"); - if(WALL_CLOCK) - printf("Wall-clock Time Backward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter); - - float kernel_average_time = - iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; - printf("GPU Kernel Time Backward AvgPool Elapsed: %f ms\n", kernel_average_time); - } - - input_grad_dev->FromGPU(GetStream(), input_grad.data()); - - return miopenStatusSuccess; -} - -template -int AvgPoolDriver::RunBackwardCPU() -{ - if(in_dim.size() == 4) - { - mloAvgPoolBackward2dRunHost(outputGradDesc, - inputGradDesc, - output_grad.data(), - input_grad_host.data(), - N, - C, - H, - W, - OH, - OW, - ksize.data(), - stride.data(), - padding.data(), - count_include_pad, - divisor_override); - } - else if(in_dim.size() == 5) - { - mloAvgPoolBackward3dRunHost(outputGradDesc, - inputGradDesc, - output_grad.data(), - input_grad_host.data(), - N, - C, - D, - H, - W, - OD, - OH, - OW, - ksize.data(), - stride.data(), - padding.data(), - count_include_pad, - divisor_override); - } - return miopenStatusSuccess; -} - -template -Tref AvgPoolDriver::GetTolerance() -{ - // Computation error of fp16 is ~2^13 (=8192) bigger than - // the one of fp32 because mantissa is shorter by 13 bits. - auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; - - // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. - if(std::is_same::value) - tolerance *= 8.0; - return tolerance; -} - -template -int AvgPoolDriver::VerifyForward() -{ - RunForwardCPU(); - const Tref tolerance = GetTolerance(); - auto error = miopen::rms_range(output_host, output); - - if(!std::isfinite(error) || error > tolerance) - { - std::cout << "Forward AvgPool FAILED: " << error << std::endl; - return EC_VerifyFwd; - } - else - { - printf("Forward AvgPool Verifies on CPU and GPU (err=%f)\n", error); - } - - return miopenStatusSuccess; -} - -template -int AvgPoolDriver::VerifyBackward() -{ - RunBackwardCPU(); - const Tref tolerance = GetTolerance(); - auto error = miopen::rms_range(input_grad_host, input_grad); - - if(!std::isfinite(error) || error > tolerance) - { - std::cout << "Backward AvgPool FAILED: " << error << std::endl; - return EC_VerifyFwd; - } - else - { - printf("Backward AvgPool Verifies on CPU and GPU (err=%f)\n", error); - } - return miopenStatusSuccess; -} - -#endif // GUARD_MIOPEN_AVGPOOL_DRIVER_HPP diff --git a/driver/dm_avgpool.cpp b/driver/dm_adaptiveavgpool.cpp similarity index 81% rename from driver/dm_avgpool.cpp rename to driver/dm_adaptiveavgpool.cpp index ec0e457056..b6e53ba17e 100644 --- a/driver/dm_avgpool.cpp +++ b/driver/dm_adaptiveavgpool.cpp @@ -24,16 +24,16 @@ * *******************************************************************************/ #include "registry_driver_maker.hpp" -#include "avgpool_driver.hpp" +#include "adaptiveavgpool_driver.hpp" static Driver* makeDriver(const std::string& base_arg) { - if(base_arg == "avgpool") - return new AvgPoolDriver(); - if(base_arg == "avgpoolfp16") - return new AvgPoolDriver(); - if(base_arg == "avgpoolbfp16") - return new AvgPoolDriver(); + if(base_arg == "adaptiveavgpool") + return new AdaptiveAvgPoolDriver(); + if(base_arg == "adaptiveavgpoolfp16") + return new AdaptiveAvgPoolDriver(); + if(base_arg == "adaptiveavgpoolbfp16") + return new AdaptiveAvgPoolDriver(); return nullptr; } diff --git a/driver/driver.hpp b/driver/driver.hpp index bd42f6ee13..15e20ad55d 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -175,7 +175,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, " - "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], avgpool[bfp16|fp16]\n"); + "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], adaptiveavgpool[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -206,8 +206,9 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" && arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" && arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" && - arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "avgpool" && - arg != "avgpoolfp16" && arg != "avgpoolbfp16" && arg != "--version") + arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && + arg != "adaptiveavgpool" && arg != "adaptiveavgpoolfp16" && arg != "adaptiveavgpoolbfp16" && + arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/mloAdaptiveAvgPoolHost.hpp b/driver/mloAdaptiveAvgPoolHost.hpp new file mode 100644 index 0000000000..1c45f16213 --- /dev/null +++ b/driver/mloAdaptiveAvgPoolHost.hpp @@ -0,0 +1,337 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MLO_ADAPTIVEAVGPOOLHOST_H_ +#define MLO_ADAPTIVEAVGPOOLHOST_H_ + +#include +#include +#include + +template +int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + Tgpu* input, + Tcheck* output, + size_t N, + size_t C, + size_t H, + size_t OH) +{ + auto dims = miopen::deref(inputDesc).GetLengths(); + auto numel = miopen::deref(outputDesc).GetElementSize(); + + auto input_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(inputDesc)); + auto output_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(outputDesc)); + + for(size_t gid = 0; gid < numel; gid++) + { + size_t nc = gid / OH, oh = gid % OH; + size_t n = nc / C, c = nc % C; + + if(n >= N) + return 0; + + size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); + size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; + + float sum = 0; + for(size_t ih = h; ih < (h + kh); ++ih) + { + sum += static_cast(input[input_tv.get_tensor_view_idx({n, c, ih})]); + } + + output[output_tv.get_tensor_view_idx({n, c, oh})] = static_cast(sum / kh); + } + return 0; +} + +template +int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + Tgpu* input, + Tcheck* output, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW) +{ + auto dims = miopen::deref(inputDesc).GetLengths(); + auto numel = miopen::deref(outputDesc).GetElementSize(); + + auto input_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc)); + auto output_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputDesc)); + + for(size_t gid = 0; gid < numel; gid++) + { + size_t ncoh = gid / OW, ow = gid % OW; + size_t nc = ncoh / OH, oh = ncoh % OH; + size_t n = nc / C, c = nc % C; + + if(n >= N) + return 0; + + size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); + size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; + + size_t w = static_cast(std::floor(static_cast(ow * W) / OW)); + size_t kw = static_cast(std::ceil(static_cast((ow + 1) * W) / OW)) - w; + + float divider = static_cast(kh * kw); + float sum = 0; + for(size_t ih = h; ih < (h + kh); ++ih) + { + for(size_t iw = w; iw < (w + kw); ++iw) + { + sum += static_cast(input[input_tv.get_tensor_view_idx({n, c, ih, iw})]); + } + } + + output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = static_cast(sum / divider); + } + return 0; +} + +template +int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + Tgpu* input, + Tcheck* output, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW) +{ + auto dims = miopen::deref(inputDesc).GetLengths(); + auto numel = miopen::deref(outputDesc).GetElementSize(); + + auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); + auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc)); + + for(size_t gid = 0; gid < numel; gid++) + { + size_t ncodoh = gid / OW, ow = gid % OW; + size_t ncod = ncodoh / OH, oh = ncodoh % OH; + size_t nc = ncod / OD, od = ncod % OD; + size_t n = nc / C, c = nc % C; + + if(n >= N) + return 0; + + size_t d = static_cast(std::floor(static_cast(od * D) / OD)); + size_t kd = static_cast(std::ceil(static_cast((od + 1) * D) / OD)) - d; + + size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); + size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; + + size_t w = static_cast(std::floor(static_cast(ow * W) / OW)); + size_t kw = static_cast(std::ceil(static_cast((ow + 1) * W) / OW)) - w; + + float sum = 0; + for(size_t id = d; id < (d + kd); ++id) + { + for(size_t ih = h; ih < (h + kh); ++ih) + { + for(size_t iw = w; iw < (w + kw); ++iw) + { + sum += + static_cast(input[input_tv.get_tensor_view_idx({n, c, id, ih, iw})]); + } + } + } + + output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] = + static_cast(sum / (kd * kh * kw)); + } + return 0; +} + +template +int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outputGradDesc, + const miopenTensorDescriptor_t inputGradDesc, + Tgpu* output_grad, + Tcheck* input_grad, + size_t N, + size_t C, + size_t H, + size_t OH) +{ + auto dims = miopen::deref(inputGradDesc).GetLengths(); + auto numel = miopen::deref(inputGradDesc).GetElementSize(); + + auto output_grad_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(outputGradDesc)); + auto input_grad_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(inputGradDesc)); + + for(size_t gid = 0; gid < numel; gid++) + { + size_t nc = gid / H, h = gid % H; + size_t n = nc / C, c = nc % C; + + if(n >= N) + return 0; + + size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); + size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; + + float grad = 0; + for(size_t ih = oh; ih < (oh + koh); ++ih) + { + size_t kh = static_cast(std::ceil(static_cast((ih + 1) * H) / OH)) - + static_cast(std::floor(static_cast(ih * H) / OH)); + grad += + static_cast(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih})]) / + kh; + } + input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = static_cast(grad); + } + return 0; +} + +template +int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDesc, + const miopenTensorDescriptor_t inputGradDesc, + Tgpu* output_grad, + Tcheck* input_grad, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW) +{ + auto dims = miopen::deref(inputGradDesc).GetLengths(); + auto numel = miopen::deref(inputGradDesc).GetElementSize(); + + auto output_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc)); + auto input_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc)); + + for(size_t gid = 0; gid < numel; gid++) + { + size_t nch = gid / W, w = gid % W; + size_t nc = nch / H, h = nch % H; + size_t n = nc / C, c = nc % C; + + if(n >= N) + return 0; + + size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); + size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; + + size_t ow = static_cast(std::floor(static_cast(w * OW) / W)); + size_t kow = static_cast(std::ceil(static_cast((w + 1) * OW) / W)) - ow; + + float grad = 0; + for(size_t ih = oh; ih < (oh + koh); ++ih) + { + size_t kh = static_cast(std::ceil(static_cast((ih + 1) * H) / OH)) - + static_cast(std::floor(static_cast(ih * H) / OH)); + for(size_t iw = ow; iw < (ow + kow); ++iw) + { + size_t kw = static_cast(std::ceil(static_cast((iw + 1) * W) / OW)) - + static_cast(std::floor(static_cast(iw * W) / OW)); + grad += static_cast( + output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih, iw})]) / + (kh * kw); + } + } + + input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = static_cast(grad); + } + return 0; +} + +template +int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDesc, + const miopenTensorDescriptor_t inputGradDesc, + Tgpu* output_grad, + Tcheck* input_grad, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW) +{ + auto dims = miopen::deref(inputGradDesc).GetLengths(); + auto numel = miopen::deref(inputGradDesc).GetElementSize(); + + auto output_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc)); + auto input_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc)); + + for(size_t gid = 0; gid < numel; gid++) + { + size_t ncdh = gid / W, w = gid % W; + size_t ncd = ncdh / H, h = ncdh % H; + size_t nc = ncd / D, d = ncd % D; + size_t n = nc / C, c = nc % C; + + if(n >= N) + return 0; + + size_t od = static_cast(std::floor(static_cast(d * OD) / D)); + size_t kod = static_cast(std::ceil(static_cast((d + 1) * OD) / D)) - od; + + size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); + size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; + + size_t ow = static_cast(std::floor(static_cast(w * OW) / W)); + size_t kow = static_cast(std::ceil(static_cast((w + 1) * OW) / W)) - ow; + + float grad = 0; + for(size_t id = od; id < (od + kod); ++id) + { + size_t kd = static_cast(std::ceil(static_cast((id + 1) * D) / OD)) - + static_cast(std::floor(static_cast(id * D) / OD)); + for(size_t ih = oh; ih < (oh + koh); ++ih) + { + size_t kh = static_cast(std::ceil(static_cast((ih + 1) * H) / OH)) - + static_cast(std::floor(static_cast(ih * H) / OH)); + for(size_t iw = ow; iw < (ow + kow); ++iw) + { + size_t kw = + static_cast(std::ceil(static_cast((iw + 1) * W) / OW)) - + static_cast(std::floor(static_cast(iw * W) / OW)); + grad += + static_cast( + output_grad[output_grad_tv.get_tensor_view_idx({n, c, id, ih, iw})]) / + (kd * kh * kw); + } + } + } + + input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = static_cast(grad); + } + return 0; +} + +#endif // MLO_ADAPTIVEAVGPOOLHOST_H_ diff --git a/driver/mloAvgPoolHost.hpp b/driver/mloAvgPoolHost.hpp deleted file mode 100644 index 6980ce968e..0000000000 --- a/driver/mloAvgPoolHost.hpp +++ /dev/null @@ -1,438 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2024 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef MLO_AVGPOOLHOST_H_ -#define MLO_AVGPOOLHOST_H_ - -#include -#include - -template -int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc, - const miopenTensorDescriptor_t outputDesc, - Tgpu* input, - Tcheck* output, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, - const int32_t* ksize, - const int32_t* stride, - const int32_t* padding, - bool count_include_pad, - int32_t divisor_override) -{ - auto dims = miopen::deref(inputDesc).GetLengths(); - auto numel = miopen::deref(outputDesc).GetElementSize(); - - auto input_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc)); - auto output_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputDesc)); - - for(int32_t gid = 0; gid < numel; gid++) - { - int32_t ncoh = gid / OW, ow = gid % OW; - int32_t nc = ncoh / OH, oh = ncoh % OH; - int32_t n = nc / C, c = nc % C; - int32_t R = ksize[0]; - int32_t S = ksize[1]; - int32_t sh = stride[0]; - int32_t sw = stride[1]; - int32_t ph = padding[0]; - int32_t pw = padding[1]; - - if(n >= N) - return 0; - - float m = 0; - for(int32_t r = 0; r < R; ++r) - { - for(int32_t s = 0; s < S; ++s) - { - // input idx : (n, c, h, w) - int32_t h = oh * sh - ph + r; - if(h < 0 || h >= H) - continue; - int32_t w = ow * sw - pw + s; - if(w < 0 || w >= W) - continue; - // int32_t input_idx = ((n * C + c) * H + h) * W + w; - m += static_cast( - input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]); - } - } - - int32_t hstart = oh * sh - ph; - int32_t wstart = ow * sw - pw; - int32_t hend = min(hstart + R, H + ph); - int32_t wend = min(wstart + S, W + pw); - - const int32_t pool_size = (hend - hstart) * (wend - wstart); - - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, H); - wend = min(wend, W); - - int32_t divide_factor; - if(divisor_override != 0) - { - divide_factor = divisor_override; - } - else - { - if(count_include_pad) - { - divide_factor = pool_size; - } - else - { - divide_factor = (hend - hstart) * (wend - wstart); - } - } - float val = m / divide_factor; - - output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = - static_cast(val); - } - return 0; -} - -template -int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc, - const miopenTensorDescriptor_t outputDesc, - Tgpu* input, - Tcheck* output, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, - const int32_t* ksize, - const int32_t* stride, - const int32_t* padding, - bool count_include_pad, - int32_t divisor_override) -{ - auto dims = miopen::deref(inputDesc).GetLengths(); - auto numel = miopen::deref(outputDesc).GetElementSize(); - - auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); - auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc)); - - for(int32_t gid = 0; gid < numel; gid++) - { - int32_t ncodoh = gid / OW, ow = gid % OW; - int32_t ncod = ncodoh / OH, oh = ncodoh % OH; - int32_t nc = ncod / OD, od = ncod % OD; - int32_t n = nc / C, c = nc % C; - int32_t KD = ksize[0]; - int32_t R = ksize[1]; - int32_t S = ksize[2]; - int32_t sd = stride[0]; - int32_t sh = stride[1]; - int32_t sw = stride[2]; - int32_t pd = padding[0]; - int32_t ph = padding[1]; - int32_t pw = padding[2]; - - if(n >= N) - return 0; - float sum = 0; - for(int32_t kd = 0; kd < KD; ++kd) - { - for(int32_t r = 0; r < R; ++r) - { - for(int32_t s = 0; s < S; ++s) - { - // input idx : (n, c, d, h, w) - int32_t d = od * sd - pd + kd; - if(d < 0 || d >= D) - continue; - int32_t h = oh * sh - ph + r; - if(h < 0 || h >= H) - continue; - int32_t w = ow * sw - pw + s; - if(w < 0 || w >= W) - continue; - // int32_t input_idx = ((n * C + c) * H + h) * W + w; - sum += static_cast( - input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]); - } - } - } - int32_t dstart = od * sd - pd; - int32_t hstart = oh * sh - ph; - int32_t wstart = ow * sw - pw; - int32_t dend = min(dstart + KD, D + pd); - int32_t hend = min(hstart + R, H + ph); - int32_t wend = min(wstart + S, W + pw); - - const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); - dstart = max(dstart, 0); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - dend = min(dend, D); - hend = min(hend, H); - wend = min(wend, W); - - int32_t divide_factor; - if(divisor_override != 0) - { - divide_factor = divisor_override; - } - else - { - if(count_include_pad) - { - divide_factor = pool_size; - } - else - { - divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); - } - } - float val = sum / divide_factor; - output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] = - static_cast(val); - } - return 0; -} - -template -int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDesc, - const miopenTensorDescriptor_t inputGradDesc, - Tgpu* output_grad, - Tcheck* input_grad, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, - const int32_t* ksize, - const int32_t* stride, - const int32_t* padding, - bool count_include_pad, - int32_t divisor_override) -{ - auto dims = miopen::deref(inputGradDesc).GetLengths(); - auto numel = miopen::deref(inputGradDesc).GetElementSize(); - - auto output_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc)); - auto input_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc)); - - for(size_t gid = 0; gid < numel; gid++) - { - int32_t nch = gid / W, w = gid % W; - int32_t nc = nch / H, h = nch % H; - int32_t n = nc / C, c = nc % C; - int32_t R = ksize[0]; - int32_t S = ksize[1]; - int32_t sh = stride[0]; - int32_t sw = stride[1]; - int32_t ph = padding[0]; - int32_t pw = padding[1]; - - if(n >= N) - return 0; - - float grad = 0; - for(int32_t r = 0; r < R; ++r) - { - for(int32_t s = 0; s < S; ++s) - { - int32_t ohsh = h + ph - r; - if(ohsh % sh != 0) - continue; - int32_t oh = ohsh / sh; - if(oh < 0 || oh >= OH) - continue; - int32_t owsw = w + pw - s; - if(owsw % sw != 0) - continue; - int32_t ow = owsw / sw; - if(ow < 0 || ow >= OW) - continue; - - int32_t hstart = oh * sh - ph; - int32_t wstart = ow * sw - pw; - int32_t hend = min(hstart + R, H + ph); - int32_t wend = min(wstart + S, W + pw); - - const int32_t pool_size = (hend - hstart) * (wend - wstart); - - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, H); - wend = min(wend, W); - - int32_t divide_factor; - if(divisor_override != 0) - { - divide_factor = divisor_override; - } - else - { - if(count_include_pad) - { - divide_factor = pool_size; - } - else - { - divide_factor = (hend - hstart) * (wend - wstart); - } - } - - grad += static_cast(output_grad[output_grad_tv.get_tensor_view_idx( - tensor_layout_t<4>(n, c, oh, ow))]) / - divide_factor; - } - } - input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] = - static_cast(grad); - } - return 0; -} - -template -int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDesc, - const miopenTensorDescriptor_t inputGradDesc, - Tgpu* output_grad, - Tcheck* input_grad, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, - const int32_t* ksize, - const int32_t* stride, - const int32_t* padding, - bool count_include_pad, - int32_t divisor_override) -{ - auto dims = miopen::deref(inputGradDesc).GetLengths(); - auto numel = miopen::deref(inputGradDesc).GetElementSize(); - - auto output_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc)); - auto input_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc)); - - for(size_t gid = 0; gid < numel; gid++) - { - int32_t ncdh = gid / W, w = gid % W; - int32_t ncd = ncdh / H, h = ncdh % H; - int32_t nc = ncd / D, d = ncd % D; - int32_t n = nc / C, c = nc % C; - int32_t KD = ksize[0]; - int32_t R = ksize[1]; - int32_t S = ksize[2]; - int32_t sd = stride[0]; - int32_t sh = stride[1]; - int32_t sw = stride[2]; - int32_t pd = padding[0]; - int32_t ph = padding[1]; - int32_t pw = padding[2]; - - if(n >= N) - return 0; - - float grad = 0; - for(int32_t kd = 0; kd < KD; ++kd) - { - for(int32_t r = 0; r < R; ++r) - { - for(int32_t s = 0; s < S; ++s) - { - int32_t odsd = d + pd - kd; - if(odsd % sd != 0) - continue; - int32_t od = odsd / sd; - if(od < 0 || od >= OD) - continue; - - int32_t ohsh = h + ph - r; - if(ohsh % sh != 0) - continue; - int32_t oh = ohsh / sh; - if(oh < 0 || oh >= OH) - continue; - - int32_t owsw = w + pw - s; - if(owsw % sw != 0) - continue; - int32_t ow = owsw / sw; - if(ow < 0 || ow >= OW) - continue; - - int32_t dstart = od * sd - pd; - int32_t hstart = oh * sh - ph; - int32_t wstart = ow * sw - pw; - int32_t dend = min(dstart + KD, D + pd); - int32_t hend = min(hstart + R, H + ph); - int32_t wend = min(wstart + S, W + pw); - - const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); - dstart = max(dstart, 0); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - dend = min(dend, D); - hend = min(hend, H); - wend = min(wend, W); - int32_t divide_factor; - if(divisor_override != 0) - { - divide_factor = divisor_override; - } - else - { - if(count_include_pad) - { - divide_factor = pool_size; - } - else - { - divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); - } - } - grad += static_cast(output_grad[output_grad_tv.get_tensor_view_idx( - tensor_layout_t<5>(n, c, od, oh, ow))]) / - divide_factor; - } - } - } - input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] = - static_cast(grad); - } - return 0; -} - -#endif // MLO_AVGPOOLHOST_H_ diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 57aeeb5d3b..2e44b62588 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -7638,11 +7638,11 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle, * @param output Data tensor output (output) * @return miopenStatus_t */ -MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, - const miopenTensorDescriptor_t inputDesc, - const void* input, - const miopenTensorDescriptor_t outputDesc, - void* output); +MIOPEN_EXPORT miopenStatus_t miopenAdaptiveAvgPoolForward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output); /*! @brief Execute an adaptiveavgpool backward layer * @@ -7653,11 +7653,12 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, * @param input_grad Data tensor input grad (output) * @return miopenStatus_t */ -MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle, - const miopenTensorDescriptor_t outputGradDesc, - const void* output_grad, - const miopenTensorDescriptor_t inputGradDesc, - void* input_grad); +MIOPEN_EXPORT miopenStatus_t +miopenAdaptiveAvgPoolBackward(miopenHandle_t handle, + const miopenTensorDescriptor_t outputGradDesc, + const void* output_grad, + const miopenTensorDescriptor_t inputGradDesc, + void* input_grad); /** @} */ // CLOSEOUT adaptiveavgpool DOXYGEN GROUP #endif // MIOPEN_BETA_API diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ae621b28ad..f46579e007 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -490,7 +490,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN ${GPU_BATCHED_TRANSPOSE_KERNEL_HIP} ${GPU_GENERAL_TENSOR_REORDER_KERNEL_HIP_SOURCE} kernels/MIOpenAdam.cpp - kernels/MIOpenAvgPool.cpp + kernels/MIOpenAdaptiveAvgPool.cpp kernels/MIOpenCat.cpp kernels/MIOpenCheckNumerics.cpp kernels/MIOpenBatchNormActivBwdPerAct.cl diff --git a/src/include/miopen/adaptiveavgpool/problem_description.hpp b/src/include/miopen/adaptiveavgpool/problem_description.hpp index 53be89cd89..adec5759e7 100644 --- a/src/include/miopen/adaptiveavgpool/problem_description.hpp +++ b/src/include/miopen/adaptiveavgpool/problem_description.hpp @@ -42,6 +42,7 @@ struct FwdProblemDescription : ProblemDescriptionBase : inputDesc(inputDesc_), outputDesc(outputDesc_) { IsValidLength(); + IsValidDims(); } auto GetInputDesc() const { return inputDesc; } @@ -59,9 +60,68 @@ struct FwdProblemDescription : ProblemDescriptionBase "AdaptiveAvgPool: Input and output tensor sizes do not match."); } + if(input_dims == 3) + { + if(outputDesc.GetLengths()[2] > inputDesc.GetLengths()[2]) + { + MIOPEN_THROW(miopenStatusBadParm, + "AdaptiveAvgPool: Input tensor sizes are too small compare to output " + "tensor sizes."); + } + } + else if(input_dims == 4) + { + if(outputDesc.GetLengths()[2] > inputDesc.GetLengths()[2] || + outputDesc.GetLengths()[3] > inputDesc.GetLengths()[3]) + { + MIOPEN_THROW(miopenStatusBadParm, + "AdaptiveAvgPool: Input tensor sizes are too small compare to output " + "tensor sizes."); + } + } + else if(input_dims == 5) + { + if(outputDesc.GetLengths()[2] > inputDesc.GetLengths()[2] || + outputDesc.GetLengths()[3] > inputDesc.GetLengths()[3] || + outputDesc.GetLengths()[4] > inputDesc.GetLengths()[4]) + { + MIOPEN_THROW(miopenStatusBadParm, + "AdaptiveAvgPool: Input tensor sizes are too small compare to output " + "tensor sizes."); + } + } + + return true; + } + + bool IsValidDims() const + { + if(inputDesc.GetLengths().size() > 5 || inputDesc.GetLengths().size() < 3) + { + MIOPEN_THROW(miopenStatusBadParm, + "AdaptiveAvgPool: Only 3D, 4D and 5D tensors are supported."); + } + return true; } + bool IsAllContiguous() const + { + auto isContiguous = [](TensorDescriptor td) { + size_t s = 1; + for(int i = td.GetNumDims() - 1; i >= 0; --i) + { + if(s != td.GetStrides()[i]) + { + return false; + } + s *= td.GetLengths()[i]; + } + return true; + }; + return isContiguous(inputDesc) && isContiguous(outputDesc); + } + NetworkConfig MakeNetworkConfig() const override; protected: @@ -76,6 +136,7 @@ struct BwdProblemDescription : ProblemDescriptionBase : outputGradDesc(outputGradDesc_), inputGradDesc(inputGradDesc_) { IsValidLength(); + IsValidDims(); } auto GetOutputGradDesc() const { return outputGradDesc; } @@ -93,9 +154,68 @@ struct BwdProblemDescription : ProblemDescriptionBase "AdaptiveAvgPool: Input grad and output grad tensor sizes do not match."); } + if(input_dims == 3) + { + if(outputGradDesc.GetLengths()[2] > inputGradDesc.GetLengths()[2]) + { + MIOPEN_THROW(miopenStatusBadParm, + "AdaptiveAvgPool: Input grad tensor sizes are too small compare to " + "output grad tensor sizes."); + } + } + else if(input_dims == 4) + { + if(outputGradDesc.GetLengths()[2] > inputGradDesc.GetLengths()[2] || + outputGradDesc.GetLengths()[3] > inputGradDesc.GetLengths()[3]) + { + MIOPEN_THROW(miopenStatusBadParm, + "AdaptiveAvgPool: Input grad tensor sizes are too small compare to " + "output grad tensor sizes."); + } + } + else if(input_dims == 5) + { + if(outputGradDesc.GetLengths()[2] > inputGradDesc.GetLengths()[2] || + outputGradDesc.GetLengths()[3] > inputGradDesc.GetLengths()[3] || + outputGradDesc.GetLengths()[4] > inputGradDesc.GetLengths()[4]) + { + MIOPEN_THROW(miopenStatusBadParm, + "AdaptiveAvgPool: Input grad tensor sizes are too small compare to " + "output grad tensor sizes."); + } + } + + return true; + } + + bool IsValidDims() const + { + if(inputGradDesc.GetLengths().size() > 5 || inputGradDesc.GetLengths().size() < 3) + { + MIOPEN_THROW(miopenStatusBadParm, + "AdaptiveAvgPool: Only 3D, 4D and 5D tensors are supported."); + } + return true; } + bool IsAllContiguous() const + { + auto isContiguous = [](TensorDescriptor td) { + size_t s = 1; + for(int i = td.GetNumDims() - 1; i >= 0; --i) + { + if(s != td.GetStrides()[i]) + { + return false; + } + s *= td.GetLengths()[i]; + } + return true; + }; + return isContiguous(inputGradDesc) && isContiguous(outputGradDesc); + } + NetworkConfig MakeNetworkConfig() const override; protected: diff --git a/src/kernels/MIOpenAdaptiveAvgPool.cpp b/src/kernels/MIOpenAdaptiveAvgPool.cpp index d29a03ab1d..17877fdf0c 100644 --- a/src/kernels/MIOpenAdaptiveAvgPool.cpp +++ b/src/kernels/MIOpenAdaptiveAvgPool.cpp @@ -23,7 +23,6 @@ * SOFTWARE. * *******************************************************************************/ -#include #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS #include #include @@ -41,14 +40,14 @@ #endif template -__device__ void avgPoolForward1d(const TI* __restrict__ input, - TO* __restrict__ output, - size_t N, - size_t C, - size_t H, - size_t OH, - tensor_view_t<3> input_tv, - tensor_view_t<3> output_tv) +__device__ void adaptiveAvgPoolForward1d(const TI* __restrict__ input, + TO* __restrict__ output, + size_t N, + size_t C, + size_t H, + size_t OH, + tensor_view_t<3> input_tv, + tensor_view_t<3> output_tv) { size_t gid = threadIdx.x + blockIdx.x * blockDim.x; size_t nc = gid / OH, oh = gid % OH; @@ -56,38 +55,38 @@ __device__ void avgPoolForward1d(const TI* __restrict__ input, if(n >= N) return; - int32_t h = (int32_t)floor((float)(oh * H) / OH); - int32_t kh = (int32_t)ceil((float)((oh + 1) * H) / OH) - h; + size_t h = static_cast(floor(static_cast(oh * H) / OH)); + size_t kh = static_cast(ceil(static_cast((oh + 1) * H) / OH)) - h; - DTYPE_ACCURATE sum = 0; - for(int ih = h; ih < (h + kh); ++ih) + FLOAT_ACCUM sum = 0; + for(size_t ih = h; ih < (h + kh); ++ih) { - sum += GET_3D_VAL_AT(input, n, c, ih); + sum += CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx({n, c, ih})]); } - - SET_3D_VAL_AT(output, n, c, oh, sum / kh); + output[output_tv.get_tensor_view_idx({n, c, oh})] = CVT_ACCUM2FLOAT(sum / kh); } -extern "C" __global__ void AvgPoolForward1d(const INPUT_TYPE* __restrict__ input, - OUTPUT_TYPE* __restrict__ output, - size_t N, - size_t C, - size_t H, - size_t OH, - tensor_view_t<3> input_tv, - tensor_view_t<3> output_tv) +extern "C" __global__ void AdaptiveAvgPoolForward1d(const INPUT_TYPE* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + size_t N, + size_t C, + size_t H, + size_t OH, + tensor_view_t<3> input_tv, + tensor_view_t<3> output_tv) { - avgPoolForward1d(input, output, N, C, H, OH, input_tv, output_tv); + adaptiveAvgPoolForward1d( + input, output, N, C, H, OH, input_tv, output_tv); } template -__device__ void avgPoolBackward1d(const TI* __restrict__ output_grad, - TO* __restrict__ input_grad, - size_t N, - size_t C, - size_t H, - size_t OH, - tensor_view_t<3> output_grad_tv, - tensor_view_t<3> input_grad_tv) +__device__ void adaptiveAvgPoolBackward1d(const TI* __restrict__ output_grad, + TO* __restrict__ input_grad, + size_t N, + size_t C, + size_t H, + size_t OH, + tensor_view_t<3> output_grad_tv, + tensor_view_t<3> input_grad_tv) { size_t gid = threadIdx.x + blockIdx.x * blockDim.x; size_t nc = gid / H, h = gid % H; @@ -95,310 +94,276 @@ __device__ void avgPoolBackward1d(const TI* __restrict__ output_grad, if(n >= N) return; - int32_t oh = (int32_t)floor((float)(h * OH) / H); - int32_t koh = (int32_t)ceil((float)((h + 1) * OH) / H) - oh; + size_t oh = static_cast(floor(static_cast(h * OH) / H)); + size_t koh = static_cast(ceil(static_cast((h + 1) * OH) / H)) - oh; - DTYPE_ACCURATE grad = 0; - for(int ih = oh; ih < (oh + koh); ++ih) + FLOAT_ACCUM grad = 0; + for(size_t ih = oh; ih < (oh + koh); ++ih) { - int32_t kh = - (int32_t)ceil((float)((ih + 1) * H) / OH) - (int32_t)floor((float)(ih * H) / OH); - grad += GET_3D_VAL_AT(output_grad, n, c, ih) / kh; + size_t kh = static_cast(ceil(static_cast((ih + 1) * H) / OH)) - + static_cast(floor(static_cast(ih * H) / OH)); + grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih})]) / kh; } - - SET_3D_VAL_AT(input_grad, n, c, h, grad); + input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = CVT_ACCUM2FLOAT(grad); } -extern "C" __global__ void AvgPoolBackward1d(const INPUT_TYPE* __restrict__ output_grad, - OUTPUT_TYPE* __restrict__ input_grad, - size_t N, - size_t C, - size_t H, - size_t OH, - tensor_view_t<3> output_grad_tv, - tensor_view_t<3> input_grad_tv) + +extern "C" __global__ void AdaptiveAvgPoolBackward1d(const INPUT_TYPE* __restrict__ output_grad, + OUTPUT_TYPE* __restrict__ input_grad, + size_t N, + size_t C, + size_t H, + size_t OH, + tensor_view_t<3> output_grad_tv, + tensor_view_t<3> input_grad_tv) { - avgPoolBackward1d( + adaptiveAvgPoolBackward1d( output_grad, input_grad, N, C, H, OH, output_grad_tv, input_grad_tv); } template -__device__ void avgPoolForward2d(const TI* __restrict__ input, - TO* __restrict__ output, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, - tensor_view_t<4> input_tv, - tensor_view_t<4> output_tv) +__device__ void adaptiveAvgPoolForward2d(const TI* __restrict__ input, + TO* __restrict__ output, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + tensor_view_t<4> input_tv, + tensor_view_t<4> output_tv) { - int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; - int32_t ncoh = gid / OW, ow = gid % OW; - int32_t nc = ncoh / OH, oh = ncoh % OH; - int32_t n = nc / C, c = nc % C; + size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + size_t ncoh = gid / OW, ow = gid % OW; + size_t nc = ncoh / OH, oh = ncoh % OH; + size_t n = nc / C, c = nc % C; if(n >= N) return; - size_t h = (size_t)floor((float)(oh * H) / OH); - size_t kh = (size_t)ceil((float)((oh + 1) * H) / OH) - h; + size_t h = static_cast(floor(static_cast(oh * H) / OH)); + size_t kh = static_cast(ceil(static_cast((oh + 1) * H) / OH)) - h; - size_t w = (size_t)floor((float)(ow * W) / OW); - size_t kw = (size_t)ceil((float)((ow + 1) * W) / OW) - w; + size_t w = static_cast(floor(static_cast(ow * W) / OW)); + size_t kw = static_cast(ceil(static_cast((ow + 1) * W) / OW)) - w; - FSTYPE divider = (FSTYPE)(kh * kw); - FSTYPE sum = 0; + FLOAT_ACCUM divider = static_cast(kh * kw); + FLOAT_ACCUM sum = 0; for(size_t ih = h; ih < (h + kh); ++ih) { for(size_t iw = w; iw < (w + kw); ++iw) { - sum += GET_4D_VAL_AT(input, n, c, ih, iw); + sum += CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx({n, c, ih, iw})]); } } - - SET_4D_VAL_AT(output, n, c, oh, ow, sum / divider); - - output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = CVT_ACCUM2FLOAT(val); + output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = CVT_ACCUM2FLOAT(sum / divider); } -extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input, - OUTPUT_TYPE* __restrict__ output, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, - tensor_view_t<4> input_tv, - tensor_view_t<4> output_tv) +extern "C" __global__ void AdaptiveAvgPoolForward2d(const INPUT_TYPE* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + tensor_view_t<4> input_tv, + tensor_view_t<4> output_tv) { - avgPoolForward2d( + adaptiveAvgPoolForward2d( input, output, N, C, H, W, OH, OW, input_tv, output_tv); } template -__device__ void avgPoolBackward2d(const TI* __restrict__ output_grad, - TO* __restrict__ input_grad, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, - tensor_view_t<4> output_grad_tv, - tensor_view_t<4> input_grad_tv) +__device__ void adaptiveAvgPoolBackward2d(const TI* __restrict__ output_grad, + TO* __restrict__ input_grad, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + tensor_view_t<4> output_grad_tv, + tensor_view_t<4> input_grad_tv) { - int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; - int32_t nch = gid / W, w = gid % W; - int32_t nc = nch / H, h = nch % H; - int32_t n = nc / C, c = nc % C; + size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + size_t nch = gid / W, w = gid % W; + size_t nc = nch / H, h = nch % H; + size_t n = nc / C, c = nc % C; if(n >= N) return; - size_t oh = (size_t)floor((float)(h * OH) / H); - size_t koh = (size_t)ceil((float)((h + 1) * OH) / H) - oh; + size_t oh = static_cast(floor(static_cast(h * OH) / H)); + size_t koh = static_cast(ceil(static_cast((h + 1) * OH) / H)) - oh; - size_t ow = (size_t)floor((float)(w * OW) / W); - size_t kow = (size_t)ceil((float)((w + 1) * OW) / W) - ow; + size_t ow = static_cast(floor(static_cast(w * OW) / W)); + size_t kow = static_cast(ceil(static_cast((w + 1) * OW) / W)) - ow; FLOAT_ACCUM grad = 0; for(size_t ih = oh; ih < (oh + koh); ++ih) { - size_t kh = (size_t)ceil((float)((ih + 1) * H) / OH) - (size_t)floor((float)(ih * H) / OH); + size_t kh = static_cast(ceil(static_cast((ih + 1) * H) / OH)) - + static_cast(floor(static_cast(ih * H) / OH)); for(size_t iw = ow; iw < (ow + kow); ++iw) { - size_t kw = - (size_t)ceil((float)((iw + 1) * W) / OW) - (size_t)floor((float)(iw * W) / OW); - grad += (FSTYPE)(GET_4D_VAL_AT(output_grad, n, c, ih, iw)) / (kh * kw); + size_t kw = static_cast(ceil(static_cast((iw + 1) * W) / OW)) - + static_cast(floor(static_cast(iw * W) / OW)); + grad += + CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih, iw})]) / + (kh * kw); } } - SET_4D_VAL_AT(input_grad, n, c, h, w, grad); - - input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] = - CVT_ACCUM2FLOAT(grad); + input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = CVT_ACCUM2FLOAT(grad); } -extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad, - OUTPUT_TYPE* __restrict__ input_grad, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, - tensor_view_t<4> output_grad_tv, - tensor_view_t<4> input_grad_tv) +extern "C" __global__ void AdaptiveAvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad, + OUTPUT_TYPE* __restrict__ input_grad, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + tensor_view_t<4> output_grad_tv, + tensor_view_t<4> input_grad_tv) { - avgPoolBackward2d( + adaptiveAvgPoolBackward2d( output_grad, input_grad, N, C, H, W, OH, OW, output_grad_tv, input_grad_tv); } -// __kernel void AdaptiveAvgpool2dBackward1x1OutputNHWC(const __global DTYPE_PTR output_grad, -// __global DTYPE_PTR input_grad, -// const int32_t N, -// const int32_t C, -// const int32_t HW, -// const int32_t output_grad_off, -// const int32_t input_grad_off) -// { -// /* VSIZE 2 and 16 is fastest but don't know why */ -// #define VSIZE 2 -// size_t gid = get_global_id(0) * VSIZE; -// size_t c = gid % C; -// size_t n = gid / C; -// if(n >= N) -// return; - -// __global DTYPE_VEC_PTR(VSIZE) output_grad_vec = -// (__global DTYPE_VEC_PTR(VSIZE))(output_grad + n * C + c + output_grad_off); - -// DTYPE_VEC(VSIZE) output_grad_v = GET(output_grad_vec, 0) / HW; - -// __global DTYPE_VEC_PTR(VSIZE) input_grad_vec = -// (__global DTYPE_VEC_PTR(VSIZE))(input_grad + n * C * HW + c + input_grad_off); - -// for(size_t i = 0; i < HW; ++i) -// { -// SET(input_grad_vec, i * C / VSIZE, output_grad_v); -// } -// #undef VSIZE -// } - template -__device__ void avgPoolForward3d(const TI* __restrict__ input, - TO* __restrict__ output, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, - tensor_view_t<5> input_tv, - tensor_view_t<5> output_tv) +__device__ void adaptiveAvgPoolForward3d(const TI* __restrict__ input, + TO* __restrict__ output, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + tensor_view_t<5> input_tv, + tensor_view_t<5> output_tv) { - int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; - int32_t ncodoh = gid / OW, ow = gid % OW; - int32_t ncod = ncodoh / OH, oh = ncodoh % OH; - int32_t nc = ncod / OD, od = ncod % OD; - int32_t n = nc / C, c = nc % C; + size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + size_t ncodoh = gid / OW, ow = gid % OW; + size_t ncod = ncodoh / OH, oh = ncodoh % OH; + size_t nc = ncod / OD, od = ncod % OD; + size_t n = nc / C, c = nc % C; if(n >= N) return; - int32_t d = (int32_t)floor((float)(od * D) / OD); - int32_t kd = (int32_t)ceil((float)((od + 1) * D) / OD) - d; + size_t d = static_cast(floor(static_cast(od * D) / OD)); + size_t kd = static_cast(ceil(static_cast((od + 1) * D) / OD)) - d; - int32_t h = (int32_t)floor((float)(oh * H) / OH); - int32_t kh = (int32_t)ceil((float)((oh + 1) * H) / OH) - h; + size_t h = static_cast(floor(static_cast(oh * H) / OH)); + size_t kh = static_cast(ceil(static_cast((oh + 1) * H) / OH)) - h; - int32_t w = (int32_t)floor((float)(ow * W) / OW); - int32_t kw = (int32_t)ceil((float)((ow + 1) * W) / OW) - w; + size_t w = static_cast(floor(static_cast(ow * W) / OW)); + size_t kw = static_cast(ceil(static_cast((ow + 1) * W) / OW)) - w; - DTYPE_ACCURATE sum = 0; - for(int32_t id = d; id < (d + kd); ++id) + FLOAT_ACCUM sum = 0; + for(size_t id = d; id < (d + kd); ++id) { - for(int32_t ih = h; ih < (h + kh); ++ih) + for(size_t ih = h; ih < (h + kh); ++ih) { - for(int32_t iw = w; iw < (w + kw); ++iw) + for(size_t iw = w; iw < (w + kw); ++iw) { - sum += GET_5D_VAL_AT(input, n, c, id, ih, iw); + sum += CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx({n, c, id, ih, iw})]); } } } - output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] = - CVT_ACCUM2FLOAT(val); - SET_5D_VAL_AT(output, n, c, od, oh, ow, sum / (kd * kh * kw)); + output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] = + CVT_ACCUM2FLOAT(sum / (kd * kh * kw)); } -extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input, - OUTPUT_TYPE* __restrict__ output, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, - tensor_view_t<5> input_tv, - tensor_view_t<5> output_tv) +extern "C" __global__ void AdaptiveAvgPoolForward3d(const INPUT_TYPE* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + tensor_view_t<5> input_tv, + tensor_view_t<5> output_tv) { - avgPoolForward3d( + adaptiveAvgPoolForward3d( input, output, N, C, D, H, W, OD, OH, OW, input_tv, output_tv); } template -__device__ void avgPoolBackward3d(const TI* __restrict__ output_grad, - TO* __restrict__ input_grad, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, - tensor_view_t<5> output_grad_tv, - tensor_view_t<5> input_grad_tv) +__device__ void adaptiveAvgPoolBackward3d(const TI* __restrict__ output_grad, + TO* __restrict__ input_grad, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + tensor_view_t<5> output_grad_tv, + tensor_view_t<5> input_grad_tv) { - int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; - int32_t ncdh = gid / W, w = gid % W; - int32_t ncd = ncdh / H, h = ncdh % H; - int32_t nc = ncd / D, d = ncd % D; - int32_t n = nc / C, c = nc % C; + size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + size_t ncdh = gid / W, w = gid % W; + size_t ncd = ncdh / H, h = ncdh % H; + size_t nc = ncd / D, d = ncd % D; + size_t n = nc / C, c = nc % C; if(n >= N) return; - int32_t od = (int32_t)floor((float)(d * OD) / D); - int32_t kod = (int32_t)ceil((float)((d + 1) * OD) / D) - od; + size_t od = static_cast(floor(static_cast(d * OD) / D)); + size_t kod = static_cast(ceil(static_cast((d + 1) * OD) / D)) - od; - int32_t oh = (int32_t)floor((float)(h * OH) / H); - int32_t koh = (int32_t)ceil((float)((h + 1) * OH) / H) - oh; + size_t oh = static_cast(floor(static_cast(h * OH) / H)); + size_t koh = static_cast(ceil(static_cast((h + 1) * OH) / H)) - oh; - int32_t ow = (int32_t)floor((float)(w * OW) / W); - int32_t kow = (int32_t)ceil((float)((w + 1) * OW) / W) - ow; + size_t ow = static_cast(floor(static_cast(w * OW) / W)); + size_t kow = static_cast(ceil(static_cast((w + 1) * OW) / W)) - ow; - DTYPE_ACCURATE grad = 0; - for(int32_t id = od; id < (od + kod); ++id) + FLOAT_ACCUM grad = 0; + for(size_t id = od; id < (od + kod); ++id) { - int32_t kd = - (int32_t)ceil((float)((id + 1) * D) / OD) - (int32_t)floor((float)(id * D) / OD); - for(int32_t ih = oh; ih < (oh + koh); ++ih) + size_t kd = static_cast(ceil(static_cast((id + 1) * D) / OD)) - + static_cast(floor(static_cast(id * D) / OD)); + for(size_t ih = oh; ih < (oh + koh); ++ih) { - int32_t kh = - (int32_t)ceil((float)((ih + 1) * H) / OH) - (int32_t)floor((float)(ih * H) / OH); - for(int32_t iw = ow; iw < (ow + kow); ++iw) + size_t kh = static_cast(ceil(static_cast((ih + 1) * H) / OH)) - + static_cast(floor(static_cast(ih * H) / OH)); + for(size_t iw = ow; iw < (ow + kow); ++iw) { - int32_t kw = (int32_t)ceil((float)((iw + 1) * W) / OW) - - (int32_t)floor((float)(iw * W) / OW); - grad += GET_5D_VAL_AT(output_grad, n, c, id, ih, iw) / (kd * kh * kw); + size_t kw = static_cast(ceil(static_cast((iw + 1) * W) / OW)) - + static_cast(floor(static_cast(iw * W) / OW)); + grad += CVT_FLOAT2ACCUM( + output_grad[output_grad_tv.get_tensor_view_idx({n, c, id, ih, iw})]) / + (kd * kh * kw); } } } - input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] = - CVT_ACCUM2FLOAT(grad); + input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = CVT_ACCUM2FLOAT(grad); } -extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad, - OUTPUT_TYPE* __restrict__ input_grad, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, - tensor_view_t<5> output_grad_tv, - tensor_view_t<5> input_grad_tv) +extern "C" __global__ void AdaptiveAvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad, + OUTPUT_TYPE* __restrict__ input_grad, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + tensor_view_t<5> output_grad_tv, + tensor_view_t<5> input_grad_tv) { - avgPoolBackward3d( + adaptiveAvgPoolBackward3d( output_grad, input_grad, N, C, D, H, W, OD, OH, OW, output_grad_tv, input_grad_tv); } diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp index d64dbf21f9..12394dbde6 100644 --- a/src/kernels/tensor_view.hpp +++ b/src/kernels/tensor_view.hpp @@ -27,6 +27,8 @@ #ifndef GUARD_TENSOR_VIEW_HPP #define GUARD_TENSOR_VIEW_HPP +#include + template struct tensor_layout_t; @@ -47,7 +49,6 @@ struct tensor_view_t uint64_t stride[N]; uint64_t size[N]; }; - template struct tensor_layout_t { @@ -72,44 +73,13 @@ struct tensor_layout_t } } - constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w) - { - static_assert(N == 5); - layout[0] = n; - layout[1] = c; - layout[2] = d; - layout[3] = h; - layout[4] = w; - } - - constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t h, uint64_t w) - { - static_assert(N == 4); - layout[0] = n; - layout[1] = c; - layout[2] = h; - layout[3] = w; - } - - constexpr tensor_layout_t(uint64_t n, uint64_t h, uint64_t w) - { - static_assert(N == 3); - layout[0] = n; - layout[1] = h; - layout[2] = w; - } - - constexpr tensor_layout_t(uint64_t n, uint64_t w) - { - static_assert(N == 2); - layout[0] = n; - layout[1] = w; - } - - constexpr tensor_layout_t(uint64_t n) + constexpr tensor_layout_t(std::initializer_list layout_) { - static_assert(N == 1); - layout[0] = n; + static_assert(N > 0); + for(auto i = 0; i < N; ++i) + { + layout[i] = layout_.begin()[i]; + } } uint64_t layout[N]; diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp index 1afb78de45..e97c9ec0a9 100644 --- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp +++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp @@ -35,7 +35,7 @@ #include #include -#define LOCAL_SIZE_BWD_2D 256 +#define LOCAL_SIZE_BWD_1D 256 namespace miopen { @@ -43,29 +43,17 @@ namespace solver { namespace adaptiveavgpool { -bool IsOverRocmBwd2d(const miopen::adaptiveavgpool::BwdProblemDescription& problem) +bool IsOverRocmBwd1d(const miopen::adaptiveavgpool::BwdProblemDescription& problem) { - auto dtype = problem.GetInputGradDesc().GetType(); - auto in_nelems = problem.GetInputGradDesc().GetElementSize(); - auto out_nelems = problem.GetOutputGradDesc().GetElementSize(); - auto mul_nc = - problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1]; - auto in_over_out = static_cast(in_nelems) / out_nelems; - - if(dtype == miopenFloat) + if(!problem.IsAllContiguous()) { - return false; + return true; } - else if(dtype == miopenHalf) + else { - if(in_over_out < 2 && in_nelems >= 11075584) - { - return true; - } - } - else if(dtype == miopenBFloat16) - { - if(in_over_out < 2 || (in_nelems > 20000000 && mul_nc <= 2048)) + auto mul_nc = problem.GetOutputGradDesc().GetLengths()[0] * + problem.GetOutputGradDesc().GetLengths()[1]; + if(mul_nc < 141312) { return true; } @@ -73,22 +61,22 @@ bool IsOverRocmBwd2d(const miopen::adaptiveavgpool::BwdProblemDescription& probl return false; } -bool AdaptiveAvgPoolBackward2d::IsApplicable( +bool AdaptiveAvgPoolBackward1d::IsApplicable( const ExecutionContext&, const miopen::adaptiveavgpool::BwdProblemDescription& problem) const { - if(problem.GetInputGradDesc().GetNumDims() != 4 || - problem.GetOutputGradDesc().GetNumDims() != 4) + if(problem.GetInputGradDesc().GetNumDims() != 3 || + problem.GetOutputGradDesc().GetNumDims() != 3) { return false; } - if(!IsOverRocmBwd2d(problem)) + if(!IsOverRocmBwd1d(problem)) { return false; } return true; } -ConvSolution AdaptiveAvgPoolBackward2d::GetSolution( +ConvSolution AdaptiveAvgPoolBackward1d::GetSolution( const ExecutionContext& context, const miopen::adaptiveavgpool::BwdProblemDescription& problem) const { @@ -108,10 +96,10 @@ ConvSolution AdaptiveAvgPoolBackward2d::GetSolution( {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; - result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_2D}, + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_1D}, {N_total}, "MIOpenAdaptiveAvgPool.cpp", - "AdaptiveAvgPoolBackward2d", + "AdaptiveAvgPoolBackward1d", build_params)); result.invoker_factory = [](const std::vector& kernels) { @@ -120,26 +108,16 @@ ConvSolution AdaptiveAvgPoolBackward2d::GetSolution( decltype(auto) kernel = handle_.Run(kernels.front()); - auto input_grad_tv = get_inner_expanded_tv<4>(deref(params.inputGradDesc)); - auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc)); + auto input_grad_tv = get_inner_expanded_tv<3>(deref(params.inputGradDesc)); + auto output_grad_tv = get_inner_expanded_tv<3>(deref(params.outputGradDesc)); auto N = deref(params.inputGradDesc).GetLengths()[0]; auto C = deref(params.inputGradDesc).GetLengths()[1]; auto H = deref(params.inputGradDesc).GetLengths()[2]; - auto W = deref(params.inputGradDesc).GetLengths()[3]; auto OH = deref(params.outputGradDesc).GetLengths()[2]; - auto OW = deref(params.outputGradDesc).GetLengths()[3]; - - kernel(params.output_grad, - params.input_grad, - N, - C, - H, - W, - OH, - OW, - output_grad_tv, - input_grad_tv); + + kernel( + params.output_grad, params.input_grad, N, C, H, OH, output_grad_tv, input_grad_tv); }; }; diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp index 1afb78de45..dd8aeb9902 100644 --- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp +++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp @@ -45,29 +45,33 @@ namespace adaptiveavgpool { bool IsOverRocmBwd2d(const miopen::adaptiveavgpool::BwdProblemDescription& problem) { - auto dtype = problem.GetInputGradDesc().GetType(); - auto in_nelems = problem.GetInputGradDesc().GetElementSize(); - auto out_nelems = problem.GetOutputGradDesc().GetElementSize(); - auto mul_nc = - problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1]; - auto in_over_out = static_cast(in_nelems) / out_nelems; - - if(dtype == miopenFloat) + if(problem.IsAllContiguous()) { return false; } - else if(dtype == miopenHalf) + else { - if(in_over_out < 2 && in_nelems >= 11075584) + auto dtype = problem.GetInputGradDesc().GetType(); + auto in_nelems = problem.GetInputGradDesc().GetElementSize(); + auto out_nelems = problem.GetOutputGradDesc().GetElementSize(); + auto in_over_out = static_cast(in_nelems) / out_nelems; + + if(dtype == miopenFloat) { - return true; + if(in_nelems > 3801600) + return true; } - } - else if(dtype == miopenBFloat16) - { - if(in_over_out < 2 || (in_nelems > 20000000 && mul_nc <= 2048)) + else if(dtype == miopenHalf) + { + if(in_over_out == 1 || (in_over_out >= 1024 && in_over_out <= 4096)) + return true; + } + else if(dtype == miopenBFloat16) { - return true; + if(in_over_out < 13 || (in_over_out >= 1024 && in_over_out <= 4096)) + { + return true; + } } } return false; diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp index 85bb5747f3..3ad93574de 100644 --- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp +++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp @@ -35,7 +35,7 @@ #include #include -#define LOCAL_SIZE_FWD_2D 256 +#define LOCAL_SIZE_FWD_1D 256 namespace miopen { @@ -43,34 +43,15 @@ namespace solver { namespace adaptiveavgpool { -bool IsOverRocmFwd2d(const miopen::adaptiveavgpool::FwdProblemDescription& problem) +bool IsOverRocmFwd1d(const miopen::adaptiveavgpool::FwdProblemDescription& problem) { - auto dtype = problem.GetOutputDesc().GetType(); - auto in_nelems = problem.GetInputDesc().GetElementSize(); - auto out_nelems = problem.GetOutputDesc().GetElementSize(); - auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1]; + auto in_nelems = problem.GetInputDesc().GetLengths()[-1]; + auto out_nelems = problem.GetOutputDesc().GetLengths()[-1]; auto in_over_out = static_cast(in_nelems) / out_nelems; - if(dtype == miopenFloat) + if(in_over_out < 56) { - if(in_over_out > 11 || (in_over_out < 2 && mul_nc >= 12288)) - { - return true; - } - } - else if(dtype == miopenHalf) - { - if(in_over_out > 11 || (in_over_out < 2 && mul_nc < 90000)) - { - return true; - } - } - else if(dtype == miopenBFloat16) - { - if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 4816896) - { - return true; - } + return true; } return false; } @@ -78,11 +59,11 @@ bool IsOverRocmFwd2d(const miopen::adaptiveavgpool::FwdProblemDescription& probl bool AdaptiveAvgPoolForward1d::IsApplicable( const ExecutionContext&, const miopen::adaptiveavgpool::FwdProblemDescription& problem) const { - if(problem.GetInputDesc().GetNumDims() != 4 || problem.GetOutputDesc().GetNumDims() != 4) + if(problem.GetInputDesc().GetNumDims() != 3 || problem.GetOutputDesc().GetNumDims() != 3) { return false; } - if(!IsOverRocmFwd2d(problem)) + if(!IsOverRocmFwd1d(problem)) { return false; } @@ -109,7 +90,7 @@ ConvSolution AdaptiveAvgPoolForward1d::GetSolution( {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; - result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_2D}, + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_1D}, {N_total}, "MIOpenAdaptiveAvgPool.cpp", "AdaptiveAvgPoolForward1d", @@ -121,17 +102,15 @@ ConvSolution AdaptiveAvgPoolForward1d::GetSolution( decltype(auto) kernel = handle_.Run(kernels.front()); - auto input_tv = get_inner_expanded_tv<4>(deref(params.inputDesc)); - auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc)); + auto input_tv = get_inner_expanded_tv<3>(deref(params.inputDesc)); + auto output_tv = get_inner_expanded_tv<3>(deref(params.outputDesc)); size_t N = deref(params.inputDesc).GetLengths()[0]; size_t C = deref(params.inputDesc).GetLengths()[1]; size_t H = deref(params.inputDesc).GetLengths()[2]; - size_t W = deref(params.inputDesc).GetLengths()[3]; size_t OH = deref(params.outputDesc).GetLengths()[2]; - size_t OW = deref(params.outputDesc).GetLengths()[3]; - kernel(params.input, params.output, N, C, H, W, OH, OW, input_tv, output_tv); + kernel(params.input, params.output, N, C, H, OH, input_tv, output_tv); }; }; diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp index d1afc40842..92c120494e 100644 --- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp +++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp @@ -28,7 +28,6 @@ #include "miopen/execution_context.hpp" #include "miopen/invoke_params.hpp" #include "miopen/tensor_view_utils.hpp" -#include #include #include @@ -46,32 +45,19 @@ namespace adaptiveavgpool { bool IsOverRocmFwd2d(const miopen::adaptiveavgpool::FwdProblemDescription& problem) { - auto dtype = problem.GetOutputDesc().GetType(); - auto in_nelems = problem.GetInputDesc().GetElementSize(); - auto out_nelems = problem.GetOutputDesc().GetElementSize(); - auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1]; + auto in_nelems = problem.GetInputDesc().GetElementSize(); + auto out_nelems = problem.GetOutputDesc().GetElementSize(); auto in_over_out = static_cast(in_nelems) / out_nelems; - if(dtype == miopenFloat) + if(problem.IsAllContiguous()) { - if(in_over_out > 11 || (in_over_out < 2 && mul_nc >= 12288)) - { + if((in_over_out < 13) || (in_over_out >= 100 && in_over_out <= 112)) return true; - } } - else if(dtype == miopenHalf) + else { - if(in_over_out > 11 || (in_over_out < 2 && mul_nc < 90000)) - { + if(in_over_out < 248) return true; - } - } - else if(dtype == miopenBFloat16) - { - if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 4816896) - { - return true; - } } return false; } diff --git a/test/cpu_adaptiveavgpool.hpp b/test/cpu_adaptiveavgpool.hpp new file mode 100644 index 0000000000..4b6dd99dda --- /dev/null +++ b/test/cpu_adaptiveavgpool.hpp @@ -0,0 +1,311 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_CPU_AVGPOOL_HPP +#define GUARD_CPU_AVGPOOL_HPP + +#include "tensor_holder.hpp" +#include + +template +void cpu_adaptiveavgpool_forward_1d( + tensor input, tensor& output, size_t N, size_t C, size_t H, size_t OH) +{ + auto dims = input.desc.GetLengths(); + auto numel = output.desc.GetElementSize(); + + auto input_tv = miopen::get_inner_expanded_tv<3>(input.desc); + auto output_tv = miopen::get_inner_expanded_tv<3>(output.desc); + + for(size_t gid = 0; gid < numel; gid++) + { + size_t nc = gid / OH, oh = gid % OH; + size_t n = nc / C, c = nc % C; + + if(n >= N) + return; + + size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); + size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; + + float sum = 0; + for(size_t ih = h; ih < (h + kh); ++ih) + { + sum += static_cast(input[input_tv.get_tensor_view_idx({n, c, ih})]); + } + + output[output_tv.get_tensor_view_idx({n, c, oh})] = static_cast(sum / kh); + } +} + +template +void cpu_adaptiveavgpool_forward_2d(tensor input, + tensor& output, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW) +{ + auto dims = input.desc.GetLengths(); + auto numel = output.desc.GetElementSize(); + + auto input_tv = miopen::get_inner_expanded_tv<4>(input.desc); + auto output_tv = miopen::get_inner_expanded_tv<4>(output.desc); + + for(size_t gid = 0; gid < numel; gid++) + { + size_t ncoh = gid / OW, ow = gid % OW; + size_t nc = ncoh / OH, oh = ncoh % OH; + size_t n = nc / C, c = nc % C; + + if(n >= N) + return; + + size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); + size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; + + size_t w = static_cast(std::floor(static_cast(ow * W) / OW)); + size_t kw = static_cast(std::ceil(static_cast((ow + 1) * W) / OW)) - w; + + float divider = static_cast(kh * kw); + float sum = 0; + for(size_t ih = h; ih < (h + kh); ++ih) + { + for(size_t iw = w; iw < (w + kw); ++iw) + { + sum += static_cast(input[input_tv.get_tensor_view_idx({n, c, ih, iw})]); + } + } + + output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = static_cast(sum / divider); + } +} + +template +void cpu_adaptiveavgpool_forward_3d(tensor input, + tensor& output, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW) +{ + auto dims = input.desc.GetLengths(); + auto numel = output.desc.GetElementSize(); + + auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); + auto output_tv = miopen::get_inner_expanded_tv<5>(output.desc); + + for(size_t gid = 0; gid < numel; gid++) + { + size_t ncodoh = gid / OW, ow = gid % OW; + size_t ncod = ncodoh / OH, oh = ncodoh % OH; + size_t nc = ncod / OD, od = ncod % OD; + size_t n = nc / C, c = nc % C; + + if(n >= N) + return; + + size_t d = static_cast(std::floor(static_cast(od * D) / OD)); + size_t kd = static_cast(std::ceil(static_cast((od + 1) * D) / OD)) - d; + + size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); + size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; + + size_t w = static_cast(std::floor(static_cast(ow * W) / OW)); + size_t kw = static_cast(std::ceil(static_cast((ow + 1) * W) / OW)) - w; + + float sum = 0; + for(size_t id = d; id < (d + kd); ++id) + { + for(size_t ih = h; ih < (h + kh); ++ih) + { + for(size_t iw = w; iw < (w + kw); ++iw) + { + sum += + static_cast(input[input_tv.get_tensor_view_idx({n, c, id, ih, iw})]); + } + } + } + + output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] = + static_cast(sum / (kd * kh * kw)); + } +} + +template +void cpu_adaptiveavgpool_backward_1d( + tensor output_grad, tensor& input_grad, size_t N, size_t C, size_t H, size_t OH) +{ + auto dims = input_grad.desc.GetLengths(); + auto numel = input_grad.desc.GetElementSize(); + + auto output_grad_tv = miopen::get_inner_expanded_tv<3>(output_grad.desc); + auto input_grad_tv = miopen::get_inner_expanded_tv<3>(input_grad.desc); + + for(size_t gid = 0; gid < numel; gid++) + { + size_t nc = gid / H, h = gid % H; + size_t n = nc / C, c = nc % C; + + if(n >= N) + return; + + size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); + size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; + + float grad = 0; + for(size_t ih = oh; ih < (oh + koh); ++ih) + { + size_t kh = static_cast(std::ceil(static_cast((ih + 1) * H) / OH)) - + static_cast(std::floor(static_cast(ih * H) / OH)); + grad += + static_cast(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih})]) / + kh; + } + + input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = static_cast(grad); + } +} + +template +void cpu_adaptiveavgpool_backward_2d(tensor output_grad, + tensor& input_grad, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW) +{ + auto dims = input_grad.desc.GetLengths(); + auto numel = input_grad.desc.GetElementSize(); + + auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc); + auto input_grad_tv = miopen::get_inner_expanded_tv<4>(input_grad.desc); + + for(size_t gid = 0; gid < numel; gid++) + { + size_t nch = gid / W, w = gid % W; + size_t nc = nch / H, h = nch % H; + size_t n = nc / C, c = nc % C; + + if(n >= N) + return; + + size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); + size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; + + size_t ow = static_cast(std::floor(static_cast(w * OW) / W)); + size_t kow = static_cast(std::ceil(static_cast((w + 1) * OW) / W)) - ow; + + float grad = 0; + for(size_t ih = oh; ih < (oh + koh); ++ih) + { + size_t kh = static_cast(std::ceil(static_cast((ih + 1) * H) / OH)) - + static_cast(std::floor(static_cast(ih * H) / OH)); + for(size_t iw = ow; iw < (ow + kow); ++iw) + { + size_t kw = static_cast(std::ceil(static_cast((iw + 1) * W) / OW)) - + static_cast(std::floor(static_cast(iw * W) / OW)); + grad += static_cast( + output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih, iw})]) / + (kh * kw); + } + } + + input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = static_cast(grad); + } +} + +template +void cpu_adaptiveavgpool_backward_3d(tensor output_grad, + tensor& input_grad, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW) +{ + auto dims = input_grad.desc.GetLengths(); + auto numel = input_grad.desc.GetElementSize(); + + auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc); + auto input_grad_tv = miopen::get_inner_expanded_tv<5>(input_grad.desc); + + for(size_t gid = 0; gid < numel; gid++) + { + size_t ncdh = gid / W, w = gid % W; + size_t ncd = ncdh / H, h = ncdh % H; + size_t nc = ncd / D, d = ncd % D; + size_t n = nc / C, c = nc % C; + + if(n >= N) + return; + + size_t od = static_cast(std::floor(static_cast(d * OD) / D)); + size_t kod = static_cast(std::ceil(static_cast((d + 1) * OD) / D)) - od; + + size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); + size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; + + size_t ow = static_cast(std::floor(static_cast(w * OW) / W)); + size_t kow = static_cast(std::ceil(static_cast((w + 1) * OW) / W)) - ow; + + float grad = 0; + for(size_t id = od; id < (od + kod); ++id) + { + size_t kd = static_cast(std::ceil(static_cast((id + 1) * D) / OD)) - + static_cast(std::floor(static_cast(id * D) / OD)); + for(size_t ih = oh; ih < (oh + koh); ++ih) + { + size_t kh = static_cast(std::ceil(static_cast((ih + 1) * H) / OH)) - + static_cast(std::floor(static_cast(ih * H) / OH)); + for(size_t iw = ow; iw < (ow + kow); ++iw) + { + size_t kw = + static_cast(std::ceil(static_cast((iw + 1) * W) / OW)) - + static_cast(std::floor(static_cast(iw * W) / OW)); + grad += + static_cast( + output_grad[output_grad_tv.get_tensor_view_idx({n, c, id, ih, iw})]) / + (kd * kh * kw); + } + } + } + + input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = static_cast(grad); + } +} + +#endif diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp deleted file mode 100644 index 5b91033633..0000000000 --- a/test/cpu_avgpool.hpp +++ /dev/null @@ -1,426 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2024 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_CPU_AVGPOOL_HPP -#define GUARD_CPU_AVGPOOL_HPP - -#include "tensor_holder.hpp" -#include - -template -void cpu_avgpool_forward_2d(tensor input, - tensor& output, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, - tensor ksize, - tensor stride, - tensor padding, - bool count_include_pad, - int32_t divisor_override) -{ - auto dims = input.desc.GetLengths(); - auto numel = output.desc.GetElementSize(); - - auto input_tv = miopen::get_inner_expanded_tv<4>(input.desc); - auto output_tv = miopen::get_inner_expanded_tv<4>(output.desc); - - for(int32_t gid = 0; gid < numel; gid++) - { - int32_t ncoh = gid / OW, ow = gid % OW; - int32_t nc = ncoh / OH, oh = ncoh % OH; - int32_t n = nc / C, c = nc % C; - int32_t R = ksize[0]; - int32_t S = ksize[1]; - int32_t sh = stride[0]; - int32_t sw = stride[1]; - int32_t ph = padding[0]; - int32_t pw = padding[1]; - - if(n >= N) - return; - - float m = 0; - for(int32_t r = 0; r < R; ++r) - { - for(int32_t s = 0; s < S; ++s) - { - // input idx : (n, c, h, w) - int32_t h = oh * sh - ph + r; - if(h < 0 || h >= H) - continue; - int32_t w = ow * sw - pw + s; - if(w < 0 || w >= W) - continue; - // int32_t input_idx = ((n * C + c) * H + h) * W + w; - m += static_cast( - input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]); - } - } - - int32_t hstart = oh * sh - ph; - int32_t wstart = ow * sw - pw; - int32_t hend = min(hstart + R, H + ph); - int32_t wend = min(wstart + S, W + pw); - - const int32_t pool_size = (hend - hstart) * (wend - wstart); - - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, H); - wend = min(wend, W); - - int32_t divide_factor; - if(divisor_override != 0) - { - divide_factor = divisor_override; - } - else - { - if(count_include_pad) - { - divide_factor = pool_size; - } - else - { - divide_factor = (hend - hstart) * (wend - wstart); - } - } - float val = m / divide_factor; - - output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = - static_cast(val); - } -} - -template -void cpu_avgpool_forward_3d(tensor input, - tensor& output, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, - tensor ksize, - tensor stride, - tensor padding, - bool count_include_pad, - int32_t divisor_override) -{ - auto dims = input.desc.GetLengths(); - auto numel = output.desc.GetElementSize(); - - auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); - auto output_tv = miopen::get_inner_expanded_tv<5>(output.desc); - - for(int32_t gid = 0; gid < numel; gid++) - { - int32_t ncodoh = gid / OW, ow = gid % OW; - int32_t ncod = ncodoh / OH, oh = ncodoh % OH; - int32_t nc = ncod / OD, od = ncod % OD; - int32_t n = nc / C, c = nc % C; - int32_t KD = ksize[0]; - int32_t R = ksize[1]; - int32_t S = ksize[2]; - int32_t sd = stride[0]; - int32_t sh = stride[1]; - int32_t sw = stride[2]; - int32_t pd = padding[0]; - int32_t ph = padding[1]; - int32_t pw = padding[2]; - - if(n >= N) - return; - float sum = 0; - for(int32_t kd = 0; kd < KD; ++kd) - { - for(int32_t r = 0; r < R; ++r) - { - for(int32_t s = 0; s < S; ++s) - { - // input idx : (n, c, d, h, w) - int32_t d = od * sd - pd + kd; - if(d < 0 || d >= D) - continue; - int32_t h = oh * sh - ph + r; - if(h < 0 || h >= H) - continue; - int32_t w = ow * sw - pw + s; - if(w < 0 || w >= W) - continue; - // int32_t input_idx = ((n * C + c) * H + h) * W + w; - sum += static_cast( - input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]); - } - } - } - int32_t dstart = od * sd - pd; - int32_t hstart = oh * sh - ph; - int32_t wstart = ow * sw - pw; - int32_t dend = min(dstart + KD, D + pd); - int32_t hend = min(hstart + R, H + ph); - int32_t wend = min(wstart + S, W + pw); - - const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); - dstart = max(dstart, 0); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - dend = min(dend, D); - hend = min(hend, H); - wend = min(wend, W); - - int32_t divide_factor; - if(divisor_override != 0) - { - divide_factor = divisor_override; - } - else - { - if(count_include_pad) - { - divide_factor = pool_size; - } - else - { - divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); - } - } - float val = sum / divide_factor; - output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] = - static_cast(val); - } -} - -template -void cpu_avgpool_backward_2d(tensor output_grad, - tensor& input_grad, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, - tensor ksize, - tensor stride, - tensor padding, - bool count_include_pad, - int32_t divisor_override) -{ - auto dims = input_grad.desc.GetLengths(); - auto numel = input_grad.desc.GetElementSize(); - - auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc); - auto input_grad_tv = miopen::get_inner_expanded_tv<4>(input_grad.desc); - - for(size_t gid = 0; gid < numel; gid++) - { - int32_t nch = gid / W, w = gid % W; - int32_t nc = nch / H, h = nch % H; - int32_t n = nc / C, c = nc % C; - int32_t R = ksize[0]; - int32_t S = ksize[1]; - int32_t sh = stride[0]; - int32_t sw = stride[1]; - int32_t ph = padding[0]; - int32_t pw = padding[1]; - - if(n >= N) - return; - - float grad = 0; - for(int32_t r = 0; r < R; ++r) - { - for(int32_t s = 0; s < S; ++s) - { - int32_t ohsh = h + ph - r; - if(ohsh % sh != 0) - continue; - int32_t oh = ohsh / sh; - if(oh < 0 || oh >= OH) - continue; - int32_t owsw = w + pw - s; - if(owsw % sw != 0) - continue; - int32_t ow = owsw / sw; - if(ow < 0 || ow >= OW) - continue; - - int32_t hstart = oh * sh - ph; - int32_t wstart = ow * sw - pw; - int32_t hend = min(hstart + R, H + ph); - int32_t wend = min(wstart + S, W + pw); - - const int32_t pool_size = (hend - hstart) * (wend - wstart); - - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, H); - wend = min(wend, W); - - int32_t divide_factor; - if(divisor_override != 0) - { - divide_factor = divisor_override; - } - else - { - if(count_include_pad) - { - divide_factor = pool_size; - } - else - { - divide_factor = (hend - hstart) * (wend - wstart); - } - } - - grad += static_cast(output_grad[output_grad_tv.get_tensor_view_idx( - tensor_layout_t<4>(n, c, oh, ow))]) / - divide_factor; - } - } - input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] = - static_cast(grad); - } -} - -template -void cpu_avgpool_backward_3d(tensor output_grad, - tensor& input_grad, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, - tensor ksize, - tensor stride, - tensor padding, - bool count_include_pad, - int32_t divisor_override) -{ - auto dims = input_grad.desc.GetLengths(); - auto numel = input_grad.desc.GetElementSize(); - - auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc); - auto input_grad_tv = miopen::get_inner_expanded_tv<5>(input_grad.desc); - - for(size_t gid = 0; gid < numel; gid++) - { - int32_t ncdh = gid / W, w = gid % W; - int32_t ncd = ncdh / H, h = ncdh % H; - int32_t nc = ncd / D, d = ncd % D; - int32_t n = nc / C, c = nc % C; - int32_t KD = ksize[0]; - int32_t R = ksize[1]; - int32_t S = ksize[2]; - int32_t sd = stride[0]; - int32_t sh = stride[1]; - int32_t sw = stride[2]; - int32_t pd = padding[0]; - int32_t ph = padding[1]; - int32_t pw = padding[2]; - - if(n >= N) - return; - - float grad = 0; - for(int32_t kd = 0; kd < KD; ++kd) - { - for(int32_t r = 0; r < R; ++r) - { - for(int32_t s = 0; s < S; ++s) - { - int32_t odsd = d + pd - kd; - if(odsd % sd != 0) - continue; - int32_t od = odsd / sd; - if(od < 0 || od >= OD) - continue; - - int32_t ohsh = h + ph - r; - if(ohsh % sh != 0) - continue; - int32_t oh = ohsh / sh; - if(oh < 0 || oh >= OH) - continue; - - int32_t owsw = w + pw - s; - if(owsw % sw != 0) - continue; - int32_t ow = owsw / sw; - if(ow < 0 || ow >= OW) - continue; - - int32_t dstart = od * sd - pd; - int32_t hstart = oh * sh - ph; - int32_t wstart = ow * sw - pw; - int32_t dend = min(dstart + KD, D + pd); - int32_t hend = min(hstart + R, H + ph); - int32_t wend = min(wstart + S, W + pw); - - const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); - dstart = max(dstart, 0); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - dend = min(dend, D); - hend = min(hend, H); - wend = min(wend, W); - int32_t divide_factor; - if(divisor_override != 0) - { - divide_factor = divisor_override; - } - else - { - if(count_include_pad) - { - divide_factor = pool_size; - } - else - { - divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); - } - } - grad += static_cast(output_grad[output_grad_tv.get_tensor_view_idx( - tensor_layout_t<5>(n, c, od, oh, ow))]) / - divide_factor; - } - } - } - input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] = - static_cast(grad); - } -} - -#endif diff --git a/test/gtest/avgpool.cpp b/test/gtest/adaptiveavgpool.cpp similarity index 64% rename from test/gtest/avgpool.cpp rename to test/gtest/adaptiveavgpool.cpp index 3ab32be510..a548ada4cd 100644 --- a/test/gtest/avgpool.cpp +++ b/test/gtest/adaptiveavgpool.cpp @@ -23,13 +23,13 @@ * SOFTWARE. * *******************************************************************************/ -#include "avgpool.hpp" +#include "adaptiveavgpool.hpp" #include MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) -namespace avgpool { +namespace adaptiveavgpool { std::string GetFloatArg() { @@ -41,35 +41,35 @@ std::string GetFloatArg() return tmp; } -struct GPU_Avgpool_fwd_FP32 : AvgPoolTestFwd +struct GPU_AdaptiveAvgpool_fwd_FP32 : AdaptiveAvgPoolTestFwd { }; -struct GPU_Avgpool_fwd_FP16 : AvgPoolTestFwd +struct GPU_AdaptiveAvgpool_fwd_FP16 : AdaptiveAvgPoolTestFwd { }; -struct GPU_Avgpool_fwd_BFP16 : AvgPoolTestFwd +struct GPU_AdaptiveAvgpool_fwd_BFP16 : AdaptiveAvgPoolTestFwd { }; -struct GPU_Avgpool_bwd_FP32 : AvgPoolTestBwd +struct GPU_AdaptiveAvgpool_bwd_FP32 : AdaptiveAvgPoolTestBwd { }; -struct GPU_Avgpool_bwd_FP16 : AvgPoolTestBwd +struct GPU_AdaptiveAvgpool_bwd_FP16 : AdaptiveAvgPoolTestBwd { }; -struct GPU_Avgpool_bwd_BFP16 : AvgPoolTestBwd +struct GPU_AdaptiveAvgpool_bwd_BFP16 : AdaptiveAvgPoolTestBwd { }; -} // namespace avgpool -using namespace avgpool; +} // namespace adaptiveavgpool +using namespace adaptiveavgpool; // FORWARD TEST -TEST_P(GPU_Avgpool_fwd_FP32, AvgPoolTestFwd) +TEST_P(GPU_AdaptiveAvgpool_fwd_FP32, AdaptiveAvgPoolTestFwd) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -83,7 +83,7 @@ TEST_P(GPU_Avgpool_fwd_FP32, AvgPoolTestFwd) } }; -TEST_P(GPU_Avgpool_fwd_FP16, AvgPoolTestFwd) +TEST_P(GPU_AdaptiveAvgpool_fwd_FP16, AdaptiveAvgPoolTestFwd) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -97,7 +97,7 @@ TEST_P(GPU_Avgpool_fwd_FP16, AvgPoolTestFwd) } }; -TEST_P(GPU_Avgpool_fwd_BFP16, AvgPoolTestFwd) +TEST_P(GPU_AdaptiveAvgpool_fwd_BFP16, AdaptiveAvgPoolTestFwd) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -112,17 +112,17 @@ TEST_P(GPU_Avgpool_fwd_BFP16, AvgPoolTestFwd) }; INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_Avgpool_fwd_FP32, - testing::ValuesIn(AvgPoolTestConfigsFwdFp32())); + GPU_AdaptiveAvgpool_fwd_FP32, + testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp32())); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_Avgpool_fwd_FP16, - testing::ValuesIn(AvgPoolTestConfigsFwdFp16())); + GPU_AdaptiveAvgpool_fwd_FP16, + testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp16())); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_Avgpool_fwd_BFP16, - testing::ValuesIn(AvgPoolTestConfigsFwdBfp16())); + GPU_AdaptiveAvgpool_fwd_BFP16, + testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdBfp16())); // BACKWARD TEST -TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd) +TEST_P(GPU_AdaptiveAvgpool_bwd_FP32, AdaptiveAvgPoolTestBwd) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -136,7 +136,7 @@ TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd) } }; -TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd) +TEST_P(GPU_AdaptiveAvgpool_bwd_FP16, AdaptiveAvgPoolTestBwd) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -150,7 +150,7 @@ TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd) } }; -TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd) +TEST_P(GPU_AdaptiveAvgpool_bwd_BFP16, AdaptiveAvgPoolTestBwd) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -165,11 +165,11 @@ TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd) }; INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_Avgpool_bwd_FP32, - testing::ValuesIn(AvgPoolTestConfigsBwdFp32())); + GPU_AdaptiveAvgpool_bwd_FP32, + testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp32())); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_Avgpool_bwd_FP16, - testing::ValuesIn(AvgPoolTestConfigsBwdFp16())); + GPU_AdaptiveAvgpool_bwd_FP16, + testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp16())); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_Avgpool_bwd_BFP16, - testing::ValuesIn(AvgPoolTestConfigsBwdBfp16())); + GPU_AdaptiveAvgpool_bwd_BFP16, + testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdBfp16())); diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp new file mode 100644 index 0000000000..8c172e4494 --- /dev/null +++ b/test/gtest/adaptiveavgpool.hpp @@ -0,0 +1,380 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "cpu_adaptiveavgpool.hpp" +#include "get_handle.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include +#include +#include +#include +#include + +template +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +struct AdaptiveAvgPoolTestCase +{ + std::vector input_dims; + std::vector output_dims; + bool is_contiguous = true; + + friend std::ostream& operator<<(std::ostream& os, const AdaptiveAvgPoolTestCase& tc) + { + return os << " input_dims:" << tc.input_dims << " output_dims:" << tc.output_dims + << "is_contiguous:" << tc.is_contiguous; + } + + std::vector GetInput() const { return input_dims; } + std::vector GetOutput() const { return output_dims; } + + std::vector ComputeStrides(std::vector inputDim) const + { + if(!is_contiguous) + std::swap(inputDim.front(), inputDim.back()); + std::vector strides(inputDim.size()); + strides.back() = 1; + for(int i = inputDim.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * inputDim[i + 1]; + if(!is_contiguous) + std::swap(strides.front(), strides.back()); + return strides; + } +}; + +inline std::vector AdaptiveAvgPoolTestConfigsFwdFp32() +{ + return { + {{64, 768, 17}, {10}, false}, + {{64, 768, 17}, {10}, true}, + {{64, 78, 17, 17}, {10, 10}, false}, + {{64, 78, 17, 17}, {10, 10}, true}, + {{6, 18, 18, 18, 18}, {5, 5, 5}, false}, + {{6, 18, 18, 18, 18}, {5, 5, 5}, true}, + }; +} + +inline std::vector AdaptiveAvgPoolTestConfigsFwdFp16() +{ + return { + {{64, 768, 17}, {10}, false}, + {{64, 768, 17}, {10}, true}, + {{64, 78, 17, 17}, {10, 10}, false}, + {{64, 78, 17, 17}, {10, 10}, true}, + {{6, 18, 18, 18, 18}, {5, 5, 5}, false}, + {{6, 18, 18, 18, 18}, {5, 5, 5}, true}, + }; +} + +inline std::vector AdaptiveAvgPoolTestConfigsFwdBfp16() +{ + return { + {{64, 768, 17}, {10}, false}, + {{64, 768, 17}, {10}, true}, + {{64, 78, 17, 17}, {10, 10}, false}, + {{64, 78, 17, 17}, {10, 10}, true}, + {{6, 18, 18, 18, 18}, {5, 5, 5}, false}, + {{6, 18, 18, 18, 18}, {5, 5, 5}, true}, + }; +} + +inline std::vector AdaptiveAvgPoolTestConfigsBwdFp32() +{ + return { + {{64, 768, 17}, {10}, false}, + {{64, 768, 17}, {10}, true}, + {{64, 78, 17, 17}, {10, 10}, false}, + {{64, 78, 17, 17}, {10, 10}, true}, + {{6, 18, 18, 18, 18}, {5, 5, 5}, false}, + {{6, 18, 18, 18, 18}, {5, 5, 5}, true}, + }; +} + +inline std::vector AdaptiveAvgPoolTestConfigsBwdFp16() +{ + return { + {{64, 768, 17}, {10}, false}, + {{64, 768, 17}, {10}, true}, + {{64, 28, 35, 35}, {7, 7}, false}, + {{64, 28, 35, 35}, {7, 7}, true}, + {{6, 28, 35, 35, 35}, {10, 10, 10}, false}, + {{6, 28, 35, 35, 35}, {10, 10, 10}, true}, + }; +} + +inline std::vector AdaptiveAvgPoolTestConfigsBwdBfp16() +{ + return { + {{64, 768, 17}, {10}, false}, + {{64, 768, 17}, {10}, true}, + {{64, 208, 9, 9}, {7, 7}, false}, + {{64, 208, 9, 9}, {7, 7}, true}, + {{6, 18, 12, 12, 12}, {5, 5, 5}, false}, + {{6, 18, 12, 12, 12}, {5, 5, 5}, true}, + }; +} + +// FORWARD TEST +template +struct AdaptiveAvgPoolTestFwd : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + adaptiveavgpool_config = GetParam(); + auto in_dim = adaptiveavgpool_config.GetInput(); + auto in_strides = adaptiveavgpool_config.ComputeStrides(in_dim); + auto out_dim = adaptiveavgpool_config.GetOutput(); + N = in_dim[0]; + C = in_dim[1]; + std::vector out_dim_final = {N, C}; + if(in_dim.size() == 3) + { + D = 1; + H = in_dim[2]; + W = 1; + + OD = 1; + OH = out_dim[0]; + OW = 1; + out_dim_final.push_back(OH); + } + else if(in_dim.size() == 4) + { + D = 1; + H = in_dim[2]; + W = in_dim[3]; + + OD = 1; + OH = out_dim[0]; + OW = out_dim[1]; + out_dim_final.push_back(OH); + out_dim_final.push_back(OW); + } + else if(in_dim.size() == 5) + { + D = in_dim[2]; + H = in_dim[3]; + W = in_dim[4]; + + OD = out_dim[0]; + OH = out_dim[1]; + OW = out_dim[2]; + out_dim_final.push_back(OD); + out_dim_final.push_back(OH); + out_dim_final.push_back(OW); + } + + auto gen_input_value = [](auto...) { + return prng::gen_A_to_B(static_cast(-10.0f), static_cast(10.0f)); + }; + input = tensor{in_dim, in_strides}.generate(gen_input_value); + + output = tensor{out_dim_final}; + std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); + + ref_output = tensor{out_dim_final}; + std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits::quiet_NaN()); + + input_dev = handle.Write(input.data); + output_dev = handle.Write(output.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + miopenStatus_t status; + + auto dims = input.desc.GetNumDims(); + if(dims == 3) + { + cpu_adaptiveavgpool_forward_1d(input, ref_output, N, C, H, OH); + } + else if(dims == 4) + { + cpu_adaptiveavgpool_forward_2d(input, ref_output, N, C, H, W, OH, OW); + } + else if(dims == 5) + { + cpu_adaptiveavgpool_forward_3d(input, ref_output, N, C, D, H, W, OD, OH, OW); + } + status = miopen::AdaptiveAvgPoolForward( + handle, input.desc, input_dev.get(), output.desc, output_dev.get()); + fflush(stdout); + ASSERT_EQ(status, miopenStatusSuccess); + + output.data = handle.Read(output_dev, output.data.size()); + } + + void Verify() + { + double threshold = std::numeric_limits::epsilon(); + + auto error = miopen::rms_range(ref_output, output); + + ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output)); + EXPECT_LT(error, threshold * 10); + } + AdaptiveAvgPoolTestCase adaptiveavgpool_config; + + tensor input; + tensor output; + tensor ref_output; + + size_t N, C, D, H, W, OD, OH, OW; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr output_dev; +}; + +// BACKWARD TEST +template +struct AdaptiveAvgPoolTestBwd : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + adaptiveavgpool_config = GetParam(); + auto in_grad_dim = adaptiveavgpool_config.GetInput(); + auto out_grad_dim = adaptiveavgpool_config.GetOutput(); + N = in_grad_dim[0]; + C = in_grad_dim[1]; + std::vector out_grad_dim_final = {N, C}; + + if(in_grad_dim.size() == 3) + { + D = 1; + H = in_grad_dim[2]; + W = 1; + + OD = 1; + OH = out_grad_dim[0]; + OW = 1; + out_grad_dim_final.push_back(OH); + } + else if(in_grad_dim.size() == 4) + { + D = 1; + H = in_grad_dim[2]; + W = in_grad_dim[3]; + + OD = 1; + OH = out_grad_dim[0]; + OW = out_grad_dim[1]; + out_grad_dim_final.push_back(OH); + out_grad_dim_final.push_back(OW); + } + else if(in_grad_dim.size() == 5) + { + D = in_grad_dim[2]; + H = in_grad_dim[3]; + W = in_grad_dim[4]; + + OD = out_grad_dim[0]; + OH = out_grad_dim[1]; + OW = out_grad_dim[2]; + out_grad_dim_final.push_back(OD); + out_grad_dim_final.push_back(OH); + out_grad_dim_final.push_back(OW); + } + auto out_grad_strides = adaptiveavgpool_config.ComputeStrides(out_grad_dim_final); + + auto gen_output_grad_value = [](auto...) { + return prng::gen_A_to_B(static_cast(-10.0f), static_cast(10.0f)); + }; + output_grad = + tensor{out_grad_dim_final, out_grad_strides}.generate(gen_output_grad_value); + + input_grad = tensor{in_grad_dim}; + std::fill(input_grad.begin(), input_grad.end(), std::numeric_limits::quiet_NaN()); + + ref_input_grad = tensor{in_grad_dim}; + std::fill( + ref_input_grad.begin(), ref_input_grad.end(), std::numeric_limits::quiet_NaN()); + + output_grad_dev = handle.Write(output_grad.data); + input_grad_dev = handle.Write(input_grad.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + + miopenStatus_t status; + + auto dims = input_grad.desc.GetNumDims(); + if(dims == 3) + { + cpu_adaptiveavgpool_backward_1d(output_grad, ref_input_grad, N, C, H, OH); + } + else if(dims == 4) + { + cpu_adaptiveavgpool_backward_2d(output_grad, ref_input_grad, N, C, H, W, OH, OW); + } + else if(dims == 5) + { + cpu_adaptiveavgpool_backward_3d( + output_grad, ref_input_grad, N, C, D, H, W, OD, OH, OW); + } + status = miopen::AdaptiveAvgPoolBackward( + handle, output_grad.desc, output_grad_dev.get(), input_grad.desc, input_grad_dev.get()); + + ASSERT_EQ(status, miopenStatusSuccess); + + input_grad.data = handle.Read(input_grad_dev, input_grad.data.size()); + } + + void Verify() + { + double threshold = std::numeric_limits::epsilon(); + auto error = miopen::rms_range(ref_input_grad, input_grad); + ASSERT_EQ(miopen::range_distance(ref_input_grad), miopen::range_distance(input_grad)); + EXPECT_LT(error, threshold * 10); + } + AdaptiveAvgPoolTestCase adaptiveavgpool_config; + + tensor output_grad; + tensor input_grad; + tensor ref_input_grad; + + size_t N, C, D, H, W, OD, OH, OW; + + miopen::Allocator::ManageDataPtr output_grad_dev; + miopen::Allocator::ManageDataPtr input_grad_dev; +}; diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp deleted file mode 100644 index 94898d32b6..0000000000 --- a/test/gtest/avgpool.hpp +++ /dev/null @@ -1,451 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2024 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "../driver/tensor_driver.hpp" -#include "cpu_avgpool.hpp" -#include "get_handle.hpp" -#include "random.hpp" -#include "tensor_holder.hpp" -#include "verify.hpp" -#include -#include -#include -#include -#include - -template -inline std::ostream& operator<<(std::ostream& os, const std::vector& v) -{ - os << '{'; - for(int i = 0; i < v.size(); ++i) - { - if(i != 0) - os << ','; - os << v[i]; - } - os << '}'; - return os; -} - -struct AvgPoolTestCase -{ - std::vector input_dims; - std::vector kernel_size; - std::vector stride; - std::vector padding; - bool ceil_mode; - bool count_include_pad; - int32_t divisor_override; - - friend std::ostream& operator<<(std::ostream& os, const AvgPoolTestCase& tc) - { - return os << " input_dims:" << tc.input_dims << " kernel_size:" << tc.kernel_size - << " stride:" << tc.stride << " padding:" << tc.padding - << " ceil_mode:" << tc.ceil_mode << " count_include_pad:" << tc.count_include_pad - << " divisor_override:" << tc.divisor_override; - } - - std::vector GetInput() const { return input_dims; } -}; - -inline std::vector AvgPoolTestConfigsFwdFp32() -{ - return { - {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0}, - {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, - }; -} - -inline std::vector AvgPoolTestConfigsFwdFp16() -{ - return { - {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0}, - {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, - }; -} - -inline std::vector AvgPoolTestConfigsFwdBfp16() -{ - return { - {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0}, - {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, - }; -} - -inline std::vector AvgPoolTestConfigsBwdFp32() -{ - return { - {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, - }; -} - -inline std::vector AvgPoolTestConfigsBwdFp16() -{ - return { - {{64, 288, 35, 35}, {3, 3}, {1, 1}, {1, 1}, false, true, 0}, - {{6, 288, 35, 35, 35}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, false, true, 0}, - }; -} - -inline std::vector AvgPoolTestConfigsBwdBfp16() -{ - return { - {{64, 2048, 9, 9}, {3, 3}, {1, 1}, {1, 1}, false, true, 0}, - {{6, 128, 112, 112, 112}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, - }; -} - -// FORWARD TEST -template -struct AvgPoolTestFwd : public ::testing::TestWithParam -{ -protected: - void SetUp() override - { - auto&& handle = get_handle(); - avgpool_config = GetParam(); - auto in_dim = avgpool_config.GetInput(); - N = in_dim[0]; - C = in_dim[1]; - D = in_dim.size() == 5 ? in_dim[2] : 1; - H = in_dim.size() == 5 ? in_dim[3] : in_dim[2]; - W = in_dim.size() == 5 ? in_dim[4] : in_dim[3]; - ksize = tensor{in_dim.size() - 2}; - ksize.data = avgpool_config.kernel_size; - stride = tensor{in_dim.size() - 2}; - stride.data = avgpool_config.stride; - padding = tensor{in_dim.size() - 2}; - padding.data = avgpool_config.padding; - ceil_mode = avgpool_config.ceil_mode; - count_include_pad = avgpool_config.count_include_pad; - divisor_override = avgpool_config.divisor_override; - - auto gen_input_value = [](auto...) { - return prng::gen_A_to_B(static_cast(-10.0f), static_cast(10.0f)); - }; - input = tensor{in_dim}.generate(gen_input_value); - - std::vector out_dim; - if(in_dim.size() == 5) - { - if(ceil_mode) - { - OD = std::ceil(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; - OH = std::ceil(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; - OW = std::ceil(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; - } - else - { - OD = std::floor(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; - OH = std::floor(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; - OW = std::floor(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; - } - out_dim = {N, C, OD, OH, OW}; - } - else - { - if(ceil_mode) - { - OH = std::ceil(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; - OW = std::ceil(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; - } - else - { - OH = std::floor(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; - OW = std::floor(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; - } - out_dim = {N, C, OH, OW}; - } - - output = tensor{out_dim}; - std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); - - ref_output = tensor{out_dim}; - std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits::quiet_NaN()); - - input_dev = handle.Write(input.data); - output_dev = handle.Write(output.data); - ksize_dev = handle.Write(ksize.data); - stride_dev = handle.Write(stride.data); - padding_dev = handle.Write(padding.data); - } - - void RunTest() - { - auto&& handle = get_handle(); - miopenStatus_t status; - - auto dims = input.desc.GetNumDims(); - if(dims == 4) - { - cpu_avgpool_forward_2d(input, - ref_output, - N, - C, - H, - W, - OH, - OW, - ksize, - stride, - padding, - count_include_pad, - divisor_override); - } - else if(dims == 5) - { - cpu_avgpool_forward_3d(input, - ref_output, - N, - C, - D, - H, - W, - OD, - OH, - OW, - ksize, - stride, - padding, - count_include_pad, - divisor_override); - } - status = miopen::AvgPoolForward(handle, - input.desc, - input_dev.get(), - output.desc, - output_dev.get(), - ksize.GetSize() == 3 ? ksize[0] : 0, - ksize.GetSize() == 3 ? ksize[1] : ksize[0], - ksize.GetSize() == 3 ? ksize[2] : ksize[1], - stride.GetSize() == 3 ? stride[0] : 0, - stride.GetSize() == 3 ? stride[1] : stride[0], - stride.GetSize() == 3 ? stride[2] : stride[1], - padding.GetSize() == 3 ? padding[0] : 0, - padding.GetSize() == 3 ? padding[1] : padding[0], - padding.GetSize() == 3 ? padding[2] : padding[1], - count_include_pad, - divisor_override); - fflush(stdout); - ASSERT_EQ(status, miopenStatusSuccess); - - output.data = handle.Read(output_dev, output.data.size()); - } - - void Verify() - { - double threshold = std::numeric_limits::epsilon(); - - auto error = miopen::rms_range(ref_output, output); - - ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output)); - EXPECT_LT(error, threshold * 10); - } - AvgPoolTestCase avgpool_config; - - tensor input; - tensor output; - tensor ref_output; - tensor ksize; - tensor stride; - tensor padding; - - bool ceil_mode; - bool count_include_pad; - int32_t divisor_override; - int32_t N, C, D, H, W, OD, OH, OW; - - miopen::Allocator::ManageDataPtr input_dev; - miopen::Allocator::ManageDataPtr output_dev; - miopen::Allocator::ManageDataPtr ksize_dev; - miopen::Allocator::ManageDataPtr stride_dev; - miopen::Allocator::ManageDataPtr padding_dev; -}; - -// BACKWARD TEST -template -struct AvgPoolTestBwd : public ::testing::TestWithParam -{ -protected: - void SetUp() override - { - auto&& handle = get_handle(); - avgpool_config = GetParam(); - auto in_grad_dim = avgpool_config.GetInput(); - N = in_grad_dim[0]; - C = in_grad_dim[1]; - D = in_grad_dim.size() == 5 ? in_grad_dim[2] : 1; - H = in_grad_dim.size() == 5 ? in_grad_dim[3] : in_grad_dim[2]; - W = in_grad_dim.size() == 5 ? in_grad_dim[4] : in_grad_dim[3]; - ksize = tensor{in_grad_dim.size() - 2}; - ksize.data = avgpool_config.kernel_size; - stride = tensor{in_grad_dim.size() - 2}; - stride.data = avgpool_config.stride; - padding = tensor{in_grad_dim.size() - 2}; - padding.data = avgpool_config.padding; - ceil_mode = avgpool_config.ceil_mode; - count_include_pad = avgpool_config.count_include_pad; - divisor_override = avgpool_config.divisor_override; - - std::vector out_grad_dim; - if(in_grad_dim.size() == 5) - { - if(ceil_mode) - { - OD = std::ceil(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; - OH = std::ceil(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; - OW = std::ceil(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; - } - else - { - OD = std::floor(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; - OH = std::floor(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; - OW = std::floor(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; - } - out_grad_dim = {N, C, OD, OH, OW}; - } - else - { - if(ceil_mode) - { - OH = std::ceil(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; - OW = std::ceil(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; - } - else - { - OH = std::floor(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; - OW = std::floor(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; - } - out_grad_dim = {N, C, OH, OW}; - } - auto gen_output_grad_value = [](auto...) { - return prng::gen_A_to_B(static_cast(-10.0f), static_cast(10.0f)); - }; - output_grad = tensor{out_grad_dim}.generate(gen_output_grad_value); - - input_grad = tensor{in_grad_dim}; - std::fill(input_grad.begin(), input_grad.end(), std::numeric_limits::quiet_NaN()); - - ref_input_grad = tensor{in_grad_dim}; - std::fill( - ref_input_grad.begin(), ref_input_grad.end(), std::numeric_limits::quiet_NaN()); - - output_grad_dev = handle.Write(output_grad.data); - input_grad_dev = handle.Write(input_grad.data); - ksize_dev = handle.Write(ksize.data); - stride_dev = handle.Write(stride.data); - padding_dev = handle.Write(padding.data); - } - - void RunTest() - { - auto&& handle = get_handle(); - - miopenStatus_t status; - - auto dims = input_grad.desc.GetNumDims(); - if(dims == 4) - { - cpu_avgpool_backward_2d(output_grad, - ref_input_grad, - N, - C, - H, - W, - OH, - OW, - ksize, - stride, - padding, - count_include_pad, - divisor_override); - } - else if(dims == 5) - { - cpu_avgpool_backward_3d(output_grad, - ref_input_grad, - N, - C, - D, - H, - W, - OD, - OH, - OW, - ksize, - stride, - padding, - count_include_pad, - divisor_override); - } - status = miopen::AvgPoolBackward(handle, - output_grad.desc, - output_grad_dev.get(), - input_grad.desc, - input_grad_dev.get(), - ksize.GetSize() == 3 ? ksize[0] : 0, - ksize.GetSize() == 3 ? ksize[1] : ksize[0], - ksize.GetSize() == 3 ? ksize[2] : ksize[1], - stride.GetSize() == 3 ? stride[0] : 0, - stride.GetSize() == 3 ? stride[1] : stride[0], - stride.GetSize() == 3 ? stride[2] : stride[1], - padding.GetSize() == 3 ? padding[0] : 0, - padding.GetSize() == 3 ? padding[1] : padding[0], - padding.GetSize() == 3 ? padding[2] : padding[1], - count_include_pad, - divisor_override); - - ASSERT_EQ(status, miopenStatusSuccess); - - input_grad.data = handle.Read(input_grad_dev, input_grad.data.size()); - } - - void Verify() - { - double threshold = std::numeric_limits::epsilon(); - auto error = miopen::rms_range(ref_input_grad, input_grad); - ASSERT_EQ(miopen::range_distance(ref_input_grad), miopen::range_distance(input_grad)); - EXPECT_LT(error, threshold * 10); - } - AvgPoolTestCase avgpool_config; - - tensor output_grad; - tensor input_grad; - tensor ref_input_grad; - tensor ksize; - tensor stride; - tensor padding; - - bool ceil_mode; - bool count_include_pad; - int32_t divisor_override; - int32_t N, C, D, H, W, OD, OH, OW; - - miopen::Allocator::ManageDataPtr output_grad_dev; - miopen::Allocator::ManageDataPtr input_grad_dev; - miopen::Allocator::ManageDataPtr ksize_dev; - miopen::Allocator::ManageDataPtr stride_dev; - miopen::Allocator::ManageDataPtr padding_dev; -}; From 7d3a0a69e7dd3a566939e3a0ad9daa086d460169 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Thu, 3 Oct 2024 17:04:52 +0700 Subject: [PATCH 12/38] add isOverRocm3d --- driver/adaptiveavgpool_driver.hpp | 26 ++++++++-------- .../miopen/adaptiveavgpool/solvers.hpp | 1 + .../backward_adaptiveavgpool_3d.cpp | 31 +++---------------- .../forward_adaptiveavgpool_3d.cpp | 30 ++++-------------- test/gtest/adaptiveavgpool.hpp | 13 +++----- 5 files changed, 30 insertions(+), 71 deletions(-) diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp index fd86cf9eec..1a9b1b6242 100644 --- a/driver/adaptiveavgpool_driver.hpp +++ b/driver/adaptiveavgpool_driver.hpp @@ -56,12 +56,12 @@ class AdaptiveAvgPoolDriver : public Driver data_type = miopen_type{}; } - std::vector ComputeStrides(std::vector input); + std::vector ComputeStrides(std::vector input); int AddCmdLineArgs() override; int ParseCmdLineArgs(int argc, char* argv[]) override; InputFlags& GetInputFlags() override { return inflags; } - std::vector GetInputTensorDimsFromCmd(const char* param); + std::vector GetInputTensorDimsFromCmd(const char* param); int GetandSetData() override; int AllocateBuffersAndCopy() override; @@ -107,8 +107,8 @@ class AdaptiveAvgPoolDriver : public Driver size_t N = 1, C = 1, D = 1, H = 1, W = 1, OD = 1, OH = 1, OW = 1; - std::vector in_dim; - std::vector out_dim; + std::vector in_dim; + std::vector out_dim; bool isContiguous; }; @@ -126,11 +126,11 @@ int AdaptiveAvgPoolDriver::ParseCmdLineArgs(int argc, char* argv[]) } template -std::vector AdaptiveAvgPoolDriver::GetInputTensorDimsFromCmd(const char* param) +std::vector AdaptiveAvgPoolDriver::GetInputTensorDimsFromCmd(const char* param) { std::string lengthsStr = inflags.GetValueStr(param); - std::vector lengths; + std::vector lengths; std::size_t pos = 0; std::size_t new_pos; @@ -150,7 +150,7 @@ std::vector AdaptiveAvgPoolDriver::GetInputTensorDimsFromCmd(co std::string sliceStr = lengthsStr.substr(pos); int len = std::stoi(sliceStr); - lengths.push_back(len); + lengths.push_back(static_cast(len)); return (lengths); } @@ -158,9 +158,9 @@ std::vector AdaptiveAvgPoolDriver::GetInputTensorDimsFromCmd(co template int AdaptiveAvgPoolDriver::GetandSetData() { - in_dim = GetInputTensorDimsFromCmd("input_dims"); - std::vector in_stride = ComputeStrides(in_dim); - out_dim = GetInputTensorDimsFromCmd("output_dims"); + in_dim = GetInputTensorDimsFromCmd("input_dims"); + std::vector in_stride = ComputeStrides(in_dim); + out_dim = GetInputTensorDimsFromCmd("output_dims"); if(in_dim.size() != out_dim.size() + 2) { MIOPEN_THROW(miopenStatusBadParm, @@ -199,7 +199,7 @@ int AdaptiveAvgPoolDriver::GetandSetData() out_dim_final.push_back(OH); out_dim_final.push_back(OW); } - std::vector out_grad_stride = ComputeStrides(out_dim_final); + std::vector out_grad_stride = ComputeStrides(out_dim_final); SetTensorNd(inputDesc, in_dim, in_stride, data_type); SetTensorNd(outputDesc, out_dim_final, data_type); SetTensorNd(outputGradDesc, out_dim_final, out_grad_stride, data_type); @@ -210,11 +210,11 @@ int AdaptiveAvgPoolDriver::GetandSetData() // Equivalent to: tensor.tranpose(0, -1).contiguous().tranpose(0, -1) incase contiguous = False template -std::vector AdaptiveAvgPoolDriver::ComputeStrides(std::vector inputDim) +std::vector AdaptiveAvgPoolDriver::ComputeStrides(std::vector inputDim) { if(!isContiguous) std::swap(inputDim.front(), inputDim.back()); - std::vector strides(inputDim.size()); + std::vector strides(inputDim.size()); strides.back() = 1; for(int i = inputDim.size() - 2; i >= 0; --i) strides[i] = strides[i + 1] * inputDim[i + 1]; diff --git a/src/include/miopen/adaptiveavgpool/solvers.hpp b/src/include/miopen/adaptiveavgpool/solvers.hpp index 25f08f3345..ce2419527a 100644 --- a/src/include/miopen/adaptiveavgpool/solvers.hpp +++ b/src/include/miopen/adaptiveavgpool/solvers.hpp @@ -32,6 +32,7 @@ #include #include "miopen/kernel_build_params.hpp" #include "miopen/kernel_info.hpp" +#include "miopen/mlo_internal.hpp" namespace miopen { diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp index 51d815e281..b45d024c0b 100644 --- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp +++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp @@ -45,36 +45,15 @@ namespace adaptiveavgpool { bool IsOverRocmBwd3d(const miopen::adaptiveavgpool::BwdProblemDescription& problem) { - auto dtype = problem.GetInputGradDesc().GetType(); - auto in_nelems = problem.GetInputGradDesc().GetElementSize(); - auto out_nelems = problem.GetOutputGradDesc().GetElementSize(); - auto mul_nc = - problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1]; - auto N = problem.GetOutputGradDesc().GetLengths()[0]; - auto in_over_out = static_cast(in_nelems) / out_nelems; - - if(dtype == miopenFloat) + if(!problem.IsAllContiguous()) { - if((in_over_out < 2 && out_nelems <= 12582912) || (in_over_out <= 8 && N >= 6)) - { - return true; - } - return false; - } - else if(dtype == miopenHalf) - { - if((in_over_out < 2 && mul_nc < 8192) || (8 > in_over_out && out_nelems >= 29052108)) - { - return true; - } + return true; } - else if(dtype == miopenBFloat16) + else { - if((1 <= in_over_out && in_over_out < 2 && in_nelems >= 4194304) || - (in_over_out <= 8 && in_nelems >= 944111616)) - { + if((problem.GetInputGradDesc().GetElementSize() / + problem.GetOutputGradDesc().GetElementSize()) == 1) return true; - } } return false; } diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp index cf9bf5a9b9..481805cfa4 100644 --- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp +++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp @@ -45,37 +45,19 @@ namespace adaptiveavgpool { bool IsOverRocmFwd3d(const miopen::adaptiveavgpool::FwdProblemDescription& problem) { - auto dtype = problem.GetOutputDesc().GetType(); - auto in_nelems = problem.GetInputDesc().GetElementSize(); - auto out_nelems = problem.GetOutputDesc().GetElementSize(); - auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1]; - auto N = problem.GetOutputDesc().GetLengths()[0]; + auto in_nelems = problem.GetInputDesc().GetElementSize(); + auto out_nelems = problem.GetOutputDesc().GetElementSize(); auto in_over_out = static_cast(in_nelems) / out_nelems; - std::cout << "in_over_out: " << in_over_out << std::endl; - std::cout << "in_nelems: " << in_nelems << std::endl; - std::cout << "out_nelems: " << out_nelems << std::endl; - - if(dtype == miopenFloat) - { - if(in_over_out < 2 || in_over_out >= 262144 || (out_nelems >= 10125000 && N > 4)) - { - return true; - } - } - else if(dtype == miopenHalf) + if(problem.IsAllContiguous()) { - if(in_nelems >= 201326592 || (in_over_out < 2 && mul_nc < 8192)) - { + if(in_over_out <= 98) return true; - } } - else if(dtype == miopenBFloat16) + else { - if((out_nelems >= 5971968 && in_over_out < 2) || out_nelems >= 74088000) - { + if(in_over_out < 8000) return true; - } } return false; } diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp index 8c172e4494..8e2213dbf8 100644 --- a/test/gtest/adaptiveavgpool.hpp +++ b/test/gtest/adaptiveavgpool.hpp @@ -117,10 +117,9 @@ inline std::vector AdaptiveAvgPoolTestConfigsBwdFp32() return { {{64, 768, 17}, {10}, false}, {{64, 768, 17}, {10}, true}, - {{64, 78, 17, 17}, {10, 10}, false}, - {{64, 78, 17, 17}, {10, 10}, true}, + {{64, 206, 17, 17}, {10, 10}, false}, {{6, 18, 18, 18, 18}, {5, 5, 5}, false}, - {{6, 18, 18, 18, 18}, {5, 5, 5}, true}, + {{6, 18, 18, 18, 18}, {18, 18, 18}, true}, }; } @@ -129,10 +128,9 @@ inline std::vector AdaptiveAvgPoolTestConfigsBwdFp16() return { {{64, 768, 17}, {10}, false}, {{64, 768, 17}, {10}, true}, - {{64, 28, 35, 35}, {7, 7}, false}, - {{64, 28, 35, 35}, {7, 7}, true}, + {{64, 28, 35, 35}, {35, 35}, false}, {{6, 28, 35, 35, 35}, {10, 10, 10}, false}, - {{6, 28, 35, 35, 35}, {10, 10, 10}, true}, + {{6, 28, 35, 35, 35}, {35, 35, 35}, true}, }; } @@ -142,9 +140,8 @@ inline std::vector AdaptiveAvgPoolTestConfigsBwdBfp16() {{64, 768, 17}, {10}, false}, {{64, 768, 17}, {10}, true}, {{64, 208, 9, 9}, {7, 7}, false}, - {{64, 208, 9, 9}, {7, 7}, true}, {{6, 18, 12, 12, 12}, {5, 5, 5}, false}, - {{6, 18, 12, 12, 12}, {5, 5, 5}, true}, + {{6, 18, 12, 12, 12}, {12, 12, 12}, true}, }; } From dfbb6c71b58e1cbedb789238c9568f8f4afde770 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Thu, 3 Oct 2024 17:10:52 +0700 Subject: [PATCH 13/38] add const Tgpu --- driver/mloAdaptiveAvgPoolHost.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/driver/mloAdaptiveAvgPoolHost.hpp b/driver/mloAdaptiveAvgPoolHost.hpp index 1c45f16213..8bd435f415 100644 --- a/driver/mloAdaptiveAvgPoolHost.hpp +++ b/driver/mloAdaptiveAvgPoolHost.hpp @@ -33,7 +33,7 @@ template int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputDesc, const miopenTensorDescriptor_t outputDesc, - Tgpu* input, + const Tgpu* input, Tcheck* output, size_t N, size_t C, @@ -71,7 +71,7 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD template int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc, const miopenTensorDescriptor_t outputDesc, - Tgpu* input, + const Tgpu* input, Tcheck* output, size_t N, size_t C, @@ -119,7 +119,7 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD template int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc, const miopenTensorDescriptor_t outputDesc, - Tgpu* input, + const Tgpu* input, Tcheck* output, size_t N, size_t C, @@ -177,7 +177,7 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD template int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outputGradDesc, const miopenTensorDescriptor_t inputGradDesc, - Tgpu* output_grad, + const Tgpu* output_grad, Tcheck* input_grad, size_t N, size_t C, @@ -218,7 +218,7 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu template int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDesc, const miopenTensorDescriptor_t inputGradDesc, - Tgpu* output_grad, + const Tgpu* output_grad, Tcheck* input_grad, size_t N, size_t C, @@ -271,7 +271,7 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu template int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDesc, const miopenTensorDescriptor_t inputGradDesc, - Tgpu* output_grad, + const Tgpu* output_grad, Tcheck* input_grad, size_t N, size_t C, From 88179b049d2840c3a084a09f3ce96c8a2a0aed0f Mon Sep 17 00:00:00 2001 From: hieule88 Date: Fri, 4 Oct 2024 13:32:44 +0700 Subject: [PATCH 14/38] add resolve comment Rocm PR --- src/adaptiveavgpool.cpp | 4 ++++ src/adaptiveavgpool_api.cpp | 20 +++++++++---------- src/include/miopen/adaptiveavgpool.hpp | 5 +++++ .../backward_adaptiveavgpool_1d.cpp | 4 ++++ .../backward_adaptiveavgpool_2d.cpp | 4 ++++ .../backward_adaptiveavgpool_3d.cpp | 4 ++++ .../forward_adaptiveavgpool_1d.cpp | 4 ++++ .../forward_adaptiveavgpool_2d.cpp | 4 ++++ .../forward_adaptiveavgpool_3d.cpp | 4 ++++ test/gtest/adaptiveavgpool.hpp | 4 ++-- 10 files changed, 45 insertions(+), 12 deletions(-) diff --git a/src/adaptiveavgpool.cpp b/src/adaptiveavgpool.cpp index fee382a4d1..f5ff954740 100644 --- a/src/adaptiveavgpool.cpp +++ b/src/adaptiveavgpool.cpp @@ -33,6 +33,8 @@ namespace miopen { +namespace adaptiveavgpool { + miopenStatus_t AdaptiveAvgPoolForward(Handle& handle, const TensorDescriptor& inputDesc, ConstData_t input, @@ -91,4 +93,6 @@ miopenStatus_t AdaptiveAvgPoolBackward(Handle& handle, return miopenStatusSuccess; } +} // namespace adaptiveavgpool + } // namespace miopen diff --git a/src/adaptiveavgpool_api.cpp b/src/adaptiveavgpool_api.cpp index a9159258f9..c183386a6a 100644 --- a/src/adaptiveavgpool_api.cpp +++ b/src/adaptiveavgpool_api.cpp @@ -85,11 +85,11 @@ extern "C" miopenStatus_t miopenAdaptiveAvgPoolForward(miopenHandle_t handle, LogCmdAdaptiveAvgPool(inputDesc, outputDesc, true); return miopen::try_([&] { - miopen::AdaptiveAvgPoolForward(miopen::deref(handle), - miopen::deref(inputDesc), - DataCast(input), - miopen::deref(outputDesc), - DataCast(output)); + miopen::adaptiveavgpool::AdaptiveAvgPoolForward(miopen::deref(handle), + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(outputDesc), + DataCast(output)); }); } @@ -104,10 +104,10 @@ miopenAdaptiveAvgPoolBackward(miopenHandle_t handle, LogCmdAdaptiveAvgPool(inputGradDesc, outputGradDesc, false); return miopen::try_([&] { - miopen::AdaptiveAvgPoolBackward(miopen::deref(handle), - miopen::deref(outputGradDesc), - DataCast(output_grad), - miopen::deref(inputGradDesc), - DataCast(input_grad)); + miopen::adaptiveavgpool::AdaptiveAvgPoolBackward(miopen::deref(handle), + miopen::deref(outputGradDesc), + DataCast(output_grad), + miopen::deref(inputGradDesc), + DataCast(input_grad)); }); } diff --git a/src/include/miopen/adaptiveavgpool.hpp b/src/include/miopen/adaptiveavgpool.hpp index 9f38a62d94..9902befb99 100644 --- a/src/include/miopen/adaptiveavgpool.hpp +++ b/src/include/miopen/adaptiveavgpool.hpp @@ -34,6 +34,8 @@ namespace miopen { struct Handle; struct TensorDescriptor; +namespace adaptiveavgpool { + MIOPEN_INTERNALS_EXPORT miopenStatus_t AdaptiveAvgPoolForward(Handle& handle, const TensorDescriptor& inputDesc, ConstData_t input, @@ -46,5 +48,8 @@ AdaptiveAvgPoolBackward(Handle& handle, ConstData_t output_grad, const TensorDescriptor& inputGradDesc, Data_t input_grad); + +} // namespace adaptiveavgpool + } // namespace miopen #endif // _MIOPEN_ADAPTIVEAVGPOOL_HPP_ diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp index e97c9ec0a9..19dfa7d5f9 100644 --- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp +++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp @@ -73,6 +73,10 @@ bool AdaptiveAvgPoolBackward1d::IsApplicable( { return false; } + if(!(problem.GetInputGradDesc().GetType() == miopenFloat || + problem.GetInputGradDesc().GetType() == miopenHalf || + problem.GetInputGradDesc().GetType() == miopenBFloat16)) + return false; return true; } diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp index dd8aeb9902..bc813dd7bf 100644 --- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp +++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp @@ -89,6 +89,10 @@ bool AdaptiveAvgPoolBackward2d::IsApplicable( { return false; } + if(!(problem.GetInputGradDesc().GetType() == miopenFloat || + problem.GetInputGradDesc().GetType() == miopenHalf || + problem.GetInputGradDesc().GetType() == miopenBFloat16)) + return false; return true; } diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp index b45d024c0b..d2073f4304 100644 --- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp +++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp @@ -70,6 +70,10 @@ bool AdaptiveAvgPoolBackward3d::IsApplicable( { return false; } + if(!(problem.GetInputGradDesc().GetType() == miopenFloat || + problem.GetInputGradDesc().GetType() == miopenHalf || + problem.GetInputGradDesc().GetType() == miopenBFloat16)) + return false; return true; } diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp index 3ad93574de..1dc63c5858 100644 --- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp +++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp @@ -67,6 +67,10 @@ bool AdaptiveAvgPoolForward1d::IsApplicable( { return false; } + if(!(problem.GetInputDesc().GetType() == miopenFloat || + problem.GetInputDesc().GetType() == miopenHalf || + problem.GetInputDesc().GetType() == miopenBFloat16)) + return false; return true; } diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp index 92c120494e..623485634a 100644 --- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp +++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp @@ -73,6 +73,10 @@ bool AdaptiveAvgPoolForward2d::IsApplicable( { return false; } + if(!(problem.GetInputDesc().GetType() == miopenFloat || + problem.GetInputDesc().GetType() == miopenHalf || + problem.GetInputDesc().GetType() == miopenBFloat16)) + return false; return true; } diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp index 481805cfa4..b4081849eb 100644 --- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp +++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp @@ -73,6 +73,10 @@ bool AdaptiveAvgPoolForward3d::IsApplicable( { return false; } + if(!(problem.GetInputDesc().GetType() == miopenFloat || + problem.GetInputDesc().GetType() == miopenHalf || + problem.GetInputDesc().GetType() == miopenBFloat16)) + return false; return true; } diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp index 8e2213dbf8..7f01813331 100644 --- a/test/gtest/adaptiveavgpool.hpp +++ b/test/gtest/adaptiveavgpool.hpp @@ -230,7 +230,7 @@ struct AdaptiveAvgPoolTestFwd : public ::testing::TestWithParam(input, ref_output, N, C, D, H, W, OD, OH, OW); } - status = miopen::AdaptiveAvgPoolForward( + status = miopen::adaptiveavgpool::AdaptiveAvgPoolForward( handle, input.desc, input_dev.get(), output.desc, output_dev.get()); fflush(stdout); ASSERT_EQ(status, miopenStatusSuccess); @@ -349,7 +349,7 @@ struct AdaptiveAvgPoolTestBwd : public ::testing::TestWithParam( output_grad, ref_input_grad, N, C, D, H, W, OD, OH, OW); } - status = miopen::AdaptiveAvgPoolBackward( + status = miopen::adaptiveavgpool::AdaptiveAvgPoolBackward( handle, output_grad.desc, output_grad_dev.get(), input_grad.desc, input_grad_dev.get()); ASSERT_EQ(status, miopenStatusSuccess); From d2b2d1f3c5353d3882b0c8f0feba89524b79220c Mon Sep 17 00:00:00 2001 From: hieule88 Date: Fri, 4 Oct 2024 13:38:28 +0700 Subject: [PATCH 15/38] add issametype --- .../adaptiveavgpool/problem_description.hpp | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/include/miopen/adaptiveavgpool/problem_description.hpp b/src/include/miopen/adaptiveavgpool/problem_description.hpp index adec5759e7..2fda5f111b 100644 --- a/src/include/miopen/adaptiveavgpool/problem_description.hpp +++ b/src/include/miopen/adaptiveavgpool/problem_description.hpp @@ -43,6 +43,7 @@ struct FwdProblemDescription : ProblemDescriptionBase { IsValidLength(); IsValidDims(); + IsSameType(); } auto GetInputDesc() const { return inputDesc; } @@ -122,6 +123,17 @@ struct FwdProblemDescription : ProblemDescriptionBase return isContiguous(inputDesc) && isContiguous(outputDesc); } + bool IsSameType() const + { + if(inputDesc.GetType() != outputDesc.GetType()) + { + MIOPEN_THROW(miopenStatusBadParm, + "AdaptiveAvgPool: Input and output tensor types do not match."); + } + + return true; + } + NetworkConfig MakeNetworkConfig() const override; protected: @@ -137,6 +149,7 @@ struct BwdProblemDescription : ProblemDescriptionBase { IsValidLength(); IsValidDims(); + IsSameType(); } auto GetOutputGradDesc() const { return outputGradDesc; } @@ -216,6 +229,17 @@ struct BwdProblemDescription : ProblemDescriptionBase return isContiguous(inputGradDesc) && isContiguous(outputGradDesc); } + bool IsSameType() const + { + if(inputGradDesc.GetType() != outputGradDesc.GetType()) + { + MIOPEN_THROW(miopenStatusBadParm, + "AdaptiveAvgPool: Input grad and output grad tensor types do not match."); + } + + return true; + } + NetworkConfig MakeNetworkConfig() const override; protected: From 0dc61de31075ddebd4712e95f00869ff877aaec7 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Fri, 4 Oct 2024 13:46:28 +0700 Subject: [PATCH 16/38] rm ENV gtest --- test/gtest/adaptiveavgpool.cpp | 125 ++++++--------------------------- 1 file changed, 22 insertions(+), 103 deletions(-) diff --git a/test/gtest/adaptiveavgpool.cpp b/test/gtest/adaptiveavgpool.cpp index a548ada4cd..e12e327500 100644 --- a/test/gtest/adaptiveavgpool.cpp +++ b/test/gtest/adaptiveavgpool.cpp @@ -24,91 +24,30 @@ * *******************************************************************************/ #include "adaptiveavgpool.hpp" -#include - -MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) -MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) - -namespace adaptiveavgpool { - -std::string GetFloatArg() -{ - const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG); - if(tmp.empty()) - { - return ""; - } - return tmp; -} - -struct GPU_AdaptiveAvgpool_fwd_FP32 : AdaptiveAvgPoolTestFwd -{ -}; - -struct GPU_AdaptiveAvgpool_fwd_FP16 : AdaptiveAvgPoolTestFwd -{ -}; - -struct GPU_AdaptiveAvgpool_fwd_BFP16 : AdaptiveAvgPoolTestFwd -{ -}; - -struct GPU_AdaptiveAvgpool_bwd_FP32 : AdaptiveAvgPoolTestBwd -{ -}; - -struct GPU_AdaptiveAvgpool_bwd_FP16 : AdaptiveAvgPoolTestBwd -{ -}; - -struct GPU_AdaptiveAvgpool_bwd_BFP16 : AdaptiveAvgPoolTestBwd -{ -}; - -} // namespace adaptiveavgpool -using namespace adaptiveavgpool; +#include "gtest/gtest.h" +using float16 = half_float::half; // FORWARD TEST +using GPU_AdaptiveAvgpool_fwd_FP32 = AdaptiveAvgPoolTestFwd; +using GPU_AdaptiveAvgpool_fwd_FP16 = AdaptiveAvgPoolTestFwd; +using GPU_AdaptiveAvgpool_fwd_BFP16 = AdaptiveAvgPoolTestFwd; + TEST_P(GPU_AdaptiveAvgpool_fwd_FP32, AdaptiveAvgPoolTestFwd) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; TEST_P(GPU_AdaptiveAvgpool_fwd_FP16, AdaptiveAvgPoolTestFwd) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; TEST_P(GPU_AdaptiveAvgpool_fwd_BFP16, AdaptiveAvgPoolTestFwd) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; INSTANTIATE_TEST_SUITE_P(Smoke, @@ -122,46 +61,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke, testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdBfp16())); // BACKWARD TEST +using GPU_AdaptiveAvgpool_bwd_FP32 = AdaptiveAvgPoolTestBwd; +using GPU_AdaptiveAvgpool_bwd_FP16 = AdaptiveAvgPoolTestBwd; +using GPU_AdaptiveAvgpool_bwd_BFP16 = AdaptiveAvgPoolTestBwd; + TEST_P(GPU_AdaptiveAvgpool_bwd_FP32, AdaptiveAvgPoolTestBwd) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; TEST_P(GPU_AdaptiveAvgpool_bwd_FP16, AdaptiveAvgPoolTestBwd) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; TEST_P(GPU_AdaptiveAvgpool_bwd_BFP16, AdaptiveAvgPoolTestBwd) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; INSTANTIATE_TEST_SUITE_P(Smoke, From 194ab403b47bf314044c90169a70e68956c33237 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Fri, 4 Oct 2024 13:57:34 +0700 Subject: [PATCH 17/38] rm magic number in driver test --- driver/adaptiveavgpool_driver.hpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp index 1a9b1b6242..3a74a0aef5 100644 --- a/driver/adaptiveavgpool_driver.hpp +++ b/driver/adaptiveavgpool_driver.hpp @@ -438,13 +438,7 @@ int AdaptiveAvgPoolDriver::RunBackwardCPU() template Tref AdaptiveAvgPoolDriver::GetTolerance() { - // Computation error of fp16 is ~2^13 (=8192) bigger than - // the one of fp32 because mantissa is shorter by 13 bits. - auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; - - // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. - if(std::is_same::value) - tolerance *= 8.0; + Tref tolerance = std::numeric_limits::epsilon() * 10; return tolerance; } From 36b0662ebb17441223b77cbcbfafe4b02ff5b513 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Fri, 4 Oct 2024 16:25:10 +0700 Subject: [PATCH 18/38] change to AddTensorFlag --- driver/adaptiveavgpool_driver.hpp | 53 ++++++------------------------- 1 file changed, 10 insertions(+), 43 deletions(-) diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp index 3a74a0aef5..afe2161720 100644 --- a/driver/adaptiveavgpool_driver.hpp +++ b/driver/adaptiveavgpool_driver.hpp @@ -61,7 +61,6 @@ class AdaptiveAvgPoolDriver : public Driver int ParseCmdLineArgs(int argc, char* argv[]) override; InputFlags& GetInputFlags() override { return inflags; } - std::vector GetInputTensorDimsFromCmd(const char* param); int GetandSetData() override; int AllocateBuffersAndCopy() override; @@ -107,8 +106,8 @@ class AdaptiveAvgPoolDriver : public Driver size_t N = 1, C = 1, D = 1, H = 1, W = 1, OD = 1, OH = 1, OW = 1; - std::vector in_dim; - std::vector out_dim; + std::vector in_dim; + std::vector out_dim; bool isContiguous; }; @@ -125,42 +124,12 @@ int AdaptiveAvgPoolDriver::ParseCmdLineArgs(int argc, char* argv[]) return miopenStatusSuccess; } -template -std::vector AdaptiveAvgPoolDriver::GetInputTensorDimsFromCmd(const char* param) -{ - std::string lengthsStr = inflags.GetValueStr(param); - - std::vector lengths; - std::size_t pos = 0; - std::size_t new_pos; - - new_pos = lengthsStr.find(',', pos); - while(new_pos != std::string::npos) - { - std::string sliceStr = lengthsStr.substr(pos, new_pos - pos); - - int len = std::stoi(sliceStr); - - lengths.push_back(len); - - pos = new_pos + 1; - new_pos = lengthsStr.find(',', pos); - }; - - std::string sliceStr = lengthsStr.substr(pos); - int len = std::stoi(sliceStr); - - lengths.push_back(static_cast(len)); - - return (lengths); -} - template int AdaptiveAvgPoolDriver::GetandSetData() { - in_dim = GetInputTensorDimsFromCmd("input_dims"); + in_dim = inflags.GetValueTensor("input_dims").lengths; std::vector in_stride = ComputeStrides(in_dim); - out_dim = GetInputTensorDimsFromCmd("output_dims"); + out_dim = inflags.GetValueTensor("output_dims").lengths; if(in_dim.size() != out_dim.size() + 2) { MIOPEN_THROW(miopenStatusBadParm, @@ -227,18 +196,16 @@ template int AdaptiveAvgPoolDriver::AddCmdLineArgs() { inflags.AddInputFlag("forw", 'F', "1", "Run only Forward AdaptiveAvgPool (Default=1)", "int"); - inflags.AddInputFlag( + inflags.AddTensorFlag( "input_dims", 'D', - "2,3,7,9,9", - "The dimensional lengths of the input tensor: N,C,D,H,W... Example: 2,3,7,9,9.", - "string"); - inflags.AddInputFlag( + "2x3x7x9x9", + "The dimensional lengths of the input tensor: N,C,D,H,W... Example: 2,3,7,9,9."); + inflags.AddTensorFlag( "output_dims", 'S', - "5,5,5", - "The dimensional lengths of the output tensor: OD,OH,OW,... Example: 5,5,5.", - "string"); + "5x5x5", + "The dimensional lengths of the output tensor: OD,OH,OW,... Example: 5,5,5."); inflags.AddInputFlag("is-contiguous", 'c', "1", "is-contiguous (Default=1)", "int"); inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int"); From 99ff4bb5996cf21aa7ccc706130f3761d4332aac Mon Sep 17 00:00:00 2001 From: hieule88 Date: Fri, 4 Oct 2024 17:23:41 +0700 Subject: [PATCH 19/38] small fix --- driver/adaptiveavgpool_driver.hpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp index afe2161720..724397a9e6 100644 --- a/driver/adaptiveavgpool_driver.hpp +++ b/driver/adaptiveavgpool_driver.hpp @@ -56,7 +56,7 @@ class AdaptiveAvgPoolDriver : public Driver data_type = miopen_type{}; } - std::vector ComputeStrides(std::vector input); + std::vector ComputeStrides(std::vector input); int AddCmdLineArgs() override; int ParseCmdLineArgs(int argc, char* argv[]) override; InputFlags& GetInputFlags() override { return inflags; } @@ -127,17 +127,17 @@ int AdaptiveAvgPoolDriver::ParseCmdLineArgs(int argc, char* argv[]) template int AdaptiveAvgPoolDriver::GetandSetData() { - in_dim = inflags.GetValueTensor("input_dims").lengths; - std::vector in_stride = ComputeStrides(in_dim); - out_dim = inflags.GetValueTensor("output_dims").lengths; + in_dim = inflags.GetValueTensor("input_dims").lengths; + std::vector in_stride = ComputeStrides(in_dim); + out_dim = inflags.GetValueTensor("output_dims").lengths; if(in_dim.size() != out_dim.size() + 2) { MIOPEN_THROW(miopenStatusBadParm, "AdaptiveAvgPool: Input and output tensor sizes do not match."); } - N = in_dim[0]; - C = in_dim[1]; - std::vector out_dim_final = {N, C}; + N = in_dim[0]; + C = in_dim[1]; + std::vector out_dim_final = {N, C}; if(in_dim.size() == 3) { H = in_dim[2]; @@ -168,7 +168,7 @@ int AdaptiveAvgPoolDriver::GetandSetData() out_dim_final.push_back(OH); out_dim_final.push_back(OW); } - std::vector out_grad_stride = ComputeStrides(out_dim_final); + std::vector out_grad_stride = ComputeStrides(out_dim_final); SetTensorNd(inputDesc, in_dim, in_stride, data_type); SetTensorNd(outputDesc, out_dim_final, data_type); SetTensorNd(outputGradDesc, out_dim_final, out_grad_stride, data_type); @@ -179,11 +179,11 @@ int AdaptiveAvgPoolDriver::GetandSetData() // Equivalent to: tensor.tranpose(0, -1).contiguous().tranpose(0, -1) incase contiguous = False template -std::vector AdaptiveAvgPoolDriver::ComputeStrides(std::vector inputDim) +std::vector AdaptiveAvgPoolDriver::ComputeStrides(std::vector inputDim) { if(!isContiguous) std::swap(inputDim.front(), inputDim.back()); - std::vector strides(inputDim.size()); + std::vector strides(inputDim.size()); strides.back() = 1; for(int i = inputDim.size() - 2; i >= 0; --i) strides[i] = strides[i + 1] * inputDim[i + 1]; From 42b54a7b97166d6d9bc48f94f8f8b3306d273d01 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Mon, 7 Oct 2024 10:28:12 +0700 Subject: [PATCH 20/38] fix driver as comments --- driver/CMakeLists.txt | 2 +- driver/adaptiveavgpool_driver.hpp | 145 ++++++++++++++--------- driver/mloAdaptiveAvgPoolHost.hpp | 29 ++--- src/include/miopen/tensor_view_utils.hpp | 1 - 4 files changed, 104 insertions(+), 73 deletions(-) diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 3eeb7d4d42..ea309e87ed 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -31,8 +31,8 @@ add_executable(MIOpenDriver conv_common.cpp dm_activ.cpp dm_adam.cpp - dm_addlayernorm.cpp dm_adaptiveavgpool.cpp + dm_addlayernorm.cpp dm_bnorm.cpp dm_cat.cpp dm_conv.cpp diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp index 724397a9e6..65badc5fa2 100644 --- a/driver/adaptiveavgpool_driver.hpp +++ b/driver/adaptiveavgpool_driver.hpp @@ -23,8 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -#ifndef GUARD_MIOPEN_ADAPTIVEAVGPOOL_DRIVER_HPP -#define GUARD_MIOPEN_ADAPTIVEAVGPOOL_DRIVER_HPP +#pragma once #include "InputFlags.hpp" #include "driver.hpp" @@ -169,10 +168,16 @@ int AdaptiveAvgPoolDriver::GetandSetData() out_dim_final.push_back(OW); } std::vector out_grad_stride = ComputeStrides(out_dim_final); - SetTensorNd(inputDesc, in_dim, in_stride, data_type); - SetTensorNd(outputDesc, out_dim_final, data_type); - SetTensorNd(outputGradDesc, out_dim_final, out_grad_stride, data_type); - SetTensorNd(inputGradDesc, in_dim, data_type); + if(SetTensorNd(inputDesc, in_dim, in_stride, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing input tensor: " + inflags.GetValueStr("input_dims") + "."); + if(SetTensorNd(outputDesc, out_dim_final, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing output tensor: " + inflags.GetValueStr("output_dims") + "."); + if(SetTensorNd(outputGradDesc, out_dim_final, out_grad_stride, data_type) != + miopenStatusSuccess) + MIOPEN_THROW("Error parsing output grad tensor: " + inflags.GetValueStr("output_dims") + + "."); + if(SetTensorNd(inputGradDesc, in_dim, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing input grad tensor: " + inflags.GetValueStr("input_dims") + "."); return miopenStatusSuccess; } @@ -200,12 +205,12 @@ int AdaptiveAvgPoolDriver::AddCmdLineArgs() "input_dims", 'D', "2x3x7x9x9", - "The dimensional lengths of the input tensor: N,C,D,H,W... Example: 2,3,7,9,9."); + "The dimensional lengths of the input tensor: N,C,D,H,W... Example: 2x3x7x9x9."); inflags.AddTensorFlag( "output_dims", 'S', "5x5x5", - "The dimensional lengths of the output tensor: OD,OH,OW,... Example: 5,5,5."); + "The dimensional lengths of the output tensor: OD,OH,OW,... Example: 5x5x5."); inflags.AddInputFlag("is-contiguous", 'c', "1", "is-contiguous (Default=1)", "int"); inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int"); @@ -272,8 +277,9 @@ int AdaptiveAvgPoolDriver::RunForwardGPU() for(int i = 0; i < inflags.GetValueInt("iter"); i++) { - miopenAdaptiveAvgPoolForward( + auto status = miopenAdaptiveAvgPoolForward( GetHandle(), inputDesc, input_dev->GetMem(), outputDesc, output_dev->GetMem()); + MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in miopenAdaptiveAvgPoolForward"); float time = 0.0; miopenGetKernelTime(GetHandle(), &time); @@ -287,15 +293,21 @@ int AdaptiveAvgPoolDriver::RunForwardGPU() STOP_TIME int iter = inflags.GetValueInt("iter"); if(WALL_CLOCK) - printf("Wall-clock Time Forward AdaptiveAvgPool Elapsed: %f ms\n", - t.gettime_ms() / iter); + std::cout << "Wall-clock Time Forward AdaptiveAvgPool Elapsed: " + << t.gettime_ms() / iter << " ms" << std::endl; float kernel_average_time = iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; - printf("GPU Kernel Time Forward AdaptiveAvgPool Elapsed: %f ms\n", kernel_average_time); + std::cout << "GPU Kernel Time Forward AdaptiveAvgPool Elapsed: " << kernel_average_time + << " ms" << std::endl; } - output_dev->FromGPU(GetStream(), output.data()); + if(output_dev->FromGPU(GetStream(), output.data()) != 0) + { + std::cerr << "Error copying (output_dev) from GPU, size: " << output_dev->GetSize() + << std::endl; + return miopenStatusInternalError; + } return miopenStatusSuccess; } @@ -303,22 +315,30 @@ int AdaptiveAvgPoolDriver::RunForwardGPU() template int AdaptiveAvgPoolDriver::RunForwardCPU() { + int status = miopenStatusSuccess; + if(in_dim.size() == 3) { - mloAdaptiveAvgPoolForward1dRunHost( + status = mloAdaptiveAvgPoolForward1dRunHost( inputDesc, outputDesc, input.data(), output_host.data(), N, C, H, OH); + MIOPEN_THROW_IF(status != miopenStatusSuccess, + "Error in mloAdaptiveAvgPoolForward1dRunHost"); } else if(in_dim.size() == 4) { - mloAdaptiveAvgPoolForward2dRunHost( + status = mloAdaptiveAvgPoolForward2dRunHost( inputDesc, outputDesc, input.data(), output_host.data(), N, C, H, W, OH, OW); + MIOPEN_THROW_IF(status != miopenStatusSuccess, + "Error in mloAdaptiveAvgPoolForward2dRunHost"); } else if(in_dim.size() == 5) { - mloAdaptiveAvgPoolForward3dRunHost( + status = mloAdaptiveAvgPoolForward3dRunHost( inputDesc, outputDesc, input.data(), output_host.data(), N, C, D, H, W, OD, OH, OW); + MIOPEN_THROW_IF(status != miopenStatusSuccess, + "Error in mloAdaptiveAvgPoolForward3dRunHost"); } - return miopenStatusSuccess; + return status; } template @@ -332,11 +352,12 @@ int AdaptiveAvgPoolDriver::RunBackwardGPU() for(int i = 0; i < inflags.GetValueInt("iter"); i++) { - miopenAdaptiveAvgPoolBackward(GetHandle(), - outputGradDesc, - output_grad_dev->GetMem(), - inputGradDesc, - input_grad_dev->GetMem()); + auto status = miopenAdaptiveAvgPoolBackward(GetHandle(), + outputGradDesc, + output_grad_dev->GetMem(), + inputGradDesc, + input_grad_dev->GetMem()); + MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in miopenAdaptiveAvgPoolBackward"); float time = 0.0; miopenGetKernelTime(GetHandle(), &time); @@ -350,15 +371,21 @@ int AdaptiveAvgPoolDriver::RunBackwardGPU() STOP_TIME int iter = inflags.GetValueInt("iter"); if(WALL_CLOCK) - printf("Wall-clock Time Backward AdaptiveAvgPool Elapsed: %f ms\n", - t.gettime_ms() / iter); + std::cout << "Wall-clock Time Backward AdaptiveAvgPool Elapsed: " + << t.gettime_ms() / iter << " ms" << std::endl; float kernel_average_time = iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; - printf("GPU Kernel Time Backward AdaptiveAvgPool Elapsed: %f ms\n", kernel_average_time); + std::cout << "GPU Kernel Time Backward AdaptiveAvgPool Elapsed: " << kernel_average_time + << " ms" << std::endl; } - input_grad_dev->FromGPU(GetStream(), input_grad.data()); + if(input_grad_dev->FromGPU(GetStream(), input_grad.data()) != 0) + { + std::cerr << "Error copying (input_grad_dev) from GPU, size: " << input_grad_dev->GetSize() + << std::endl; + return miopenStatusInternalError; + } return miopenStatusSuccess; } @@ -366,40 +393,48 @@ int AdaptiveAvgPoolDriver::RunBackwardGPU() template int AdaptiveAvgPoolDriver::RunBackwardCPU() { + int status = miopenStatusSuccess; + if(in_dim.size() == 3) { - mloAdaptiveAvgPoolBackward1dRunHost( + status = mloAdaptiveAvgPoolBackward1dRunHost( outputGradDesc, inputGradDesc, output_grad.data(), input_grad_host.data(), N, C, H, OH); + MIOPEN_THROW_IF(status != miopenStatusSuccess, + "Error in mloAdaptiveAvgPoolBackward1dRunHost"); } else if(in_dim.size() == 4) { - mloAdaptiveAvgPoolBackward2dRunHost(outputGradDesc, - inputGradDesc, - output_grad.data(), - input_grad_host.data(), - N, - C, - H, - W, - OH, - OW); + status = mloAdaptiveAvgPoolBackward2dRunHost(outputGradDesc, + inputGradDesc, + output_grad.data(), + input_grad_host.data(), + N, + C, + H, + W, + OH, + OW); + MIOPEN_THROW_IF(status != miopenStatusSuccess, + "Error in mloAdaptiveAvgPoolBackward2dRunHost"); } else if(in_dim.size() == 5) { - mloAdaptiveAvgPoolBackward3dRunHost(outputGradDesc, - inputGradDesc, - output_grad.data(), - input_grad_host.data(), - N, - C, - D, - H, - W, - OD, - OH, - OW); + status = mloAdaptiveAvgPoolBackward3dRunHost(outputGradDesc, + inputGradDesc, + output_grad.data(), + input_grad_host.data(), + N, + C, + D, + H, + W, + OD, + OH, + OW); + MIOPEN_THROW_IF(status != miopenStatusSuccess, + "Error in mloAdaptiveAvgPoolBackward3dRunHost"); } - return miopenStatusSuccess; + return status; } template @@ -423,7 +458,8 @@ int AdaptiveAvgPoolDriver::VerifyForward() } else { - printf("Forward AdaptiveAvgPool Verifies on CPU and GPU (err=%f)\n", error); + std::cout << "Forward AdaptiveAvgPool Verifies on CPU and GPU (err=" << error << ")" + << std::endl; } return miopenStatusSuccess; @@ -439,13 +475,12 @@ int AdaptiveAvgPoolDriver::VerifyBackward() if(!std::isfinite(error) || error > tolerance) { std::cout << "Backward AdaptiveAvgPool FAILED: " << error << std::endl; - return EC_VerifyFwd; + return EC_VerifyBwd; } else { - printf("Backward AdaptiveAvgPool Verifies on CPU and GPU (err=%f)\n", error); + std::cout << "Backward AdaptiveAvgPool Verifies on CPU and GPU (err=" << error << ")" + << std::endl; } return miopenStatusSuccess; } - -#endif // GUARD_MIOPEN_ADAPTIVEAVGPOOL_DRIVER_HPP diff --git a/driver/mloAdaptiveAvgPoolHost.hpp b/driver/mloAdaptiveAvgPoolHost.hpp index 8bd435f415..21e758f494 100644 --- a/driver/mloAdaptiveAvgPoolHost.hpp +++ b/driver/mloAdaptiveAvgPoolHost.hpp @@ -23,8 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -#ifndef MLO_ADAPTIVEAVGPOOLHOST_H_ -#define MLO_ADAPTIVEAVGPOOLHOST_H_ +#pragma once #include #include @@ -52,7 +51,7 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD size_t n = nc / C, c = nc % C; if(n >= N) - return 0; + return miopenStatusSuccess; size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; @@ -65,7 +64,7 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD output[output_tv.get_tensor_view_idx({n, c, oh})] = static_cast(sum / kh); } - return 0; + return miopenStatusSuccess; } template @@ -93,7 +92,7 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD size_t n = nc / C, c = nc % C; if(n >= N) - return 0; + return miopenStatusSuccess; size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; @@ -113,7 +112,7 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = static_cast(sum / divider); } - return 0; + return miopenStatusSuccess; } template @@ -144,7 +143,7 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD size_t n = nc / C, c = nc % C; if(n >= N) - return 0; + return miopenStatusSuccess; size_t d = static_cast(std::floor(static_cast(od * D) / OD)); size_t kd = static_cast(std::ceil(static_cast((od + 1) * D) / OD)) - d; @@ -171,7 +170,7 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] = static_cast(sum / (kd * kh * kw)); } - return 0; + return miopenStatusSuccess; } template @@ -196,7 +195,7 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu size_t n = nc / C, c = nc % C; if(n >= N) - return 0; + return miopenStatusSuccess; size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; @@ -212,7 +211,7 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu } input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = static_cast(grad); } - return 0; + return miopenStatusSuccess; } template @@ -240,7 +239,7 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu size_t n = nc / C, c = nc % C; if(n >= N) - return 0; + return miopenStatusSuccess; size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; @@ -265,7 +264,7 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = static_cast(grad); } - return 0; + return miopenStatusSuccess; } template @@ -296,7 +295,7 @@ int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outpu size_t n = nc / C, c = nc % C; if(n >= N) - return 0; + return miopenStatusSuccess; size_t od = static_cast(std::floor(static_cast(d * OD) / D)); size_t kod = static_cast(std::ceil(static_cast((d + 1) * OD) / D)) - od; @@ -331,7 +330,5 @@ int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outpu input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = static_cast(grad); } - return 0; + return miopenStatusSuccess; } - -#endif // MLO_ADAPTIVEAVGPOOLHOST_H_ diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp index d4f3aa4163..1b095affb7 100644 --- a/src/include/miopen/tensor_view_utils.hpp +++ b/src/include/miopen/tensor_view_utils.hpp @@ -30,7 +30,6 @@ #include #include #include "../../kernels/tensor_view.hpp" -#include "miopen/tensor.hpp" namespace miopen { From 22ae98aa8e06d6c82bbf977ea69ba4ab7a1534f9 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Mon, 7 Oct 2024 10:55:35 +0700 Subject: [PATCH 21/38] for to parford --- driver/mloAdaptiveAvgPoolHost.hpp | 49 ++++++++----------------------- 1 file changed, 13 insertions(+), 36 deletions(-) diff --git a/driver/mloAdaptiveAvgPoolHost.hpp b/driver/mloAdaptiveAvgPoolHost.hpp index 21e758f494..38088cf09e 100644 --- a/driver/mloAdaptiveAvgPoolHost.hpp +++ b/driver/mloAdaptiveAvgPoolHost.hpp @@ -28,6 +28,7 @@ #include #include #include +#include <../test/ford.hpp> template int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputDesc, @@ -45,14 +46,10 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD auto input_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(inputDesc)); auto output_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(outputDesc)); - for(size_t gid = 0; gid < numel; gid++) - { + par_ford(numel)([&](size_t gid) { size_t nc = gid / OH, oh = gid % OH; size_t n = nc / C, c = nc % C; - if(n >= N) - return miopenStatusSuccess; - size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; @@ -63,7 +60,7 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD } output[output_tv.get_tensor_view_idx({n, c, oh})] = static_cast(sum / kh); - } + }); return miopenStatusSuccess; } @@ -85,15 +82,11 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD auto input_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc)); auto output_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputDesc)); - for(size_t gid = 0; gid < numel; gid++) - { + par_ford(numel)([&](size_t gid) { size_t ncoh = gid / OW, ow = gid % OW; size_t nc = ncoh / OH, oh = ncoh % OH; size_t n = nc / C, c = nc % C; - if(n >= N) - return miopenStatusSuccess; - size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; @@ -111,7 +104,7 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD } output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = static_cast(sum / divider); - } + }); return miopenStatusSuccess; } @@ -135,16 +128,12 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc)); - for(size_t gid = 0; gid < numel; gid++) - { + par_ford(numel)([&](size_t gid) { size_t ncodoh = gid / OW, ow = gid % OW; size_t ncod = ncodoh / OH, oh = ncodoh % OH; size_t nc = ncod / OD, od = ncod % OD; size_t n = nc / C, c = nc % C; - if(n >= N) - return miopenStatusSuccess; - size_t d = static_cast(std::floor(static_cast(od * D) / OD)); size_t kd = static_cast(std::ceil(static_cast((od + 1) * D) / OD)) - d; @@ -169,7 +158,7 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] = static_cast(sum / (kd * kh * kw)); - } + }); return miopenStatusSuccess; } @@ -189,14 +178,10 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu auto output_grad_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(outputGradDesc)); auto input_grad_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(inputGradDesc)); - for(size_t gid = 0; gid < numel; gid++) - { + par_ford(numel)([&](size_t gid) { size_t nc = gid / H, h = gid % H; size_t n = nc / C, c = nc % C; - if(n >= N) - return miopenStatusSuccess; - size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; @@ -210,7 +195,7 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu kh; } input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = static_cast(grad); - } + }); return miopenStatusSuccess; } @@ -232,15 +217,11 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu auto output_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc)); auto input_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc)); - for(size_t gid = 0; gid < numel; gid++) - { + par_ford(numel)([&](size_t gid) { size_t nch = gid / W, w = gid % W; size_t nc = nch / H, h = nch % H; size_t n = nc / C, c = nc % C; - if(n >= N) - return miopenStatusSuccess; - size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; @@ -263,7 +244,7 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu } input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = static_cast(grad); - } + }); return miopenStatusSuccess; } @@ -287,16 +268,12 @@ int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outpu auto output_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc)); auto input_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc)); - for(size_t gid = 0; gid < numel; gid++) - { + par_ford(numel)([&](size_t gid) { size_t ncdh = gid / W, w = gid % W; size_t ncd = ncdh / H, h = ncdh % H; size_t nc = ncd / D, d = ncd % D; size_t n = nc / C, c = nc % C; - if(n >= N) - return miopenStatusSuccess; - size_t od = static_cast(std::floor(static_cast(d * OD) / D)); size_t kod = static_cast(std::ceil(static_cast((d + 1) * OD) / D)) - od; @@ -329,6 +306,6 @@ int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outpu } input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = static_cast(grad); - } + }); return miopenStatusSuccess; } From 6ef45f6f51a90b120c15d5d84684c3894c4f31c7 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Mon, 7 Oct 2024 14:02:38 +0700 Subject: [PATCH 22/38] resolved comments --- src/CMakeLists.txt | 6 +- src/include/miopen/adaptiveavgpool.hpp | 6 +- .../miopen/adaptiveavgpool/invoke_params.hpp | 2 +- .../adaptiveavgpool/problem_description.hpp | 31 +- .../miopen/adaptiveavgpool/solvers.hpp | 14 +- src/kernels/MIOpenAdaptiveAvgPool.cpp | 297 +++++++++--------- src/kernels/tensor_view.hpp | 1 + src/solver.cpp | 2 +- .../backward_adaptiveavgpool_1d.cpp | 18 +- .../backward_adaptiveavgpool_2d.cpp | 22 +- .../backward_adaptiveavgpool_3d.cpp | 26 +- .../forward_adaptiveavgpool_1d.cpp | 18 +- .../forward_adaptiveavgpool_2d.cpp | 22 +- .../forward_adaptiveavgpool_3d.cpp | 26 +- test/cpu_adaptiveavgpool.hpp | 54 +--- test/gtest/adaptiveavgpool.hpp | 8 +- 16 files changed, 246 insertions(+), 307 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 416f5ef4e3..4c219e3d2c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -87,10 +87,10 @@ set( MIOpen_Source activ_api.cpp adam/problem_description.cpp adam_api.cpp - addlayernorm_api.cpp - api/find2_0_commons.cpp adaptiveavgpool_api.cpp adaptiveavgpool/problem_description.cpp + addlayernorm_api.cpp + api/find2_0_commons.cpp batch_norm.cpp batch_norm_api.cpp batchnorm/problem_description.cpp @@ -659,8 +659,8 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN list(APPEND MIOpen_Source activ.cpp adam.cpp - addlayernorm.cpp adaptiveavgpool.cpp + addlayernorm.cpp cat.cpp groupnorm.cpp getitem.cpp diff --git a/src/include/miopen/adaptiveavgpool.hpp b/src/include/miopen/adaptiveavgpool.hpp index 9902befb99..7f04af7b8d 100644 --- a/src/include/miopen/adaptiveavgpool.hpp +++ b/src/include/miopen/adaptiveavgpool.hpp @@ -23,10 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -#include -#ifndef MIOPEN_ADAPTIVEAVGPOOL_HPP_ -#define MIOPEN_ADAPTIVEAVGPOOL_HPP_ - +#pragma once #include namespace miopen { @@ -52,4 +49,3 @@ AdaptiveAvgPoolBackward(Handle& handle, } // namespace adaptiveavgpool } // namespace miopen -#endif // _MIOPEN_ADAPTIVEAVGPOOL_HPP_ diff --git a/src/include/miopen/adaptiveavgpool/invoke_params.hpp b/src/include/miopen/adaptiveavgpool/invoke_params.hpp index e97a66a427..b9a30f7236 100644 --- a/src/include/miopen/adaptiveavgpool/invoke_params.hpp +++ b/src/include/miopen/adaptiveavgpool/invoke_params.hpp @@ -26,7 +26,7 @@ #pragma once -#include "miopen/common.hpp" +#include #include #include diff --git a/src/include/miopen/adaptiveavgpool/problem_description.hpp b/src/include/miopen/adaptiveavgpool/problem_description.hpp index 2fda5f111b..d8b112e46e 100644 --- a/src/include/miopen/adaptiveavgpool/problem_description.hpp +++ b/src/include/miopen/adaptiveavgpool/problem_description.hpp @@ -106,22 +106,7 @@ struct FwdProblemDescription : ProblemDescriptionBase return true; } - bool IsAllContiguous() const - { - auto isContiguous = [](TensorDescriptor td) { - size_t s = 1; - for(int i = td.GetNumDims() - 1; i >= 0; --i) - { - if(s != td.GetStrides()[i]) - { - return false; - } - s *= td.GetLengths()[i]; - } - return true; - }; - return isContiguous(inputDesc) && isContiguous(outputDesc); - } + bool IsAllContiguous() const { return inputDesc.IsContiguous() && outputDesc.IsContiguous(); } bool IsSameType() const { @@ -214,19 +199,7 @@ struct BwdProblemDescription : ProblemDescriptionBase bool IsAllContiguous() const { - auto isContiguous = [](TensorDescriptor td) { - size_t s = 1; - for(int i = td.GetNumDims() - 1; i >= 0; --i) - { - if(s != td.GetStrides()[i]) - { - return false; - } - s *= td.GetLengths()[i]; - } - return true; - }; - return isContiguous(inputGradDesc) && isContiguous(outputGradDesc); + return inputGradDesc.IsContiguous() && outputGradDesc.IsContiguous(); } bool IsSameType() const diff --git a/src/include/miopen/adaptiveavgpool/solvers.hpp b/src/include/miopen/adaptiveavgpool/solvers.hpp index ce2419527a..980bb1a330 100644 --- a/src/include/miopen/adaptiveavgpool/solvers.hpp +++ b/src/include/miopen/adaptiveavgpool/solvers.hpp @@ -26,18 +26,20 @@ #pragma once -#include "miopen/conv_solution.hpp" -#include "miopen/execution_context.hpp" +#include +#include #include #include -#include "miopen/kernel_build_params.hpp" -#include "miopen/kernel_info.hpp" -#include "miopen/mlo_internal.hpp" +#include +#include +#include namespace miopen { namespace solver { +namespace adaptiveavgpool { + const auto make_hip_kernel = [](std::vector localsize, std::vector gridsize, std::string kernel_file, @@ -53,8 +55,6 @@ const auto make_hip_kernel = [](std::vector localsize, build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name}; }; -namespace adaptiveavgpool { - using AdaptiveAvgPoolForward = NonTunableSolverBase; diff --git a/src/kernels/MIOpenAdaptiveAvgPool.cpp b/src/kernels/MIOpenAdaptiveAvgPool.cpp index 17877fdf0c..8d26ea0301 100644 --- a/src/kernels/MIOpenAdaptiveAvgPool.cpp +++ b/src/kernels/MIOpenAdaptiveAvgPool.cpp @@ -31,35 +31,27 @@ #include "float_types.h" #include "tensor_view.hpp" -#ifndef INPUT_TYPE -#define INPUT_TYPE float -#endif - -#ifndef OUTPUT_TYPE -#define OUTPUT_TYPE float -#endif - template __device__ void adaptiveAvgPoolForward1d(const TI* __restrict__ input, TO* __restrict__ output, - size_t N, - size_t C, - size_t H, - size_t OH, + uint64_t N, + uint64_t C, + uint64_t H, + uint64_t OH, tensor_view_t<3> input_tv, tensor_view_t<3> output_tv) { - size_t gid = threadIdx.x + blockIdx.x * blockDim.x; - size_t nc = gid / OH, oh = gid % OH; - size_t n = nc / C, c = nc % C; + uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x; + uint64_t nc = gid / OH, oh = gid % OH; + uint64_t n = nc / C, c = nc % C; if(n >= N) return; - size_t h = static_cast(floor(static_cast(oh * H) / OH)); - size_t kh = static_cast(ceil(static_cast((oh + 1) * H) / OH)) - h; + uint64_t h = static_cast(floor(static_cast(oh * H) / OH)); + uint64_t kh = static_cast(ceil(static_cast((oh + 1) * H) / OH)) - h; FLOAT_ACCUM sum = 0; - for(size_t ih = h; ih < (h + kh); ++ih) + for(uint64_t ih = h; ih < (h + kh); ++ih) { sum += CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx({n, c, ih})]); } @@ -67,10 +59,10 @@ __device__ void adaptiveAvgPoolForward1d(const TI* __restrict__ input, } extern "C" __global__ void AdaptiveAvgPoolForward1d(const INPUT_TYPE* __restrict__ input, OUTPUT_TYPE* __restrict__ output, - size_t N, - size_t C, - size_t H, - size_t OH, + uint64_t N, + uint64_t C, + uint64_t H, + uint64_t OH, tensor_view_t<3> input_tv, tensor_view_t<3> output_tv) { @@ -81,27 +73,27 @@ extern "C" __global__ void AdaptiveAvgPoolForward1d(const INPUT_TYPE* __restrict template __device__ void adaptiveAvgPoolBackward1d(const TI* __restrict__ output_grad, TO* __restrict__ input_grad, - size_t N, - size_t C, - size_t H, - size_t OH, + uint64_t N, + uint64_t C, + uint64_t H, + uint64_t OH, tensor_view_t<3> output_grad_tv, tensor_view_t<3> input_grad_tv) { - size_t gid = threadIdx.x + blockIdx.x * blockDim.x; - size_t nc = gid / H, h = gid % H; - size_t n = nc / C, c = nc % C; + uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x; + uint64_t nc = gid / H, h = gid % H; + uint64_t n = nc / C, c = nc % C; if(n >= N) return; - size_t oh = static_cast(floor(static_cast(h * OH) / H)); - size_t koh = static_cast(ceil(static_cast((h + 1) * OH) / H)) - oh; + uint64_t oh = static_cast(floor(static_cast(h * OH) / H)); + uint64_t koh = static_cast(ceil(static_cast((h + 1) * OH) / H)) - oh; FLOAT_ACCUM grad = 0; - for(size_t ih = oh; ih < (oh + koh); ++ih) + for(uint64_t ih = oh; ih < (oh + koh); ++ih) { - size_t kh = static_cast(ceil(static_cast((ih + 1) * H) / OH)) - - static_cast(floor(static_cast(ih * H) / OH)); + uint64_t kh = static_cast(ceil(static_cast((ih + 1) * H) / OH)) - + static_cast(floor(static_cast(ih * H) / OH)); grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih})]) / kh; } input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = CVT_ACCUM2FLOAT(grad); @@ -109,10 +101,10 @@ __device__ void adaptiveAvgPoolBackward1d(const TI* __restrict__ output_grad, extern "C" __global__ void AdaptiveAvgPoolBackward1d(const INPUT_TYPE* __restrict__ output_grad, OUTPUT_TYPE* __restrict__ input_grad, - size_t N, - size_t C, - size_t H, - size_t OH, + uint64_t N, + uint64_t C, + uint64_t H, + uint64_t OH, tensor_view_t<3> output_grad_tv, tensor_view_t<3> input_grad_tv) { @@ -123,34 +115,34 @@ extern "C" __global__ void AdaptiveAvgPoolBackward1d(const INPUT_TYPE* __restric template __device__ void adaptiveAvgPoolForward2d(const TI* __restrict__ input, TO* __restrict__ output, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, + uint64_t N, + uint64_t C, + uint64_t H, + uint64_t W, + uint64_t OH, + uint64_t OW, tensor_view_t<4> input_tv, tensor_view_t<4> output_tv) { - size_t gid = threadIdx.x + blockIdx.x * blockDim.x; - size_t ncoh = gid / OW, ow = gid % OW; - size_t nc = ncoh / OH, oh = ncoh % OH; - size_t n = nc / C, c = nc % C; + uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x; + uint64_t ncoh = gid / OW, ow = gid % OW; + uint64_t nc = ncoh / OH, oh = ncoh % OH; + uint64_t n = nc / C, c = nc % C; if(n >= N) return; - size_t h = static_cast(floor(static_cast(oh * H) / OH)); - size_t kh = static_cast(ceil(static_cast((oh + 1) * H) / OH)) - h; + uint64_t h = static_cast(floor(static_cast(oh * H) / OH)); + uint64_t kh = static_cast(ceil(static_cast((oh + 1) * H) / OH)) - h; - size_t w = static_cast(floor(static_cast(ow * W) / OW)); - size_t kw = static_cast(ceil(static_cast((ow + 1) * W) / OW)) - w; + uint64_t w = static_cast(floor(static_cast(ow * W) / OW)); + uint64_t kw = static_cast(ceil(static_cast((ow + 1) * W) / OW)) - w; FLOAT_ACCUM divider = static_cast(kh * kw); FLOAT_ACCUM sum = 0; - for(size_t ih = h; ih < (h + kh); ++ih) + for(uint64_t ih = h; ih < (h + kh); ++ih) { - for(size_t iw = w; iw < (w + kw); ++iw) + for(uint64_t iw = w; iw < (w + kw); ++iw) { sum += CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx({n, c, ih, iw})]); } @@ -160,12 +152,12 @@ __device__ void adaptiveAvgPoolForward2d(const TI* __restrict__ input, extern "C" __global__ void AdaptiveAvgPoolForward2d(const INPUT_TYPE* __restrict__ input, OUTPUT_TYPE* __restrict__ output, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, + uint64_t N, + uint64_t C, + uint64_t H, + uint64_t W, + uint64_t OH, + uint64_t OW, tensor_view_t<4> input_tv, tensor_view_t<4> output_tv) { @@ -176,38 +168,38 @@ extern "C" __global__ void AdaptiveAvgPoolForward2d(const INPUT_TYPE* __restrict template __device__ void adaptiveAvgPoolBackward2d(const TI* __restrict__ output_grad, TO* __restrict__ input_grad, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, + uint64_t N, + uint64_t C, + uint64_t H, + uint64_t W, + uint64_t OH, + uint64_t OW, tensor_view_t<4> output_grad_tv, tensor_view_t<4> input_grad_tv) { - size_t gid = threadIdx.x + blockIdx.x * blockDim.x; - size_t nch = gid / W, w = gid % W; - size_t nc = nch / H, h = nch % H; - size_t n = nc / C, c = nc % C; + uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x; + uint64_t nch = gid / W, w = gid % W; + uint64_t nc = nch / H, h = nch % H; + uint64_t n = nc / C, c = nc % C; if(n >= N) return; - size_t oh = static_cast(floor(static_cast(h * OH) / H)); - size_t koh = static_cast(ceil(static_cast((h + 1) * OH) / H)) - oh; + uint64_t oh = static_cast(floor(static_cast(h * OH) / H)); + uint64_t koh = static_cast(ceil(static_cast((h + 1) * OH) / H)) - oh; - size_t ow = static_cast(floor(static_cast(w * OW) / W)); - size_t kow = static_cast(ceil(static_cast((w + 1) * OW) / W)) - ow; + uint64_t ow = static_cast(floor(static_cast(w * OW) / W)); + uint64_t kow = static_cast(ceil(static_cast((w + 1) * OW) / W)) - ow; FLOAT_ACCUM grad = 0; - for(size_t ih = oh; ih < (oh + koh); ++ih) + for(uint64_t ih = oh; ih < (oh + koh); ++ih) { - size_t kh = static_cast(ceil(static_cast((ih + 1) * H) / OH)) - - static_cast(floor(static_cast(ih * H) / OH)); - for(size_t iw = ow; iw < (ow + kow); ++iw) + uint64_t kh = static_cast(ceil(static_cast((ih + 1) * H) / OH)) - + static_cast(floor(static_cast(ih * H) / OH)); + for(uint64_t iw = ow; iw < (ow + kow); ++iw) { - size_t kw = static_cast(ceil(static_cast((iw + 1) * W) / OW)) - - static_cast(floor(static_cast(iw * W) / OW)); + uint64_t kw = static_cast(ceil(static_cast((iw + 1) * W) / OW)) - + static_cast(floor(static_cast(iw * W) / OW)); grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih, iw})]) / (kh * kw); @@ -219,12 +211,12 @@ __device__ void adaptiveAvgPoolBackward2d(const TI* __restrict__ output_grad, extern "C" __global__ void AdaptiveAvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad, OUTPUT_TYPE* __restrict__ input_grad, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW, + uint64_t N, + uint64_t C, + uint64_t H, + uint64_t W, + uint64_t OH, + uint64_t OW, tensor_view_t<4> output_grad_tv, tensor_view_t<4> input_grad_tv) { @@ -235,40 +227,40 @@ extern "C" __global__ void AdaptiveAvgPoolBackward2d(const INPUT_TYPE* __restric template __device__ void adaptiveAvgPoolForward3d(const TI* __restrict__ input, TO* __restrict__ output, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, + uint64_t N, + uint64_t C, + uint64_t D, + uint64_t H, + uint64_t W, + uint64_t OD, + uint64_t OH, + uint64_t OW, tensor_view_t<5> input_tv, tensor_view_t<5> output_tv) { - size_t gid = threadIdx.x + blockIdx.x * blockDim.x; - size_t ncodoh = gid / OW, ow = gid % OW; - size_t ncod = ncodoh / OH, oh = ncodoh % OH; - size_t nc = ncod / OD, od = ncod % OD; - size_t n = nc / C, c = nc % C; + uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x; + uint64_t ncodoh = gid / OW, ow = gid % OW; + uint64_t ncod = ncodoh / OH, oh = ncodoh % OH; + uint64_t nc = ncod / OD, od = ncod % OD; + uint64_t n = nc / C, c = nc % C; if(n >= N) return; - size_t d = static_cast(floor(static_cast(od * D) / OD)); - size_t kd = static_cast(ceil(static_cast((od + 1) * D) / OD)) - d; + uint64_t d = static_cast(floor(static_cast(od * D) / OD)); + uint64_t kd = static_cast(ceil(static_cast((od + 1) * D) / OD)) - d; - size_t h = static_cast(floor(static_cast(oh * H) / OH)); - size_t kh = static_cast(ceil(static_cast((oh + 1) * H) / OH)) - h; + uint64_t h = static_cast(floor(static_cast(oh * H) / OH)); + uint64_t kh = static_cast(ceil(static_cast((oh + 1) * H) / OH)) - h; - size_t w = static_cast(floor(static_cast(ow * W) / OW)); - size_t kw = static_cast(ceil(static_cast((ow + 1) * W) / OW)) - w; + uint64_t w = static_cast(floor(static_cast(ow * W) / OW)); + uint64_t kw = static_cast(ceil(static_cast((ow + 1) * W) / OW)) - w; FLOAT_ACCUM sum = 0; - for(size_t id = d; id < (d + kd); ++id) + for(uint64_t id = d; id < (d + kd); ++id) { - for(size_t ih = h; ih < (h + kh); ++ih) + for(uint64_t ih = h; ih < (h + kh); ++ih) { - for(size_t iw = w; iw < (w + kw); ++iw) + for(uint64_t iw = w; iw < (w + kw); ++iw) { sum += CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx({n, c, id, ih, iw})]); } @@ -281,14 +273,14 @@ __device__ void adaptiveAvgPoolForward3d(const TI* __restrict__ input, extern "C" __global__ void AdaptiveAvgPoolForward3d(const INPUT_TYPE* __restrict__ input, OUTPUT_TYPE* __restrict__ output, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, + uint64_t N, + uint64_t C, + uint64_t D, + uint64_t H, + uint64_t W, + uint64_t OD, + uint64_t OH, + uint64_t OW, tensor_view_t<5> input_tv, tensor_view_t<5> output_tv) { @@ -299,48 +291,49 @@ extern "C" __global__ void AdaptiveAvgPoolForward3d(const INPUT_TYPE* __restrict template __device__ void adaptiveAvgPoolBackward3d(const TI* __restrict__ output_grad, TO* __restrict__ input_grad, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, + uint64_t N, + uint64_t C, + uint64_t D, + uint64_t H, + uint64_t W, + uint64_t OD, + uint64_t OH, + uint64_t OW, tensor_view_t<5> output_grad_tv, tensor_view_t<5> input_grad_tv) { - size_t gid = threadIdx.x + blockIdx.x * blockDim.x; - size_t ncdh = gid / W, w = gid % W; - size_t ncd = ncdh / H, h = ncdh % H; - size_t nc = ncd / D, d = ncd % D; - size_t n = nc / C, c = nc % C; + uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x; + uint64_t ncdh = gid / W, w = gid % W; + uint64_t ncd = ncdh / H, h = ncdh % H; + uint64_t nc = ncd / D, d = ncd % D; + uint64_t n = nc / C, c = nc % C; if(n >= N) return; - size_t od = static_cast(floor(static_cast(d * OD) / D)); - size_t kod = static_cast(ceil(static_cast((d + 1) * OD) / D)) - od; + uint64_t od = static_cast(floor(static_cast(d * OD) / D)); + uint64_t kod = static_cast(ceil(static_cast((d + 1) * OD) / D)) - od; - size_t oh = static_cast(floor(static_cast(h * OH) / H)); - size_t koh = static_cast(ceil(static_cast((h + 1) * OH) / H)) - oh; + uint64_t oh = static_cast(floor(static_cast(h * OH) / H)); + uint64_t koh = static_cast(ceil(static_cast((h + 1) * OH) / H)) - oh; - size_t ow = static_cast(floor(static_cast(w * OW) / W)); - size_t kow = static_cast(ceil(static_cast((w + 1) * OW) / W)) - ow; + uint64_t ow = static_cast(floor(static_cast(w * OW) / W)); + uint64_t kow = static_cast(ceil(static_cast((w + 1) * OW) / W)) - ow; FLOAT_ACCUM grad = 0; - for(size_t id = od; id < (od + kod); ++id) + for(uint64_t id = od; id < (od + kod); ++id) { - size_t kd = static_cast(ceil(static_cast((id + 1) * D) / OD)) - - static_cast(floor(static_cast(id * D) / OD)); - for(size_t ih = oh; ih < (oh + koh); ++ih) + uint64_t kd = static_cast(ceil(static_cast((id + 1) * D) / OD)) - + static_cast(floor(static_cast(id * D) / OD)); + for(uint64_t ih = oh; ih < (oh + koh); ++ih) { - size_t kh = static_cast(ceil(static_cast((ih + 1) * H) / OH)) - - static_cast(floor(static_cast(ih * H) / OH)); - for(size_t iw = ow; iw < (ow + kow); ++iw) + uint64_t kh = static_cast(ceil(static_cast((ih + 1) * H) / OH)) - + static_cast(floor(static_cast(ih * H) / OH)); + for(uint64_t iw = ow; iw < (ow + kow); ++iw) { - size_t kw = static_cast(ceil(static_cast((iw + 1) * W) / OW)) - - static_cast(floor(static_cast(iw * W) / OW)); + uint64_t kw = + static_cast(ceil(static_cast((iw + 1) * W) / OW)) - + static_cast(floor(static_cast(iw * W) / OW)); grad += CVT_FLOAT2ACCUM( output_grad[output_grad_tv.get_tensor_view_idx({n, c, id, ih, iw})]) / (kd * kh * kw); @@ -353,14 +346,14 @@ __device__ void adaptiveAvgPoolBackward3d(const TI* __restrict__ output_grad, extern "C" __global__ void AdaptiveAvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad, OUTPUT_TYPE* __restrict__ input_grad, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW, + uint64_t N, + uint64_t C, + uint64_t D, + uint64_t H, + uint64_t W, + uint64_t OD, + uint64_t OH, + uint64_t OW, tensor_view_t<5> output_grad_tv, tensor_view_t<5> input_grad_tv) { diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp index 12394dbde6..c9357dd729 100644 --- a/src/kernels/tensor_view.hpp +++ b/src/kernels/tensor_view.hpp @@ -49,6 +49,7 @@ struct tensor_view_t uint64_t stride[N]; uint64_t size[N]; }; + template struct tensor_layout_t { diff --git a/src/solver.cpp b/src/solver.cpp index 90ea7c263e..62ba83cda2 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -24,9 +24,9 @@ * *******************************************************************************/ -#include "miopen/adaptiveavgpool/solvers.hpp" #include #include +#include #include #include #include diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp index 19dfa7d5f9..700029db10 100644 --- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp +++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp @@ -24,10 +24,10 @@ * *******************************************************************************/ -#include "miopen/conv_solution.hpp" -#include "miopen/execution_context.hpp" -#include "miopen/invoke_params.hpp" -#include "miopen/tensor_view_utils.hpp" +#include +#include +#include +#include #include #include @@ -90,7 +90,7 @@ ConvSolution AdaptiveAvgPoolBackward1d::GetSolution( auto input_dtype = miopen::GetDataType(problem.GetOutputGradDesc().GetType()); auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType()); auto dtype = problem.GetInputGradDesc().GetType(); - size_t N_total = problem.GetNtotal(); + uint64_t N_total = problem.GetNtotal(); auto build_params = KernelBuildParameters{ {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, @@ -115,10 +115,10 @@ ConvSolution AdaptiveAvgPoolBackward1d::GetSolution( auto input_grad_tv = get_inner_expanded_tv<3>(deref(params.inputGradDesc)); auto output_grad_tv = get_inner_expanded_tv<3>(deref(params.outputGradDesc)); - auto N = deref(params.inputGradDesc).GetLengths()[0]; - auto C = deref(params.inputGradDesc).GetLengths()[1]; - auto H = deref(params.inputGradDesc).GetLengths()[2]; - auto OH = deref(params.outputGradDesc).GetLengths()[2]; + uint64_t N = deref(params.inputGradDesc).GetLengths()[0]; + uint64_t C = deref(params.inputGradDesc).GetLengths()[1]; + uint64_t H = deref(params.inputGradDesc).GetLengths()[2]; + uint64_t OH = deref(params.outputGradDesc).GetLengths()[2]; kernel( params.output_grad, params.input_grad, N, C, H, OH, output_grad_tv, input_grad_tv); diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp index bc813dd7bf..8d3e78eb27 100644 --- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp +++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp @@ -24,10 +24,10 @@ * *******************************************************************************/ -#include "miopen/conv_solution.hpp" -#include "miopen/execution_context.hpp" -#include "miopen/invoke_params.hpp" -#include "miopen/tensor_view_utils.hpp" +#include +#include +#include +#include #include #include @@ -106,7 +106,7 @@ ConvSolution AdaptiveAvgPoolBackward2d::GetSolution( auto input_dtype = miopen::GetDataType(problem.GetOutputGradDesc().GetType()); auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType()); auto dtype = problem.GetInputGradDesc().GetType(); - size_t N_total = problem.GetNtotal(); + uint64_t N_total = problem.GetNtotal(); auto build_params = KernelBuildParameters{ {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, @@ -131,12 +131,12 @@ ConvSolution AdaptiveAvgPoolBackward2d::GetSolution( auto input_grad_tv = get_inner_expanded_tv<4>(deref(params.inputGradDesc)); auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc)); - auto N = deref(params.inputGradDesc).GetLengths()[0]; - auto C = deref(params.inputGradDesc).GetLengths()[1]; - auto H = deref(params.inputGradDesc).GetLengths()[2]; - auto W = deref(params.inputGradDesc).GetLengths()[3]; - auto OH = deref(params.outputGradDesc).GetLengths()[2]; - auto OW = deref(params.outputGradDesc).GetLengths()[3]; + uint64_t N = deref(params.inputGradDesc).GetLengths()[0]; + uint64_t C = deref(params.inputGradDesc).GetLengths()[1]; + uint64_t H = deref(params.inputGradDesc).GetLengths()[2]; + uint64_t W = deref(params.inputGradDesc).GetLengths()[3]; + uint64_t OH = deref(params.outputGradDesc).GetLengths()[2]; + uint64_t OW = deref(params.outputGradDesc).GetLengths()[3]; kernel(params.output_grad, params.input_grad, diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp index d2073f4304..4918f2c970 100644 --- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp +++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp @@ -24,10 +24,10 @@ * *******************************************************************************/ -#include "miopen/conv_solution.hpp" -#include "miopen/execution_context.hpp" -#include "miopen/invoke_params.hpp" -#include "miopen/tensor_view_utils.hpp" +#include +#include +#include +#include #include #include @@ -87,7 +87,7 @@ ConvSolution AdaptiveAvgPoolBackward3d::GetSolution( auto input_dtype = miopen::GetDataType(problem.GetOutputGradDesc().GetType()); auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType()); auto dtype = problem.GetInputGradDesc().GetType(); - size_t N_total = problem.GetNtotal(); + uint64_t N_total = problem.GetNtotal(); auto build_params = KernelBuildParameters{ {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, @@ -112,14 +112,14 @@ ConvSolution AdaptiveAvgPoolBackward3d::GetSolution( auto input_grad_tv = get_inner_expanded_tv<5>(deref(params.inputGradDesc)); auto output_grad_tv = get_inner_expanded_tv<5>(deref(params.outputGradDesc)); - auto N = deref(params.inputGradDesc).GetLengths()[0]; - auto C = deref(params.inputGradDesc).GetLengths()[1]; - auto D = deref(params.inputGradDesc).GetLengths()[2]; - auto H = deref(params.inputGradDesc).GetLengths()[3]; - auto W = deref(params.inputGradDesc).GetLengths()[4]; - auto OD = deref(params.outputGradDesc).GetLengths()[2]; - auto OH = deref(params.outputGradDesc).GetLengths()[3]; - auto OW = deref(params.outputGradDesc).GetLengths()[4]; + uint64_t N = deref(params.inputGradDesc).GetLengths()[0]; + uint64_t C = deref(params.inputGradDesc).GetLengths()[1]; + uint64_t D = deref(params.inputGradDesc).GetLengths()[2]; + uint64_t H = deref(params.inputGradDesc).GetLengths()[3]; + uint64_t W = deref(params.inputGradDesc).GetLengths()[4]; + uint64_t OD = deref(params.outputGradDesc).GetLengths()[2]; + uint64_t OH = deref(params.outputGradDesc).GetLengths()[3]; + uint64_t OW = deref(params.outputGradDesc).GetLengths()[4]; kernel(params.output_grad, params.input_grad, diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp index 1dc63c5858..f50bd5a56f 100644 --- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp +++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp @@ -24,10 +24,10 @@ * *******************************************************************************/ -#include "miopen/conv_solution.hpp" -#include "miopen/execution_context.hpp" -#include "miopen/invoke_params.hpp" -#include "miopen/tensor_view_utils.hpp" +#include +#include +#include +#include #include #include @@ -84,7 +84,7 @@ ConvSolution AdaptiveAvgPoolForward1d::GetSolution( auto input_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType()); auto dtype = problem.GetOutputDesc().GetType(); - size_t N_total = problem.GetNtotal(); + uint64_t N_total = problem.GetNtotal(); auto build_params = KernelBuildParameters{ {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, @@ -109,10 +109,10 @@ ConvSolution AdaptiveAvgPoolForward1d::GetSolution( auto input_tv = get_inner_expanded_tv<3>(deref(params.inputDesc)); auto output_tv = get_inner_expanded_tv<3>(deref(params.outputDesc)); - size_t N = deref(params.inputDesc).GetLengths()[0]; - size_t C = deref(params.inputDesc).GetLengths()[1]; - size_t H = deref(params.inputDesc).GetLengths()[2]; - size_t OH = deref(params.outputDesc).GetLengths()[2]; + uint64_t N = deref(params.inputDesc).GetLengths()[0]; + uint64_t C = deref(params.inputDesc).GetLengths()[1]; + uint64_t H = deref(params.inputDesc).GetLengths()[2]; + uint64_t OH = deref(params.outputDesc).GetLengths()[2]; kernel(params.input, params.output, N, C, H, OH, input_tv, output_tv); }; diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp index 623485634a..ff62625dcd 100644 --- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp +++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp @@ -24,10 +24,10 @@ * *******************************************************************************/ -#include "miopen/conv_solution.hpp" -#include "miopen/execution_context.hpp" -#include "miopen/invoke_params.hpp" -#include "miopen/tensor_view_utils.hpp" +#include +#include +#include +#include #include #include @@ -90,7 +90,7 @@ ConvSolution AdaptiveAvgPoolForward2d::GetSolution( auto input_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType()); auto dtype = problem.GetOutputDesc().GetType(); - size_t N_total = problem.GetNtotal(); + uint64_t N_total = problem.GetNtotal(); auto build_params = KernelBuildParameters{ {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, @@ -115,12 +115,12 @@ ConvSolution AdaptiveAvgPoolForward2d::GetSolution( auto input_tv = get_inner_expanded_tv<4>(deref(params.inputDesc)); auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc)); - size_t N = deref(params.inputDesc).GetLengths()[0]; - size_t C = deref(params.inputDesc).GetLengths()[1]; - size_t H = deref(params.inputDesc).GetLengths()[2]; - size_t W = deref(params.inputDesc).GetLengths()[3]; - size_t OH = deref(params.outputDesc).GetLengths()[2]; - size_t OW = deref(params.outputDesc).GetLengths()[3]; + uint64_t N = deref(params.inputDesc).GetLengths()[0]; + uint64_t C = deref(params.inputDesc).GetLengths()[1]; + uint64_t H = deref(params.inputDesc).GetLengths()[2]; + uint64_t W = deref(params.inputDesc).GetLengths()[3]; + uint64_t OH = deref(params.outputDesc).GetLengths()[2]; + uint64_t OW = deref(params.outputDesc).GetLengths()[3]; kernel(params.input, params.output, N, C, H, W, OH, OW, input_tv, output_tv); }; diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp index b4081849eb..2c31e96f24 100644 --- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp +++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp @@ -24,10 +24,10 @@ * *******************************************************************************/ -#include "miopen/conv_solution.hpp" -#include "miopen/execution_context.hpp" -#include "miopen/invoke_params.hpp" -#include "miopen/tensor_view_utils.hpp" +#include +#include +#include +#include #include #include @@ -90,7 +90,7 @@ ConvSolution AdaptiveAvgPoolForward3d::GetSolution( auto input_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType()); auto dtype = problem.GetOutputDesc().GetType(); - size_t N_total = problem.GetNtotal(); + uint64_t N_total = problem.GetNtotal(); auto build_params = KernelBuildParameters{ {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, @@ -115,14 +115,14 @@ ConvSolution AdaptiveAvgPoolForward3d::GetSolution( auto input_tv = get_inner_expanded_tv<5>(deref(params.inputDesc)); auto output_tv = get_inner_expanded_tv<5>(deref(params.outputDesc)); - auto N = deref(params.inputDesc).GetLengths()[0]; - auto C = deref(params.inputDesc).GetLengths()[1]; - auto D = deref(params.inputDesc).GetLengths()[2]; - auto H = deref(params.inputDesc).GetLengths()[3]; - auto W = deref(params.inputDesc).GetLengths()[4]; - auto OD = deref(params.outputDesc).GetLengths()[2]; - auto OH = deref(params.outputDesc).GetLengths()[3]; - auto OW = deref(params.outputDesc).GetLengths()[4]; + uint64_t N = deref(params.inputDesc).GetLengths()[0]; + uint64_t C = deref(params.inputDesc).GetLengths()[1]; + uint64_t D = deref(params.inputDesc).GetLengths()[2]; + uint64_t H = deref(params.inputDesc).GetLengths()[3]; + uint64_t W = deref(params.inputDesc).GetLengths()[4]; + uint64_t OD = deref(params.outputDesc).GetLengths()[2]; + uint64_t OH = deref(params.outputDesc).GetLengths()[3]; + uint64_t OW = deref(params.outputDesc).GetLengths()[4]; kernel(params.input, params.output, N, C, D, H, W, OD, OH, OW, input_tv, output_tv); }; diff --git a/test/cpu_adaptiveavgpool.hpp b/test/cpu_adaptiveavgpool.hpp index 4b6dd99dda..955cdbb3b7 100644 --- a/test/cpu_adaptiveavgpool.hpp +++ b/test/cpu_adaptiveavgpool.hpp @@ -23,11 +23,11 @@ * SOFTWARE. * *******************************************************************************/ -#ifndef GUARD_CPU_AVGPOOL_HPP -#define GUARD_CPU_AVGPOOL_HPP +#pragma once #include "tensor_holder.hpp" #include +#include "ford.hpp" template void cpu_adaptiveavgpool_forward_1d( @@ -39,14 +39,10 @@ void cpu_adaptiveavgpool_forward_1d( auto input_tv = miopen::get_inner_expanded_tv<3>(input.desc); auto output_tv = miopen::get_inner_expanded_tv<3>(output.desc); - for(size_t gid = 0; gid < numel; gid++) - { + par_ford(numel)([&](size_t gid) { size_t nc = gid / OH, oh = gid % OH; size_t n = nc / C, c = nc % C; - if(n >= N) - return; - size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; @@ -57,7 +53,7 @@ void cpu_adaptiveavgpool_forward_1d( } output[output_tv.get_tensor_view_idx({n, c, oh})] = static_cast(sum / kh); - } + }); } template @@ -76,15 +72,11 @@ void cpu_adaptiveavgpool_forward_2d(tensor input, auto input_tv = miopen::get_inner_expanded_tv<4>(input.desc); auto output_tv = miopen::get_inner_expanded_tv<4>(output.desc); - for(size_t gid = 0; gid < numel; gid++) - { + par_ford(numel)([&](size_t gid) { size_t ncoh = gid / OW, ow = gid % OW; size_t nc = ncoh / OH, oh = ncoh % OH; size_t n = nc / C, c = nc % C; - if(n >= N) - return; - size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; @@ -102,7 +94,7 @@ void cpu_adaptiveavgpool_forward_2d(tensor input, } output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = static_cast(sum / divider); - } + }); } template @@ -123,16 +115,12 @@ void cpu_adaptiveavgpool_forward_3d(tensor input, auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); auto output_tv = miopen::get_inner_expanded_tv<5>(output.desc); - for(size_t gid = 0; gid < numel; gid++) - { + par_ford(numel)([&](size_t gid) { size_t ncodoh = gid / OW, ow = gid % OW; size_t ncod = ncodoh / OH, oh = ncodoh % OH; size_t nc = ncod / OD, od = ncod % OD; size_t n = nc / C, c = nc % C; - if(n >= N) - return; - size_t d = static_cast(std::floor(static_cast(od * D) / OD)); size_t kd = static_cast(std::ceil(static_cast((od + 1) * D) / OD)) - d; @@ -157,7 +145,7 @@ void cpu_adaptiveavgpool_forward_3d(tensor input, output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] = static_cast(sum / (kd * kh * kw)); - } + }); } template @@ -170,14 +158,10 @@ void cpu_adaptiveavgpool_backward_1d( auto output_grad_tv = miopen::get_inner_expanded_tv<3>(output_grad.desc); auto input_grad_tv = miopen::get_inner_expanded_tv<3>(input_grad.desc); - for(size_t gid = 0; gid < numel; gid++) - { + par_ford(numel)([&](size_t gid) { size_t nc = gid / H, h = gid % H; size_t n = nc / C, c = nc % C; - if(n >= N) - return; - size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; @@ -192,7 +176,7 @@ void cpu_adaptiveavgpool_backward_1d( } input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = static_cast(grad); - } + }); } template @@ -211,15 +195,11 @@ void cpu_adaptiveavgpool_backward_2d(tensor output_grad, auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc); auto input_grad_tv = miopen::get_inner_expanded_tv<4>(input_grad.desc); - for(size_t gid = 0; gid < numel; gid++) - { + par_ford(numel)([&](size_t gid) { size_t nch = gid / W, w = gid % W; size_t nc = nch / H, h = nch % H; size_t n = nc / C, c = nc % C; - if(n >= N) - return; - size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; @@ -242,7 +222,7 @@ void cpu_adaptiveavgpool_backward_2d(tensor output_grad, } input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = static_cast(grad); - } + }); } template @@ -263,16 +243,12 @@ void cpu_adaptiveavgpool_backward_3d(tensor output_grad, auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc); auto input_grad_tv = miopen::get_inner_expanded_tv<5>(input_grad.desc); - for(size_t gid = 0; gid < numel; gid++) - { + par_ford(numel)([&](size_t gid) { size_t ncdh = gid / W, w = gid % W; size_t ncd = ncdh / H, h = ncdh % H; size_t nc = ncd / D, d = ncd % D; size_t n = nc / C, c = nc % C; - if(n >= N) - return; - size_t od = static_cast(std::floor(static_cast(d * OD) / D)); size_t kod = static_cast(std::ceil(static_cast((d + 1) * OD) / D)) - od; @@ -305,7 +281,5 @@ void cpu_adaptiveavgpool_backward_3d(tensor output_grad, } input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = static_cast(grad); - } + }); } - -#endif diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp index 7f01813331..d7d493ed27 100644 --- a/test/gtest/adaptiveavgpool.hpp +++ b/test/gtest/adaptiveavgpool.hpp @@ -232,7 +232,6 @@ struct AdaptiveAvgPoolTestFwd : public ::testing::TestWithParam(output_dev, output.data.size()); @@ -245,7 +244,8 @@ struct AdaptiveAvgPoolTestFwd : public ::testing::TestWithParam::epsilon(); auto error = miopen::rms_range(ref_input_grad, input_grad); ASSERT_EQ(miopen::range_distance(ref_input_grad), miopen::range_distance(input_grad)); - EXPECT_LT(error, threshold * 10); + EXPECT_LT(error, threshold * 10) + << "Error backward Input Gradient beyond 10xthreshold : " << error + << " Tolerance: " << threshold * 10; } AdaptiveAvgPoolTestCase adaptiveavgpool_config; From 796adb177aa9b2862a4193c2d9090427c73fd3d7 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Mon, 7 Oct 2024 14:36:25 +0700 Subject: [PATCH 23/38] rm large number of cast in kernel --- src/adaptiveavgpool/problem_description.cpp | 12 ++-- src/kernels/MIOpenAdaptiveAvgPool.cpp | 67 +++++++++------------ 2 files changed, 34 insertions(+), 45 deletions(-) diff --git a/src/adaptiveavgpool/problem_description.cpp b/src/adaptiveavgpool/problem_description.cpp index ec3b9cf636..148a67e299 100644 --- a/src/adaptiveavgpool/problem_description.cpp +++ b/src/adaptiveavgpool/problem_description.cpp @@ -47,10 +47,8 @@ inline std::ostream& operator<<(std::ostream& os, const std::vector& v) NetworkConfig FwdProblemDescription::MakeNetworkConfig() const { - auto input_size = inputDesc.GetLengths(); - auto output_size = outputDesc.GetLengths(); - auto input_stride = inputDesc.GetStrides(); - auto output_stride = outputDesc.GetStrides(); + auto input_size = inputDesc.GetLengths(); + auto output_size = outputDesc.GetLengths(); auto input_dtype = inputDesc.GetType(); @@ -60,8 +58,7 @@ NetworkConfig FwdProblemDescription::MakeNetworkConfig() const ss << "-input_dtype" << input_dtype; ss << "-Is" << input_size; ss << "-Os" << output_size; - ss << "-Si" << input_stride; - ss << "-So" << output_stride; + ss << "-Con" << IsAllContiguous(); return NetworkConfig{ss.str()}; } @@ -81,8 +78,7 @@ NetworkConfig BwdProblemDescription::MakeNetworkConfig() const ss << "-input_dtype" << input_dtype; ss << "-dIs" << input_grad_size; ss << "-dOs" << output_grad_size; - ss << "-dSi" << input_grad_stride; - ss << "-dSo" << output_grad_stride; + ss << "-Con" << IsAllContiguous(); return NetworkConfig{ss.str()}; } diff --git a/src/kernels/MIOpenAdaptiveAvgPool.cpp b/src/kernels/MIOpenAdaptiveAvgPool.cpp index 8d26ea0301..273ec99087 100644 --- a/src/kernels/MIOpenAdaptiveAvgPool.cpp +++ b/src/kernels/MIOpenAdaptiveAvgPool.cpp @@ -47,8 +47,8 @@ __device__ void adaptiveAvgPoolForward1d(const TI* __restrict__ input, if(n >= N) return; - uint64_t h = static_cast(floor(static_cast(oh * H) / OH)); - uint64_t kh = static_cast(ceil(static_cast((oh + 1) * H) / OH)) - h; + uint64_t h = oh * H / OH; + uint64_t kh = (((oh + 1) * H + OH - 1) / OH) - h; FLOAT_ACCUM sum = 0; for(uint64_t ih = h; ih < (h + kh); ++ih) @@ -86,14 +86,13 @@ __device__ void adaptiveAvgPoolBackward1d(const TI* __restrict__ output_grad, if(n >= N) return; - uint64_t oh = static_cast(floor(static_cast(h * OH) / H)); - uint64_t koh = static_cast(ceil(static_cast((h + 1) * OH) / H)) - oh; + uint64_t oh = (h * OH) / H; + uint64_t koh = (((h + 1) * OH + H - 1) / H) - oh; FLOAT_ACCUM grad = 0; for(uint64_t ih = oh; ih < (oh + koh); ++ih) { - uint64_t kh = static_cast(ceil(static_cast((ih + 1) * H) / OH)) - - static_cast(floor(static_cast(ih * H) / OH)); + uint64_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH; grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih})]) / kh; } input_grad[input_grad_tv.get_tensor_view_idx({n, c, h})] = CVT_ACCUM2FLOAT(grad); @@ -132,11 +131,11 @@ __device__ void adaptiveAvgPoolForward2d(const TI* __restrict__ input, if(n >= N) return; - uint64_t h = static_cast(floor(static_cast(oh * H) / OH)); - uint64_t kh = static_cast(ceil(static_cast((oh + 1) * H) / OH)) - h; + uint64_t h = (oh * H) / OH; + uint64_t kh = (((oh + 1) * H + OH - 1) / OH) - h; - uint64_t w = static_cast(floor(static_cast(ow * W) / OW)); - uint64_t kw = static_cast(ceil(static_cast((ow + 1) * W) / OW)) - w; + uint64_t w = (ow * W) / OW; + uint64_t kw = (((ow + 1) * W + OW - 1) / OW) - w; FLOAT_ACCUM divider = static_cast(kh * kw); FLOAT_ACCUM sum = 0; @@ -185,21 +184,19 @@ __device__ void adaptiveAvgPoolBackward2d(const TI* __restrict__ output_grad, if(n >= N) return; - uint64_t oh = static_cast(floor(static_cast(h * OH) / H)); - uint64_t koh = static_cast(ceil(static_cast((h + 1) * OH) / H)) - oh; + uint64_t oh = (h * OH) / H; + uint64_t koh = ((h + 1) * OH + H - 1) / H - oh; - uint64_t ow = static_cast(floor(static_cast(w * OW) / W)); - uint64_t kow = static_cast(ceil(static_cast((w + 1) * OW) / W)) - ow; + uint64_t ow = (w * OW) / W; + uint64_t kow = ((w + 1) * OW + W - 1) / W - ow; FLOAT_ACCUM grad = 0; for(uint64_t ih = oh; ih < (oh + koh); ++ih) { - uint64_t kh = static_cast(ceil(static_cast((ih + 1) * H) / OH)) - - static_cast(floor(static_cast(ih * H) / OH)); + uint64_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH; for(uint64_t iw = ow; iw < (ow + kow); ++iw) { - uint64_t kw = static_cast(ceil(static_cast((iw + 1) * W) / OW)) - - static_cast(floor(static_cast(iw * W) / OW)); + uint64_t kw = ((iw + 1) * W + OW - 1) / OW - (iw * W) / OW; grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih, iw})]) / (kh * kw); @@ -246,14 +243,14 @@ __device__ void adaptiveAvgPoolForward3d(const TI* __restrict__ input, if(n >= N) return; - uint64_t d = static_cast(floor(static_cast(od * D) / OD)); - uint64_t kd = static_cast(ceil(static_cast((od + 1) * D) / OD)) - d; + uint64_t d = (od * D) / OD; + uint64_t kd = ((od + 1) * D + OD - 1) / OD - d; - uint64_t h = static_cast(floor(static_cast(oh * H) / OH)); - uint64_t kh = static_cast(ceil(static_cast((oh + 1) * H) / OH)) - h; + uint64_t h = (oh * H) / OH; + uint64_t kh = ((oh + 1) * H + OH - 1) / OH - h; - uint64_t w = static_cast(floor(static_cast(ow * W) / OW)); - uint64_t kw = static_cast(ceil(static_cast((ow + 1) * W) / OW)) - w; + uint64_t w = (ow * W) / OW; + uint64_t kw = ((ow + 1) * W + OW - 1) / OW - w; FLOAT_ACCUM sum = 0; for(uint64_t id = d; id < (d + kd); ++id) @@ -311,29 +308,25 @@ __device__ void adaptiveAvgPoolBackward3d(const TI* __restrict__ output_grad, if(n >= N) return; - uint64_t od = static_cast(floor(static_cast(d * OD) / D)); - uint64_t kod = static_cast(ceil(static_cast((d + 1) * OD) / D)) - od; + uint64_t od = (d * OD) / D; + uint64_t kod = ((d + 1) * OD + D - 1) / D - od; - uint64_t oh = static_cast(floor(static_cast(h * OH) / H)); - uint64_t koh = static_cast(ceil(static_cast((h + 1) * OH) / H)) - oh; + uint64_t oh = (h * OH) / H; + uint64_t koh = ((h + 1) * OH + H - 1) / H - oh; - uint64_t ow = static_cast(floor(static_cast(w * OW) / W)); - uint64_t kow = static_cast(ceil(static_cast((w + 1) * OW) / W)) - ow; + uint64_t ow = (w * OW) / W; + uint64_t kow = ((w + 1) * OW + W - 1) / W - ow; FLOAT_ACCUM grad = 0; for(uint64_t id = od; id < (od + kod); ++id) { - uint64_t kd = static_cast(ceil(static_cast((id + 1) * D) / OD)) - - static_cast(floor(static_cast(id * D) / OD)); + uint64_t kd = ((id + 1) * D + OD - 1) / OD - (id * D) / OD; for(uint64_t ih = oh; ih < (oh + koh); ++ih) { - uint64_t kh = static_cast(ceil(static_cast((ih + 1) * H) / OH)) - - static_cast(floor(static_cast(ih * H) / OH)); + uint64_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH; for(uint64_t iw = ow; iw < (ow + kow); ++iw) { - uint64_t kw = - static_cast(ceil(static_cast((iw + 1) * W) / OW)) - - static_cast(floor(static_cast(iw * W) / OW)); + uint64_t kw = ((iw + 1) * W + OW - 1) / OW - (iw * W) / OW; grad += CVT_FLOAT2ACCUM( output_grad[output_grad_tv.get_tensor_view_idx({n, c, id, ih, iw})]) / (kd * kh * kw); From 93985d4003d40389f105253824b56fba60b2664f Mon Sep 17 00:00:00 2001 From: hieule88 Date: Mon, 7 Oct 2024 17:33:35 +0700 Subject: [PATCH 24/38] resolved comments --- driver/adaptiveavgpool_driver.hpp | 3 +++ test/gtest/adaptiveavgpool.hpp | 5 ++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp index 65badc5fa2..f179ff5b5b 100644 --- a/driver/adaptiveavgpool_driver.hpp +++ b/driver/adaptiveavgpool_driver.hpp @@ -261,7 +261,10 @@ int AdaptiveAvgPoolDriver::AllocateBuffersAndCopy() status |= output_grad_dev->ToGPU(q, output_grad.data()); if(status != 0) + { std::cout << "Error copying data to GPU\n" << std::endl; + return miopenStatusAllocFailed; + } return miopenStatusSuccess; } diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp index d7d493ed27..d4e5f1829e 100644 --- a/test/gtest/adaptiveavgpool.hpp +++ b/test/gtest/adaptiveavgpool.hpp @@ -228,7 +228,7 @@ struct AdaptiveAvgPoolTestFwd : public ::testing::TestWithParam(input, ref_output, N, C, D, H, W, OD, OH, OW); + cpu_adaptiveavgpool_forward_3d(input, ref_output, N, C, D, H, W, OD, OH, OW); } status = miopen::adaptiveavgpool::AdaptiveAvgPoolForward( handle, input.desc, input_dev.get(), output.desc, output_dev.get()); @@ -346,8 +346,7 @@ struct AdaptiveAvgPoolTestBwd : public ::testing::TestWithParam( - output_grad, ref_input_grad, N, C, D, H, W, OD, OH, OW); + cpu_adaptiveavgpool_backward_3d(output_grad, ref_input_grad, N, C, D, H, W, OD, OH, OW); } status = miopen::adaptiveavgpool::AdaptiveAvgPoolBackward( handle, output_grad.desc, output_grad_dev.get(), input_grad.desc, input_grad_dev.get()); From 7795a19d84fc3f2136d12fb10031fb1c60026543 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Mon, 7 Oct 2024 17:41:03 +0700 Subject: [PATCH 25/38] add T --- test/gtest/adaptiveavgpool.hpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp index d4e5f1829e..58b82cdf4e 100644 --- a/test/gtest/adaptiveavgpool.hpp +++ b/test/gtest/adaptiveavgpool.hpp @@ -220,15 +220,15 @@ struct AdaptiveAvgPoolTestFwd : public ::testing::TestWithParam(input, ref_output, N, C, H, OH); } else if(dims == 4) { - cpu_adaptiveavgpool_forward_2d(input, ref_output, N, C, H, W, OH, OW); + cpu_adaptiveavgpool_forward_2d(input, ref_output, N, C, H, W, OH, OW); } else if(dims == 5) { - cpu_adaptiveavgpool_forward_3d(input, ref_output, N, C, D, H, W, OD, OH, OW); + cpu_adaptiveavgpool_forward_3d(input, ref_output, N, C, D, H, W, OD, OH, OW); } status = miopen::adaptiveavgpool::AdaptiveAvgPoolForward( handle, input.desc, input_dev.get(), output.desc, output_dev.get()); @@ -338,15 +338,16 @@ struct AdaptiveAvgPoolTestBwd : public ::testing::TestWithParam(output_grad, ref_input_grad, N, C, H, OH); } else if(dims == 4) { - cpu_adaptiveavgpool_backward_2d(output_grad, ref_input_grad, N, C, H, W, OH, OW); + cpu_adaptiveavgpool_backward_2d(output_grad, ref_input_grad, N, C, H, W, OH, OW); } else if(dims == 5) { - cpu_adaptiveavgpool_backward_3d(output_grad, ref_input_grad, N, C, D, H, W, OD, OH, OW); + cpu_adaptiveavgpool_backward_3d( + output_grad, ref_input_grad, N, C, D, H, W, OD, OH, OW); } status = miopen::adaptiveavgpool::AdaptiveAvgPoolBackward( handle, output_grad.desc, output_grad_dev.get(), input_grad.desc, input_grad_dev.get()); From 58bcf1e1a161714156a46a786d728579c9a7092f Mon Sep 17 00:00:00 2001 From: hieule88 Date: Mon, 7 Oct 2024 18:28:35 +0700 Subject: [PATCH 26/38] reorder --- include/miopen/miopen.h | 82 ++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index a1dcc49bd4..cae252b9a2 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -7826,47 +7826,6 @@ MIOPEN_EXPORT miopenStatus_t miopenPReLUBackward(miopenHandle_t handle, // CLOSEOUT RELU DOXYGEN GROUP #endif // MIOPEN_BETA_API -#ifdef MIOPEN_BETA_API -// adaptiveavgpool APIs -/** @addtogroup adaptiveavgpool - * - * @{ - */ - -/*! @brief Execute an adaptiveavgpool forward layer - * - * @param handle MIOpen handle (input) - * @param inputDesc Tensor descriptor for input tensor (input) - * @param input Data tensor input (input) - * @param outputDesc Tensor descriptor for output tensor (input) - * @param output Data tensor output (output) - * @return miopenStatus_t - */ -MIOPEN_EXPORT miopenStatus_t miopenAdaptiveAvgPoolForward(miopenHandle_t handle, - const miopenTensorDescriptor_t inputDesc, - const void* input, - const miopenTensorDescriptor_t outputDesc, - void* output); - -/*! @brief Execute an adaptiveavgpool backward layer - * - * @param handle MIOpen handle (input) - * @param outputGradDesc Tensor descriptor for output grad tensor (input) - * @param output_grad Data tensor output grad (input) - * @param inputGradDesc Tensor descriptor for input grad tensor (input) - * @param input_grad Data tensor input grad (output) - * @return miopenStatus_t - */ -MIOPEN_EXPORT miopenStatus_t -miopenAdaptiveAvgPoolBackward(miopenHandle_t handle, - const miopenTensorDescriptor_t outputGradDesc, - const void* output_grad, - const miopenTensorDescriptor_t inputGradDesc, - void* input_grad); -/** @} */ -// CLOSEOUT adaptiveavgpool DOXYGEN GROUP -#endif // MIOPEN_BETA_API - #ifdef MIOPEN_BETA_API /*! @ingroup LossFunction @@ -7963,6 +7922,47 @@ MIOPEN_EXPORT miopenStatus_t miopenSoftMarginLossBackward(miopenHandle_t handle, // CLOSEOUT LossFunction DOXYGEN GROUP #endif +#ifdef MIOPEN_BETA_API +// adaptiveavgpool APIs +/** @addtogroup adaptiveavgpool + * + * @{ + */ + +/*! @brief Execute an adaptiveavgpool forward layer + * + * @param handle MIOpen handle (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param input Data tensor input (input) + * @param outputDesc Tensor descriptor for output tensor (input) + * @param output Data tensor output (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenAdaptiveAvgPoolForward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output); + +/*! @brief Execute an adaptiveavgpool backward layer + * + * @param handle MIOpen handle (input) + * @param outputGradDesc Tensor descriptor for output grad tensor (input) + * @param output_grad Data tensor output grad (input) + * @param inputGradDesc Tensor descriptor for input grad tensor (input) + * @param input_grad Data tensor input grad (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenAdaptiveAvgPoolBackward(miopenHandle_t handle, + const miopenTensorDescriptor_t outputGradDesc, + const void* output_grad, + const miopenTensorDescriptor_t inputGradDesc, + void* input_grad); +/** @} */ +// CLOSEOUT adaptiveavgpool DOXYGEN GROUP +#endif // MIOPEN_BETA_API + #ifdef __cplusplus } #endif From 335f1d596a89598b0ec3cadf32054656a55327cb Mon Sep 17 00:00:00 2001 From: hieule88 Date: Tue, 8 Oct 2024 10:38:23 +0700 Subject: [PATCH 27/38] small fix --- src/include/miopen/solver_id.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp index a2b5a4214f..8559acaf49 100644 --- a/src/include/miopen/solver_id.hpp +++ b/src/include/miopen/solver_id.hpp @@ -63,7 +63,8 @@ enum class Primitive ReLU, Kthvalue, SoftMarginLoss, - MultiMarginLoss AdaptiveAvgPool, + MultiMarginLoss, + AdaptiveAvgPool, }; struct MIOPEN_INTERNALS_EXPORT Id From 702443bdf356932bfbf2eb1f138367951c93a1e6 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Wed, 16 Oct 2024 13:58:51 +0700 Subject: [PATCH 28/38] rerun CI --- test/gtest/adaptiveavgpool.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/gtest/adaptiveavgpool.cpp b/test/gtest/adaptiveavgpool.cpp index e12e327500..2a49a2d7eb 100644 --- a/test/gtest/adaptiveavgpool.cpp +++ b/test/gtest/adaptiveavgpool.cpp @@ -50,13 +50,13 @@ TEST_P(GPU_AdaptiveAvgpool_fwd_BFP16, AdaptiveAvgPoolTestFwd) Verify(); }; -INSTANTIATE_TEST_SUITE_P(Smoke, +INSTANTIATE_TEST_SUITE_P(Full, GPU_AdaptiveAvgpool_fwd_FP32, testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp32())); -INSTANTIATE_TEST_SUITE_P(Smoke, +INSTANTIATE_TEST_SUITE_P(Full, GPU_AdaptiveAvgpool_fwd_FP16, testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp16())); -INSTANTIATE_TEST_SUITE_P(Smoke, +INSTANTIATE_TEST_SUITE_P(Full, GPU_AdaptiveAvgpool_fwd_BFP16, testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdBfp16())); @@ -83,12 +83,12 @@ TEST_P(GPU_AdaptiveAvgpool_bwd_BFP16, AdaptiveAvgPoolTestBwd) Verify(); }; -INSTANTIATE_TEST_SUITE_P(Smoke, +INSTANTIATE_TEST_SUITE_P(Full, GPU_AdaptiveAvgpool_bwd_FP32, testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp32())); -INSTANTIATE_TEST_SUITE_P(Smoke, +INSTANTIATE_TEST_SUITE_P(Full, GPU_AdaptiveAvgpool_bwd_FP16, testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp16())); -INSTANTIATE_TEST_SUITE_P(Smoke, +INSTANTIATE_TEST_SUITE_P(Full, GPU_AdaptiveAvgpool_bwd_BFP16, testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdBfp16())); From bd72a3ef4b851769ffc168937480bb2be2ec6978 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Thu, 17 Oct 2024 15:45:37 +0700 Subject: [PATCH 29/38] rerun CI --- test/gtest/adaptiveavgpool.cpp | 36 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/test/gtest/adaptiveavgpool.cpp b/test/gtest/adaptiveavgpool.cpp index 2a49a2d7eb..b36051f40a 100644 --- a/test/gtest/adaptiveavgpool.cpp +++ b/test/gtest/adaptiveavgpool.cpp @@ -28,67 +28,67 @@ using float16 = half_float::half; // FORWARD TEST -using GPU_AdaptiveAvgpool_fwd_FP32 = AdaptiveAvgPoolTestFwd; -using GPU_AdaptiveAvgpool_fwd_FP16 = AdaptiveAvgPoolTestFwd; -using GPU_AdaptiveAvgpool_fwd_BFP16 = AdaptiveAvgPoolTestFwd; +using GPU_AdaptiveAvgPool_fwd_FP32 = AdaptiveAvgPoolTestFwd; +using GPU_AdaptiveAvgPool_fwd_FP16 = AdaptiveAvgPoolTestFwd; +using GPU_AdaptiveAvgPool_fwd_BFP16 = AdaptiveAvgPoolTestFwd; -TEST_P(GPU_AdaptiveAvgpool_fwd_FP32, AdaptiveAvgPoolTestFwd) +TEST_P(GPU_AdaptiveAvgPool_fwd_FP32, AdaptiveAvgPoolTestFwd) { RunTest(); Verify(); }; -TEST_P(GPU_AdaptiveAvgpool_fwd_FP16, AdaptiveAvgPoolTestFwd) +TEST_P(GPU_AdaptiveAvgPool_fwd_FP16, AdaptiveAvgPoolTestFwd) { RunTest(); Verify(); }; -TEST_P(GPU_AdaptiveAvgpool_fwd_BFP16, AdaptiveAvgPoolTestFwd) +TEST_P(GPU_AdaptiveAvgPool_fwd_BFP16, AdaptiveAvgPoolTestFwd) { RunTest(); Verify(); }; INSTANTIATE_TEST_SUITE_P(Full, - GPU_AdaptiveAvgpool_fwd_FP32, + GPU_AdaptiveAvgPool_fwd_FP32, testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp32())); INSTANTIATE_TEST_SUITE_P(Full, - GPU_AdaptiveAvgpool_fwd_FP16, + GPU_AdaptiveAvgPool_fwd_FP16, testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp16())); INSTANTIATE_TEST_SUITE_P(Full, - GPU_AdaptiveAvgpool_fwd_BFP16, + GPU_AdaptiveAvgPool_fwd_BFP16, testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdBfp16())); // BACKWARD TEST -using GPU_AdaptiveAvgpool_bwd_FP32 = AdaptiveAvgPoolTestBwd; -using GPU_AdaptiveAvgpool_bwd_FP16 = AdaptiveAvgPoolTestBwd; -using GPU_AdaptiveAvgpool_bwd_BFP16 = AdaptiveAvgPoolTestBwd; +using GPU_AdaptiveAvgPool_bwd_FP32 = AdaptiveAvgPoolTestBwd; +using GPU_AdaptiveAvgPool_bwd_FP16 = AdaptiveAvgPoolTestBwd; +using GPU_AdaptiveAvgPool_bwd_BFP16 = AdaptiveAvgPoolTestBwd; -TEST_P(GPU_AdaptiveAvgpool_bwd_FP32, AdaptiveAvgPoolTestBwd) +TEST_P(GPU_AdaptiveAvgPool_bwd_FP32, AdaptiveAvgPoolTestBwd) { RunTest(); Verify(); }; -TEST_P(GPU_AdaptiveAvgpool_bwd_FP16, AdaptiveAvgPoolTestBwd) +TEST_P(GPU_AdaptiveAvgPool_bwd_FP16, AdaptiveAvgPoolTestBwd) { RunTest(); Verify(); }; -TEST_P(GPU_AdaptiveAvgpool_bwd_BFP16, AdaptiveAvgPoolTestBwd) +TEST_P(GPU_AdaptiveAvgPool_bwd_BFP16, AdaptiveAvgPoolTestBwd) { RunTest(); Verify(); }; INSTANTIATE_TEST_SUITE_P(Full, - GPU_AdaptiveAvgpool_bwd_FP32, + GPU_AdaptiveAvgPool_bwd_FP32, testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp32())); INSTANTIATE_TEST_SUITE_P(Full, - GPU_AdaptiveAvgpool_bwd_FP16, + GPU_AdaptiveAvgPool_bwd_FP16, testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp16())); INSTANTIATE_TEST_SUITE_P(Full, - GPU_AdaptiveAvgpool_bwd_BFP16, + GPU_AdaptiveAvgPool_bwd_BFP16, testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdBfp16())); From f58fb6a15f4dc1b099f25f558917dff7c1a36036 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Tue, 5 Nov 2024 15:50:27 +0700 Subject: [PATCH 30/38] rerun CI --- driver/adaptiveavgpool_driver.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp index f179ff5b5b..cecb5be58b 100644 --- a/driver/adaptiveavgpool_driver.hpp +++ b/driver/adaptiveavgpool_driver.hpp @@ -464,7 +464,6 @@ int AdaptiveAvgPoolDriver::VerifyForward() std::cout << "Forward AdaptiveAvgPool Verifies on CPU and GPU (err=" << error << ")" << std::endl; } - return miopenStatusSuccess; } From 16c5eb279cc14ed1d65d28c54dcff44db2b5b3e4 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Wed, 6 Nov 2024 11:20:41 +0700 Subject: [PATCH 31/38] rerun CI --- src/adaptiveavgpool/problem_description.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/adaptiveavgpool/problem_description.cpp b/src/adaptiveavgpool/problem_description.cpp index 148a67e299..61a07272ce 100644 --- a/src/adaptiveavgpool/problem_description.cpp +++ b/src/adaptiveavgpool/problem_description.cpp @@ -49,8 +49,7 @@ NetworkConfig FwdProblemDescription::MakeNetworkConfig() const { auto input_size = inputDesc.GetLengths(); auto output_size = outputDesc.GetLengths(); - - auto input_dtype = inputDesc.GetType(); + auto input_dtype = inputDesc.GetType(); std::ostringstream ss; @@ -69,8 +68,7 @@ NetworkConfig BwdProblemDescription::MakeNetworkConfig() const auto output_grad_size = outputGradDesc.GetLengths(); auto input_grad_stride = inputGradDesc.GetStrides(); auto output_grad_stride = outputGradDesc.GetStrides(); - - auto input_dtype = inputGradDesc.GetType(); + auto input_dtype = inputGradDesc.GetType(); std::ostringstream ss; From 297e759f626b9544792912d0eb0aa745bcb3f8f1 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Fri, 22 Nov 2024 10:40:59 +0700 Subject: [PATCH 32/38] fix CICD --- driver/adaptiveavgpool_driver.hpp | 10 ++-- driver/mloAdaptiveAvgPoolHost.hpp | 66 ++++++++++----------- src/adaptiveavgpool/problem_description.cpp | 4 +- test/cpu_adaptiveavgpool.hpp | 17 ++---- test/gtest/adaptiveavgpool.hpp | 13 ++-- 5 files changed, 46 insertions(+), 64 deletions(-) diff --git a/driver/adaptiveavgpool_driver.hpp b/driver/adaptiveavgpool_driver.hpp index cecb5be58b..062c56fdce 100644 --- a/driver/adaptiveavgpool_driver.hpp +++ b/driver/adaptiveavgpool_driver.hpp @@ -323,21 +323,21 @@ int AdaptiveAvgPoolDriver::RunForwardCPU() if(in_dim.size() == 3) { status = mloAdaptiveAvgPoolForward1dRunHost( - inputDesc, outputDesc, input.data(), output_host.data(), N, C, H, OH); + inputDesc, outputDesc, input.data(), output_host.data(), C, H, OH); MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in mloAdaptiveAvgPoolForward1dRunHost"); } else if(in_dim.size() == 4) { status = mloAdaptiveAvgPoolForward2dRunHost( - inputDesc, outputDesc, input.data(), output_host.data(), N, C, H, W, OH, OW); + inputDesc, outputDesc, input.data(), output_host.data(), C, H, W, OH, OW); MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in mloAdaptiveAvgPoolForward2dRunHost"); } else if(in_dim.size() == 5) { status = mloAdaptiveAvgPoolForward3dRunHost( - inputDesc, outputDesc, input.data(), output_host.data(), N, C, D, H, W, OD, OH, OW); + inputDesc, outputDesc, input.data(), output_host.data(), C, D, H, W, OD, OH, OW); MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in mloAdaptiveAvgPoolForward3dRunHost"); } @@ -401,7 +401,7 @@ int AdaptiveAvgPoolDriver::RunBackwardCPU() if(in_dim.size() == 3) { status = mloAdaptiveAvgPoolBackward1dRunHost( - outputGradDesc, inputGradDesc, output_grad.data(), input_grad_host.data(), N, C, H, OH); + outputGradDesc, inputGradDesc, output_grad.data(), input_grad_host.data(), C, H, OH); MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in mloAdaptiveAvgPoolBackward1dRunHost"); } @@ -411,7 +411,6 @@ int AdaptiveAvgPoolDriver::RunBackwardCPU() inputGradDesc, output_grad.data(), input_grad_host.data(), - N, C, H, W, @@ -426,7 +425,6 @@ int AdaptiveAvgPoolDriver::RunBackwardCPU() inputGradDesc, output_grad.data(), input_grad_host.data(), - N, C, D, H, diff --git a/driver/mloAdaptiveAvgPoolHost.hpp b/driver/mloAdaptiveAvgPoolHost.hpp index 38088cf09e..7274408148 100644 --- a/driver/mloAdaptiveAvgPoolHost.hpp +++ b/driver/mloAdaptiveAvgPoolHost.hpp @@ -35,10 +35,9 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD const miopenTensorDescriptor_t outputDesc, const Tgpu* input, Tcheck* output, - size_t N, - size_t C, - size_t H, - size_t OH) + const size_t C, + const size_t H, + const size_t OH) { auto dims = miopen::deref(inputDesc).GetLengths(); auto numel = miopen::deref(outputDesc).GetElementSize(); @@ -69,12 +68,11 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD const miopenTensorDescriptor_t outputDesc, const Tgpu* input, Tcheck* output, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW) + const size_t C, + const size_t H, + const size_t W, + const size_t OH, + const size_t OW) { auto dims = miopen::deref(inputDesc).GetLengths(); auto numel = miopen::deref(outputDesc).GetElementSize(); @@ -113,14 +111,13 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD const miopenTensorDescriptor_t outputDesc, const Tgpu* input, Tcheck* output, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW) + const size_t C, + const size_t D, + const size_t H, + const size_t W, + const size_t OD, + const size_t OH, + const size_t OW) { auto dims = miopen::deref(inputDesc).GetLengths(); auto numel = miopen::deref(outputDesc).GetElementSize(); @@ -167,10 +164,9 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu const miopenTensorDescriptor_t inputGradDesc, const Tgpu* output_grad, Tcheck* input_grad, - size_t N, - size_t C, - size_t H, - size_t OH) + const size_t C, + const size_t H, + const size_t OH) { auto dims = miopen::deref(inputGradDesc).GetLengths(); auto numel = miopen::deref(inputGradDesc).GetElementSize(); @@ -204,12 +200,11 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu const miopenTensorDescriptor_t inputGradDesc, const Tgpu* output_grad, Tcheck* input_grad, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW) + const size_t C, + const size_t H, + const size_t W, + const size_t OH, + const size_t OW) { auto dims = miopen::deref(inputGradDesc).GetLengths(); auto numel = miopen::deref(inputGradDesc).GetElementSize(); @@ -253,14 +248,13 @@ int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outpu const miopenTensorDescriptor_t inputGradDesc, const Tgpu* output_grad, Tcheck* input_grad, - size_t N, - size_t C, - size_t D, - size_t H, - size_t W, - size_t OD, - size_t OH, - size_t OW) + const size_t C, + const size_t D, + const size_t H, + const size_t W, + const size_t OD, + const size_t OH, + const size_t OW) { auto dims = miopen::deref(inputGradDesc).GetLengths(); auto numel = miopen::deref(inputGradDesc).GetElementSize(); diff --git a/src/adaptiveavgpool/problem_description.cpp b/src/adaptiveavgpool/problem_description.cpp index 61a07272ce..f4ba38231d 100644 --- a/src/adaptiveavgpool/problem_description.cpp +++ b/src/adaptiveavgpool/problem_description.cpp @@ -49,7 +49,7 @@ NetworkConfig FwdProblemDescription::MakeNetworkConfig() const { auto input_size = inputDesc.GetLengths(); auto output_size = outputDesc.GetLengths(); - auto input_dtype = inputDesc.GetType(); + auto input_dtype = inputDesc.GetType(); std::ostringstream ss; @@ -68,7 +68,7 @@ NetworkConfig BwdProblemDescription::MakeNetworkConfig() const auto output_grad_size = outputGradDesc.GetLengths(); auto input_grad_stride = inputGradDesc.GetStrides(); auto output_grad_stride = outputGradDesc.GetStrides(); - auto input_dtype = inputGradDesc.GetType(); + auto input_dtype = inputGradDesc.GetType(); std::ostringstream ss; diff --git a/test/cpu_adaptiveavgpool.hpp b/test/cpu_adaptiveavgpool.hpp index 955cdbb3b7..ec3e457bba 100644 --- a/test/cpu_adaptiveavgpool.hpp +++ b/test/cpu_adaptiveavgpool.hpp @@ -31,7 +31,7 @@ template void cpu_adaptiveavgpool_forward_1d( - tensor input, tensor& output, size_t N, size_t C, size_t H, size_t OH) + tensor input, tensor& output, size_t C, size_t H, size_t OH) { auto dims = input.desc.GetLengths(); auto numel = output.desc.GetElementSize(); @@ -57,14 +57,8 @@ void cpu_adaptiveavgpool_forward_1d( } template -void cpu_adaptiveavgpool_forward_2d(tensor input, - tensor& output, - size_t N, - size_t C, - size_t H, - size_t W, - size_t OH, - size_t OW) +void cpu_adaptiveavgpool_forward_2d( + tensor input, tensor& output, size_t C, size_t H, size_t W, size_t OH, size_t OW) { auto dims = input.desc.GetLengths(); auto numel = output.desc.GetElementSize(); @@ -100,7 +94,6 @@ void cpu_adaptiveavgpool_forward_2d(tensor input, template void cpu_adaptiveavgpool_forward_3d(tensor input, tensor& output, - size_t N, size_t C, size_t D, size_t H, @@ -150,7 +143,7 @@ void cpu_adaptiveavgpool_forward_3d(tensor input, template void cpu_adaptiveavgpool_backward_1d( - tensor output_grad, tensor& input_grad, size_t N, size_t C, size_t H, size_t OH) + tensor output_grad, tensor& input_grad, size_t C, size_t H, size_t OH) { auto dims = input_grad.desc.GetLengths(); auto numel = input_grad.desc.GetElementSize(); @@ -182,7 +175,6 @@ void cpu_adaptiveavgpool_backward_1d( template void cpu_adaptiveavgpool_backward_2d(tensor output_grad, tensor& input_grad, - size_t N, size_t C, size_t H, size_t W, @@ -228,7 +220,6 @@ void cpu_adaptiveavgpool_backward_2d(tensor output_grad, template void cpu_adaptiveavgpool_backward_3d(tensor output_grad, tensor& input_grad, - size_t N, size_t C, size_t D, size_t H, diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp index 58b82cdf4e..ad2ef3e2d1 100644 --- a/test/gtest/adaptiveavgpool.hpp +++ b/test/gtest/adaptiveavgpool.hpp @@ -220,15 +220,15 @@ struct AdaptiveAvgPoolTestFwd : public ::testing::TestWithParam(input, ref_output, N, C, H, OH); + cpu_adaptiveavgpool_forward_1d(input, ref_output, C, H, OH); } else if(dims == 4) { - cpu_adaptiveavgpool_forward_2d(input, ref_output, N, C, H, W, OH, OW); + cpu_adaptiveavgpool_forward_2d(input, ref_output, C, H, W, OH, OW); } else if(dims == 5) { - cpu_adaptiveavgpool_forward_3d(input, ref_output, N, C, D, H, W, OD, OH, OW); + cpu_adaptiveavgpool_forward_3d(input, ref_output, C, D, H, W, OD, OH, OW); } status = miopen::adaptiveavgpool::AdaptiveAvgPoolForward( handle, input.desc, input_dev.get(), output.desc, output_dev.get()); @@ -338,16 +338,15 @@ struct AdaptiveAvgPoolTestBwd : public ::testing::TestWithParam(output_grad, ref_input_grad, N, C, H, OH); + cpu_adaptiveavgpool_backward_1d(output_grad, ref_input_grad, C, H, OH); } else if(dims == 4) { - cpu_adaptiveavgpool_backward_2d(output_grad, ref_input_grad, N, C, H, W, OH, OW); + cpu_adaptiveavgpool_backward_2d(output_grad, ref_input_grad, C, H, W, OH, OW); } else if(dims == 5) { - cpu_adaptiveavgpool_backward_3d( - output_grad, ref_input_grad, N, C, D, H, W, OD, OH, OW); + cpu_adaptiveavgpool_backward_3d(output_grad, ref_input_grad, C, D, H, W, OD, OH, OW); } status = miopen::adaptiveavgpool::AdaptiveAvgPoolBackward( handle, output_grad.desc, output_grad_dev.get(), input_grad.desc, input_grad_dev.get()); From 322047c3f160680887fc90e064778f89c6ac2e0c Mon Sep 17 00:00:00 2001 From: hieule88 Date: Mon, 25 Nov 2024 10:44:14 +0700 Subject: [PATCH 33/38] rerun CICD --- test/gtest/adaptiveavgpool.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/gtest/adaptiveavgpool.cpp b/test/gtest/adaptiveavgpool.cpp index b36051f40a..b09c286b15 100644 --- a/test/gtest/adaptiveavgpool.cpp +++ b/test/gtest/adaptiveavgpool.cpp @@ -50,13 +50,13 @@ TEST_P(GPU_AdaptiveAvgPool_fwd_BFP16, AdaptiveAvgPoolTestFwd) Verify(); }; -INSTANTIATE_TEST_SUITE_P(Full, +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_AdaptiveAvgPool_fwd_FP32, testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp32())); -INSTANTIATE_TEST_SUITE_P(Full, +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_AdaptiveAvgPool_fwd_FP16, testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdFp16())); -INSTANTIATE_TEST_SUITE_P(Full, +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_AdaptiveAvgPool_fwd_BFP16, testing::ValuesIn(AdaptiveAvgPoolTestConfigsFwdBfp16())); @@ -83,12 +83,12 @@ TEST_P(GPU_AdaptiveAvgPool_bwd_BFP16, AdaptiveAvgPoolTestBwd) Verify(); }; -INSTANTIATE_TEST_SUITE_P(Full, +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_AdaptiveAvgPool_bwd_FP32, testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp32())); -INSTANTIATE_TEST_SUITE_P(Full, +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_AdaptiveAvgPool_bwd_FP16, testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdFp16())); -INSTANTIATE_TEST_SUITE_P(Full, +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_AdaptiveAvgPool_bwd_BFP16, testing::ValuesIn(AdaptiveAvgPoolTestConfigsBwdBfp16())); From be5c06eab1d80f0e7b0c8a5db176ab7bc79447a6 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Thu, 16 Jan 2025 11:36:42 +0700 Subject: [PATCH 34/38] small fix CICD --- .gitignore | 5 +++++ src/adaptiveavgpool/problem_description.cpp | 2 +- src/adaptiveavgpool_api.cpp | 2 +- test/gtest/adaptiveavgpool.hpp | 2 +- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 380c163c3f..a341211e0b 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,8 @@ build*/ # Python cache __pycache__/ + +install_dir/ +.cache/ +.devcontainer/ +.gitignore diff --git a/src/adaptiveavgpool/problem_description.cpp b/src/adaptiveavgpool/problem_description.cpp index f4ba38231d..21800d4cd0 100644 --- a/src/adaptiveavgpool/problem_description.cpp +++ b/src/adaptiveavgpool/problem_description.cpp @@ -35,7 +35,7 @@ namespace adaptiveavgpool { inline std::ostream& operator<<(std::ostream& os, const std::vector& v) { os << '{'; - for(int i = 0; i < v.size(); ++i) + for(size_t i = 0; i < v.size(); ++i) { if(i != 0) os << ','; diff --git a/src/adaptiveavgpool_api.cpp b/src/adaptiveavgpool_api.cpp index c183386a6a..0f27507f46 100644 --- a/src/adaptiveavgpool_api.cpp +++ b/src/adaptiveavgpool_api.cpp @@ -33,7 +33,7 @@ inline std::ostream& operator<<(std::ostream& os, const std::vector& v) { os << '{'; - for(int i = 0; i < v.size(); ++i) + for(size_t i = 0; i < v.size(); ++i) { if(i != 0) os << ','; diff --git a/test/gtest/adaptiveavgpool.hpp b/test/gtest/adaptiveavgpool.hpp index ad2ef3e2d1..cf2e1fa5dd 100644 --- a/test/gtest/adaptiveavgpool.hpp +++ b/test/gtest/adaptiveavgpool.hpp @@ -37,7 +37,7 @@ template inline std::ostream& operator<<(std::ostream& os, const std::vector& v) { os << '{'; - for(int i = 0; i < v.size(); ++i) + for(size_t i = 0; i < v.size(); ++i) { if(i != 0) os << ','; From fe4eefc529d9d91b01ef68b1a32d6523e82dc5b5 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Mon, 17 Feb 2025 11:15:53 +0700 Subject: [PATCH 35/38] rm floor,ceil --- driver/mloAdaptiveAvgPoolHost.hpp | 67 ++++++++++++++----------------- test/cpu_adaptiveavgpool.hpp | 67 ++++++++++++++----------------- 2 files changed, 60 insertions(+), 74 deletions(-) diff --git a/driver/mloAdaptiveAvgPoolHost.hpp b/driver/mloAdaptiveAvgPoolHost.hpp index 7274408148..5441b9a0a7 100644 --- a/driver/mloAdaptiveAvgPoolHost.hpp +++ b/driver/mloAdaptiveAvgPoolHost.hpp @@ -49,8 +49,8 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD size_t nc = gid / OH, oh = gid % OH; size_t n = nc / C, c = nc % C; - size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); - size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; + size_t h = oh * H / OH; + size_t kh = (((oh + 1) * H + OH - 1) / OH) - h; float sum = 0; for(size_t ih = h; ih < (h + kh); ++ih) @@ -85,11 +85,11 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD size_t nc = ncoh / OH, oh = ncoh % OH; size_t n = nc / C, c = nc % C; - size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); - size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; + size_t h = (oh * H) / OH; + size_t kh = (((oh + 1) * H + OH - 1) / OH) - h; - size_t w = static_cast(std::floor(static_cast(ow * W) / OW)); - size_t kw = static_cast(std::ceil(static_cast((ow + 1) * W) / OW)) - w; + size_t w = (ow * W) / OW; + size_t kw = (((ow + 1) * W + OW - 1) / OW) - w; float divider = static_cast(kh * kw); float sum = 0; @@ -131,14 +131,14 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD size_t nc = ncod / OD, od = ncod % OD; size_t n = nc / C, c = nc % C; - size_t d = static_cast(std::floor(static_cast(od * D) / OD)); - size_t kd = static_cast(std::ceil(static_cast((od + 1) * D) / OD)) - d; + size_t d = (od * D) / OD; + size_t kd = ((od + 1) * D + OD - 1) / OD - d; - size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); - size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; + size_t h = (oh * H) / OH; + size_t kh = ((oh + 1) * H + OH - 1) / OH - h; - size_t w = static_cast(std::floor(static_cast(ow * W) / OW)); - size_t kw = static_cast(std::ceil(static_cast((ow + 1) * W) / OW)) - w; + size_t w = (ow * W) / OW; + size_t kw = ((ow + 1) * W + OW - 1) / OW - w; float sum = 0; for(size_t id = d; id < (d + kd); ++id) @@ -178,14 +178,13 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu size_t nc = gid / H, h = gid % H; size_t n = nc / C, c = nc % C; - size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); - size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; + size_t oh = (h * OH) / H; + size_t koh = (((h + 1) * OH + H - 1) / H) - oh; float grad = 0; for(size_t ih = oh; ih < (oh + koh); ++ih) { - size_t kh = static_cast(std::ceil(static_cast((ih + 1) * H) / OH)) - - static_cast(std::floor(static_cast(ih * H) / OH)); + size_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH; grad += static_cast(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih})]) / kh; @@ -217,21 +216,19 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu size_t nc = nch / H, h = nch % H; size_t n = nc / C, c = nc % C; - size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); - size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; + size_t oh = (h * OH) / H; + size_t koh = ((h + 1) * OH + H - 1) / H - oh; - size_t ow = static_cast(std::floor(static_cast(w * OW) / W)); - size_t kow = static_cast(std::ceil(static_cast((w + 1) * OW) / W)) - ow; + size_t ow = (w * OW) / W; + size_t kow = ((w + 1) * OW + W - 1) / W - ow; float grad = 0; for(size_t ih = oh; ih < (oh + koh); ++ih) { - size_t kh = static_cast(std::ceil(static_cast((ih + 1) * H) / OH)) - - static_cast(std::floor(static_cast(ih * H) / OH)); + size_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH; for(size_t iw = ow; iw < (ow + kow); ++iw) { - size_t kw = static_cast(std::ceil(static_cast((iw + 1) * W) / OW)) - - static_cast(std::floor(static_cast(iw * W) / OW)); + size_t kw = ((iw + 1) * W + OW - 1) / OW - (iw * W) / OW; grad += static_cast( output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih, iw})]) / (kh * kw); @@ -268,29 +265,25 @@ int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outpu size_t nc = ncd / D, d = ncd % D; size_t n = nc / C, c = nc % C; - size_t od = static_cast(std::floor(static_cast(d * OD) / D)); - size_t kod = static_cast(std::ceil(static_cast((d + 1) * OD) / D)) - od; + size_t od = (d * OD) / D; + size_t kod = ((d + 1) * OD + D - 1) / D - od; - size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); - size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; + size_t oh = (h * OH) / H; + size_t koh = ((h + 1) * OH + H - 1) / H - oh; - size_t ow = static_cast(std::floor(static_cast(w * OW) / W)); - size_t kow = static_cast(std::ceil(static_cast((w + 1) * OW) / W)) - ow; + size_t ow = (w * OW) / W; + size_t kow = ((w + 1) * OW + W - 1) / W - ow; float grad = 0; for(size_t id = od; id < (od + kod); ++id) { - size_t kd = static_cast(std::ceil(static_cast((id + 1) * D) / OD)) - - static_cast(std::floor(static_cast(id * D) / OD)); + size_t kd = ((id + 1) * D + OD - 1) / OD - (id * D) / OD; for(size_t ih = oh; ih < (oh + koh); ++ih) { - size_t kh = static_cast(std::ceil(static_cast((ih + 1) * H) / OH)) - - static_cast(std::floor(static_cast(ih * H) / OH)); + size_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH; for(size_t iw = ow; iw < (ow + kow); ++iw) { - size_t kw = - static_cast(std::ceil(static_cast((iw + 1) * W) / OW)) - - static_cast(std::floor(static_cast(iw * W) / OW)); + size_t kw = ((iw + 1) * W + OW - 1) / OW - (iw * W) / OW; grad += static_cast( output_grad[output_grad_tv.get_tensor_view_idx({n, c, id, ih, iw})]) / diff --git a/test/cpu_adaptiveavgpool.hpp b/test/cpu_adaptiveavgpool.hpp index ec3e457bba..a9d3ed0376 100644 --- a/test/cpu_adaptiveavgpool.hpp +++ b/test/cpu_adaptiveavgpool.hpp @@ -43,8 +43,8 @@ void cpu_adaptiveavgpool_forward_1d( size_t nc = gid / OH, oh = gid % OH; size_t n = nc / C, c = nc % C; - size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); - size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; + size_t h = oh * H / OH; + size_t kh = (((oh + 1) * H + OH - 1) / OH) - h; float sum = 0; for(size_t ih = h; ih < (h + kh); ++ih) @@ -71,11 +71,11 @@ void cpu_adaptiveavgpool_forward_2d( size_t nc = ncoh / OH, oh = ncoh % OH; size_t n = nc / C, c = nc % C; - size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); - size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; + size_t h = (oh * H) / OH; + size_t kh = (((oh + 1) * H + OH - 1) / OH) - h; - size_t w = static_cast(std::floor(static_cast(ow * W) / OW)); - size_t kw = static_cast(std::ceil(static_cast((ow + 1) * W) / OW)) - w; + size_t w = (ow * W) / OW; + size_t kw = (((ow + 1) * W + OW - 1) / OW) - w; float divider = static_cast(kh * kw); float sum = 0; @@ -114,14 +114,14 @@ void cpu_adaptiveavgpool_forward_3d(tensor input, size_t nc = ncod / OD, od = ncod % OD; size_t n = nc / C, c = nc % C; - size_t d = static_cast(std::floor(static_cast(od * D) / OD)); - size_t kd = static_cast(std::ceil(static_cast((od + 1) * D) / OD)) - d; + size_t d = (od * D) / OD; + size_t kd = ((od + 1) * D + OD - 1) / OD - d; - size_t h = static_cast(std::floor(static_cast(oh * H) / OH)); - size_t kh = static_cast(std::ceil(static_cast((oh + 1) * H) / OH)) - h; + size_t h = (oh * H) / OH; + size_t kh = ((oh + 1) * H + OH - 1) / OH - h; - size_t w = static_cast(std::floor(static_cast(ow * W) / OW)); - size_t kw = static_cast(std::ceil(static_cast((ow + 1) * W) / OW)) - w; + size_t w = (ow * W) / OW; + size_t kw = ((ow + 1) * W + OW - 1) / OW - w; float sum = 0; for(size_t id = d; id < (d + kd); ++id) @@ -155,14 +155,13 @@ void cpu_adaptiveavgpool_backward_1d( size_t nc = gid / H, h = gid % H; size_t n = nc / C, c = nc % C; - size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); - size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; + size_t oh = (h * OH) / H; + size_t koh = (((h + 1) * OH + H - 1) / H) - oh; float grad = 0; for(size_t ih = oh; ih < (oh + koh); ++ih) { - size_t kh = static_cast(std::ceil(static_cast((ih + 1) * H) / OH)) - - static_cast(std::floor(static_cast(ih * H) / OH)); + size_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH; grad += static_cast(output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih})]) / kh; @@ -192,21 +191,19 @@ void cpu_adaptiveavgpool_backward_2d(tensor output_grad, size_t nc = nch / H, h = nch % H; size_t n = nc / C, c = nc % C; - size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); - size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; + size_t oh = (h * OH) / H; + size_t koh = ((h + 1) * OH + H - 1) / H - oh; - size_t ow = static_cast(std::floor(static_cast(w * OW) / W)); - size_t kow = static_cast(std::ceil(static_cast((w + 1) * OW) / W)) - ow; + size_t ow = (w * OW) / W; + size_t kow = ((w + 1) * OW + W - 1) / W - ow; float grad = 0; for(size_t ih = oh; ih < (oh + koh); ++ih) { - size_t kh = static_cast(std::ceil(static_cast((ih + 1) * H) / OH)) - - static_cast(std::floor(static_cast(ih * H) / OH)); + size_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH; for(size_t iw = ow; iw < (ow + kow); ++iw) { - size_t kw = static_cast(std::ceil(static_cast((iw + 1) * W) / OW)) - - static_cast(std::floor(static_cast(iw * W) / OW)); + size_t kw = ((iw + 1) * W + OW - 1) / OW - (iw * W) / OW; grad += static_cast( output_grad[output_grad_tv.get_tensor_view_idx({n, c, ih, iw})]) / (kh * kw); @@ -240,29 +237,25 @@ void cpu_adaptiveavgpool_backward_3d(tensor output_grad, size_t nc = ncd / D, d = ncd % D; size_t n = nc / C, c = nc % C; - size_t od = static_cast(std::floor(static_cast(d * OD) / D)); - size_t kod = static_cast(std::ceil(static_cast((d + 1) * OD) / D)) - od; + size_t od = (d * OD) / D; + size_t kod = ((d + 1) * OD + D - 1) / D - od; - size_t oh = static_cast(std::floor(static_cast(h * OH) / H)); - size_t koh = static_cast(std::ceil(static_cast((h + 1) * OH) / H)) - oh; + size_t oh = (h * OH) / H; + size_t koh = ((h + 1) * OH + H - 1) / H - oh; - size_t ow = static_cast(std::floor(static_cast(w * OW) / W)); - size_t kow = static_cast(std::ceil(static_cast((w + 1) * OW) / W)) - ow; + size_t ow = (w * OW) / W; + size_t kow = ((w + 1) * OW + W - 1) / W - ow; float grad = 0; for(size_t id = od; id < (od + kod); ++id) { - size_t kd = static_cast(std::ceil(static_cast((id + 1) * D) / OD)) - - static_cast(std::floor(static_cast(id * D) / OD)); + size_t kd = ((id + 1) * D + OD - 1) / OD - (id * D) / OD; for(size_t ih = oh; ih < (oh + koh); ++ih) { - size_t kh = static_cast(std::ceil(static_cast((ih + 1) * H) / OH)) - - static_cast(std::floor(static_cast(ih * H) / OH)); + size_t kh = ((ih + 1) * H + OH - 1) / OH - (ih * H) / OH; for(size_t iw = ow; iw < (ow + kow); ++iw) { - size_t kw = - static_cast(std::ceil(static_cast((iw + 1) * W) / OW)) - - static_cast(std::floor(static_cast(iw * W) / OW)); + size_t kw = ((iw + 1) * W + OW - 1) / OW - (iw * W) / OW; grad += static_cast( output_grad[output_grad_tv.get_tensor_view_idx({n, c, id, ih, iw})]) / From 949988b6b57e9cfa1088b12215fb2afc5a61e1a9 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Mon, 17 Feb 2025 17:14:40 +0700 Subject: [PATCH 36/38] rm unused --- driver/mloAdaptiveAvgPoolHost.hpp | 6 ------ src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp | 4 ++++ src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp | 4 ++++ src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp | 4 ++++ src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp | 4 ++++ src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp | 4 ++++ src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp | 4 ++++ test/cpu_adaptiveavgpool.hpp | 6 ------ 8 files changed, 24 insertions(+), 12 deletions(-) diff --git a/driver/mloAdaptiveAvgPoolHost.hpp b/driver/mloAdaptiveAvgPoolHost.hpp index 5441b9a0a7..73848ca38f 100644 --- a/driver/mloAdaptiveAvgPoolHost.hpp +++ b/driver/mloAdaptiveAvgPoolHost.hpp @@ -39,7 +39,6 @@ int32_t mloAdaptiveAvgPoolForward1dRunHost(const miopenTensorDescriptor_t inputD const size_t H, const size_t OH) { - auto dims = miopen::deref(inputDesc).GetLengths(); auto numel = miopen::deref(outputDesc).GetElementSize(); auto input_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(inputDesc)); @@ -74,7 +73,6 @@ int32_t mloAdaptiveAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputD const size_t OH, const size_t OW) { - auto dims = miopen::deref(inputDesc).GetLengths(); auto numel = miopen::deref(outputDesc).GetElementSize(); auto input_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc)); @@ -119,7 +117,6 @@ int32_t mloAdaptiveAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputD const size_t OH, const size_t OW) { - auto dims = miopen::deref(inputDesc).GetLengths(); auto numel = miopen::deref(outputDesc).GetElementSize(); auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); @@ -168,7 +165,6 @@ int32_t mloAdaptiveAvgPoolBackward1dRunHost(const miopenTensorDescriptor_t outpu const size_t H, const size_t OH) { - auto dims = miopen::deref(inputGradDesc).GetLengths(); auto numel = miopen::deref(inputGradDesc).GetElementSize(); auto output_grad_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(outputGradDesc)); @@ -205,7 +201,6 @@ int32_t mloAdaptiveAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outpu const size_t OH, const size_t OW) { - auto dims = miopen::deref(inputGradDesc).GetLengths(); auto numel = miopen::deref(inputGradDesc).GetElementSize(); auto output_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc)); @@ -253,7 +248,6 @@ int32_t mloAdaptiveAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outpu const size_t OH, const size_t OW) { - auto dims = miopen::deref(inputGradDesc).GetLengths(); auto numel = miopen::deref(inputGradDesc).GetElementSize(); auto output_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc)); diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp index 700029db10..1552ac8385 100644 --- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp +++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_1d.cpp @@ -43,6 +43,8 @@ namespace solver { namespace adaptiveavgpool { +namespace { + bool IsOverRocmBwd1d(const miopen::adaptiveavgpool::BwdProblemDescription& problem) { if(!problem.IsAllContiguous()) @@ -61,6 +63,8 @@ bool IsOverRocmBwd1d(const miopen::adaptiveavgpool::BwdProblemDescription& probl return false; } +} // namespace + bool AdaptiveAvgPoolBackward1d::IsApplicable( const ExecutionContext&, const miopen::adaptiveavgpool::BwdProblemDescription& problem) const { diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp index 8d3e78eb27..46dcef3e88 100644 --- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp +++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_2d.cpp @@ -43,6 +43,8 @@ namespace solver { namespace adaptiveavgpool { +namespace { + bool IsOverRocmBwd2d(const miopen::adaptiveavgpool::BwdProblemDescription& problem) { if(problem.IsAllContiguous()) @@ -77,6 +79,8 @@ bool IsOverRocmBwd2d(const miopen::adaptiveavgpool::BwdProblemDescription& probl return false; } +} // namespace + bool AdaptiveAvgPoolBackward2d::IsApplicable( const ExecutionContext&, const miopen::adaptiveavgpool::BwdProblemDescription& problem) const { diff --git a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp index 4918f2c970..c16603a530 100644 --- a/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp +++ b/src/solver/adaptiveavgpool/backward_adaptiveavgpool_3d.cpp @@ -43,6 +43,8 @@ namespace solver { namespace adaptiveavgpool { +namespace { + bool IsOverRocmBwd3d(const miopen::adaptiveavgpool::BwdProblemDescription& problem) { if(!problem.IsAllContiguous()) @@ -58,6 +60,8 @@ bool IsOverRocmBwd3d(const miopen::adaptiveavgpool::BwdProblemDescription& probl return false; } +} // namespace + bool AdaptiveAvgPoolBackward3d::IsApplicable( const ExecutionContext&, const miopen::adaptiveavgpool::BwdProblemDescription& problem) const { diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp index f50bd5a56f..f31d80c8be 100644 --- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp +++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_1d.cpp @@ -43,6 +43,8 @@ namespace solver { namespace adaptiveavgpool { +namespace { + bool IsOverRocmFwd1d(const miopen::adaptiveavgpool::FwdProblemDescription& problem) { auto in_nelems = problem.GetInputDesc().GetLengths()[-1]; @@ -56,6 +58,8 @@ bool IsOverRocmFwd1d(const miopen::adaptiveavgpool::FwdProblemDescription& probl return false; } +} // namespace + bool AdaptiveAvgPoolForward1d::IsApplicable( const ExecutionContext&, const miopen::adaptiveavgpool::FwdProblemDescription& problem) const { diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp index ff62625dcd..344071a3a4 100644 --- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp +++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_2d.cpp @@ -43,6 +43,8 @@ namespace solver { namespace adaptiveavgpool { +namespace { + bool IsOverRocmFwd2d(const miopen::adaptiveavgpool::FwdProblemDescription& problem) { auto in_nelems = problem.GetInputDesc().GetElementSize(); @@ -62,6 +64,8 @@ bool IsOverRocmFwd2d(const miopen::adaptiveavgpool::FwdProblemDescription& probl return false; } +} // namespace + bool AdaptiveAvgPoolForward2d::IsApplicable( const ExecutionContext&, const miopen::adaptiveavgpool::FwdProblemDescription& problem) const { diff --git a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp index 2c31e96f24..3c4fcf552f 100644 --- a/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp +++ b/src/solver/adaptiveavgpool/forward_adaptiveavgpool_3d.cpp @@ -43,6 +43,8 @@ namespace solver { namespace adaptiveavgpool { +namespace { + bool IsOverRocmFwd3d(const miopen::adaptiveavgpool::FwdProblemDescription& problem) { auto in_nelems = problem.GetInputDesc().GetElementSize(); @@ -62,6 +64,8 @@ bool IsOverRocmFwd3d(const miopen::adaptiveavgpool::FwdProblemDescription& probl return false; } +} // namespace + bool AdaptiveAvgPoolForward3d::IsApplicable( const ExecutionContext&, const miopen::adaptiveavgpool::FwdProblemDescription& problem) const { diff --git a/test/cpu_adaptiveavgpool.hpp b/test/cpu_adaptiveavgpool.hpp index a9d3ed0376..462cbda67c 100644 --- a/test/cpu_adaptiveavgpool.hpp +++ b/test/cpu_adaptiveavgpool.hpp @@ -33,7 +33,6 @@ template void cpu_adaptiveavgpool_forward_1d( tensor input, tensor& output, size_t C, size_t H, size_t OH) { - auto dims = input.desc.GetLengths(); auto numel = output.desc.GetElementSize(); auto input_tv = miopen::get_inner_expanded_tv<3>(input.desc); @@ -60,7 +59,6 @@ template void cpu_adaptiveavgpool_forward_2d( tensor input, tensor& output, size_t C, size_t H, size_t W, size_t OH, size_t OW) { - auto dims = input.desc.GetLengths(); auto numel = output.desc.GetElementSize(); auto input_tv = miopen::get_inner_expanded_tv<4>(input.desc); @@ -102,7 +100,6 @@ void cpu_adaptiveavgpool_forward_3d(tensor input, size_t OH, size_t OW) { - auto dims = input.desc.GetLengths(); auto numel = output.desc.GetElementSize(); auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); @@ -145,7 +142,6 @@ template void cpu_adaptiveavgpool_backward_1d( tensor output_grad, tensor& input_grad, size_t C, size_t H, size_t OH) { - auto dims = input_grad.desc.GetLengths(); auto numel = input_grad.desc.GetElementSize(); auto output_grad_tv = miopen::get_inner_expanded_tv<3>(output_grad.desc); @@ -180,7 +176,6 @@ void cpu_adaptiveavgpool_backward_2d(tensor output_grad, size_t OH, size_t OW) { - auto dims = input_grad.desc.GetLengths(); auto numel = input_grad.desc.GetElementSize(); auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc); @@ -225,7 +220,6 @@ void cpu_adaptiveavgpool_backward_3d(tensor output_grad, size_t OH, size_t OW) { - auto dims = input_grad.desc.GetLengths(); auto numel = input_grad.desc.GetElementSize(); auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc); From ade653f67d3f5cb1ae7e83360e841fbea8f26272 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Wed, 12 Mar 2025 16:38:02 +0700 Subject: [PATCH 37/38] small fix --- docs/reference/index.rst | 1 - include/miopen/miopen.h | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 835acf89bd..c2b74eabee 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -39,4 +39,3 @@ The MIOpen API library is structured as follows: * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental) * :doc:`Kthvalue <../doxygen/html/group__kthvalue>` (experimental) * :doc:`GLU <../doxygen/html/group__glu>` (experimental) - * :doc:`AdaptiveAvgPool <../doxygen/html/group__adaptiveavgpool>` (experimental) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 54ec3677bc..f67e61d87c 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -72,7 +72,6 @@ * @defgroup ReduceCalculation * @defgroup RotaryPositionalEmbeddings * @defgroup ReLU - * @defgroup adaptiveavgpool * */ @@ -8229,7 +8228,7 @@ MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle, #ifdef MIOPEN_BETA_API // adaptiveavgpool APIs -/** @addtogroup adaptiveavgpool +/** @addtogroup pooling * * @{ */ @@ -8265,7 +8264,7 @@ miopenAdaptiveAvgPoolBackward(miopenHandle_t handle, const miopenTensorDescriptor_t inputGradDesc, void* input_grad); /** @} */ -// CLOSEOUT adaptiveavgpool DOXYGEN GROUP +// CLOSEOUT pooling DOXYGEN GROUP #endif // MIOPEN_BETA_API #ifdef __cplusplus From d3bdd4c4c00e555d802b0e25a76fca8ae120d3b8 Mon Sep 17 00:00:00 2001 From: hieule88 Date: Wed, 12 Mar 2025 16:38:52 +0700 Subject: [PATCH 38/38] fix gitignore --- .gitignore | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.gitignore b/.gitignore index a341211e0b..380c163c3f 100644 --- a/.gitignore +++ b/.gitignore @@ -65,8 +65,3 @@ build*/ # Python cache __pycache__/ - -install_dir/ -.cache/ -.devcontainer/ -.gitignore