diff --git a/docs/reference/index.rst b/docs/reference/index.rst index c2b74eabee..406df8db56 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -39,3 +39,4 @@ The MIOpen API library is structured as follows: * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental) * :doc:`Kthvalue <../doxygen/html/group__kthvalue>` (experimental) * :doc:`GLU <../doxygen/html/group__glu>` (experimental) + * :doc:`Trace <../doxygen/html/group__trace>` (experimental) diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 60d6fe6ce6..ef5e162646 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -61,6 +61,7 @@ add_executable(MIOpenDriver dm_softmax.cpp dm_t5layernorm.cpp dm_tensorop.cpp + dm_trace.cpp dm_transformers_adam_w.cpp main.cpp registry_driver_maker.cpp diff --git a/driver/dm_trace.cpp b/driver/dm_trace.cpp new file mode 100644 index 0000000000..306117c8e7 --- /dev/null +++ b/driver/dm_trace.cpp @@ -0,0 +1,41 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "trace_driver.hpp" +#include "registry_driver_maker.hpp" + +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "trace") + return new TraceDriver(); + if(base_arg == "tracefp16") + return new TraceDriver(); + if(base_arg == "tracebfp16") + return new TraceDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/driver.hpp b/driver/driver.hpp index d77d5d02d2..fe8c668892 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -314,7 +314,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, " "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], " "prelu[bfp16|fp16], kthvalue[bfp16|fp16], glu[bfp16|fp16], softmarginloss[bfp16|fp16], " - "multimarginloss[bfp16|fp16]\n"); + "multimarginloss[bfp16|fp16], trace[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -352,7 +352,7 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "kthvaluebfp16" && arg != "glu" && arg != "glufp16" && arg != "glubfp16" && arg != "softmarginloss" && arg != "softmarginlossfp16" && arg != "softmarginlossbfp16" && arg != "multimarginloss" && arg != "multimarginlossfp16" && arg != "multimarginlossbfp16" && - arg != "--version") + arg != "trace" && arg != "tracefp16" && arg != "tracebfp16" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/trace_driver.hpp b/driver/trace_driver.hpp new file mode 100644 index 0000000000..e72ee2ccdd --- /dev/null +++ b/driver/trace_driver.hpp @@ -0,0 +1,455 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include "InputFlags.hpp" +#include "driver.hpp" +#include "tensor_driver.hpp" +#include "tensor_view.hpp" +#include "timer.hpp" + +#include <../test/ford.hpp> +#include <../test/tensor_holder.hpp> +#include <../test/verify.hpp> + +#include +#include +#include + +#include +#include +#include + +template +int mloTraceForwardRunHost(const miopenTensorDescriptor_t inputDesc, + const Tgpu* input, + Tcheck* outputHost) +{ + tensor_view_t<2> input_tv = miopen::get_inner_expanded_tv<2>(miopen::deref(inputDesc)); + auto input_len = miopen::deref(inputDesc).GetLengths(); + size_t N = std::min(input_len[0], input_len[1]); + double res = 0; + for(size_t i = 0; i < N; i++) + { + tensor_layout_t<2> input_layout = {i, i}; + size_t input_idx = input_tv.get_tensor_view_idx(input_layout); + res += static_cast(input[input_idx]); + } + outputHost[0] = static_cast(res); + + return 0; +} + +template +int mloTraceBackwardRunHost(const Tgpu* outputGrad, + const miopenTensorDescriptor_t inputGradDesc, + Tcheck* inputGradHost) +{ + tensor_view_t<2> input_grad_tv = miopen::get_inner_expanded_tv<2>(miopen::deref(inputGradDesc)); + auto input_grad_len = miopen::deref(inputGradDesc).GetLengths(); + size_t N = input_grad_len[0]; + + par_ford(N)([&](size_t i) { + size_t idx = i % (input_grad_tv.size[1] + 1); + + if(idx != input_grad_tv.size[1]) + { + Tgpu val = outputGrad[0]; + tensor_layout_t<2> ingrad_layout = {i, idx}; + inputGradHost[input_grad_tv.get_tensor_view_idx(ingrad_layout)] = + static_cast(val); + } + }); + + return 0; +} + +template +class TraceDriver : public Driver +{ +public: + TraceDriver() : Driver() + { + miopenCreateTensorDescriptor(&inputDesc); + miopenCreateTensorDescriptor(&outputDesc); + miopenCreateTensorDescriptor(&inputGradDesc); + miopenCreateTensorDescriptor(&outputGradDesc); + + data_type = miopen_type{}; + } + + std::vector ComputeStrides(std::vector inputDim); + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + int GetandSetData() override; + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + int RunBackwardCPU(); + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~TraceDriver() override + { + miopenDestroyTensorDescriptor(inputDesc); + miopenDestroyTensorDescriptor(outputDesc); + miopenDestroyTensorDescriptor(inputGradDesc); + miopenDestroyTensorDescriptor(outputGradDesc); + } + +private: + InputFlags inflags; + + int forw; + bool isContiguous; + + miopenTensorDescriptor_t inputDesc; + miopenTensorDescriptor_t outputDesc; + miopenTensorDescriptor_t inputGradDesc; + miopenTensorDescriptor_t outputGradDesc; + + std::unique_ptr in_dev; + std::unique_ptr out_dev; + std::unique_ptr in_grad_dev; + std::unique_ptr out_grad_dev; + std::unique_ptr workspace_dev; + + std::vector in; + std::vector out; + std::vector in_grad; + std::vector out_grad; + std::vector workspace; + + std::vector outHost; + std::vector inGradHost; + + size_t ws_sizeInBytes; +}; + +// Equivalent tensor.transpose(0, -1).contiguous().transpose(0, -1) +template +std::vector TraceDriver::ComputeStrides(std::vector inputDim) +{ + if(!isContiguous) + std::swap(inputDim.front(), inputDim.back()); + std::vector strides(inputDim.size()); + strides.back() = 1; + for(int i = inputDim.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * inputDim[i + 1]; + if(!isContiguous) + std::swap(strides.front(), strides.back()); + return strides; +} + +template +int TraceDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + isContiguous = inflags.GetValueInt("contiguous") > 0 ? true : false; + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + + forw = inflags.GetValueInt("forw"); + + if(forw != 0 && forw != 1) + { + MIOPEN_THROW("Invalid Forward|Backward Mode"); + } + + return miopenStatusSuccess; +} + +template +int TraceDriver::GetandSetData() +{ + auto in_len = inflags.GetValueTensor("dim_lengths").lengths; + auto in_strides = ComputeStrides(in_len); + SetTensorNd(inputDesc, in_len, in_strides, data_type); + SetTensorNd(inputGradDesc, in_len, in_strides, data_type); + + std::vector out_lens = {1}; + SetTensorNd(outputDesc, out_lens, data_type); + SetTensorNd(outputGradDesc, out_lens, data_type); + + return miopenStatusSuccess; +} + +template +int TraceDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag( + "forw", + 'F', + "0", + "Run only Trace Forward (1) or Run both Forward and Backward (0) (Default = 0)", + "int"); + inflags.AddTensorFlag( + "dim_lengths", 'D', "9x9", "The dimensional lengths of the input tensor (Default=9x9)"); + inflags.AddInputFlag("contiguous", + 'C', + "1", + "Tensor is contiguous (1) or not contiguous (0) (Default=1)", + "int"); + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); + inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +int TraceDriver::AllocateBuffersAndCopy() +{ + size_t in_sz = GetTensorSize(inputDesc); + size_t out_sz = GetTensorSize(outputDesc); + + miopenGetTraceForwardWorkspaceSize(GetHandle(), inputDesc, outputDesc, &ws_sizeInBytes); + + if(ws_sizeInBytes == static_cast(-1)) + return miopenStatusAllocFailed; + + size_t ws_sz = ws_sizeInBytes / sizeof(Tgpu); + + uint32_t ctx = 0; + + in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); + out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); + workspace_dev = std::unique_ptr(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte))); + + in_grad_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); + out_grad_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); + + in = std::vector(in_sz, static_cast(0)); + out = std::vector(out_sz, static_cast(0)); + workspace = std::vector(ws_sz, static_cast(0)); + + in_grad = std::vector(in_sz, static_cast(0)); + out_grad = std::vector(out_sz, static_cast(0)); + + outHost = std::vector(out_sz, static_cast(0)); + inGradHost = std::vector(in_sz, static_cast(0)); + + for(int i = 0; i < in_sz; i++) + { + in[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(0.2)); + } + + for(int i = 0; i < out_sz; i++) + { + out_grad[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(0.3)); + } + + if(in_dev->ToGPU(GetStream(), in.data()) != 0) + { + std::cerr << "Error copying (in) to GPU, size: " << in_dev->GetSize() << std::endl; + return miopenStatusInternalError; + } + + if(out_grad_dev->ToGPU(GetStream(), out_grad.data()) != 0) + { + std::cerr << "Error copying (out_grad) to GPU, size: " << out_grad_dev->GetSize() + << std::endl; + return miopenStatusInternalError; + } + + return miopenStatusSuccess; +} + +template +int TraceDriver::RunForwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenStatus_t status = miopenTraceForward(GetHandle(), + workspace_dev->GetMem(), + ws_sizeInBytes, + inputDesc, + in_dev->GetMem(), + outputDesc, + out_dev->GetMem()); + MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in miopenTraceForward"); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Forward Trace Elapsed: " << t.gettime_ms() / iter + << " ms\n"; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Forward Trace Elapsed: " << kernel_average_time << " ms\n"; + } + + if(out_dev->FromGPU(GetStream(), out.data()) != 0) + { + std::cerr << "Error copying (out_dev) from GPU, size: " << out_dev->GetSize() << std::endl; + return miopenStatusInternalError; + } + + return miopenStatusSuccess; +} + +template +int TraceDriver::RunForwardCPU() +{ + auto status = mloTraceForwardRunHost(inputDesc, in.data(), outHost.data()); + MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in mloTraceForwardRunHost"); + + return status; +} + +template +int TraceDriver::RunBackwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenStatus_t status = miopenTraceBackward(GetHandle(), + outputGradDesc, + out_grad_dev->GetMem(), + inputGradDesc, + in_grad_dev->GetMem()); + MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in miopenTraceBackward"); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Backward Trace Elapsed: " << t.gettime_ms() / iter + << " ms\n"; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Backward Trace Elapsed: " << kernel_average_time << " ms\n"; + } + + if(in_grad_dev->FromGPU(GetStream(), in_grad.data()) != 0) + { + std::cerr << "Error copying (in_grad_dev) from GPU, size: " << in_grad_dev->GetSize() + << std::endl; + return miopenStatusInternalError; + } + + return miopenStatusSuccess; +} + +template +int TraceDriver::RunBackwardCPU() +{ + auto status = mloTraceBackwardRunHost(out_grad.data(), inputGradDesc, inGradHost.data()); + MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in mloTraceBackwardRunHost"); + + return status; +} + +template +Tref TraceDriver::GetTolerance() +{ + Tref tolerance = std::numeric_limits::epsilon() * 10; + return tolerance; +} + +template +int TraceDriver::VerifyForward() +{ + RunForwardCPU(); + const Tref tolerance = GetTolerance(); + auto error = miopen::rms_range(outHost, out); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Forward Trace FAILED: " << error << " > " << tolerance << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Forward Trace Verifies OK on CPU reference (" << error << " < " << tolerance + << ')' << std::endl; + } + + return miopenStatusSuccess; +} + +template +int TraceDriver::VerifyBackward() +{ + RunBackwardCPU(); + const Tref tolerance = GetTolerance(); + auto error = miopen::rms_range(inGradHost, in_grad); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Backward Trace FAILED: " << error << " > " << tolerance << std::endl; + return EC_VerifyBwd; + } + else + { + std::cout << "Backward Trace Verifies OK on CPU reference (" << error << " < " << tolerance + << ')' << std::endl; + } + + return miopenStatusSuccess; +} diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 6d1830d7d6..2936505dae 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -72,6 +72,7 @@ * @defgroup ReduceCalculation * @defgroup RotaryPositionalEmbeddings * @defgroup ReLU + * @defgroup trace * */ @@ -8231,6 +8232,69 @@ MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle, // CLOSEOUT LossFunction DOXYGEN GROUP #endif // MIOPEN_BETA_API +#ifdef MIOPEN_BETA_API +/** @addtogroup trace + * + * @{ + */ + +/*! @brief Helper function to query the minimum workspace size required by the Trace call + * + * @param handle MIOpen Handle (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param outputDesc Tensor descriptor for output tensor (input) + * @param sizeInBytes Pointer to data to return the minimum workspace size + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenGetTraceForwardWorkspaceSize(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + size_t* sizeInBytes); + +/** @addtogroup trace + * + * @{ + */ + +/*! @brief Execute Trace forward layer + * + * @param handle MIOpen handle (input) + * @param workspace Address of the allocated workspace data (input) + * @param workspaceSizeInBytes Size in bytes of the allocated workspace data (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param input Input tensor (input) + * @param outputDesc Tensor descriptor for output tensor (input) + * @param output Output tensor (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenTraceForward(miopenHandle_t handle, + void* workspace, + size_t workspaceSizeInBytes, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output); + +/*! @brief Execute Trace backward layer + * + * @param handle MIOpen handle (input) + * @param outputGradDesc Tensor descriptor for output grad tensor (input) + * @param outputGrad Output grad tensor (input) + * @param inputGradDesc Tensor descriptor for input grad tensor (input) + * @param inputGrad Input grad tensor (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenTraceBackward(miopenHandle_t handle, + const miopenTensorDescriptor_t outputGradDesc, + const void* outputGrad, + const miopenTensorDescriptor_t inputGradDesc, + void* inputGrad); + +/** @} */ +// CLOSEOUT TRACE DOXYGEN GROUP +#endif // MIOPEN_BETA_API + #ifdef __cplusplus } #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9661ccb28f..5840c0ba1f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -343,6 +343,8 @@ set( MIOpen_Source solver/tensorOp/Op5dTensorGeneric.cpp solver/tensorOp/OpTensorFwdBias.cpp solver/tensorOp/OpTensorLeadingOnes.cpp + solver/trace/backward_trace.cpp + solver/trace/forward_trace.cpp subbuffers.cpp t5layernorm_api.cpp target_properties.cpp @@ -350,6 +352,8 @@ set( MIOpen_Source tensor.cpp tensorOp/problem_description.cpp tensor_api.cpp + trace/problem_description.cpp + trace_api.cpp transformers_adam_w_api.cpp seq_tensor.cpp ) @@ -537,6 +541,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenConvDirUni.cl kernels/MIOpenConvDirBatchNormActiv.cl kernels/MIOpenConvDirGenFwd.cl + kernels/MIOpenFill.cpp kernels/MIOpenGLU.cpp kernels/MIOpenGroupNorm.cpp kernels/MIOpenGetitem.cpp @@ -562,6 +567,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenSoftMarginLoss.cpp kernels/MIOpenSoftmax.cl kernels/MIOpenSoftmaxAttn.cpp + kernels/MIOpenTrace.cpp kernels/MIOpenUtilKernels3.cl kernels/MIOpenUtilKernels4.cl kernels/MIOpenUtilKernels5.cl @@ -709,6 +715,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN reduceextreme.cpp rope.cpp softmarginloss.cpp + trace.cpp transformers_adam_w.cpp ${PROJECT_BINARY_DIR}/db_path.cpp ) diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp index 76a13b051c..44db139657 100644 --- a/src/include/miopen/solver_id.hpp +++ b/src/include/miopen/solver_id.hpp @@ -64,7 +64,8 @@ enum class Primitive ReLU, Kthvalue, SoftMarginLoss, - MultiMarginLoss + MultiMarginLoss, + Trace }; struct MIOPEN_INTERNALS_EXPORT Id diff --git a/src/include/miopen/trace.hpp b/src/include/miopen/trace.hpp new file mode 100644 index 0000000000..2bee1fa2d1 --- /dev/null +++ b/src/include/miopen/trace.hpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +namespace trace { + +MIOPEN_INTERNALS_EXPORT size_t GetTraceForwardWorkspaceSize(Handle& handle, + const TensorDescriptor& inputDesc, + const TensorDescriptor& outputDesc); + +MIOPEN_INTERNALS_EXPORT miopenStatus_t TraceForward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output); + +MIOPEN_INTERNALS_EXPORT miopenStatus_t TraceBackward(Handle& handle, + const TensorDescriptor& outputGradDesc, + ConstData_t outputGrad, + const TensorDescriptor& inputGradDesc, + Data_t inputGrad); + +} // namespace trace + +} // namespace miopen diff --git a/src/include/miopen/trace/invoke_params.hpp b/src/include/miopen/trace/invoke_params.hpp new file mode 100644 index 0000000000..c15aa00029 --- /dev/null +++ b/src/include/miopen/trace/invoke_params.hpp @@ -0,0 +1,72 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include +#include + +namespace miopen { + +namespace trace { + +struct FwdInvokeParams : public miopen::InvokeParams +{ + FwdInvokeParams() = default; + + const TensorDescriptor* inputDesc = nullptr; + const TensorDescriptor* outputDesc = nullptr; + + ConstData_t input = nullptr; + Data_t output = nullptr; + + Data_t workspace = nullptr; + std::size_t workspace_size = 0; + + std::size_t GetWorkspaceSize() const { return workspace_size; } + Data_t GetWorkspace() const { return workspace; } +}; + +struct BwdInvokeParams : public miopen::InvokeParams +{ + BwdInvokeParams() = default; + + const TensorDescriptor* inputGradDesc = nullptr; + const TensorDescriptor* outputGradDesc = nullptr; + + ConstData_t outputGrad = nullptr; + Data_t inputGrad = nullptr; + + Data_t workspace = nullptr; + std::size_t workspace_size = 0; + + std::size_t GetWorkspaceSize() const { return workspace_size; } + Data_t GetWorkspace() const { return workspace; } +}; + +} // namespace trace + +} // namespace miopen diff --git a/src/include/miopen/trace/problem_description.hpp b/src/include/miopen/trace/problem_description.hpp new file mode 100644 index 0000000000..43b1077789 --- /dev/null +++ b/src/include/miopen/trace/problem_description.hpp @@ -0,0 +1,123 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include +#include +#include + +namespace miopen { + +struct NetworkConfig; + +namespace trace { + +struct FwdProblemDescription : ProblemDescriptionBase +{ + FwdProblemDescription(const TensorDescriptor& inputDesc_, const TensorDescriptor& outputDesc_) + : inputDesc(inputDesc_), outputDesc(outputDesc_) + { + if(inputDesc.GetNumDims() != 2) + { + MIOPEN_THROW(miopenStatusBadParm, "Input tensor must be 2D."); + } + + if(outputDesc.GetNumDims() != 1) + { + MIOPEN_THROW(miopenStatusBadParm, "Output tensor must be 1D."); + } + + if(!IsSameType()) + { + MIOPEN_THROW(miopenStatusBadParm, "Input and output tensor must have same type."); + } + } + + const TensorDescriptor& GetInputDesc() const { return inputDesc; } + const TensorDescriptor& GetOutputDesc() const { return outputDesc; } + + bool IsSameType() const + { + if(inputDesc.GetType() != outputDesc.GetType()) + { + return false; + } + return true; + } + + NetworkConfig MakeNetworkConfig() const override; + +protected: + TensorDescriptor inputDesc; + TensorDescriptor outputDesc; +}; + +struct BwdProblemDescription : ProblemDescriptionBase +{ + BwdProblemDescription(const TensorDescriptor& outputGradDesc_, + const TensorDescriptor& inputGradDesc_) + : outputGradDesc(outputGradDesc_), inputGradDesc(inputGradDesc_) + { + if(inputGradDesc.GetNumDims() != 2) + { + MIOPEN_THROW(miopenStatusBadParm, "Input grad tensor must be 2D."); + } + + if(outputGradDesc.GetNumDims() != 1) + { + MIOPEN_THROW(miopenStatusBadParm, "Output grad tensor must be 1D."); + } + + if(!IsSameType()) + { + MIOPEN_THROW(miopenStatusBadParm, + "Input grad and output grad tensor must have same type."); + } + } + + const TensorDescriptor& GetInputGradDesc() const { return inputGradDesc; } + const TensorDescriptor& GetOutputGradDesc() const { return outputGradDesc; } + + bool IsSameType() const + { + if(inputGradDesc.GetType() != outputGradDesc.GetType()) + { + return false; + } + return true; + } + + NetworkConfig MakeNetworkConfig() const override; + +protected: + TensorDescriptor outputGradDesc; + TensorDescriptor inputGradDesc; +}; + +} // namespace trace + +} // namespace miopen diff --git a/src/include/miopen/trace/solvers.hpp b/src/include/miopen/trace/solvers.hpp new file mode 100644 index 0000000000..7a8784d203 --- /dev/null +++ b/src/include/miopen/trace/solvers.hpp @@ -0,0 +1,77 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +namespace miopen { + +namespace solver { + +namespace trace { + +using TraceForwardSolverBase = + NonTunableSolverBase; + +struct TraceForward final : TraceForwardSolverBase +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::trace::FwdProblemDescription& problem) const override; + + bool IsImprovementOverROCm(const ExecutionContext& context, + const miopen::trace::FwdProblemDescription& problem) const; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::trace::FwdProblemDescription& problem) const override; + + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::trace::FwdProblemDescription& problem) const override; + bool MayNeedWorkspace() const override { return true; } +}; + +using TraceBackwardSolverBase = + NonTunableSolverBase; + +struct TraceBackward final : TraceBackwardSolverBase +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::trace::BwdProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::trace::BwdProblemDescription& problem) const override; +}; + +} // namespace trace + +} // namespace solver + +} // namespace miopen diff --git a/src/kernels/MIOpenFill.cpp b/src/kernels/MIOpenFill.cpp new file mode 100644 index 0000000000..4333d76173 --- /dev/null +++ b/src/kernels/MIOpenFill.cpp @@ -0,0 +1,44 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +template +__device__ void FillZeroKernel(TIO* output, uint64_t size) +{ + size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + if(gid >= size) + return; + + output[gid] = static_cast(0); +} + +extern "C" __global__ void FillZero(IO_TYPE* output, uint64_t size) +{ + FillZeroKernel(output, size); +} diff --git a/src/kernels/MIOpenTrace.cpp b/src/kernels/MIOpenTrace.cpp new file mode 100644 index 0000000000..bc21eac706 --- /dev/null +++ b/src/kernels/MIOpenTrace.cpp @@ -0,0 +1,106 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "block_reduce.hpp" +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "tensor_view.hpp" + +#define LOCAL_SIZE 256 + +template +__device__ void TraceForward_kernel( + const TI* input, TO* output, size_t N, tensor_view_t<2> input_tv, bool isReduced) +{ + const size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + const size_t lid = threadIdx.x; + + __shared__ FLOAT_ACCUM local_mem[LOCAL_SIZE]; + + if(gid < N) + { + tensor_layout_t<2> diag_layout = {gid, gid}; + auto input_idx = input_tv.get_tensor_view_idx(diag_layout); + local_mem[lid] = CVT_FLOAT2ACCUM(input[input_idx]); + } + else + { + local_mem[lid] = 0; + } + + __syncthreads(); + + for(size_t i = blockDim.x / 2; i > 0; i >>= 1) + { + if(lid < i) + { + local_mem[lid] += local_mem[lid + i]; + } + __syncthreads(); + } + + if(lid == 0) + { + output[blockIdx.x] = isReduced ? local_mem[0] : CVT_ACCUM2FLOAT(local_mem[0]); + } +} + +extern "C" __global__ void TraceForward( + const IO_TYPE* input, O_TYPE* output, size_t N, tensor_view_t<2> input_tv, bool isReduced) +{ + TraceForward_kernel(input, output, N, input_tv, isReduced); +} + +template +__device__ void TraceBackward_kernel(const TIO* output_grad, + TIO* input_grad, + size_t N, + tensor_view_t<2> input_grad_tv) +{ + size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + if(gid >= N) + return; + + size_t idx = gid % (input_grad_tv.size[1] + 1); + + if(idx != input_grad_tv.size[1]) + { + TIO val = output_grad[0]; + tensor_layout_t<2> ingrad_layout = {gid, idx}; + input_grad[input_grad_tv.get_tensor_view_idx(ingrad_layout)] = val; + } +} + +extern "C" __global__ void TraceBackward(const IO_TYPE* output_grad, + IO_TYPE* input_grad, + size_t N, + tensor_view_t<2> input_grad_tv) +{ + TraceBackward_kernel(output_grad, input_grad, N, input_grad_tv); +} diff --git a/src/solver.cpp b/src/solver.cpp index c3b077d435..eb439d9d83 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -707,6 +708,9 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) multimarginloss::MultiMarginLossForward{}.SolverDbId()); Register(registry, ++id, Primitive::Mha, mha::MhaCKFlashAttentionV2Forward{}.SolverDbId()); + + Register(registry, ++id, Primitive::Trace, trace::TraceBackward{}.SolverDbId()); + Register(registry, ++id, Primitive::Trace, trace::TraceForward{}.SolverDbId()); // IMPORTANT: New solvers should be added to the end of the function, and don't leave a white // space between this comment and the newly registered solver(s)! } diff --git a/src/solver/trace/backward_trace.cpp b/src/solver/trace/backward_trace.cpp new file mode 100644 index 0000000000..3431f9d78d --- /dev/null +++ b/src/solver/trace/backward_trace.cpp @@ -0,0 +1,181 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +namespace solver { + +namespace trace { + +bool TraceBackward::IsApplicable(const ExecutionContext& /*context*/, + const miopen::trace::BwdProblemDescription& problem) const +{ + if(!(problem.GetInputGradDesc().GetType() == miopenFloat || + problem.GetInputGradDesc().GetType() == miopenHalf || + problem.GetInputGradDesc().GetType() == miopenBFloat16)) + { + return false; + } + + return true; +} + +ConvSolution TraceBackward::GetSolution(const ExecutionContext& /*context*/, + const miopen::trace::BwdProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto dtype = problem.GetOutputGradDesc().GetType(); + auto io_dtype = miopen::GetDataType(dtype); + auto input_grad_len = problem.GetInputGradDesc().GetLengths(); + size_t N = input_grad_len[0]; + auto input_grad_numel = problem.GetInputGradDesc().GetElementSize(); + + const auto build_params = + KernelBuildParameters{{"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"IO_TYPE", io_dtype == "bfloat16" ? "ushort" : io_dtype}, + {"O_TYPE", io_dtype == "bfloat16" ? "ushort" : io_dtype}}; + + /* Phase 1: Fill input grad with zeros */ + { + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = AlignUp(input_grad_numel, xlocalsize); + + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenFill.cpp"; + kernel.kernel_name = "FillZero"; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(1); + kernel.l_wk.push_back(1); + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(1); + kernel.g_wk.push_back(1); + + result.construction_params.push_back(kernel); + } + + /* Phase 2: Trace backward */ + { + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = AlignUp(N, xlocalsize); + + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenTrace.cpp"; + kernel.kernel_name = "TraceBackward"; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(1); + kernel.l_wk.push_back(1); + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(1); + kernel.g_wk.push_back(1); + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [input_grad_numel, N](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + float elapsed = 0.f; + HipEventPtr start, stop; + + bool reset_profiling_state = false; + if(handle_.IsProfilingEnabled()) + { + reset_profiling_state = true; + handle_.EnableProfiling(false); + start = miopen::make_hip_event(); + stop = miopen::make_hip_event(); + hipEventRecord(start.get(), handle_.GetStream()); + } + + /* Phase 1: Fill input grad with zeros. */ + { + decltype(auto) kernel = handle_.Run(kernels.front()); + kernel(params.inputGrad, input_grad_numel); + } + + /* Phase 2: Trace backward. */ + { + auto input_grad_tv = get_inner_expanded_tv<2>(deref(params.inputGradDesc)); + + decltype(auto) kernel = handle_.Run(kernels.back()); + kernel(params.outputGrad, params.inputGrad, N, input_grad_tv); + } + + if(reset_profiling_state) + { + handle_.EnableProfiling(true); + } + + if(handle_.IsProfilingEnabled()) + { + hipEventRecord(stop.get(), handle_.GetStream()); + hipEventSynchronize(stop.get()); + hipEventElapsedTime(&elapsed, start.get(), stop.get()); + + // Clean up + hipEventDestroy(start.get()); + hipEventDestroy(stop.get()); + handle_.ResetKernelTime(); + handle_.AccumKernelTime(elapsed); + } + }; + }; + + return result; +} + +} // namespace trace + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/trace/forward_trace.cpp b/src/solver/trace/forward_trace.cpp new file mode 100644 index 0000000000..b814220879 --- /dev/null +++ b/src/solver/trace/forward_trace.cpp @@ -0,0 +1,267 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define LOCAL_SIZE 256 +#define LOCAL_SIZE_REDUCE 1024 + +namespace miopen { + +namespace solver { + +namespace trace { + +bool TraceForward::IsApplicable(const ExecutionContext& /*context*/, + const miopen::trace::FwdProblemDescription& problem) const +{ + if(!(problem.GetInputDesc().GetType() == miopenFloat || + problem.GetInputDesc().GetType() == miopenHalf || + problem.GetInputDesc().GetType() == miopenBFloat16)) + { + return false; + } + + return true; +} + +ConvSolution TraceForward::GetSolution(const ExecutionContext& /*context*/, + const miopen::trace::FwdProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto dtype = problem.GetOutputDesc().GetType(); + auto i_dtype = miopen::GetDataType(dtype); + auto input_len = problem.GetInputDesc().GetLengths(); + size_t N = std::min(input_len[0], input_len[1]); + + // 1 kernel + if(N <= LOCAL_SIZE) + { + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = AlignUp(N, xlocalsize); + + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenTrace.cpp"; + kernel.kernel_name = "TraceForward"; + + const auto build_params = + KernelBuildParameters{{"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"IO_TYPE", i_dtype == "bfloat16" ? "ushort" : i_dtype}, + {"O_TYPE", i_dtype == "bfloat16" ? "ushort" : i_dtype}}; + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(1); + kernel.l_wk.push_back(1); + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(1); + kernel.g_wk.push_back(1); + + result.construction_params.push_back(kernel); + } + else + { + const auto build_params = + KernelBuildParameters{{"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"IO_TYPE", i_dtype == "bfloat16" ? "ushort" : i_dtype}, + {"O_TYPE", "FLOAT_ACCUM"}, + {"REDUCE_SIZE", LOCAL_SIZE_REDUCE}}; + + /* Phase 1: Get diagonal to workspace */ + { + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = AlignUp(N, xlocalsize); + + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenTrace.cpp"; + kernel.kernel_name = "TraceForward"; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(1); + kernel.l_wk.push_back(1); + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(1); + kernel.g_wk.push_back(1); + + result.construction_params.push_back(kernel); + } + + /* Phase 2: Reduce sum (FLOAT_ACCUM to FLOAT_ACCUM) */ + { + auto _size = (N + LOCAL_SIZE - 1) / LOCAL_SIZE; + + while(_size > LOCAL_SIZE_REDUCE) + { + size_t xlocalsize = LOCAL_SIZE_REDUCE; + size_t xgridsize = AlignUp(_size, xlocalsize); + + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenReduceSum.cpp"; + kernel.kernel_name = "ReduceSumFLOATACCUM"; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(1); + kernel.l_wk.push_back(1); + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(1); + kernel.g_wk.push_back(1); + + result.construction_params.push_back(kernel); + _size = (_size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE; + } + + /* Reduce sum (FLOAT_ACCUM to TIO) */ + size_t xlocalsize = LOCAL_SIZE_REDUCE; + size_t xgridsize = AlignUp(_size, xlocalsize); + + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenReduceSum.cpp"; + kernel.kernel_name = "ReduceSum"; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(1); + kernel.l_wk.push_back(1); + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(1); + kernel.g_wk.push_back(1); + + result.construction_params.push_back(kernel); + } + } + + result.invoker_factory = [dtype, N](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + auto input_tv = get_inner_expanded_tv<2>(deref(params.inputDesc)); + + if(N <= LOCAL_SIZE) + { + decltype(auto) kernel = handle_.Run(kernels.front()); + kernel(params.input, params.output, N, input_tv, false); + } + else + { + auto elapsed = 0.f; + HipEventPtr start, stop; + + const bool profiling = handle_.IsProfilingEnabled(); + if(profiling) + { + handle_.EnableProfiling(false); + start = miopen::make_hip_event(); + stop = miopen::make_hip_event(); + hipEventRecord(start.get(), handle_.GetStream()); + } + + /* Phase 1: Get diagonal. */ + { + decltype(auto) kernel = handle_.Run(kernels.front()); + kernel(params.input, params.workspace, N, input_tv, true); + } + + /* Phase 2: Reduce. */ + if(kernels.size() > 1) + { + auto _size = (N + LOCAL_SIZE - 1) / LOCAL_SIZE; + auto reduce_in = params.workspace; + auto reduce_out = static_cast(static_cast(params.workspace) + + N * get_data_size(dtype)); + + for(size_t i = 1; i < kernels.size() - 1; ++i) + { + decltype(auto) kernel = handle_.Run(kernels[i]); + + kernel(reduce_in, reduce_out, _size); + std::swap(reduce_in, reduce_out); + + _size = (_size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE; + } + + decltype(auto) kernel = handle_.Run(kernels.back()); + kernel(reduce_in, params.output, _size); + + if(profiling) + { + hipEventRecord(stop.get(), handle_.GetStream()); + hipEventSynchronize(stop.get()); + hipEventElapsedTime(&elapsed, start.get(), stop.get()); + + // Clean up + hipEventDestroy(start.get()); + hipEventDestroy(stop.get()); + handle_.ResetKernelTime(); + handle_.AccumKernelTime(elapsed); + + handle_.EnableProfiling(true); + } + } + } + }; + }; + + return result; +} + +std::size_t +TraceForward::GetWorkspaceSize(const ExecutionContext& /*context*/, + const miopen::trace::FwdProblemDescription& problem) const +{ + auto input_len = problem.GetInputDesc().GetLengths(); + size_t N = std::min(input_len[0], input_len[1]); + size_t trace_forward_size = (N + LOCAL_SIZE - 1) / LOCAL_SIZE; + return (trace_forward_size + (trace_forward_size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE) * + get_data_size(miopenFloat); +} + +} // namespace trace + +} // namespace solver + +} // namespace miopen diff --git a/src/trace.cpp b/src/trace.cpp new file mode 100644 index 0000000000..d64917f83a --- /dev/null +++ b/src/trace.cpp @@ -0,0 +1,112 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +namespace trace { + +size_t GetTraceForwardWorkspaceSize(Handle& handle, + const TensorDescriptor& inputDesc, + const TensorDescriptor& outputDesc) +{ + auto ctx = ExecutionContext{&handle}; + const auto problem = trace::FwdProblemDescription{inputDesc, outputDesc}; + + const auto algo = AlgorithmName{"TraceForward"}; + const auto solvers = solver::SolverContainer{}; + + auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem); + + return pair_size_vector.empty() ? static_cast(-1) : pair_size_vector.front().second; +} + +miopenStatus_t TraceForward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output) +{ + const auto problem = trace::FwdProblemDescription{inputDesc, outputDesc}; + + const auto invoke_params = [&]() { + auto tmp = trace::FwdInvokeParams{}; + tmp.type = InvokeType::Run; + tmp.inputDesc = &inputDesc; + tmp.outputDesc = &outputDesc; + tmp.input = input; + tmp.output = output; + tmp.workspace = workspace; + tmp.workspace_size = workspaceSizeInBytes; + return tmp; + }(); + + const auto algo = AlgorithmName{"TraceForward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +miopenStatus_t TraceBackward(Handle& handle, + const TensorDescriptor& outputGradDesc, + ConstData_t outputGrad, + const TensorDescriptor& inputGradDesc, + Data_t inputGrad) +{ + const auto problem = miopen::trace::BwdProblemDescription{outputGradDesc, inputGradDesc}; + + const auto invoke_params = [&]() { + auto tmp = trace::BwdInvokeParams{}; + tmp.type = InvokeType::Run; + tmp.inputGradDesc = &inputGradDesc; + tmp.outputGradDesc = &outputGradDesc; + tmp.inputGrad = inputGrad; + tmp.outputGrad = outputGrad; + return tmp; + }(); + + const auto algo = AlgorithmName{"TraceBackward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace trace + +} // namespace miopen diff --git a/src/trace/problem_description.cpp b/src/trace/problem_description.cpp new file mode 100644 index 0000000000..c6e4c85e5c --- /dev/null +++ b/src/trace/problem_description.cpp @@ -0,0 +1,69 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include + +namespace miopen { + +namespace trace { + +NetworkConfig FwdProblemDescription::MakeNetworkConfig() const +{ + auto input_dtype = inputDesc.GetType(); + auto input_len = inputDesc.GetLengths(); + auto N = std::min(input_len[0], input_len[1]); + + std::ostringstream ss; + + ss << "trace_fwd"; + ss << "i_dtype" << input_dtype; + ss << "N" << N; + + return NetworkConfig{ss.str()}; +} + +NetworkConfig BwdProblemDescription::MakeNetworkConfig() const +{ + auto input_grad_dtype = inputGradDesc.GetType(); + auto input_grad_numel = inputGradDesc.GetElementSize(); + auto N = inputGradDesc.GetLengths()[0]; + + std::ostringstream ss; + + ss << "trace_bwd"; + ss << "i_dtype" << input_grad_dtype; + ss << "size" << input_grad_numel; + ss << "N" << N; + + return NetworkConfig{ss.str()}; +} + +} // namespace trace + +} // namespace miopen diff --git a/src/trace_api.cpp b/src/trace_api.cpp new file mode 100644 index 0000000000..f586c88346 --- /dev/null +++ b/src/trace_api.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +extern "C" miopenStatus_t +miopenGetTraceForwardWorkspaceSize(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + size_t* sizeInBytes) +{ + MIOPEN_LOG_FUNCTION(handle, inputDesc, sizeInBytes); + + return miopen::try_([&] { + miopen::deref(sizeInBytes) = miopen::trace::GetTraceForwardWorkspaceSize( + miopen::deref(handle), miopen::deref(inputDesc), miopen::deref(outputDesc)); + }); +} + +extern "C" miopenStatus_t miopenTraceForward(miopenHandle_t handle, + void* workspace, + size_t workspaceSizeInBytes, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output) +{ + MIOPEN_LOG_FUNCTION( + handle, workspace, workspaceSizeInBytes, inputDesc, input, outputDesc, output); + + return miopen::try_([&] { + miopen::trace::TraceForward(miopen::deref(handle), + DataCast(workspace), + workspaceSizeInBytes, + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(outputDesc), + DataCast(output)); + }); +} + +extern "C" miopenStatus_t miopenTraceBackward(miopenHandle_t handle, + const miopenTensorDescriptor_t outputGradDesc, + const void* outputGrad, + const miopenTensorDescriptor_t inputGradDesc, + void* inputGrad) +{ + MIOPEN_LOG_FUNCTION(handle, outputGradDesc, outputGrad, inputGradDesc, inputGrad); + + return miopen::try_([&] { + miopen::trace::TraceBackward(miopen::deref(handle), + miopen::deref(outputGradDesc), + DataCast(outputGrad), + miopen::deref(inputGradDesc), + DataCast(inputGrad)); + }); +} diff --git a/test/cpu_trace.hpp b/test/cpu_trace.hpp new file mode 100644 index 0000000000..2ae533268b --- /dev/null +++ b/test/cpu_trace.hpp @@ -0,0 +1,72 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +#include "ford.hpp" +#include "tensor_holder.hpp" +#include "tensor_view.hpp" + +#include + +template +void cpu_trace_forward(const tensor& input, tensor& ref_output) +{ + tensor_view_t<2> input_tv = miopen::get_inner_expanded_tv<2>(input.desc); + auto input_len = input.desc.GetLengths(); + size_t N = std::min(input_len[0], input_len[1]); + double res = 0; + + for(size_t i = 0; i < N; i++) + { + tensor_layout_t<2> input_layout = {i, i}; + size_t input_idx = input_tv.get_tensor_view_idx(input_layout); + T val = input[input_idx]; + res += static_cast(input[input_idx]); + } + ref_output[0] = static_cast(res); +} + +template +void cpu_trace_backward(const tensor& output_grad, tensor& ref_input_grad) +{ + tensor_view_t<2> input_grad_tv = miopen::get_inner_expanded_tv<2>(ref_input_grad.desc); + auto input_grad_len = ref_input_grad.desc.GetLengths(); + size_t N = input_grad_len[0]; + + par_ford(N)([&](size_t i) { + size_t idx = i % (input_grad_tv.size[1] + 1); + + if(idx != input_grad_tv.size[1]) + { + T val = output_grad[0]; + tensor_layout_t<2> ingrad_layout = {i, idx}; + ref_input_grad[input_grad_tv.get_tensor_view_idx(ingrad_layout)] = static_cast(val); + } + }); +} diff --git a/test/gtest/trace.cpp b/test/gtest/trace.cpp new file mode 100644 index 0000000000..b07d31d5e1 --- /dev/null +++ b/test/gtest/trace.cpp @@ -0,0 +1,84 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "trace.hpp" +using float16 = half_float::half; + +namespace trace { + +using GPU_Trace_fwd_FP32 = TraceFwdTest; +using GPU_Trace_fwd_FP16 = TraceFwdTest; +using GPU_Trace_fwd_BFP16 = TraceFwdTest; + +using GPU_Trace_bwd_FP32 = TraceBwdTest; +using GPU_Trace_bwd_FP16 = TraceBwdTest; +using GPU_Trace_bwd_BFP16 = TraceBwdTest; + +} // namespace trace +using namespace trace; + +TEST_P(GPU_Trace_fwd_FP32, Test) +{ + RunTest(); + Verify(); +}; + +TEST_P(GPU_Trace_fwd_FP16, Test) +{ + RunTest(); + Verify(); +}; + +TEST_P(GPU_Trace_fwd_BFP16, Test) +{ + RunTest(); + Verify(); +}; + +TEST_P(GPU_Trace_bwd_FP32, Test) +{ + RunTest(); + Verify(); +}; + +TEST_P(GPU_Trace_bwd_FP16, Test) +{ + RunTest(); + Verify(); +}; + +TEST_P(GPU_Trace_bwd_BFP16, Test) +{ + RunTest(); + Verify(); +}; + +INSTANTIATE_TEST_SUITE_P(Full, GPU_Trace_fwd_FP32, testing::ValuesIn(GenFullTestCases())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Trace_fwd_FP16, testing::ValuesIn(GenFullTestCases())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Trace_fwd_BFP16, testing::ValuesIn(GenFullTestCases())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Trace_bwd_FP32, testing::ValuesIn(GenFullTestCases())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Trace_bwd_FP16, testing::ValuesIn(GenFullTestCases())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Trace_bwd_BFP16, testing::ValuesIn(GenFullTestCases())); diff --git a/test/gtest/trace.hpp b/test/gtest/trace.hpp new file mode 100644 index 0000000000..453ab5840a --- /dev/null +++ b/test/gtest/trace.hpp @@ -0,0 +1,254 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "cpu_trace.hpp" +#include "get_handle.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include "random.hpp" + +#include +#include +#include + +#include +#include + +struct TraceTestCase +{ + std::vector input_dim; + bool isContiguous; + + friend std::ostream& operator<<(std::ostream& os, const TraceTestCase& tc) + { + os << "Input dims: "; + for(auto dim_sz : tc.input_dim) + { + os << dim_sz << " "; + } + return os << " contiguous: " << tc.isContiguous; + } + + TraceTestCase() {} + + TraceTestCase(std::vector input_dims_, bool cont_) + : input_dim(input_dims_), isContiguous(cont_) + { + } + + std::vector ComputeStrides() const + { + std::vector inputDim = input_dim; + if(!isContiguous) + std::swap(inputDim.front(), inputDim.back()); + std::vector strides(inputDim.size()); + strides.back() = 1; + for(int i = inputDim.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * inputDim[i + 1]; + if(!isContiguous) + std::swap(strides.front(), strides.back()); + return strides; + } +}; + +inline std::vector GenFullTestCases() +{ // n c d h w dim + // clang-format off + return { + {{1, 8}, false}, // non-cont small case + {{8, 4}, false}, // non-cont small case + {{512, 512}, false}, // non-cont large case + {{384, 640}, false}, // non-cont large case + {{512, 768}, true}, // cont large case + {{1024, 1024}, true}, // cont large case + {{1, 10}, true}, // cont small case + {{34, 20}, true}, // cont small case + }; + // clang-format on +} + +template +struct TraceFwdTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + trace_config = GetParam(); + auto gen_value1 = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + auto gen_value2 = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 99); }; + + auto in_dims = trace_config.input_dim; + auto in_strides = trace_config.ComputeStrides(); + input = tensor{in_dims, in_strides}.generate(gen_value1); + + auto out_lengths = std::vector{1}; + + output = tensor{out_lengths}; + std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); + + ref_output = tensor{out_lengths}; + std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits::quiet_NaN()); + + ws_sizeInBytes = + miopen::trace::GetTraceForwardWorkspaceSize(handle, input.desc, output.desc); + if(ws_sizeInBytes == static_cast(-1)) + GTEST_SKIP(); + + if(ws_sizeInBytes != 0) + { + std::vector workspace_dims; + workspace_dims.push_back(ws_sizeInBytes / sizeof(float)); + + workspace = tensor{workspace_dims}; + std::fill(workspace.begin(), workspace.end(), static_cast(0)); + + workspace_dev = handle.Write(workspace.data); + } + + input_dev = handle.Write(input.data); + output_dev = handle.Write(output.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + + miopenStatus_t status; + + cpu_trace_forward(input, ref_output); + status = miopen::trace::TraceForward(handle, + workspace_dev.get(), + ws_sizeInBytes, + input.desc, + input_dev.get(), + output.desc, + output_dev.get()); + + ASSERT_EQ(status, miopenStatusSuccess); + + output.data = handle.Read(output_dev, output.data.size()); + } + + double GetTolerance() + { + double tolerance = std::numeric_limits::epsilon() * 10; + return tolerance; + } + + void Verify() + { + double threshold = GetTolerance(); + EXPECT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output)); + auto error = miopen::rms_range(ref_output, output); + + EXPECT_TRUE(error < threshold * 10) << "Error output beyond tolerance Error: " << error + << ", Tolerance: " << threshold * 10; + } + + TraceTestCase trace_config; + + tensor input; + tensor output; + tensor workspace; + + tensor ref_output; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr output_dev; + miopen::Allocator::ManageDataPtr workspace_dev; + + size_t ws_sizeInBytes; +}; + +template +struct TraceBwdTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + trace_config = GetParam(); + auto gen_value1 = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + auto gen_value2 = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 99); }; + + auto in_grad_dims = trace_config.input_dim; + auto in_grad_strides = trace_config.ComputeStrides(); + input_grad = tensor{in_grad_dims, in_grad_strides}; + + auto out_grad_dims = std::vector{1}; + output_grad = tensor{out_grad_dims}.generate(gen_value1); + + std::fill(input_grad.begin(), input_grad.end(), std::numeric_limits::quiet_NaN()); + + ref_input_grad = tensor{in_grad_dims, in_grad_strides}; + std::fill(ref_input_grad.begin(), ref_input_grad.end(), static_cast(0)); + + input_grad_dev = handle.Write(input_grad.data); + output_grad_dev = handle.Write(output_grad.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + + miopenStatus_t status; + + cpu_trace_backward(output_grad, ref_input_grad); + status = miopen::trace::TraceBackward( + handle, output_grad.desc, output_grad_dev.get(), input_grad.desc, input_grad_dev.get()); + + ASSERT_EQ(status, miopenStatusSuccess); + + input_grad.data = handle.Read(input_grad_dev, input_grad.data.size()); + } + + double GetTolerance() + { + double tolerance = std::numeric_limits::epsilon() * 10; + return tolerance; + } + + void Verify() + { + double threshold = GetTolerance(); + ASSERT_EQ(miopen::range_distance(ref_input_grad), miopen::range_distance(input_grad)); + auto error = miopen::rms_range(ref_input_grad, input_grad); + + EXPECT_LT(error, threshold * 10) << "Error output beyond tolerance Error: " << error + << ", Tolerance: " << threshold * 10; + } + + TraceTestCase trace_config; + + tensor input_grad; + tensor output_grad; + + tensor ref_input_grad; + + miopen::Allocator::ManageDataPtr input_grad_dev; + miopen::Allocator::ManageDataPtr output_grad_dev; +};