From e7e373713ee7ec20965ad236d48172c9b818c0a7 Mon Sep 17 00:00:00 2001 From: vraspar Date: Mon, 20 Jan 2025 17:26:56 -0800 Subject: [PATCH 1/4] Add Softmax kernel and transpose utility function for WebGPU execution provider --- .../core/providers/webgpu/math/softmax.cc | 241 ++++++++++++++++++ .../core/providers/webgpu/math/softmax.h | 52 ++++ .../core/providers/webgpu/tensor/transpose.cc | 50 ++++ .../core/providers/webgpu/tensor/transpose.h | 2 + .../webgpu/webgpu_execution_provider.cc | 6 +- 5 files changed, 348 insertions(+), 3 deletions(-) create mode 100644 onnxruntime/core/providers/webgpu/math/softmax.cc create mode 100644 onnxruntime/core/providers/webgpu/math/softmax.h diff --git a/onnxruntime/core/providers/webgpu/math/softmax.cc b/onnxruntime/core/providers/webgpu/math/softmax.cc new file mode 100644 index 0000000000000..796c56f67c73d --- /dev/null +++ b/onnxruntime/core/providers/webgpu/math/softmax.cc @@ -0,0 +1,241 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/common/inlined_containers.h" +#include "core/providers/webgpu/tensor/softmax.h" +#include "core/providers/webgpu/tensor/transpose.h" +#include "core/providers/cpu/tensor/utils.h" +#include "core/providers/webgpu/shader_variable.h" +#include "core/providers/webgpu/shader_helper.h" +#include "core/providers/webgpu/webgpu_supported_types.h" + +namespace onnxruntime { +namespace webgpu { + +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + Softmax, + kOnnxDomain, + 1, 10, + kWebGpuExecutionProvider, + (*KernelDefBuilder::Create()) + .TypeConstraint("T", WebGpuSupportedNumberTypes()), + Softmax); + +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + Softmax, + kOnnxDomain, + 11, 12, + kWebGpuExecutionProvider, + (*KernelDefBuilder::Create()) + .TypeConstraint("T", WebGpuSupportedNumberTypes()), + Softmax); + +ONNX_OPERATOR_KERNEL_EX( + Softmax, + kOnnxDomain, + 13, + kWebGpuExecutionProvider, + (*KernelDefBuilder::Create()) + .TypeConstraint("T", WebGpuSupportedNumberTypes()), + Softmax); + +static std::string MaxVector(std::string name, int components) { + switch (components) { + case 1: + return name; + case 2: + return "max(" + name + ".x, " + name + ".y)"; + case 4: + return "max(max(" + name + ".x, " + name + ".y), max(" + name + ".z, " + name + ".w))"; + default: + ORT_THROW("Unsupported number of components: ", components); + } +} + +static std::string SumVector(std::string x, int components) { + switch (components) { + case 1: + return x; + case 2: + return "(" + x + ".x + " + x + ".y" + ")"; + case 4: + return "(" + x + ".x + " + x + ".y + " + x + ".w + " + x + ".z" + ")"; + default: + ORT_THROW("Unsupported number of components: ", components); + } +} + +static int GetMaxComponents(int64_t size) { + if (size % 4 == 0) { + return 4; + } else if (size % 2 == 0) { + return 2; + } + return 1; +} + +Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const { + // Add input and output variables + const auto& input = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias); + const auto& output = shader.AddOutput("result", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias); + int components = input.NumComponents(); + + std::string threadMaxDecl = input.StorageType() == "f32" ? + "val threadMax = x_value_t(-3.402823e+38f);\n" : + "val threadMax = x_value_t(-65504.0h));\n"; + + + // Define shared memory for row max and row sum + shader.AdditionalImplementation() + << "var rowMaxShared : x_value_t;\n" + << "var rowSumShared : x_value_t;\n" + << "var threadShared : array;\n"; + + // Define helper functions to get and set values + shader.AdditionalImplementation() + << "fn getValue(row: i32, col: i32, row_stride: i32) -> x_value_t {\n" + << " let index = row * row_stride + col;\n" + << " return x[index];\n" + << "}\n" + << "fn setValue(row: i32, col: i32, row_stride: i32, value: x_value_t) {\n" + << " let index = row * row_stride + col;\n" + << " result[index] = value;\n" + << "}\n"; + + // Main function body + shader.MainFunctionBody() + << " let gindex = i32(global_idx);\n" + << " let lindex = i32(local_idx);\n" + << " const wg = " << WG << ";\n" + << " let row = gindex / wg;\n" + << " let cols = uniforms.packedCols;\n" + << " let row_stride : i32 = uniforms.packedCols;\n" + + // Find the row's max value + << threadMaxDecl + << " for (var col = lindex; col < cols; col += wg) {\n" + << " let value = getValue(row, col, row_stride);\n" + << " threadMax = max(threadMax, value);\n" + << " }\n" + << " if (lindex < cols) {\n" + << " threadShared[lindex] = threadMax;\n" + << " }\n" + << " workgroupBarrier();\n" + + // Reduce to find the max value + << " var reduceSize = min(cols, wg);\n" + << " for (var currSize = reduceSize >> 1; currSize > 0; currSize = reduceSize >> 1) {\n" + << " reduceSize = currSize + (reduceSize & 1);\n" + << " if (lindex < currSize) {\n" + << " threadShared[lindex] = max(threadShared[lindex], threadShared[lindex + reduceSize]);\n" + << " }\n" + << " workgroupBarrier();\n" + << " }\n" + << " if (lindex == 0) {\n" + << " rowMaxShared = x_value_t(" << MaxVector('threadShared[0]', components) << ");\n" + << " }\n" + << " workgroupBarrier();\n" + + // Find the row's sum of exponentials + << " var threadSum = x_value_t(0.0);\n" + << " for (var col = lindex; col < cols; col += wg) {\n" + << " let subExp = exp(getValue(row, col, row_stride) - rowMaxShared);\n" + << " threadSum += subExp;\n" + << " }\n" + << " threadShared[lindex] = threadSum;\n" + << " workgroupBarrier();\n" + + // Reduce to find the sum of exponentials + << " for (var currSize = wg >> 1; currSize > 0; currSize = currSize >> 1) {\n" + << " if (lindex < currSize) {\n" + << " threadShared[lindex] = threadShared[lindex] + threadShared[lindex + currSize];\n" + << " }\n" + << " workgroupBarrier();\n" + << " }\n" + << " if (lindex == 0) {\n" + << " rowSumShared = x_value_t(" << SumVector("threadShared[0]", components) << ");\n" + << " }\n" + << " workgroupBarrier();\n" + + // Calculate the final value for each element in the row + << " for (var col = lindex; col < cols; col += wg) {\n" + << " let value = exp(getValue(row, col, row_stride) - rowMaxShared) / rowSumShared;\n" + << " setValue(row, col, row_stride, value);\n" + << " }\n"; + + return Status::OK(); +} + +Status Softmax::ComputeInternal(ComputeContext& context) const { + const auto* input_tensor = context.Input(0); + const TensorShape& input_shape = input_tensor->Shape(); + size_t input_rank = input_shape.NumDimensions(); + + auto* output_tensor = context.Output(0, input_shape); + + // normalize axis + int64_t axis = axis < 0 ? axis_ + input_rank : axis_; + + bool is_transpose_required = axis < input_rank - 1; + TensorShape transposed_input_shape = input_shape; + Tensor transposed_input_tensor; + Tensor intermediate_output; + InlinedVector perm; + + if (is_transpose_required) { + AllocatorPtr alloc; + perm.reserve(input_rank); + for (size_t i = 0; i < input_rank; ++i) { + perm[i] = i; + } + perm[axis] = input_rank - 1; + perm[input_rank - 1] = axis; + + // allocate a temporary tensor to hold transposed input + Tensor temp_input(input_tensor->DataType(), TensorShape(transposed_input_shape), alloc); + + ORT_RETURN_IF_ERROR(Transpose::DoTranspose( perm, *input_tensor, temp_input)); + transposed_input_tensor = std::move(temp_input); + transposed_input_shape = transposed_input_tensor.Shape(); + + // Allocate memory for the intermediate output + Tensor temp_output(output_tensor->DataType(), TensorShape(transposed_input_shape), alloc); + intermediate_output = std::move(temp_output); + } else { + transposed_input_tensor = *input_tensor; + } + + + const size_t cols = transposed_input_shape[input_rank - 1]; + const size_t rows = input_shape.Size() / cols; + const size_t components = GetMaxComponents(cols); + const auto packedCols = cols / components; + + size_t WG = rows == 1 ? 256: 64; + + SoftmaxProgram program{WG}; + + + program + .CacheHint(std::to_string(components), std::to_string(WG)) + .AddInputs({*transposed_input_tensor, ProgramTensorMetadataDependency::TypeAndRank}}) + .AddOutputs({ is_transpose_required ? *intermediate_output : output_tensor}) + .SetWorkgroupSize(WG) + .SetDispatchGroupSize(rows) + .AddUniformVariables({ + {static_cast(packedCols)} + }); + + + ORT_RETURN_IF_ERROR(context.RunProgram(program)); + + // If transpose was required, transpose the result back + if (is_transpose_required) { + Tensor transposed_output_tensor; + ORT_RETURN_IF_ERROR(Transpose::DoTranspose(perm, intermediate_output, *output_tensor)); + } + + return Status::OK(); +} +} // namespace webgpu +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/math/softmax.h b/onnxruntime/core/providers/webgpu/math/softmax.h new file mode 100644 index 0000000000000..b8bc37a0c03b6 --- /dev/null +++ b/onnxruntime/core/providers/webgpu/math/softmax.h @@ -0,0 +1,52 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/providers/webgpu/webgpu_supported_types.h" +#include "core/providers/cpu/math/softmax.h" +#include "core/providers/webgpu/webgpu_kernel.h" +#include "core/providers/webgpu/program.h" + +namespace onnxruntime { +namespace webgpu { + +class Softmax final : public WebGpuKernel { + public: + Softmax(const OpKernelInfo& info) : WebGpuKernel{info} { + int opset_ = info.node().SinceVersion(); + size_t axis; + Status status = info.GetAttr("axis", &axis); + + if (status.IsOK()) { + axis_ = axis; + } else { + if (opset_ < 13) { + axis_ = 1; // opset-12 and below, the default axis value is 1 + } else { + axis_ = -1; // opset-13, the default axis value is -1 + } + } + } + + Status ComputeInternal(ComputeContext& context) const override; + + private: + size_t axis_; +}; + +class SoftmaxProgram final : public Program { + public: + SoftmaxProgram(size_t axis, int wg) : Program{"Softmax"}, axis_{axis}, WG_{wg} { + } + + Status GenerateShaderCode(ShaderHelper& sh) const override; + + WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"packedCols", ProgramUniformVariableDataType::Int32}); + + private: + int WG; +}; + +} // namespace webgpu +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc index c40ec43dd0009..062500055eeaf 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc @@ -97,6 +97,56 @@ Status TransposeProgram::GenerateShaderCode(ShaderHelper& shader) const { return Status::OK(); } +Status Transpose::DoTranspose(const gsl::span& permutations, const Tensor& input, Tensor& output) { + const auto& input_shape = input.Shape(); + int32_t rank = gsl::narrow_cast(input_shape.NumDimensions()); + + + TensorShapeVector output_dims(rank); + InlinedVector default_perm(rank); + const InlinedVector* p_perm = nullptr; + ORT_RETURN_IF_ERROR(ComputeOutputShape(input, output_dims, default_perm, p_perm)); + TensorShape output_shape(output_dims); + + InlinedVector new_shape{}; + InlinedVector new_perm{}; + SqueezeShape(input_shape.GetDims(), *p_perm, new_shape, new_perm); + const bool channels_last = new_perm == InlinedVector({2, 3, 1}); + const bool channels_first = new_perm == InlinedVector({3, 1, 2}); + const bool use_shared = (new_shape.size() == 2 && new_perm[0] > new_perm[1]) || channels_last || channels_first; + auto new_input_shape = input_shape; + + if (use_shared) { + new_input_shape = channels_last + ? TensorShape({new_shape[0], new_shape[1] * new_shape[2]}) + : channels_first + ? TensorShape({new_shape[0] * new_shape[1], new_shape[2]}) + : new_shape; + new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]}); + } + + uint32_t output_size = gsl::narrow_cast(input.Shape().Size()); + TransposeProgram program{*p_perm, use_shared}; + if (use_shared) { + program.SetWorkgroupSize(TILE_SIZE, TILE_SIZE, 1); + } + + program + .CacheHint(absl::StrJoin(*p_perm, "-")) + .AddInputs({{*input, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}}) + .AddOutputs({{*output, ProgramTensorMetadataDependency::None, new_output_shape, 1}}) + .SetDispatchGroupSize(static_cast((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE), + static_cast(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE))) + .AddUniformVariables({ + {static_cast(output_size)}, + }); + + use_shared ? program.SetDispatchGroupSize(static_cast((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE), + static_cast(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE))) + : program.SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE); + return context.RunProgram(program); +} + Status Transpose::ComputeInternal(ComputeContext& context) const { const auto* input_tensor = context.Input(0); const TensorShape& input_shape = input_tensor->Shape(); diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.h b/onnxruntime/core/providers/webgpu/tensor/transpose.h index 7cf5c1fe0865d..3eb672d1c6e31 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.h +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.h @@ -16,6 +16,8 @@ class Transpose final : public WebGpuKernel, public TransposeBase { Transpose(const OpKernelInfo& info) : WebGpuKernel{info}, TransposeBase{info} { } Status ComputeInternal(ComputeContext& context) const override; + static Status DoTranspose(const gsl::span& permutations, const Tensor& input, Tensor& output); + constexpr static uint32_t TILE_SIZE = 16; }; diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc index dec7e48786bf5..04b5965177d1e 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc @@ -625,9 +625,9 @@ std::unique_ptr RegisterKernels() { // BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, From f9b61dbc6f9ae29a22ccfcd33228ffa9cea38ae2 Mon Sep 17 00:00:00 2001 From: vraspar Date: Fri, 24 Jan 2025 19:22:05 -0800 Subject: [PATCH 2/4] Refactor Softmax implementation for WebGPU --- .../core/providers/webgpu/math/softmax.cc | 64 ++++++++++++++----- .../core/providers/webgpu/math/softmax.h | 12 ++-- .../core/providers/webgpu/shader_variable.h | 8 ++- 3 files changed, 60 insertions(+), 24 deletions(-) diff --git a/onnxruntime/core/providers/webgpu/math/softmax.cc b/onnxruntime/core/providers/webgpu/math/softmax.cc index 796c56f67c73d..4abefa704c689 100644 --- a/onnxruntime/core/providers/webgpu/math/softmax.cc +++ b/onnxruntime/core/providers/webgpu/math/softmax.cc @@ -2,13 +2,15 @@ // Licensed under the MIT License. #include "core/common/inlined_containers.h" -#include "core/providers/webgpu/tensor/softmax.h" +#include "core/providers/webgpu/math/softmax.h" #include "core/providers/webgpu/tensor/transpose.h" #include "core/providers/cpu/tensor/utils.h" #include "core/providers/webgpu/shader_variable.h" #include "core/providers/webgpu/shader_helper.h" #include "core/providers/webgpu/webgpu_supported_types.h" +#include "core/common/logging/logging.h" + namespace onnxruntime { namespace webgpu { @@ -45,6 +47,8 @@ static std::string MaxVector(std::string name, int components) { return name; case 2: return "max(" + name + ".x, " + name + ".y)"; + case 3: + return "max(max(" + name + ".x, " + name + ".y), " + name + ".z)"; case 4: return "max(max(" + name + ".x, " + name + ".y), max(" + name + ".z, " + name + ".w))"; default: @@ -76,13 +80,19 @@ static int GetMaxComponents(int64_t size) { Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const { // Add input and output variables - const auto& input = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias); - const auto& output = shader.AddOutput("result", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias); + const auto& input = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias); + shader.AddOutput("result", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias); int components = input.NumComponents(); - std::string threadMaxDecl = input.StorageType() == "f32" ? - "val threadMax = x_value_t(-3.402823e+38f);\n" : - "val threadMax = x_value_t(-65504.0h));\n"; + LOGS_DEFAULT(VERBOSE) << "Input StorageType: " << input.StorageType() << "\n"; + LOGS_DEFAULT(VERBOSE) << "Input ElementType: " << input.ElementType() << "\n"; + LOGS_DEFAULT(VERBOSE) << "Input ValueType: " << input.ValueType() << "\n"; + + + + std::string threadMaxDecl = input.ElementType() == "f32" ? + "var threadMax = x_value_t(-3.402823e+38f);\n" : + "var threadMax = x_value_t(-65504.0h);\n"; // Define shared memory for row max and row sum @@ -132,7 +142,7 @@ Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const { << " workgroupBarrier();\n" << " }\n" << " if (lindex == 0) {\n" - << " rowMaxShared = x_value_t(" << MaxVector('threadShared[0]', components) << ");\n" + << " rowMaxShared = x_value_t(" << MaxVector("threadShared[0]", components) << ");\n" << " }\n" << " workgroupBarrier();\n" @@ -174,9 +184,15 @@ Status Softmax::ComputeInternal(ComputeContext& context) const { auto* output_tensor = context.Output(0, input_shape); // normalize axis - int64_t axis = axis < 0 ? axis_ + input_rank : axis_; + int64_t axis = axis_ < 0 ? axis_ + input_rank : axis_; bool is_transpose_required = axis < input_rank - 1; + LOGS_DEFAULT(VERBOSE) <<"axis_: " << axis_ << " axis: " << axis << "\n"; + LOGS_DEFAULT(VERBOSE) << "Transpose required: " << (is_transpose_required ? "true" : "false") << "\n"; + LOGS_DEFAULT(VERBOSE) << "Input shape: " << input_shape.ToString() << "\n"; + LOGS_DEFAULT(VERBOSE) << "Output shape: " << output_tensor->Shape().ToString() << "\n"; + LOGS_DEFAULT(VERBOSE) << "Input rank: " << input_rank << "\n"; + TensorShape transposed_input_shape = input_shape; Tensor transposed_input_tensor; Tensor intermediate_output; @@ -184,25 +200,34 @@ Status Softmax::ComputeInternal(ComputeContext& context) const { if (is_transpose_required) { AllocatorPtr alloc; - perm.reserve(input_rank); - for (size_t i = 0; i < input_rank; ++i) { + perm.resize(input_rank); + for (size_t i = 0; i < perm.size(); ++i) { perm[i] = i; } perm[axis] = input_rank - 1; perm[input_rank - 1] = axis; + LOGS_DEFAULT(VERBOSE) << "Allocating temporary tensors for transpose\n"; + // allocate a temporary tensor to hold transposed input Tensor temp_input(input_tensor->DataType(), TensorShape(transposed_input_shape), alloc); - ORT_RETURN_IF_ERROR(Transpose::DoTranspose( perm, *input_tensor, temp_input)); + LOGS_DEFAULT(VERBOSE) << "Performing transpose\n"; + + ORT_RETURN_IF_ERROR(Transpose::DoTranspose(perm, *input_tensor, temp_input)); + + LOGS_DEFAULT(VERBOSE) << "Transpose done\n"; + + LOGS_DEFAULT(VERBOSE) << "Allocating memory for intermediate output\n"; transposed_input_tensor = std::move(temp_input); transposed_input_shape = transposed_input_tensor.Shape(); + LOGS_DEFAULT(VERBOSE) << "Transposed input shape: " << transposed_input_shape.ToString() << "\n"; + // Allocate memory for the intermediate output + LOGS_DEFAULT(VERBOSE) << "Allocating memory for intermediate output\n"; Tensor temp_output(output_tensor->DataType(), TensorShape(transposed_input_shape), alloc); intermediate_output = std::move(temp_output); - } else { - transposed_input_tensor = *input_tensor; } @@ -211,15 +236,24 @@ Status Softmax::ComputeInternal(ComputeContext& context) const { const size_t components = GetMaxComponents(cols); const auto packedCols = cols / components; + LOGS_DEFAULT(VERBOSE) << "Cols: " << cols << " Rows: " << rows << " Components: " << components << " PackedCols: " << packedCols << "\n"; + size_t WG = rows == 1 ? 256: 64; SoftmaxProgram program{WG}; + if (is_transpose_required) { + program + .AddInputs({{&transposed_input_tensor, ProgramTensorMetadataDependency::TypeAndRank, static_cast(components)}}) + .AddOutputs({{&intermediate_output, ProgramTensorMetadataDependency::TypeAndRank, static_cast(components)}}); + } else { + program + .AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank, static_cast(components)}}) + .AddOutputs({{output_tensor, ProgramTensorMetadataDependency::TypeAndRank, static_cast(components)}}); + } program .CacheHint(std::to_string(components), std::to_string(WG)) - .AddInputs({*transposed_input_tensor, ProgramTensorMetadataDependency::TypeAndRank}}) - .AddOutputs({ is_transpose_required ? *intermediate_output : output_tensor}) .SetWorkgroupSize(WG) .SetDispatchGroupSize(rows) .AddUniformVariables({ diff --git a/onnxruntime/core/providers/webgpu/math/softmax.h b/onnxruntime/core/providers/webgpu/math/softmax.h index b8bc37a0c03b6..b67425471da9a 100644 --- a/onnxruntime/core/providers/webgpu/math/softmax.h +++ b/onnxruntime/core/providers/webgpu/math/softmax.h @@ -4,9 +4,9 @@ #pragma once #include "core/providers/webgpu/webgpu_supported_types.h" -#include "core/providers/cpu/math/softmax.h" #include "core/providers/webgpu/webgpu_kernel.h" #include "core/providers/webgpu/program.h" +#include "core/framework/op_kernel.h" namespace onnxruntime { namespace webgpu { @@ -15,8 +15,8 @@ class Softmax final : public WebGpuKernel { public: Softmax(const OpKernelInfo& info) : WebGpuKernel{info} { int opset_ = info.node().SinceVersion(); - size_t axis; - Status status = info.GetAttr("axis", &axis); + int64_t axis; + Status status = info.GetAttr("axis", &axis); if (status.IsOK()) { axis_ = axis; @@ -32,12 +32,12 @@ class Softmax final : public WebGpuKernel { Status ComputeInternal(ComputeContext& context) const override; private: - size_t axis_; + int64_t axis_; }; class SoftmaxProgram final : public Program { public: - SoftmaxProgram(size_t axis, int wg) : Program{"Softmax"}, axis_{axis}, WG_{wg} { + SoftmaxProgram(size_t wg) : Program{"Softmax"}, WG{wg} { } Status GenerateShaderCode(ShaderHelper& sh) const override; @@ -45,7 +45,7 @@ class SoftmaxProgram final : public Program { WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"packedCols", ProgramUniformVariableDataType::Int32}); private: - int WG; + size_t WG; }; } // namespace webgpu diff --git a/onnxruntime/core/providers/webgpu/shader_variable.h b/onnxruntime/core/providers/webgpu/shader_variable.h index 4c87bc9158890..3b8ed7bf42b55 100644 --- a/onnxruntime/core/providers/webgpu/shader_variable.h +++ b/onnxruntime/core/providers/webgpu/shader_variable.h @@ -176,6 +176,10 @@ class ShaderVariableHelper : public ShaderIndicesHelper { template inline std::string GetByOffset(TOffset&& offset) const; + std::string_view StorageType() const; + std::string_view ValueType() const; + std::string_view ElementType() const; + private: ORT_DISALLOW_COPY_AND_ASSIGNMENT(ShaderVariableHelper); @@ -183,9 +187,7 @@ class ShaderVariableHelper : public ShaderIndicesHelper { std::string GetByOffsetImpl(std::string_view offset) const; std::string SetByOffsetImpl(std::string_view offset, std::string_view value) const; - std::string_view StorageType() const; - std::string_view ValueType() const; - std::string_view ElementType() const; + friend class ShaderHelper; }; From 87de60730a1fca9aa3ec021b20d1de0262e64d57 Mon Sep 17 00:00:00 2001 From: vraspar Date: Wed, 29 Jan 2025 16:33:20 -0800 Subject: [PATCH 3/4] Refactor Softmax and remove debug logs --- .../core/providers/webgpu/math/softmax.cc | 92 +++++-------------- .../core/providers/webgpu/math/softmax.h | 6 +- .../core/providers/webgpu/tensor/transpose.cc | 27 +++--- .../core/providers/webgpu/tensor/transpose.h | 2 +- .../test/providers/cpu/math/softmax_test.cc | 13 +-- 5 files changed, 50 insertions(+), 90 deletions(-) diff --git a/onnxruntime/core/providers/webgpu/math/softmax.cc b/onnxruntime/core/providers/webgpu/math/softmax.cc index 4abefa704c689..1760acae95f39 100644 --- a/onnxruntime/core/providers/webgpu/math/softmax.cc +++ b/onnxruntime/core/providers/webgpu/math/softmax.cc @@ -8,9 +8,6 @@ #include "core/providers/webgpu/shader_variable.h" #include "core/providers/webgpu/shader_helper.h" #include "core/providers/webgpu/webgpu_supported_types.h" - -#include "core/common/logging/logging.h" - namespace onnxruntime { namespace webgpu { @@ -84,16 +81,7 @@ Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const { shader.AddOutput("result", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias); int components = input.NumComponents(); - LOGS_DEFAULT(VERBOSE) << "Input StorageType: " << input.StorageType() << "\n"; - LOGS_DEFAULT(VERBOSE) << "Input ElementType: " << input.ElementType() << "\n"; - LOGS_DEFAULT(VERBOSE) << "Input ValueType: " << input.ValueType() << "\n"; - - - - std::string threadMaxDecl = input.ElementType() == "f32" ? - "var threadMax = x_value_t(-3.402823e+38f);\n" : - "var threadMax = x_value_t(-65504.0h);\n"; - + std::string threadMaxDecl = input.ElementType() == "f32" ? "var threadMax = x_value_t(-3.402823e+38f);\n" : "var threadMax = x_value_t(-65504.0h);\n"; // Define shared memory for row max and row sum shader.AdditionalImplementation() @@ -142,7 +130,7 @@ Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const { << " workgroupBarrier();\n" << " }\n" << " if (lindex == 0) {\n" - << " rowMaxShared = x_value_t(" << MaxVector("threadShared[0]", components) << ");\n" + << " rowMaxShared = x_value_t(" << MaxVector("threadShared[0]", components) << ");\n" << " }\n" << " workgroupBarrier();\n" @@ -163,7 +151,7 @@ Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const { << " workgroupBarrier();\n" << " }\n" << " if (lindex == 0) {\n" - << " rowSumShared = x_value_t(" << SumVector("threadShared[0]", components) << ");\n" + << " rowSumShared = x_value_t(" << SumVector("threadShared[0]", components) << ");\n" << " }\n" << " workgroupBarrier();\n" @@ -179,71 +167,44 @@ Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const { Status Softmax::ComputeInternal(ComputeContext& context) const { const auto* input_tensor = context.Input(0); const TensorShape& input_shape = input_tensor->Shape(); - size_t input_rank = input_shape.NumDimensions(); - + int64_t input_rank = input_shape.NumDimensions(); auto* output_tensor = context.Output(0, input_shape); // normalize axis - int64_t axis = axis_ < 0 ? axis_ + input_rank : axis_; - + int64_t axis = axis_ < 0 ? axis_ + input_rank : axis_; bool is_transpose_required = axis < input_rank - 1; - LOGS_DEFAULT(VERBOSE) <<"axis_: " << axis_ << " axis: " << axis << "\n"; - LOGS_DEFAULT(VERBOSE) << "Transpose required: " << (is_transpose_required ? "true" : "false") << "\n"; - LOGS_DEFAULT(VERBOSE) << "Input shape: " << input_shape.ToString() << "\n"; - LOGS_DEFAULT(VERBOSE) << "Output shape: " << output_tensor->Shape().ToString() << "\n"; - LOGS_DEFAULT(VERBOSE) << "Input rank: " << input_rank << "\n"; - TensorShape transposed_input_shape = input_shape; + TensorShape transposed_input_shape; Tensor transposed_input_tensor; Tensor intermediate_output; - InlinedVector perm; + InlinedVector perm(input_rank); if (is_transpose_required) { - AllocatorPtr alloc; - perm.resize(input_rank); - for (size_t i = 0; i < perm.size(); ++i) { - perm[i] = i; - } + std::iota(std::begin(perm), std::end(perm), 0); perm[axis] = input_rank - 1; perm[input_rank - 1] = axis; - LOGS_DEFAULT(VERBOSE) << "Allocating temporary tensors for transpose\n"; - - // allocate a temporary tensor to hold transposed input - Tensor temp_input(input_tensor->DataType(), TensorShape(transposed_input_shape), alloc); - - LOGS_DEFAULT(VERBOSE) << "Performing transpose\n"; - - ORT_RETURN_IF_ERROR(Transpose::DoTranspose(perm, *input_tensor, temp_input)); - - LOGS_DEFAULT(VERBOSE) << "Transpose done\n"; - - LOGS_DEFAULT(VERBOSE) << "Allocating memory for intermediate output\n"; - transposed_input_tensor = std::move(temp_input); - transposed_input_shape = transposed_input_tensor.Shape(); - - LOGS_DEFAULT(VERBOSE) << "Transposed input shape: " << transposed_input_shape.ToString() << "\n"; + std::vector transposed_input_dims; + for (auto e : perm) { + transposed_input_dims.push_back(input_shape[e]); + } - // Allocate memory for the intermediate output - LOGS_DEFAULT(VERBOSE) << "Allocating memory for intermediate output\n"; - Tensor temp_output(output_tensor->DataType(), TensorShape(transposed_input_shape), alloc); - intermediate_output = std::move(temp_output); + transposed_input_shape = TensorShape(transposed_input_dims); + transposed_input_tensor = context.CreateGPUTensor(input_tensor->DataType(), transposed_input_shape); + ORT_RETURN_IF_ERROR(Transpose::DoTranspose(context, perm, *input_tensor, transposed_input_tensor)); + intermediate_output = context.CreateGPUTensor(output_tensor->DataType(), transposed_input_shape); } - - const size_t cols = transposed_input_shape[input_rank - 1]; - const size_t rows = input_shape.Size() / cols; - const size_t components = GetMaxComponents(cols); + const int64_t cols = is_transpose_required ? transposed_input_shape[input_rank - 1] : input_shape[input_rank - 1]; + const int64_t rows = input_shape.Size() / cols; + const int64_t components = GetMaxComponents(cols); const auto packedCols = cols / components; - - LOGS_DEFAULT(VERBOSE) << "Cols: " << cols << " Rows: " << rows << " Components: " << components << " PackedCols: " << packedCols << "\n"; - - size_t WG = rows == 1 ? 256: 64; + uint32_t WG = rows == 1 ? 256 : 64; SoftmaxProgram program{WG}; - if (is_transpose_required) { + if (is_transpose_required) { program - .AddInputs({{&transposed_input_tensor, ProgramTensorMetadataDependency::TypeAndRank, static_cast(components)}}) + .AddInputs({{&transposed_input_tensor, ProgramTensorMetadataDependency::TypeAndRank, static_cast(components)}}) .AddOutputs({{&intermediate_output, ProgramTensorMetadataDependency::TypeAndRank, static_cast(components)}}); } else { program @@ -251,22 +212,17 @@ Status Softmax::ComputeInternal(ComputeContext& context) const { .AddOutputs({{output_tensor, ProgramTensorMetadataDependency::TypeAndRank, static_cast(components)}}); } - program .CacheHint(std::to_string(components), std::to_string(WG)) .SetWorkgroupSize(WG) .SetDispatchGroupSize(rows) - .AddUniformVariables({ - {static_cast(packedCols)} - }); - + .AddUniformVariables({{static_cast(packedCols)}}); ORT_RETURN_IF_ERROR(context.RunProgram(program)); // If transpose was required, transpose the result back if (is_transpose_required) { - Tensor transposed_output_tensor; - ORT_RETURN_IF_ERROR(Transpose::DoTranspose(perm, intermediate_output, *output_tensor)); + ORT_RETURN_IF_ERROR(Transpose::DoTranspose(context, perm, intermediate_output, *output_tensor)); } return Status::OK(); diff --git a/onnxruntime/core/providers/webgpu/math/softmax.h b/onnxruntime/core/providers/webgpu/math/softmax.h index b67425471da9a..5eb6bd0ccdb15 100644 --- a/onnxruntime/core/providers/webgpu/math/softmax.h +++ b/onnxruntime/core/providers/webgpu/math/softmax.h @@ -37,15 +37,15 @@ class Softmax final : public WebGpuKernel { class SoftmaxProgram final : public Program { public: - SoftmaxProgram(size_t wg) : Program{"Softmax"}, WG{wg} { - } + SoftmaxProgram(uint32_t wg) : Program{"Softmax"}, WG{wg} { + } Status GenerateShaderCode(ShaderHelper& sh) const override; WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"packedCols", ProgramUniformVariableDataType::Int32}); private: - size_t WG; + uint32_t WG; }; } // namespace webgpu diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc index 062500055eeaf..c06a742239cdc 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc @@ -97,24 +97,27 @@ Status TransposeProgram::GenerateShaderCode(ShaderHelper& shader) const { return Status::OK(); } -Status Transpose::DoTranspose(const gsl::span& permutations, const Tensor& input, Tensor& output) { +Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context, const gsl::span& permutations, const Tensor& input, Tensor& output) { const auto& input_shape = input.Shape(); + const auto& input_dims = input_shape.GetDims(); int32_t rank = gsl::narrow_cast(input_shape.NumDimensions()); - TensorShapeVector output_dims(rank); - InlinedVector default_perm(rank); - const InlinedVector* p_perm = nullptr; - ORT_RETURN_IF_ERROR(ComputeOutputShape(input, output_dims, default_perm, p_perm)); + + for (int32_t i = 0; i < rank; i++) { + output_dims[i] = input_dims[permutations[i]]; + } + TensorShape output_shape(output_dims); InlinedVector new_shape{}; InlinedVector new_perm{}; - SqueezeShape(input_shape.GetDims(), *p_perm, new_shape, new_perm); + SqueezeShape(input_shape.GetDims(), permutations, new_shape, new_perm); const bool channels_last = new_perm == InlinedVector({2, 3, 1}); const bool channels_first = new_perm == InlinedVector({3, 1, 2}); const bool use_shared = (new_shape.size() == 2 && new_perm[0] > new_perm[1]) || channels_last || channels_first; auto new_input_shape = input_shape; + TensorShape new_output_shape(output_dims); if (use_shared) { new_input_shape = channels_last @@ -125,16 +128,16 @@ Status Transpose::DoTranspose(const gsl::span& permutations, const new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]}); } - uint32_t output_size = gsl::narrow_cast(input.Shape().Size()); - TransposeProgram program{*p_perm, use_shared}; + uint32_t output_size = gsl::narrow_cast(input_shape.Size()); + TransposeProgram program{permutations, use_shared}; + if (use_shared) { program.SetWorkgroupSize(TILE_SIZE, TILE_SIZE, 1); } - program - .CacheHint(absl::StrJoin(*p_perm, "-")) - .AddInputs({{*input, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}}) - .AddOutputs({{*output, ProgramTensorMetadataDependency::None, new_output_shape, 1}}) + .CacheHint(absl::StrJoin(permutations, "-")) + .AddInputs({{&input, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}}) + .AddOutputs({{&output, ProgramTensorMetadataDependency::None, new_output_shape, 1}}) .SetDispatchGroupSize(static_cast((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE), static_cast(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE))) .AddUniformVariables({ diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.h b/onnxruntime/core/providers/webgpu/tensor/transpose.h index 3eb672d1c6e31..81706dde33cc6 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.h +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.h @@ -16,7 +16,7 @@ class Transpose final : public WebGpuKernel, public TransposeBase { Transpose(const OpKernelInfo& info) : WebGpuKernel{info}, TransposeBase{info} { } Status ComputeInternal(ComputeContext& context) const override; - static Status DoTranspose(const gsl::span& permutations, const Tensor& input, Tensor& output); + static Status DoTranspose(onnxruntime::webgpu::ComputeContext& context, const gsl::span& permutations, const Tensor& input, Tensor& output); constexpr static uint32_t TILE_SIZE = 16; }; diff --git a/onnxruntime/test/providers/cpu/math/softmax_test.cc b/onnxruntime/test/providers/cpu/math/softmax_test.cc index 6f7930f722564..3808d62a10e56 100644 --- a/onnxruntime/test/providers/cpu/math/softmax_test.cc +++ b/onnxruntime/test/providers/cpu/math/softmax_test.cc @@ -170,11 +170,11 @@ TEST(SoftmaxOperator, ThreeAndFourDimsAxis0) { RunTest(input_vals_60, expected_vals, three_dimensions, /*opset*/ 7, /*axis*/ 0, // axis=0 is not supported by TensorRT - {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider}); + {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider, kWebGpuExecutionProvider}); RunTest(input_vals_60, expected_vals, four_dimensions, /*opset*/ 7, /*axis*/ 0, // axis=0 is not supported by TensorRT - {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider}); + {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider, kWebGpuExecutionProvider}); } TEST(SoftmaxOperator, ThreeAndFourDimsSecondLastAxis) { @@ -201,10 +201,10 @@ TEST(SoftmaxOperator, ThreeAndFourDimsSecondLastAxis) { 0.040478885f, 0.033857856f, 0.080346674f, 0.06199841f, 0.040481992f}; RunTest(input_vals_60, expected_vals, three_dimensions, /*opset*/ 7, /*axis*/ 1, - {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider}); + {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider, kWebGpuExecutionProvider}); RunTest(input_vals_60, expected_vals, four_dimensions, /*opset*/ 7, /*axis*/ 2, - {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider}); + {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider, kWebGpuExecutionProvider}); } TEST(SoftmaxOperator, ThreeAndFourDimsSecondLastAxis_opset13) { @@ -376,8 +376,9 @@ TEST(SoftmaxOperator, DimWithZero) { RunTest(x_vals, expected_vals, dimensions, /*opset*/ -1, /*axis*/ 0, {kTensorrtExecutionProvider, - kNnapiExecutionProvider, // NNAPI softmax does not support empty input - kQnnExecutionProvider} // QNN doesn't support dim 0 + kNnapiExecutionProvider, // NNAPI softmax does not support empty input + kWebGpuExecutionProvider, // WebGPU does not dim 0 + kQnnExecutionProvider} // QNN doesn't support dim 0 ); } From 2d8b47de2729b30442b7659524fddc532a7a3a99 Mon Sep 17 00:00:00 2001 From: vraspar Date: Thu, 30 Jan 2025 11:49:31 -0800 Subject: [PATCH 4/4] fix linting error Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- onnxruntime/core/providers/webgpu/shader_variable.h | 1 - 1 file changed, 1 deletion(-) diff --git a/onnxruntime/core/providers/webgpu/shader_variable.h b/onnxruntime/core/providers/webgpu/shader_variable.h index 3b8ed7bf42b55..12ded754de55c 100644 --- a/onnxruntime/core/providers/webgpu/shader_variable.h +++ b/onnxruntime/core/providers/webgpu/shader_variable.h @@ -188,7 +188,6 @@ class ShaderVariableHelper : public ShaderIndicesHelper { std::string GetByOffsetImpl(std::string_view offset) const; std::string SetByOffsetImpl(std::string_view offset, std::string_view value) const; - friend class ShaderHelper; };