From e7e373713ee7ec20965ad236d48172c9b818c0a7 Mon Sep 17 00:00:00 2001
From: vraspar <vrajang@outlook.com>
Date: Mon, 20 Jan 2025 17:26:56 -0800
Subject: [PATCH 1/4] Add Softmax kernel and transpose utility function for
 WebGPU execution provider

---
 .../core/providers/webgpu/math/softmax.cc     | 241 ++++++++++++++++++
 .../core/providers/webgpu/math/softmax.h      |  52 ++++
 .../core/providers/webgpu/tensor/transpose.cc |  50 ++++
 .../core/providers/webgpu/tensor/transpose.h  |   2 +
 .../webgpu/webgpu_execution_provider.cc       |   6 +-
 5 files changed, 348 insertions(+), 3 deletions(-)
 create mode 100644 onnxruntime/core/providers/webgpu/math/softmax.cc
 create mode 100644 onnxruntime/core/providers/webgpu/math/softmax.h
diff --git a/onnxruntime/core/providers/webgpu/math/softmax.cc b/onnxruntime/core/providers/webgpu/math/softmax.cc
new file mode 100644
index 0000000000000..796c56f67c73d
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/softmax.cc
@@ -0,0 +1,241 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/inlined_containers.h"
+#include "core/providers/webgpu/tensor/softmax.h"
+#include "core/providers/webgpu/tensor/transpose.h"
+#include "core/providers/cpu/tensor/utils.h"
+#include "core/providers/webgpu/shader_variable.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Softmax,
+    kOnnxDomain,
+    1, 10,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedNumberTypes()),
+    Softmax);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Softmax,
+    kOnnxDomain,
+    11, 12,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedNumberTypes()),
+    Softmax);
+
+ONNX_OPERATOR_KERNEL_EX(
+    Softmax,
+    kOnnxDomain,
+    13,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedNumberTypes()),
+    Softmax);
+
+static std::string MaxVector(std::string name, int components) {
+  switch (components) {
+    case 1:
+      return name;
+    case 2:
+      return "max(" + name + ".x, " + name + ".y)";
+    case 4:
+      return "max(max(" + name + ".x, " + name + ".y), max(" + name + ".z, " + name + ".w))";
+    default:
+      ORT_THROW("Unsupported number of components: ", components);
+  }
+}
+
+static std::string SumVector(std::string x, int components) {
+  switch (components) {
+    case 1:
+      return x;
+    case 2:
+      return "(" + x + ".x + " + x + ".y" + ")";
+    case 4:
+      return "(" + x + ".x + " + x + ".y + " + x + ".w + " + x + ".z" + ")";
+    default:
+      ORT_THROW("Unsupported number of components: ", components);
+  }
+}
+
+static int GetMaxComponents(int64_t size) {
+  if (size % 4 == 0) {
+    return 4;
+  } else if (size % 2 == 0) {
+    return 2;
+  }
+  return 1;
+}
+
+Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  // Add input and output variables
+  const auto& input = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
+  const auto& output = shader.AddOutput("result", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
+  int components = input.NumComponents();
+
+  std::string threadMaxDecl =  input.StorageType() == "f32" ?
+                                "val threadMax = x_value_t(-3.402823e+38f);\n" :
+                                "val threadMax = x_value_t(-65504.0h));\n";
+
+
+  // Define shared memory for row max and row sum
+  shader.AdditionalImplementation()
+      << "var<workgroup> rowMaxShared : x_value_t;\n"
+      << "var<workgroup> rowSumShared : x_value_t;\n"
+      << "var<workgroup> threadShared : array<x_value_t, " << WG << ">;\n";
+
+  // Define helper functions to get and set values
+  shader.AdditionalImplementation()
+      << "fn getValue(row: i32, col: i32, row_stride: i32) -> x_value_t {\n"
+      << "  let index = row * row_stride + col;\n"
+      << "  return x[index];\n"
+      << "}\n"
+      << "fn setValue(row: i32, col: i32, row_stride: i32, value: x_value_t) {\n"
+      << "  let index = row * row_stride + col;\n"
+      << "  result[index] = value;\n"
+      << "}\n";
+
+  // Main function body
+  shader.MainFunctionBody()
+      << "  let gindex = i32(global_idx);\n"
+      << "  let lindex = i32(local_idx);\n"
+      << "  const wg = " << WG << ";\n"
+      << "  let row = gindex / wg;\n"
+      << "  let cols = uniforms.packedCols;\n"
+      << "  let row_stride : i32 = uniforms.packedCols;\n"
+
+      // Find the row's max value
+      << threadMaxDecl
+      << "  for (var col = lindex; col < cols; col += wg) {\n"
+      << "    let value = getValue(row, col, row_stride);\n"
+      << "    threadMax = max(threadMax, value);\n"
+      << "  }\n"
+      << "  if (lindex < cols) {\n"
+      << "    threadShared[lindex] = threadMax;\n"
+      << "  }\n"
+      << "  workgroupBarrier();\n"
+
+      // Reduce to find the max value
+      << "  var reduceSize = min(cols, wg);\n"
+      << "  for (var currSize = reduceSize >> 1; currSize > 0; currSize = reduceSize >> 1) {\n"
+      << "    reduceSize = currSize + (reduceSize & 1);\n"
+      << "    if (lindex < currSize) {\n"
+      << "      threadShared[lindex] = max(threadShared[lindex], threadShared[lindex + reduceSize]);\n"
+      << "    }\n"
+      << "    workgroupBarrier();\n"
+      << "  }\n"
+      << "  if (lindex == 0) {\n"
+      << "    rowMaxShared = x_value_t(" <<  MaxVector('threadShared[0]', components)   << ");\n"
+      << "  }\n"
+      << "  workgroupBarrier();\n"
+
+      // Find the row's sum of exponentials
+      << "  var threadSum = x_value_t(0.0);\n"
+      << "  for (var col = lindex; col < cols; col += wg) {\n"
+      << "    let subExp = exp(getValue(row, col, row_stride) - rowMaxShared);\n"
+      << "    threadSum += subExp;\n"
+      << "  }\n"
+      << "  threadShared[lindex] = threadSum;\n"
+      << "  workgroupBarrier();\n"
+
+      // Reduce to find the sum of exponentials
+      << "  for (var currSize = wg >> 1; currSize > 0; currSize = currSize >> 1) {\n"
+      << "    if (lindex < currSize) {\n"
+      << "      threadShared[lindex] = threadShared[lindex] + threadShared[lindex + currSize];\n"
+      << "    }\n"
+      << "    workgroupBarrier();\n"
+      << "  }\n"
+      << "  if (lindex == 0) {\n"
+      << "    rowSumShared = x_value_t(" << SumVector("threadShared[0]", components)  << ");\n"
+      << "  }\n"
+      << "  workgroupBarrier();\n"
+
+      // Calculate the final value for each element in the row
+      << "  for (var col = lindex; col < cols; col += wg) {\n"
+      << "    let value = exp(getValue(row, col, row_stride) - rowMaxShared) / rowSumShared;\n"
+      << "    setValue(row, col, row_stride, value);\n"
+      << "  }\n";
+
+  return Status::OK();
+}
+
+Status Softmax::ComputeInternal(ComputeContext& context) const {
+  const auto* input_tensor = context.Input(0);
+  const TensorShape& input_shape = input_tensor->Shape();
+  size_t input_rank = input_shape.NumDimensions();
+
+  auto* output_tensor = context.Output(0, input_shape);
+
+  // normalize axis
+  int64_t axis = axis < 0 ? axis_ + input_rank : axis_;
+
+  bool is_transpose_required = axis < input_rank - 1;
+  TensorShape transposed_input_shape = input_shape;
+  Tensor transposed_input_tensor;
+  Tensor intermediate_output;
+  InlinedVector<size_t> perm;
+
+  if (is_transpose_required) {
+    AllocatorPtr alloc;
+    perm.reserve(input_rank);
+    for (size_t i = 0; i < input_rank; ++i) {
+      perm[i] = i;
+    }
+    perm[axis] = input_rank - 1;
+    perm[input_rank - 1] = axis;
+
+    // allocate a temporary tensor to hold transposed input
+    Tensor temp_input(input_tensor->DataType(), TensorShape(transposed_input_shape), alloc);
+
+    ORT_RETURN_IF_ERROR(Transpose::DoTranspose( perm, *input_tensor, temp_input));
+    transposed_input_tensor = std::move(temp_input);
+    transposed_input_shape = transposed_input_tensor.Shape();
+
+    // Allocate memory for the intermediate output
+    Tensor temp_output(output_tensor->DataType(), TensorShape(transposed_input_shape), alloc);
+    intermediate_output = std::move(temp_output);
+  } else {
+    transposed_input_tensor = *input_tensor;
+  }
+
+
+  const size_t cols = transposed_input_shape[input_rank - 1];
+  const size_t rows = input_shape.Size() / cols;
+  const size_t components = GetMaxComponents(cols);
+  const auto packedCols = cols / components;
+
+  size_t WG = rows == 1 ? 256: 64;
+
+  SoftmaxProgram program{WG};
+
+
+  program
+      .CacheHint(std::to_string(components), std::to_string(WG))
+      .AddInputs({*transposed_input_tensor, ProgramTensorMetadataDependency::TypeAndRank}})
+      .AddOutputs({ is_transpose_required ? *intermediate_output : output_tensor})
+      .SetWorkgroupSize(WG)
+      .SetDispatchGroupSize(rows)
+      .AddUniformVariables({
+        {static_cast<int32_t>(packedCols)}
+      });
+
+
+  ORT_RETURN_IF_ERROR(context.RunProgram(program));
+
+  // If transpose was required, transpose the result back
+  if (is_transpose_required) {
+    Tensor transposed_output_tensor;
+    ORT_RETURN_IF_ERROR(Transpose::DoTranspose(perm, intermediate_output, *output_tensor));
+  }
+
+  return Status::OK();
+}
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/math/softmax.h b/onnxruntime/core/providers/webgpu/math/softmax.h
new file mode 100644
index 0000000000000..b8bc37a0c03b6
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/softmax.h
@@ -0,0 +1,52 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/cpu/math/softmax.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/webgpu/program.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class Softmax final : public WebGpuKernel {
+ public:
+  Softmax(const OpKernelInfo& info) : WebGpuKernel{info} {
+    int opset_ = info.node().SinceVersion();
+    size_t axis;
+    Status status = info.GetAttr<size_t>("axis", &axis);
+
+    if (status.IsOK()) {
+      axis_ = axis;
+    } else {
+      if (opset_ < 13) {
+        axis_ = 1;  // opset-12 and below, the default axis value is 1
+      } else {
+        axis_ = -1;  // opset-13, the default axis value is -1
+      }
+    }
+  }
+
+  Status ComputeInternal(ComputeContext& context) const override;
+
+ private:
+  size_t axis_;
+};
+
+class SoftmaxProgram final : public Program<SoftmaxProgram> {
+ public:
+  SoftmaxProgram(size_t axis, int wg) : Program{"Softmax"}, axis_{axis}, WG_{wg} {
+ }
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"packedCols", ProgramUniformVariableDataType::Int32});
+
+ private:
+    int WG;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
index c40ec43dd0009..062500055eeaf 100644
--- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
@@ -97,6 +97,56 @@ Status TransposeProgram::GenerateShaderCode(ShaderHelper& shader) const {
   return Status::OK();
 }
 
+Status Transpose::DoTranspose(const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output) {
+  const auto& input_shape = input.Shape();
+  int32_t rank = gsl::narrow_cast<int32_t>(input_shape.NumDimensions());
+
+
+  TensorShapeVector output_dims(rank);
+  InlinedVector<size_t> default_perm(rank);
+  const InlinedVector<size_t>* p_perm = nullptr;
+  ORT_RETURN_IF_ERROR(ComputeOutputShape(input, output_dims, default_perm, p_perm));
+  TensorShape output_shape(output_dims);
+
+  InlinedVector<int64_t> new_shape{};
+  InlinedVector<int64_t> new_perm{};
+  SqueezeShape(input_shape.GetDims(), *p_perm, new_shape, new_perm);
+  const bool channels_last = new_perm == InlinedVector<int64_t>({2, 3, 1});
+  const bool channels_first = new_perm == InlinedVector<int64_t>({3, 1, 2});
+  const bool use_shared = (new_shape.size() == 2 && new_perm[0] > new_perm[1]) || channels_last || channels_first;
+  auto new_input_shape = input_shape;
+
+  if (use_shared) {
+    new_input_shape = channels_last
+                          ? TensorShape({new_shape[0], new_shape[1] * new_shape[2]})
+                      : channels_first
+                          ? TensorShape({new_shape[0] * new_shape[1], new_shape[2]})
+                          : new_shape;
+    new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]});
+  }
+
+  uint32_t output_size = gsl::narrow_cast<int32_t>(input.Shape().Size());
+  TransposeProgram program{*p_perm, use_shared};
+  if (use_shared) {
+    program.SetWorkgroupSize(TILE_SIZE, TILE_SIZE, 1);
+  }
+
+  program
+      .CacheHint(absl::StrJoin(*p_perm, "-"))
+      .AddInputs({{*input, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}})
+      .AddOutputs({{*output, ProgramTensorMetadataDependency::None, new_output_shape, 1}})
+      .SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE),
+                            static_cast<uint32_t>(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE)))
+      .AddUniformVariables({
+          {static_cast<uint32_t>(output_size)},
+      });
+
+  use_shared ? program.SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE),
+                                            static_cast<uint32_t>(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE)))
+             : program.SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE);
+  return context.RunProgram(program);
+}
+
 Status Transpose::ComputeInternal(ComputeContext& context) const {
   const auto* input_tensor = context.Input(0);
   const TensorShape& input_shape = input_tensor->Shape();
diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.h b/onnxruntime/core/providers/webgpu/tensor/transpose.h
index 7cf5c1fe0865d..3eb672d1c6e31 100644
--- a/onnxruntime/core/providers/webgpu/tensor/transpose.h
+++ b/onnxruntime/core/providers/webgpu/tensor/transpose.h
@@ -16,6 +16,8 @@ class Transpose final : public WebGpuKernel, public TransposeBase {
   Transpose(const OpKernelInfo& info) : WebGpuKernel{info}, TransposeBase{info} {
   }
   Status ComputeInternal(ComputeContext& context) const override;
+  static Status DoTranspose(const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output);
+
   constexpr static uint32_t TILE_SIZE = 16;
 };
 
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index dec7e48786bf5..04b5965177d1e 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -625,9 +625,9 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, float, ArgMin)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, float, ArgMin)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Softmax)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Softmax)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Softmax)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 3, Concat)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 4, 10, Concat)>,

From f9b61dbc6f9ae29a22ccfcd33228ffa9cea38ae2 Mon Sep 17 00:00:00 2001
From: vraspar <v2parikh@uwaterloo.ca>
Date: Fri, 24 Jan 2025 19:22:05 -0800
Subject: [PATCH 2/4] Refactor Softmax implementation for WebGPU

---
 .../core/providers/webgpu/math/softmax.cc     | 64 ++++++++++++++-----
 .../core/providers/webgpu/math/softmax.h      | 12 ++--
 .../core/providers/webgpu/shader_variable.h   |  8 ++-
 3 files changed, 60 insertions(+), 24 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/math/softmax.cc b/onnxruntime/core/providers/webgpu/math/softmax.cc
index 796c56f67c73d..4abefa704c689 100644
--- a/onnxruntime/core/providers/webgpu/math/softmax.cc
+++ b/onnxruntime/core/providers/webgpu/math/softmax.cc
@@ -2,13 +2,15 @@
 // Licensed under the MIT License.
 
 #include "core/common/inlined_containers.h"
-#include "core/providers/webgpu/tensor/softmax.h"
+#include "core/providers/webgpu/math/softmax.h"
 #include "core/providers/webgpu/tensor/transpose.h"
 #include "core/providers/cpu/tensor/utils.h"
 #include "core/providers/webgpu/shader_variable.h"
 #include "core/providers/webgpu/shader_helper.h"
 #include "core/providers/webgpu/webgpu_supported_types.h"
 
+#include "core/common/logging/logging.h"
+
 namespace onnxruntime {
 namespace webgpu {
 
@@ -45,6 +47,8 @@ static std::string MaxVector(std::string name, int components) {
       return name;
     case 2:
       return "max(" + name + ".x, " + name + ".y)";
+    case 3:
+      return "max(max(" + name + ".x, " + name + ".y), " + name + ".z)";
     case 4:
       return "max(max(" + name + ".x, " + name + ".y), max(" + name + ".z, " + name + ".w))";
     default:
@@ -76,13 +80,19 @@ static int GetMaxComponents(int64_t size) {
 
 Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
   // Add input and output variables
-  const auto& input = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
-  const auto& output = shader.AddOutput("result", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
+  const auto& input = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  shader.AddOutput("result", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
   int components = input.NumComponents();
 
-  std::string threadMaxDecl =  input.StorageType() == "f32" ?
-                                "val threadMax = x_value_t(-3.402823e+38f);\n" :
-                                "val threadMax = x_value_t(-65504.0h));\n";
+  LOGS_DEFAULT(VERBOSE) << "Input StorageType: " << input.StorageType() << "\n";
+  LOGS_DEFAULT(VERBOSE) << "Input ElementType: " << input.ElementType() << "\n";
+  LOGS_DEFAULT(VERBOSE) << "Input ValueType: " << input.ValueType() << "\n";
+
+
+
+  std::string threadMaxDecl =  input.ElementType() == "f32" ?
+                                "var threadMax = x_value_t(-3.402823e+38f);\n" :
+                                "var threadMax = x_value_t(-65504.0h);\n";
 
 
   // Define shared memory for row max and row sum
@@ -132,7 +142,7 @@ Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
       << "    workgroupBarrier();\n"
       << "  }\n"
       << "  if (lindex == 0) {\n"
-      << "    rowMaxShared = x_value_t(" <<  MaxVector('threadShared[0]', components)   << ");\n"
+      << "    rowMaxShared = x_value_t(" <<  MaxVector("threadShared[0]", components)   << ");\n"
       << "  }\n"
       << "  workgroupBarrier();\n"
 
@@ -174,9 +184,15 @@ Status Softmax::ComputeInternal(ComputeContext& context) const {
   auto* output_tensor = context.Output(0, input_shape);
 
   // normalize axis
-  int64_t axis = axis < 0 ? axis_ + input_rank : axis_;
+  int64_t  axis = axis_ < 0 ? axis_ + input_rank : axis_;
 
   bool is_transpose_required = axis < input_rank - 1;
+  LOGS_DEFAULT(VERBOSE) <<"axis_: " << axis_ << " axis: " << axis << "\n";
+  LOGS_DEFAULT(VERBOSE) << "Transpose required: " << (is_transpose_required ? "true" : "false") << "\n";
+  LOGS_DEFAULT(VERBOSE) << "Input shape: " << input_shape.ToString() << "\n";
+  LOGS_DEFAULT(VERBOSE) << "Output shape: " << output_tensor->Shape().ToString() << "\n";
+  LOGS_DEFAULT(VERBOSE) << "Input rank: " << input_rank << "\n";
+
   TensorShape transposed_input_shape = input_shape;
   Tensor transposed_input_tensor;
   Tensor intermediate_output;
@@ -184,25 +200,34 @@ Status Softmax::ComputeInternal(ComputeContext& context) const {
 
   if (is_transpose_required) {
     AllocatorPtr alloc;
-    perm.reserve(input_rank);
-    for (size_t i = 0; i < input_rank; ++i) {
+    perm.resize(input_rank);
+    for (size_t i = 0; i < perm.size(); ++i) {
       perm[i] = i;
     }
     perm[axis] = input_rank - 1;
     perm[input_rank - 1] = axis;
 
+    LOGS_DEFAULT(VERBOSE) << "Allocating temporary tensors for transpose\n";
+
     // allocate a temporary tensor to hold transposed input
     Tensor temp_input(input_tensor->DataType(), TensorShape(transposed_input_shape), alloc);
 
-    ORT_RETURN_IF_ERROR(Transpose::DoTranspose( perm, *input_tensor, temp_input));
+    LOGS_DEFAULT(VERBOSE) << "Performing transpose\n";
+
+    ORT_RETURN_IF_ERROR(Transpose::DoTranspose(perm, *input_tensor, temp_input));
+
+    LOGS_DEFAULT(VERBOSE) << "Transpose done\n";
+
+    LOGS_DEFAULT(VERBOSE) << "Allocating memory for intermediate output\n";
     transposed_input_tensor = std::move(temp_input);
     transposed_input_shape = transposed_input_tensor.Shape();
 
+    LOGS_DEFAULT(VERBOSE) << "Transposed input shape: " << transposed_input_shape.ToString() << "\n";
+
     // Allocate memory for the intermediate output
+    LOGS_DEFAULT(VERBOSE) << "Allocating memory for intermediate output\n";
     Tensor temp_output(output_tensor->DataType(), TensorShape(transposed_input_shape), alloc);
     intermediate_output = std::move(temp_output);
-  } else {
-    transposed_input_tensor = *input_tensor;
   }
 
 
@@ -211,15 +236,24 @@ Status Softmax::ComputeInternal(ComputeContext& context) const {
   const size_t components = GetMaxComponents(cols);
   const auto packedCols = cols / components;
 
+  LOGS_DEFAULT(VERBOSE) << "Cols: " << cols << " Rows: " << rows << " Components: " << components << " PackedCols: " << packedCols << "\n";
+
   size_t WG = rows == 1 ? 256: 64;
 
   SoftmaxProgram program{WG};
+  if  (is_transpose_required) {
+    program
+        .AddInputs({{&transposed_input_tensor, ProgramTensorMetadataDependency::TypeAndRank,  static_cast<int>(components)}})
+        .AddOutputs({{&intermediate_output, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(components)}});
+  } else {
+    program
+        .AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(components)}})
+        .AddOutputs({{output_tensor, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(components)}});
+  }
 
 
   program
       .CacheHint(std::to_string(components), std::to_string(WG))
-      .AddInputs({*transposed_input_tensor, ProgramTensorMetadataDependency::TypeAndRank}})
-      .AddOutputs({ is_transpose_required ? *intermediate_output : output_tensor})
       .SetWorkgroupSize(WG)
       .SetDispatchGroupSize(rows)
       .AddUniformVariables({
diff --git a/onnxruntime/core/providers/webgpu/math/softmax.h b/onnxruntime/core/providers/webgpu/math/softmax.h
index b8bc37a0c03b6..b67425471da9a 100644
--- a/onnxruntime/core/providers/webgpu/math/softmax.h
+++ b/onnxruntime/core/providers/webgpu/math/softmax.h
@@ -4,9 +4,9 @@
 #pragma once
 
 #include "core/providers/webgpu/webgpu_supported_types.h"
-#include "core/providers/cpu/math/softmax.h"
 #include "core/providers/webgpu/webgpu_kernel.h"
 #include "core/providers/webgpu/program.h"
+#include "core/framework/op_kernel.h"
 
 namespace onnxruntime {
 namespace webgpu {
@@ -15,8 +15,8 @@ class Softmax final : public WebGpuKernel {
  public:
   Softmax(const OpKernelInfo& info) : WebGpuKernel{info} {
     int opset_ = info.node().SinceVersion();
-    size_t axis;
-    Status status = info.GetAttr<size_t>("axis", &axis);
+    int64_t axis;
+    Status status = info.GetAttr<int64_t>("axis", &axis);
 
     if (status.IsOK()) {
       axis_ = axis;
@@ -32,12 +32,12 @@ class Softmax final : public WebGpuKernel {
   Status ComputeInternal(ComputeContext& context) const override;
 
  private:
-  size_t axis_;
+  int64_t axis_;
 };
 
 class SoftmaxProgram final : public Program<SoftmaxProgram> {
  public:
-  SoftmaxProgram(size_t axis, int wg) : Program{"Softmax"}, axis_{axis}, WG_{wg} {
+  SoftmaxProgram(size_t wg) : Program{"Softmax"}, WG{wg} {
  }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
@@ -45,7 +45,7 @@ class SoftmaxProgram final : public Program<SoftmaxProgram> {
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"packedCols", ProgramUniformVariableDataType::Int32});
 
  private:
-    int WG;
+    size_t WG;
 };
 
 }  // namespace webgpu
diff --git a/onnxruntime/core/providers/webgpu/shader_variable.h b/onnxruntime/core/providers/webgpu/shader_variable.h
index 4c87bc9158890..3b8ed7bf42b55 100644
--- a/onnxruntime/core/providers/webgpu/shader_variable.h
+++ b/onnxruntime/core/providers/webgpu/shader_variable.h
@@ -176,6 +176,10 @@ class ShaderVariableHelper : public ShaderIndicesHelper {
   template <typename TOffset>
   inline std::string GetByOffset(TOffset&& offset) const;
 
+  std::string_view StorageType() const;
+  std::string_view ValueType() const;
+  std::string_view ElementType() const;
+
  private:
   ORT_DISALLOW_COPY_AND_ASSIGNMENT(ShaderVariableHelper);
 
@@ -183,9 +187,7 @@ class ShaderVariableHelper : public ShaderIndicesHelper {
 
   std::string GetByOffsetImpl(std::string_view offset) const;
   std::string SetByOffsetImpl(std::string_view offset, std::string_view value) const;
-  std::string_view StorageType() const;
-  std::string_view ValueType() const;
-  std::string_view ElementType() const;
+
 
   friend class ShaderHelper;
 };

From 87de60730a1fca9aa3ec021b20d1de0262e64d57 Mon Sep 17 00:00:00 2001
From: vraspar <v2parikh@uwaterloo.ca>
Date: Wed, 29 Jan 2025 16:33:20 -0800
Subject: [PATCH 3/4] Refactor Softmax and remove debug logs

---
 .../core/providers/webgpu/math/softmax.cc     | 92 +++++--------------
 .../core/providers/webgpu/math/softmax.h      |  6 +-
 .../core/providers/webgpu/tensor/transpose.cc | 27 +++---
 .../core/providers/webgpu/tensor/transpose.h  |  2 +-
 .../test/providers/cpu/math/softmax_test.cc   | 13 +--
 5 files changed, 50 insertions(+), 90 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/math/softmax.cc b/onnxruntime/core/providers/webgpu/math/softmax.cc
index 4abefa704c689..1760acae95f39 100644
--- a/onnxruntime/core/providers/webgpu/math/softmax.cc
+++ b/onnxruntime/core/providers/webgpu/math/softmax.cc
@@ -8,9 +8,6 @@
 #include "core/providers/webgpu/shader_variable.h"
 #include "core/providers/webgpu/shader_helper.h"
 #include "core/providers/webgpu/webgpu_supported_types.h"
-
-#include "core/common/logging/logging.h"
-
 namespace onnxruntime {
 namespace webgpu {
 
@@ -84,16 +81,7 @@ Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
   shader.AddOutput("result", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
   int components = input.NumComponents();
 
-  LOGS_DEFAULT(VERBOSE) << "Input StorageType: " << input.StorageType() << "\n";
-  LOGS_DEFAULT(VERBOSE) << "Input ElementType: " << input.ElementType() << "\n";
-  LOGS_DEFAULT(VERBOSE) << "Input ValueType: " << input.ValueType() << "\n";
-
-
-
-  std::string threadMaxDecl =  input.ElementType() == "f32" ?
-                                "var threadMax = x_value_t(-3.402823e+38f);\n" :
-                                "var threadMax = x_value_t(-65504.0h);\n";
-
+  std::string threadMaxDecl = input.ElementType() == "f32" ? "var threadMax = x_value_t(-3.402823e+38f);\n" : "var threadMax = x_value_t(-65504.0h);\n";
 
   // Define shared memory for row max and row sum
   shader.AdditionalImplementation()
@@ -142,7 +130,7 @@ Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
       << "    workgroupBarrier();\n"
       << "  }\n"
       << "  if (lindex == 0) {\n"
-      << "    rowMaxShared = x_value_t(" <<  MaxVector("threadShared[0]", components)   << ");\n"
+      << "    rowMaxShared = x_value_t(" << MaxVector("threadShared[0]", components) << ");\n"
       << "  }\n"
       << "  workgroupBarrier();\n"
 
@@ -163,7 +151,7 @@ Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
       << "    workgroupBarrier();\n"
       << "  }\n"
       << "  if (lindex == 0) {\n"
-      << "    rowSumShared = x_value_t(" << SumVector("threadShared[0]", components)  << ");\n"
+      << "    rowSumShared = x_value_t(" << SumVector("threadShared[0]", components) << ");\n"
       << "  }\n"
       << "  workgroupBarrier();\n"
 
@@ -179,71 +167,44 @@ Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
 Status Softmax::ComputeInternal(ComputeContext& context) const {
   const auto* input_tensor = context.Input(0);
   const TensorShape& input_shape = input_tensor->Shape();
-  size_t input_rank = input_shape.NumDimensions();
-
+  int64_t input_rank = input_shape.NumDimensions();
   auto* output_tensor = context.Output(0, input_shape);
 
   // normalize axis
-  int64_t  axis = axis_ < 0 ? axis_ + input_rank : axis_;
-
+  int64_t axis = axis_ < 0 ? axis_ + input_rank : axis_;
   bool is_transpose_required = axis < input_rank - 1;
-  LOGS_DEFAULT(VERBOSE) <<"axis_: " << axis_ << " axis: " << axis << "\n";
-  LOGS_DEFAULT(VERBOSE) << "Transpose required: " << (is_transpose_required ? "true" : "false") << "\n";
-  LOGS_DEFAULT(VERBOSE) << "Input shape: " << input_shape.ToString() << "\n";
-  LOGS_DEFAULT(VERBOSE) << "Output shape: " << output_tensor->Shape().ToString() << "\n";
-  LOGS_DEFAULT(VERBOSE) << "Input rank: " << input_rank << "\n";
 
-  TensorShape transposed_input_shape = input_shape;
+  TensorShape transposed_input_shape;
   Tensor transposed_input_tensor;
   Tensor intermediate_output;
-  InlinedVector<size_t> perm;
+  InlinedVector<size_t> perm(input_rank);
 
   if (is_transpose_required) {
-    AllocatorPtr alloc;
-    perm.resize(input_rank);
-    for (size_t i = 0; i < perm.size(); ++i) {
-      perm[i] = i;
-    }
+    std::iota(std::begin(perm), std::end(perm), 0);
     perm[axis] = input_rank - 1;
     perm[input_rank - 1] = axis;
 
-    LOGS_DEFAULT(VERBOSE) << "Allocating temporary tensors for transpose\n";
-
-    // allocate a temporary tensor to hold transposed input
-    Tensor temp_input(input_tensor->DataType(), TensorShape(transposed_input_shape), alloc);
-
-    LOGS_DEFAULT(VERBOSE) << "Performing transpose\n";
-
-    ORT_RETURN_IF_ERROR(Transpose::DoTranspose(perm, *input_tensor, temp_input));
-
-    LOGS_DEFAULT(VERBOSE) << "Transpose done\n";
-
-    LOGS_DEFAULT(VERBOSE) << "Allocating memory for intermediate output\n";
-    transposed_input_tensor = std::move(temp_input);
-    transposed_input_shape = transposed_input_tensor.Shape();
-
-    LOGS_DEFAULT(VERBOSE) << "Transposed input shape: " << transposed_input_shape.ToString() << "\n";
+    std::vector<int64_t> transposed_input_dims;
+    for (auto e : perm) {
+      transposed_input_dims.push_back(input_shape[e]);
+    }
 
-    // Allocate memory for the intermediate output
-    LOGS_DEFAULT(VERBOSE) << "Allocating memory for intermediate output\n";
-    Tensor temp_output(output_tensor->DataType(), TensorShape(transposed_input_shape), alloc);
-    intermediate_output = std::move(temp_output);
+    transposed_input_shape = TensorShape(transposed_input_dims);
+    transposed_input_tensor = context.CreateGPUTensor(input_tensor->DataType(), transposed_input_shape);
+    ORT_RETURN_IF_ERROR(Transpose::DoTranspose(context, perm, *input_tensor, transposed_input_tensor));
+    intermediate_output = context.CreateGPUTensor(output_tensor->DataType(), transposed_input_shape);
   }
 
-
-  const size_t cols = transposed_input_shape[input_rank - 1];
-  const size_t rows = input_shape.Size() / cols;
-  const size_t components = GetMaxComponents(cols);
+  const int64_t cols = is_transpose_required ? transposed_input_shape[input_rank - 1] : input_shape[input_rank - 1];
+  const int64_t rows = input_shape.Size() / cols;
+  const int64_t components = GetMaxComponents(cols);
   const auto packedCols = cols / components;
-
-  LOGS_DEFAULT(VERBOSE) << "Cols: " << cols << " Rows: " << rows << " Components: " << components << " PackedCols: " << packedCols << "\n";
-
-  size_t WG = rows == 1 ? 256: 64;
+  uint32_t WG = rows == 1 ? 256 : 64;
 
   SoftmaxProgram program{WG};
-  if  (is_transpose_required) {
+  if (is_transpose_required) {
     program
-        .AddInputs({{&transposed_input_tensor, ProgramTensorMetadataDependency::TypeAndRank,  static_cast<int>(components)}})
+        .AddInputs({{&transposed_input_tensor, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(components)}})
         .AddOutputs({{&intermediate_output, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(components)}});
   } else {
     program
@@ -251,22 +212,17 @@ Status Softmax::ComputeInternal(ComputeContext& context) const {
         .AddOutputs({{output_tensor, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(components)}});
   }
 
-
   program
       .CacheHint(std::to_string(components), std::to_string(WG))
       .SetWorkgroupSize(WG)
       .SetDispatchGroupSize(rows)
-      .AddUniformVariables({
-        {static_cast<int32_t>(packedCols)}
-      });
-
+      .AddUniformVariables({{static_cast<int32_t>(packedCols)}});
 
   ORT_RETURN_IF_ERROR(context.RunProgram(program));
 
   // If transpose was required, transpose the result back
   if (is_transpose_required) {
-    Tensor transposed_output_tensor;
-    ORT_RETURN_IF_ERROR(Transpose::DoTranspose(perm, intermediate_output, *output_tensor));
+    ORT_RETURN_IF_ERROR(Transpose::DoTranspose(context, perm, intermediate_output, *output_tensor));
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/webgpu/math/softmax.h b/onnxruntime/core/providers/webgpu/math/softmax.h
index b67425471da9a..5eb6bd0ccdb15 100644
--- a/onnxruntime/core/providers/webgpu/math/softmax.h
+++ b/onnxruntime/core/providers/webgpu/math/softmax.h
@@ -37,15 +37,15 @@ class Softmax final : public WebGpuKernel {
 
 class SoftmaxProgram final : public Program<SoftmaxProgram> {
  public:
-  SoftmaxProgram(size_t wg) : Program{"Softmax"}, WG{wg} {
- }
+  SoftmaxProgram(uint32_t wg) : Program{"Softmax"}, WG{wg} {
+  }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
 
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"packedCols", ProgramUniformVariableDataType::Int32});
 
  private:
-    size_t WG;
+  uint32_t WG;
 };
 
 }  // namespace webgpu
diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
index 062500055eeaf..c06a742239cdc 100644
--- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
@@ -97,24 +97,27 @@ Status TransposeProgram::GenerateShaderCode(ShaderHelper& shader) const {
   return Status::OK();
 }
 
-Status Transpose::DoTranspose(const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output) {
+Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context, const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output) {
   const auto& input_shape = input.Shape();
+  const auto& input_dims = input_shape.GetDims();
   int32_t rank = gsl::narrow_cast<int32_t>(input_shape.NumDimensions());
 
-
   TensorShapeVector output_dims(rank);
-  InlinedVector<size_t> default_perm(rank);
-  const InlinedVector<size_t>* p_perm = nullptr;
-  ORT_RETURN_IF_ERROR(ComputeOutputShape(input, output_dims, default_perm, p_perm));
+
+  for (int32_t i = 0; i < rank; i++) {
+    output_dims[i] = input_dims[permutations[i]];
+  }
+
   TensorShape output_shape(output_dims);
 
   InlinedVector<int64_t> new_shape{};
   InlinedVector<int64_t> new_perm{};
-  SqueezeShape(input_shape.GetDims(), *p_perm, new_shape, new_perm);
+  SqueezeShape(input_shape.GetDims(), permutations, new_shape, new_perm);
   const bool channels_last = new_perm == InlinedVector<int64_t>({2, 3, 1});
   const bool channels_first = new_perm == InlinedVector<int64_t>({3, 1, 2});
   const bool use_shared = (new_shape.size() == 2 && new_perm[0] > new_perm[1]) || channels_last || channels_first;
   auto new_input_shape = input_shape;
+  TensorShape new_output_shape(output_dims);
 
   if (use_shared) {
     new_input_shape = channels_last
@@ -125,16 +128,16 @@ Status Transpose::DoTranspose(const gsl::span<const size_t>& permutations, const
     new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]});
   }
 
-  uint32_t output_size = gsl::narrow_cast<int32_t>(input.Shape().Size());
-  TransposeProgram program{*p_perm, use_shared};
+  uint32_t output_size = gsl::narrow_cast<int32_t>(input_shape.Size());
+  TransposeProgram program{permutations, use_shared};
+
   if (use_shared) {
     program.SetWorkgroupSize(TILE_SIZE, TILE_SIZE, 1);
   }
-
   program
-      .CacheHint(absl::StrJoin(*p_perm, "-"))
-      .AddInputs({{*input, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}})
-      .AddOutputs({{*output, ProgramTensorMetadataDependency::None, new_output_shape, 1}})
+      .CacheHint(absl::StrJoin(permutations, "-"))
+      .AddInputs({{&input, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}})
+      .AddOutputs({{&output, ProgramTensorMetadataDependency::None, new_output_shape, 1}})
       .SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE),
                             static_cast<uint32_t>(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE)))
       .AddUniformVariables({
diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.h b/onnxruntime/core/providers/webgpu/tensor/transpose.h
index 3eb672d1c6e31..81706dde33cc6 100644
--- a/onnxruntime/core/providers/webgpu/tensor/transpose.h
+++ b/onnxruntime/core/providers/webgpu/tensor/transpose.h
@@ -16,7 +16,7 @@ class Transpose final : public WebGpuKernel, public TransposeBase {
   Transpose(const OpKernelInfo& info) : WebGpuKernel{info}, TransposeBase{info} {
   }
   Status ComputeInternal(ComputeContext& context) const override;
-  static Status DoTranspose(const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output);
+  static Status DoTranspose(onnxruntime::webgpu::ComputeContext& context, const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output);
 
   constexpr static uint32_t TILE_SIZE = 16;
 };
diff --git a/onnxruntime/test/providers/cpu/math/softmax_test.cc b/onnxruntime/test/providers/cpu/math/softmax_test.cc
index 6f7930f722564..3808d62a10e56 100644
--- a/onnxruntime/test/providers/cpu/math/softmax_test.cc
+++ b/onnxruntime/test/providers/cpu/math/softmax_test.cc
@@ -170,11 +170,11 @@ TEST(SoftmaxOperator, ThreeAndFourDimsAxis0) {
 
   RunTest(input_vals_60, expected_vals, three_dimensions, /*opset*/ 7, /*axis*/ 0,
           // axis=0 is not supported by TensorRT
-          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider});
+          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider, kWebGpuExecutionProvider});
 
   RunTest(input_vals_60, expected_vals, four_dimensions, /*opset*/ 7, /*axis*/ 0,
           // axis=0 is not supported by TensorRT
-          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider});
+          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider, kWebGpuExecutionProvider});
 }
 
 TEST(SoftmaxOperator, ThreeAndFourDimsSecondLastAxis) {
@@ -201,10 +201,10 @@ TEST(SoftmaxOperator, ThreeAndFourDimsSecondLastAxis) {
       0.040478885f, 0.033857856f, 0.080346674f, 0.06199841f, 0.040481992f};
 
   RunTest(input_vals_60, expected_vals, three_dimensions, /*opset*/ 7, /*axis*/ 1,
-          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider});
+          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider, kWebGpuExecutionProvider});
 
   RunTest(input_vals_60, expected_vals, four_dimensions, /*opset*/ 7, /*axis*/ 2,
-          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider});
+          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider, kWebGpuExecutionProvider});
 }
 
 TEST(SoftmaxOperator, ThreeAndFourDimsSecondLastAxis_opset13) {
@@ -376,8 +376,9 @@ TEST(SoftmaxOperator, DimWithZero) {
 
   RunTest(x_vals, expected_vals, dimensions, /*opset*/ -1, /*axis*/ 0,
           {kTensorrtExecutionProvider,
-           kNnapiExecutionProvider,  // NNAPI softmax does not support empty input
-           kQnnExecutionProvider}    // QNN doesn't support dim 0
+           kNnapiExecutionProvider,   // NNAPI softmax does not support empty input
+           kWebGpuExecutionProvider,  // WebGPU does not dim 0
+           kQnnExecutionProvider}     // QNN doesn't support dim 0
   );
 }
 

From 2d8b47de2729b30442b7659524fddc532a7a3a99 Mon Sep 17 00:00:00 2001
From: vraspar <vrajang@outlook.com>
Date: Thu, 30 Jan 2025 11:49:31 -0800
Subject: [PATCH 4/4] fix linting error

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 onnxruntime/core/providers/webgpu/shader_variable.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxruntime/core/providers/webgpu/shader_variable.h b/onnxruntime/core/providers/webgpu/shader_variable.h
index 3b8ed7bf42b55..12ded754de55c 100644
--- a/onnxruntime/core/providers/webgpu/shader_variable.h
+++ b/onnxruntime/core/providers/webgpu/shader_variable.h
@@ -188,7 +188,6 @@ class ShaderVariableHelper : public ShaderIndicesHelper {
   std::string GetByOffsetImpl(std::string_view offset) const;
   std::string SetByOffsetImpl(std::string_view offset, std::string_view value) const;
 
-
   friend class ShaderHelper;
 };