From 88d7f99c1f8f47eea2ccc1504928467e84ee4e8e Mon Sep 17 00:00:00 2001
From: pytorchbot <pytorchbot@pytorch.com>
Date: Thu, 24 Oct 2024 11:35:19 +0000
Subject: [PATCH] 2024-10-24 nightly release
 (2553788a5dca2970c0394fa88223f51183a28279)

---
 1.txt                                         | 676 ------------------
 backends/arm/operators/op_permute.py          |  77 +-
 backends/arm/test/ops/test_hardtanh.py        | 125 ++++
 backends/arm/test/ops/test_permute.py         | 152 ++++
 .../runtime/graph/ops/glsl/q_8w_linear.glsl   |  14 +-
 build/build_android_llm_demo.sh               |  12 +-
 examples/models/llama/llama_transformer.py    |   2 +-
 .../quantized_kv_cache.py                     |   4 +-
 .../test_quantized_kv_cache.py                |   8 +-
 runtime/core/exec_aten/exec_aten.h            |   4 +-
 10 files changed, 375 insertions(+), 699 deletions(-)
 delete mode 100644 1.txt
 create mode 100644 backends/arm/test/ops/test_hardtanh.py
 create mode 100644 backends/arm/test/ops/test_permute.py

diff --git a/1.txt b/1.txt
deleted file mode 100644
index 96745dd0af..0000000000
--- a/1.txt
+++ /dev/null
@@ -1,676 +0,0 @@
-diff --git a/backends/cadence/cadence.cmake b/backends/cadence/cadence.cmake
-index cb6a2531..0fa55c6a 100644
---- a/backends/cadence/cadence.cmake
-+++ b/backends/cadence/cadence.cmake
-@@ -44,7 +44,7 @@ set(CMAKE_CXX_COMPILER ${TOOLCHAIN_HOME}/bin/${CROSS_COMPILE_TARGET}-clang++)
- set(CMAKE_C_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls")
- set(CMAKE_CXX_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls")
- #workaround for larger compilation time
--SET(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -fno-strict-aliasing")
-+set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -fno-strict-aliasing")
- 
- set(CMAKE_SYSROOT ${TOOLCHAIN_HOME}/${SYSROOT_TARGET})
- set(CMAKE_LINKER ${TOOLCHAIN_HOME}/bin/xt-ld)
-diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
-index 8faf0671..a08144d9 100644
---- a/backends/cadence/hifi/kernels/kernels.h
-+++ b/backends/cadence/hifi/kernels/kernels.h
-@@ -16,21 +16,24 @@
- #include "xa_nnlib_kernels_api.h"
- 
- /* Potential NNLIB function/APIs */
--extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-+extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
-+                                FLOAT32 * __restrict__ p_out,
-                                 const WORD32 *const p_out_shape,
-                                 const FLOAT32 * __restrict__ p_inp1,
-                                 const WORD32 *const p_inp1_shape,
-                                 const FLOAT32 * __restrict__ p_inp2,
-                                 const WORD32 *const p_inp2_shape);
- 
--extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-+extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(
-+                                FLOAT32 * __restrict__ p_out,
-                                 const WORD32 *const p_out_shape,
-                                 const FLOAT32 * __restrict__ p_inp1,
-                                 const WORD32 *const p_inp1_shape,
-                                 const FLOAT32 * __restrict__ p_inp2,
-                                 const WORD32 *const p_inp2_shape);
- 
--extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-+extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32(
-+                                FLOAT32 * __restrict__ p_out,
-                                 const FLOAT32 * __restrict__ p_inp1,
-                                 const FLOAT32 * __restrict__ p_inp2,
-                                 WORD32 num_elm,
-@@ -45,7 +48,8 @@ extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
-                                     const WORD32 *const p_inp2_shape,
-                                     WORD32 mode);        
- 
--extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-+extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
-+                                FLOAT32 * __restrict__ p_out,
-                                 const WORD32 *const p_out_shape,
-                                 const FLOAT32 * __restrict__ p_inp1,
-                                 const WORD32 *const p_inp1_shape,
-diff --git a/backends/cadence/hifi/operators/op_add.cpp b/backends/cadence/hifi/operators/op_add.cpp
-index 883cc74d..56adab71 100644
---- a/backends/cadence/hifi/operators/op_add.cpp
-+++ b/backends/cadence/hifi/operators/op_add.cpp
-@@ -6,13 +6,13 @@
-  * LICENSE file in the root directory of this source tree.
-  */
- 
-+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
- #include <executorch/kernels/portable/cpu/scalar_utils.h>
- #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
- #include <executorch/kernels/portable/cpu/util/functional_util.h>
- #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
- #include <executorch/runtime/kernel/kernel_includes.h>
- #include <executorch/runtime/platform/assert.h>
--#include <executorch/backends/cadence/hifi/kernels/kernels.h>
- 
- using exec_aten::Scalar;
- using exec_aten::ScalarType;
-@@ -23,7 +23,7 @@ using executorch::runtime::KernelRuntimeContext;
- using torch::executor::Error;
- 
- namespace impl {
--namespace HiFi { 
-+namespace HiFi {
- namespace native {
- 
- namespace {
-@@ -97,14 +97,15 @@ Tensor& add_out(
- 
-   ScalarType a_type = a.scalar_type();
-   ScalarType b_type = b.scalar_type();
--  ScalarType alpha_type = torch::executor::native::utils::get_scalar_dtype(alpha);
-+  ScalarType alpha_type = 
-+    torch::executor::native::utils::get_scalar_dtype(alpha);
-   ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
-   ScalarType out_type = out.scalar_type();
- 
-   ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
-   ET_KERNEL_CHECK(
-       ctx, check_alpha_type(alpha_type, common_type), InvalidArgument, out);
--      
-+    
-   float alpha_val;
-   torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
- 
-@@ -119,30 +120,28 @@ Tensor& add_out(
-   const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
-   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
-   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
--  
--  if((out_type != ScalarType::Float) || (alpha_val != 1.0))
-+
-+  if ((out_type != ScalarType::Float) || (alpha_val != 1.0))
-     optimized = 0;
--  
--  if((a_dim == 0) || (b_dim == 0) )
-+
-+  if ((a_dim == 0) || (b_dim == 0) )
-     optimized = 0;
- 
--  if((broadcast == 1) && (max_dim > kNnlibMaxDim))
-+  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
-     optimized = 0;
- 
- 
--  if(optimized)
--  {
-+  if (optimized) {
-       const float* const a_data = a.const_data_ptr<float>();
-       const float* const b_data = b.const_data_ptr<float>();
-       float* const out_data = out.mutable_data_ptr<float>();
--      if(broadcast == 1)
--      {
-+
-+      if(broadcast == 1) {
-          int out_shape[kNnlibMaxDim];
-          int inp1_shape[kNnlibMaxDim];
-          int inp2_shape[kNnlibMaxDim];
-          
--         for(int i = 0; i < kNnlibMaxDim; i++)
--         {
-+         for (int i = 0; i < kNnlibMaxDim; i++) {
-             out_shape[i] = 1;
-             inp1_shape[i] = 1;
-             inp2_shape[i] = 1;
-@@ -152,15 +151,15 @@ Tensor& add_out(
-          int off_a = kNnlibMaxDim - a.dim();
-          int off_b = kNnlibMaxDim - b.dim();
-          
--         for(int i = 0; i < out.dim(); i++)
-+         for (int i = 0; i < out.dim(); i++)
-              out_shape[i+off_o] = out.size(i);
--         for(int i = 0; i < a.dim(); i++)
-+         for (int i = 0; i < a.dim(); i++)
-              inp1_shape[i+off_a] = a.size(i);
--         for(int i = 0; i < b.dim(); i++)
-+         for (int i = 0; i < b.dim(); i++)
-              inp2_shape[i+off_b] = b.size(i);
-          
--         xa_nn_elm_add_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape,
--                                                b_data, inp2_shape);
-+         xa_nn_elm_add_broadcast_4D_f32xf32_f32(
-+           out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
-       }                      
-       else
-       {
-@@ -193,6 +192,6 @@ Tensor& add_out(
- }
- 
- 
--} // namespace impl
--} // namespace HiFi
- } // namespace native
-+} // namespace HiFi
-+} // namespace impl
-diff --git a/backends/cadence/hifi/operators/op_div.cpp b/backends/cadence/hifi/operators/op_div.cpp
-index 41220e5d..e887e8b5 100644
---- a/backends/cadence/hifi/operators/op_div.cpp
-+++ b/backends/cadence/hifi/operators/op_div.cpp
-@@ -6,6 +6,7 @@
-  * LICENSE file in the root directory of this source tree.
-  */
- 
-+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
- #include <executorch/kernels/portable/cpu/scalar_utils.h>
- #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
- #include <executorch/kernels/portable/cpu/util/functional_util.h>
-@@ -13,7 +14,6 @@
- #include <executorch/runtime/kernel/kernel_includes.h>
- #include <executorch/runtime/platform/assert.h>
- #include <cmath> 
--#include <executorch/backends/cadence/hifi/kernels/kernels.h>
- 
- using exec_aten::Scalar;
- using exec_aten::ScalarType;
-@@ -22,7 +22,7 @@ using executorch::aten::RuntimeContext;
- using torch::executor::Error;
- 
- namespace impl {
--namespace HiFi { 
-+namespace HiFi {
- namespace native {
- 
- namespace {
-@@ -74,29 +74,27 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
-   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
-   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
-   
--  if((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
-+  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
-     optimized = 0;
-   
--  if((a_dim == 0) || (b_dim == 0) )
-+  if ((a_dim == 0) || (b_dim == 0) )
-     optimized = 0;
- 
--  if((broadcast == 1) && (max_dim > kNnlibMaxDim))
-+  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
-     optimized = 0;
-   
--  if(optimized)
--  {
-+  if (optimized) {
-     float* a_data = a.mutable_data_ptr<float>();
-     float* b_data = b.mutable_data_ptr<float>();
-     float* out_data = out.mutable_data_ptr<float>();
-     
--    if(broadcast == 1)
--    {
-+    if (broadcast == 1) {
-       
-       int out_shape[kNnlibMaxDim];
-       int inp1_shape[kNnlibMaxDim];
-       int inp2_shape[kNnlibMaxDim];
-       
--      for(int i = 0; i < kNnlibMaxDim; i++)
-+      for (int i = 0; i < kNnlibMaxDim; i++)
-       {
-         out_shape[i] = 1;
-         inp1_shape[i] = 1;
-@@ -106,34 +104,35 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
-       int off_o = kNnlibMaxDim - out.dim();
-       int off_a = kNnlibMaxDim - a.dim();
-       int off_b = kNnlibMaxDim - b.dim();
--      for(int i = 0; i < out.dim(); i++)
-+      for (int i = 0; i < out.dim(); i++)
-         out_shape[i+off_o] = out.size(i);
--      for(int i = 0; i < a.dim(); i++)
-+      for (int i = 0; i < a.dim(); i++)
-         inp1_shape[i+off_a] = a.size(i);
--      for(int i = 0; i < b.dim(); i++)
-+      for (int i = 0; i < b.dim(); i++)
-         inp2_shape[i+off_b] = b.size(i);
-       
--      xa_nn_elm_div_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
-+      xa_nn_elm_div_broadcast_4D_f32xf32_f32(
-+        out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
-     }
-     else
-     {
--
-       xa_nn_elm_div_f32xf32_f32(out_data, a_data, b_data, out.numel());
-     }
--    
-+
-     return out;
-   }
--  
-+
-   ScalarType common_type = get_compute_type(a_type, b_type);
-   ScalarType out_type = out.scalar_type();
--  
-+
-   ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
--  
-+
-   ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out", CTYPE_A, [&]() {
-     ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out", CTYPE_B, [&]() {
-       ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out", CTYPE_IN, [&]() {
-         ET_SWITCH_FLOAT_TYPES(out_type, ctx, "div.out", CTYPE_OUT, [&]() {
--          torch::executor::apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-+          torch::executor::
-+            apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-               [](const CTYPE_A val_a, const CTYPE_B val_b) {
-                 CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                 CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-@@ -188,13 +187,13 @@ Tensor& div_out_mode(
-   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
-   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
-   
--  if((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
-+  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
-     optimized = 0;
-   
--  if((a_dim == 0) || (b_dim == 0))
-+  if ((a_dim == 0) || (b_dim == 0))
-     optimized = 0;
- 
--  if((broadcast == 1) && (max_dim > kNnlibMaxDim))
-+  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
-     optimized = 0;
-   int mode_val = -1;
-   if (mode.has_value() && mode.value() == "trunc") 
-@@ -204,20 +203,17 @@ Tensor& div_out_mode(
-   else
-     optimized = 0;
-       
--  if(optimized)
--  {
-+  if (optimized) {
-     float* a_data = a.mutable_data_ptr<float>();
-     float* b_data = b.mutable_data_ptr<float>();
-     float* out_data = out.mutable_data_ptr<float>();
- 
--    if(broadcast)
--    {
-+    if (broadcast) {
-       int out_shape[kNnlibMaxDim];
-       int inp1_shape[kNnlibMaxDim];
-       int inp2_shape[kNnlibMaxDim];
-       
--      for(int i = 0; i < kNnlibMaxDim; i++)
--      {
-+      for (int i = 0; i < kNnlibMaxDim; i++) {
-         inp1_shape[i] = 1;
-         inp2_shape[i] = 1;
-         out_shape[i] = 1;
-@@ -227,18 +223,20 @@ Tensor& div_out_mode(
-       int off_a = kNnlibMaxDim - a.dim();
-       int off_b = kNnlibMaxDim - b.dim();
- 
--      for(int i = 0; i < out.dim(); i++)
-+      for (int i = 0; i < out.dim(); i++)
-         out_shape[i+off_o] = out.size(i);
--      for(int i = 0; i < a.dim(); i++)
-+      for (int i = 0; i < a.dim(); i++)
-         inp1_shape[i+off_a] = a.size(i);
--      for(int i = 0; i < b.dim(); i++)
-+      for (int i = 0; i < b.dim(); i++)
-         inp2_shape[i+off_b] = b.size(i);
-       
--      xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape, mode_val);
-+      xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
-+        out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape, mode_val);
-     }
-     else
-     {
--      xa_nn_elm_div_mode_f32xf32_f32(out_data, a_data, b_data, out.numel(), mode_val);
-+      xa_nn_elm_div_mode_f32xf32_f32(
-+        out_data, a_data, b_data, out.numel(), mode_val);
-     }
-     
-     return out;
-@@ -248,7 +246,8 @@ Tensor& div_out_mode(
-     ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out_mode", CTYPE_B, [&]() {
-       ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out_mode", CTYPE_IN, [&]() {
-         ET_SWITCH_REAL_TYPES(out_type, ctx, "div.out_mode", CTYPE_OUT, [&]() {
--          torch::executor::apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-+          torch::executor::
-+            apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-               [mode](const CTYPE_A val_a, const CTYPE_B val_b) {
-                 CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                 CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-@@ -272,6 +271,6 @@ Tensor& div_out_mode(
- }
- 
- 
--} // namespace impl
--} // namespace HiFi
- } // namespace native
-+} // namespace HiFi
-+} // namespace impl
-diff --git a/backends/cadence/hifi/operators/op_mul.cpp b/backends/cadence/hifi/operators/op_mul.cpp
-index 9200d980..1b2e62cd 100644
---- a/backends/cadence/hifi/operators/op_mul.cpp
-+++ b/backends/cadence/hifi/operators/op_mul.cpp
-@@ -6,12 +6,12 @@
-  * LICENSE file in the root directory of this source tree.
-  */
- 
-+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
- #include <executorch/kernels/portable/cpu/scalar_utils.h>
- #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
- #include <executorch/kernels/portable/cpu/util/functional_util.h>
- #include <executorch/runtime/kernel/kernel_includes.h>
- #include <executorch/runtime/platform/assert.h>
--#include <executorch/backends/cadence/hifi/kernels/kernels.h>
- 
- using exec_aten::Scalar;
- using exec_aten::ScalarType;
-@@ -86,7 +86,7 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
-   ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
-   ScalarType out_type = out.scalar_type();
-   constexpr int kNnlibMaxDim = 4;  /*fallback if broadcast and dim > 4 */
--  
-+
-   int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
-   bool optimized = 1;
-   /*find broadcast*/
-@@ -97,28 +97,25 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
-   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
- 
-   
--  if((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
-+  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
-     optimized = 0;
-   
--  if( (a_dim == 0) || (b_dim == 0) )
-+  if ((a_dim == 0) || (b_dim == 0) )
-     optimized = 0;
-   
--  if((broadcast == 1) && (max_dim > kNnlibMaxDim))
-+  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
-     optimized = 0;
- 
--  if(optimized)
--  {
-+  if (optimized) {
-     float* a_data = a.mutable_data_ptr<float>();
-     float* b_data = b.mutable_data_ptr<float>();
-     float* out_data = out.mutable_data_ptr<float>();
- 
--    if(broadcast == 1)
--    {
-+    if (broadcast == 1) {
-        int out_shape[kNnlibMaxDim];
-        int inp1_shape[kNnlibMaxDim];
-        int inp2_shape[kNnlibMaxDim];
--       for(int i = 0; i < kNnlibMaxDim; i++)
--       {
-+       for (int i = 0; i < kNnlibMaxDim; i++) {
-           out_shape[i] = 1;
-           inp1_shape[i] = 1;
-           inp2_shape[i] = 1;
-@@ -126,14 +123,15 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
-        int off_o = kNnlibMaxDim - out.dim();
-        int off_a = kNnlibMaxDim - a.dim();
-        int off_b = kNnlibMaxDim - b.dim();
--       for(int i = 0; i < out.dim(); i++){
--            out_shape[i+off_o] = out.size(i);}
--       for(int i = 0; i < a.dim(); i++)
-+       for (int i = 0; i < out.dim(); i++)
-+            out_shape[i+off_o] = out.size(i);
-+       for (int i = 0; i < a.dim(); i++)
-             inp1_shape[i+off_a] = a.size(i);
--       for(int i = 0; i < b.dim(); i++)
-+       for (int i = 0; i < b.dim(); i++)
-             inp2_shape[i+off_b] = b.size(i);
-         
--       xa_nn_elm_mul_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
-+       xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
-+        out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
-     }
-     else
-     {
-@@ -154,7 +152,7 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
-             CTYPE_A,
-             CTYPE_B,
-             CTYPE_IN,
--            CTYPE_OUT>::run(a, b, out); 
-+            CTYPE_OUT>::run(a, b, out);
-       });
-     });
-   });
-@@ -162,6 +160,6 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
-   return out;
- }
- 
--} // namespace impl
--} // namespace HiFi
- } // namespace native
-+} // namespace HiFi
-+} // namespace impl
-diff --git a/backends/cadence/hifi/operators/op_sigmoid.cpp b/backends/cadence/hifi/operators/op_sigmoid.cpp
-index fa408d4b..1ed89880 100644
---- a/backends/cadence/hifi/operators/op_sigmoid.cpp
-+++ b/backends/cadence/hifi/operators/op_sigmoid.cpp
-@@ -8,9 +8,9 @@
- 
- #include <cmath>
- 
-+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
- #include <executorch/kernels/portable/cpu/util/functional_util.h>
- #include <executorch/runtime/kernel/kernel_includes.h>
--#include <executorch/backends/cadence/hifi/kernels/kernels.h>
- 
- using exec_aten::ScalarType;
- using exec_aten::Tensor;
-@@ -18,7 +18,7 @@ using executorch::aten::RuntimeContext;
- using torch::executor::Error;
- 
- namespace impl {
--namespace HiFi { 
-+namespace HiFi {
- namespace native {
- 
- using Tensor = exec_aten::Tensor;
-@@ -40,13 +40,12 @@ Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
- 
-   ScalarType in_type = in.scalar_type();
-   ScalarType out_type = out.scalar_type();
--  
-+
-   bool optimized = 1;
--  if((in_type != ScalarType::Float) || (out_type != ScalarType::Float))
-+  if ((in_type != ScalarType::Float) || (out_type != ScalarType::Float))
-       optimized = 0;
-   
--  if(optimized)
--  {
-+  if (optimized) {
-     float* data_in = in.mutable_data_ptr<float>();
-     float* data_out = out.mutable_data_ptr<float>();
-     xa_nn_vec_sigmoid_f32_f32(data_out, data_in, in.numel());
-diff --git a/backends/cadence/hifi/operators/op_sub.cpp b/backends/cadence/hifi/operators/op_sub.cpp
-index b9f35caf..d9958bf8 100644
---- a/backends/cadence/hifi/operators/op_sub.cpp
-+++ b/backends/cadence/hifi/operators/op_sub.cpp
-@@ -6,25 +6,25 @@
-  * LICENSE file in the root directory of this source tree.
-  */
- 
-+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
- #include <executorch/kernels/portable/cpu/scalar_utils.h>
- #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
- #include <executorch/kernels/portable/cpu/util/functional_util.h>
- #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
- #include <executorch/runtime/kernel/kernel_includes.h>
- #include <executorch/runtime/platform/assert.h>
--#include <executorch/backends/cadence/hifi/kernels/kernels.h>
- 
- using exec_aten::Scalar;
- using exec_aten::ScalarType;
- using exec_aten::Tensor;
-+using executorch::aten::RuntimeContext;
- using executorch::runtime::can_cast;
- using executorch::runtime::CppTypeToScalarType;
--using executorch::aten::RuntimeContext;
- using torch::executor::Error;
- 
- 
- namespace impl {
--namespace HiFi { 
-+namespace HiFi {
- namespace native {
- 
- namespace {
-@@ -92,7 +92,8 @@ Tensor& sub_out(
- 
-   ScalarType a_type = a.scalar_type();
-   ScalarType b_type = b.scalar_type();
--  ScalarType alpha_type = torch::executor::native::utils::get_scalar_dtype(alpha);
-+  ScalarType alpha_type =
-+    torch::executor::native::utils::get_scalar_dtype(alpha);
-   ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
-   ScalarType out_type = out.scalar_type();
- 
-@@ -115,18 +116,17 @@ Tensor& sub_out(
-   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
-   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
-   
--  if((out_type != ScalarType::Float) || (alpha_val != 1.0))
-+  if ((out_type != ScalarType::Float) || (alpha_val != 1.0))
-     optimized = 0;
-   
--  if((a_dim == 0) || (b_dim == 0))
-+  if ((a_dim == 0) || (b_dim == 0))
-     optimized = 0;
- 
--  if((broadcast == 1) && (max_dim > kNnlibMaxDim))
-+  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
-     optimized = 0;
-   
- 
--  if(optimized)
--  {
-+  if (optimized) {
-       /*logic to find broadcast*/
-       const int a_is_broadcasted = !out.sizes().equals(a.sizes());
-       const int b_is_broadcasted = !out.sizes().equals(b.sizes());
-@@ -135,14 +135,12 @@ Tensor& sub_out(
-       const float* const a_data = a.const_data_ptr<float>();
-       const float* const b_data = b.const_data_ptr<float>();
-       float* const out_data = out.mutable_data_ptr<float>();
--      if(broadcast == 1)
--      {
-+      if (broadcast == 1) {
-          int out_shape[kNnlibMaxDim];
-          int inp1_shape[kNnlibMaxDim];
-          int inp2_shape[kNnlibMaxDim];
-          
--         for(int i = 0; i < kNnlibMaxDim; i++)
--         {
-+         for (int i = 0; i < kNnlibMaxDim; i++) {
-             out_shape[i] = 1;
-             inp1_shape[i] = 1;
-             inp2_shape[i] = 1;
-@@ -151,14 +149,15 @@ Tensor& sub_out(
-          int off_o = kNnlibMaxDim - out_dim;
-          int off_a = kNnlibMaxDim - a_dim;
-          int off_b = kNnlibMaxDim - b_dim;
--         for(int i = 0; i < out_dim; i++)
-+         for (int i = 0; i < out_dim; i++)
-              out_shape[i+off_o] = out.size(i);
--         for(int i = 0; i < a_dim; i++)
-+         for (int i = 0; i < a_dim; i++)
-              inp1_shape[i+off_a] = a.size(i);
--         for(int i = 0; i < b_dim; i++)
-+         for (int i = 0; i < b_dim; i++)
-              inp2_shape[i+off_b] = b.size(i);
- 
--         xa_nn_elm_sub_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape,b_data, inp2_shape);
-+         xa_nn_elm_sub_broadcast_4D_f32xf32_f32(
-+           out_data, out_shape, a_data, inp1_shape,b_data, inp2_shape);
-       }                      
-       else
-       {
-@@ -190,6 +189,6 @@ Tensor& sub_out(
-   return out;
- }
- 
--} // namespace impl
--} // namespace HiFi
- } // namespace native
-+} // namespace HiFi
-+} // namespace impl
-diff --git a/backends/cadence/hifi/operators/op_tanh.cpp b/backends/cadence/hifi/operators/op_tanh.cpp
-index a80450b8..7989ac3b 100644
---- a/backends/cadence/hifi/operators/op_tanh.cpp
-+++ b/backends/cadence/hifi/operators/op_tanh.cpp
-@@ -6,10 +6,10 @@
-  * LICENSE file in the root directory of this source tree.
-  */
- 
-+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
- #include <executorch/kernels/portable/cpu/pattern/pattern.h>
- #include <executorch/runtime/kernel/kernel_includes.h>
- #include <cmath>
--#include <executorch/backends/cadence/hifi/kernels/kernels.h>
- 
- using exec_aten::ScalarType;
- using exec_aten::Tensor;
-@@ -17,28 +17,29 @@ using executorch::aten::RuntimeContext;
- using torch::executor::Error;
- 
- namespace impl {
--namespace HiFi { 
-+namespace HiFi {
- namespace native {
- 
- 
- Tensor& tanh_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
- 
-   bool optimized = 1;
--  if((in.scalar_type() != ScalarType::Float) || (out.scalar_type() != ScalarType::Float))
--      optimized = 0;
-+  if ((in.scalar_type() != ScalarType::Float) || 
-+      (out.scalar_type() != ScalarType::Float))
-+    optimized = 0;
-   
--  if(optimized)
--  {
-+  if (optimized) {
-     float* data_in = in.mutable_data_ptr<float>();
-     float* data_out = out.mutable_data_ptr<float>();
-     xa_nn_vec_tanh_f32_f32(data_out, data_in, (int)in.numel());
-     return out;
-   }
- 
--  return torch::executor::native::internal::unary_ufunc_realhb_to_floath(std::tanh, ctx, in, out);
-+  return torch::executor::native::internal::unary_ufunc_realhb_to_floath(
-+    std::tanh, ctx, in, out);
- 
- }
- 
--} // namespace impl
--} // namespace HiFi
- } // namespace native
-+} // namespace HiFi
-+} // namespace impl
diff --git a/backends/arm/operators/op_permute.py b/backends/arm/operators/op_permute.py
index 167a0c382f..69f6f6506c 100644
--- a/backends/arm/operators/op_permute.py
+++ b/backends/arm/operators/op_permute.py
@@ -1,4 +1,4 @@
-# Copyright 2023 Arm Limited and/or its affiliates.
+# Copyright 2023-2024 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -18,6 +18,54 @@
 from serializer.tosa_serializer import TosaOp
 
 
+def permutation_vector_to_matrix(permutation_vector: list[int]) -> torch.Tensor:
+    """
+    Converts a permutation vector of length N to a NxN matrix that describes the same permutation.
+    for example:
+    (1,0,2)
+    ->
+    [0 1 0]
+    |1 0 0|
+    [0 0 1]
+    """
+    N = len(permutation_vector)
+    P = torch.zeros(N, N)
+    for row_index, col_index in enumerate(permutation_vector):
+        P[row_index][col_index] = 1
+    return P
+
+
+def permutation_matrix_to_vector(permutation_matrix: torch.Tensor) -> list[int]:
+    """
+    Converts a NxN permutation matrix to a permutation vector of length N that describes the same permutation.
+    [0 1 0]
+    |1 0 0|
+    [0 0 1]
+    ->
+    (1,0,2)
+    """
+    N = len(permutation_matrix)
+    assert N == len(
+        permutation_matrix[0]
+    ), f"A permutation matrix must be square, got shape {permutation_matrix.shape}"
+
+    p = [0] * N
+    for row_index, row in enumerate(permutation_matrix):
+        saw_one = False
+        for col_index, value in enumerate(row):
+            if value == 1:
+                assert (
+                    not saw_one
+                ), f"A permutation matrix can only have one 1 per row, got row {row}."
+                p[row_index] = col_index
+                saw_one = True
+            else:
+                assert (
+                    value == 0
+                ), f"A permutation matrix only contains 1's and 0's, got value {value}."
+    return p
+
+
 @register_node_visitor
 class PermuteVisitor(NodeVisitor):
     target = "aten.permute_copy.default"
@@ -40,8 +88,33 @@ def define_node(
             )
             return
 
+        # The permutation vector describes a permutation P in default Pytorch dim_order.
+        # For rank 4, the default dim_order NCHW.
+        # E.g. (2,3,0,1) -> permute (n,c,h,w) to (w,c,n,h)
+        permutation_vector = inputs[1].special
+
+        if output.dim_order != tuple(range(len(output.dim_order))):
+            # the permutation vector can't be used directly if we are not in NCHW dim_order.
+            # We need to first transform to NCHW, apply P,
+            # and then transform back to the original dim_order.
+            # This transformation, S, is also a permutation, with the dim_order as permutation vector.
+
+            # To do this, represent P and S with permutation matrices.
+            # Matrices can handle chained transformations and inversion easily.
+            S = permutation_vector_to_matrix(output.dim_order)
+            # The inverse of a permutation matrix is its transpose.
+            S_inverse = S.transpose(1, 0)
+            P = permutation_vector_to_matrix(permutation_vector)
+
+            # The complete transformation is S * P * S_inverse.
+            transformation_matrix = S.matmul(P.matmul(S_inverse))
+
+            # Luckily, since it is just a combination of permutations, the result is also a permutation
+            # that can again be described by a new permutation vector.
+            permutation_vector = permutation_matrix_to_vector(transformation_matrix)
+
         attr = ts.TosaSerializerAttribute()
-        attr.TransposeAttribute(inputs[1].special)
+        attr.TransposeAttribute(permutation_vector)
         tosa_graph.addOperator(
             TosaOp.Op().TRANSPOSE, [inputs[0].name], [output.name], attr
         )
diff --git a/backends/arm/test/ops/test_hardtanh.py b/backends/arm/test/ops/test_hardtanh.py
new file mode 100644
index 0000000000..c7c3736e37
--- /dev/null
+++ b/backends/arm/test/ops/test_hardtanh.py
@@ -0,0 +1,125 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from typing import Tuple
+
+import torch
+
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    ArmQuantizer,
+    get_symmetric_quantization_config,
+)
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.xnnpack.test.tester.tester import Quantize
+from parameterized import parameterized
+
+
+test_data_suite = [
+    # (test_name, test_data)
+    ("zeros", torch.zeros(1, 10, 10, 10)),
+    ("ones", torch.ones(10, 10, 10)),
+    ("rand", torch.rand(10, 10) - 0.5),
+    ("randn_pos", torch.randn(10) + 10),
+    ("randn_neg", torch.randn(10) - 10),
+    ("ramp", torch.arange(-16, 16, 0.2)),
+]
+
+
+class TestHardTanh(unittest.TestCase):
+    """Tests HardTanh Operator."""
+
+    class HardTanh(torch.nn.Module):
+
+        def __init__(self):
+            super().__init__()
+
+            self.hardTanh = torch.nn.Hardtanh()
+
+        def forward(self, x):
+            return self.hardTanh(x)
+
+    def _test_hardtanh_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .export()
+            .check(["torch.ops.aten.hardtanh.default"])
+            .check_not(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_hardtanh_tosa_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({"torch.ops.aten.hardtanh.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_hardtanh_tosa_u55_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_u55_compile_spec(),
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({"torch.ops.aten.hardtanh.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_hardtanh_tosa_MI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+    ):
+        self._test_hardtanh_tosa_MI_pipeline(self.HardTanh(), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    def test_hardtanh_tosa_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_hardtanh_tosa_BI_pipeline(self.HardTanh(), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    def test_hardtanh_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_hardtanh_tosa_u55_BI_pipeline(self.HardTanh(), (test_data,))
diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py
new file mode 100644
index 0000000000..6346e847c9
--- /dev/null
+++ b/backends/arm/test/ops/test_permute.py
@@ -0,0 +1,152 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from typing import Tuple
+
+import torch
+
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    ArmQuantizer,
+    get_symmetric_quantization_config,
+)
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.xnnpack.test.tester.tester import Quantize
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from parameterized import parameterized
+from torchvision.ops import Permute
+
+test_data_suite = [
+    # (test_name,test_data,dims)
+    ("rank_2", torch.rand(10, 10), [1, 0]),
+    ("rank_3", torch.rand(10, 10, 10), [2, 0, 1]),
+    ("rank_3", torch.rand(10, 10, 10), [1, 2, 0]),
+    ("rank_4", torch.rand(1, 5, 1, 10), [0, 2, 3, 1]),
+    ("rank_4", torch.rand(1, 2, 5, 10), [1, 0, 2, 3]),
+    ("rank_4", torch.rand(1, 10, 10, 5), [2, 0, 1, 3]),
+]
+
+
+class TestPermute(unittest.TestCase):
+    """Tests Permute Operator."""
+
+    class Permute(torch.nn.Module):
+
+        def __init__(self, dims: list[int]):
+            super().__init__()
+
+            self.permute = Permute(dims=dims)
+
+        def forward(self, x):
+            return self.permute(x)
+
+    def _test_permute_tosa_MI_pipeline(
+        self,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.tensor],
+        permute_memory_to_nhwc: bool,
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(
+                    permute_memory_to_nhwc=permute_memory_to_nhwc
+                ),
+            )
+            .export()
+            .check(["torch.ops.aten.permute.default"])
+            .check_not(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_permute_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_permute_tosa_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({"torch.ops.aten.permute.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_permute_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_permute_ethos_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[torch.Tensor],
+    ):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=compile_spec,
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({"torch.ops.aten.permute.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_permute_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .serialize()
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_permute_tosa_MI(
+        self, test_name: str, test_data: torch.Tensor, dims: list[int]
+    ):
+        self._test_permute_tosa_MI_pipeline(self.Permute(dims=dims), (test_data,), True)
+        self._test_permute_tosa_MI_pipeline(
+            self.Permute(dims=dims), (test_data,), False
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_permute_tosa_BI(
+        self, test_name: str, test_data: torch.Tensor, dims: list[int]
+    ):
+        self._test_permute_tosa_BI_pipeline(self.Permute(dims=dims), (test_data,))
+
+    # Expected to fail as TOSA.Transpose is not supported by Ethos-U55.
+    @parameterized.expand(test_data_suite[0:1])
+    @unittest.expectedFailure
+    def test_permute_u55_BI(
+        self, test_name: str, test_data: torch.Tensor, dims: list[int]
+    ):
+        self._test_permute_ethos_BI_pipeline(
+            self.Permute(dims=dims), common.get_u55_compile_spec(), (test_data,)
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_permute_u85_BI(
+        self, test_name: str, test_data: torch.Tensor, dims: list[int]
+    ):
+        self._test_permute_ethos_BI_pipeline(
+            self.Permute(dims=dims), common.get_u85_compile_spec(), (test_data,)
+        )
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
index 624878a17c..36b9c24317 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
@@ -102,15 +102,11 @@ VEC4_T q_8w_linear(const ivec3 out_pos, const int K) {
 
   for (int i = 0; i < K; i += 4) {
     const VEC4_T mat1_tex = load_texel(t_mat1, mat1_pos);
-
     const VEC4_T sums = VEC4_T(
-        dot(mat1_tex, load_texel(t_qmat2, qmat2_pos) * scales.x),
-        dot(mat1_tex,
-            load_texel(t_qmat2, qmat2_pos + u16vec3(0, 1, 0)) * scales.y),
-        dot(mat1_tex,
-            load_texel(t_qmat2, qmat2_pos + u16vec3(0, 2, 0)) * scales.z),
-        dot(mat1_tex,
-            load_texel(t_qmat2, qmat2_pos + u16vec3(0, 3, 0)) * scales.w));
+        dot(mat1_tex, load_texel(t_qmat2, qmat2_pos)),
+        dot(mat1_tex, load_texel(t_qmat2, qmat2_pos + u16vec3(0, 1, 0))),
+        dot(mat1_tex, load_texel(t_qmat2, qmat2_pos + u16vec3(0, 2, 0))),
+        dot(mat1_tex, load_texel(t_qmat2, qmat2_pos + u16vec3(0, 3, 0))));
 
     outtex += sums;
 
@@ -118,6 +114,8 @@ VEC4_T q_8w_linear(const ivec3 out_pos, const int K) {
     qmat2_pos.x++;
   }
 
+  outtex *= scales;
+
   return outtex;
 }
 
diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
index 4eb47c7d05..1f8a554edc 100644
--- a/build/build_android_llm_demo.sh
+++ b/build/build_android_llm_demo.sh
@@ -19,6 +19,7 @@ build_android_native_library() {
   ANDROID_ABI="$1"
   ANDROID_NDK="${ANDROID_NDK:-/opt/ndk}"
   CMAKE_OUT="cmake-out-android-${ANDROID_ABI}"
+  EXECUTORCH_CMAKE_BUILD_TYPE="${EXECUTORCH_CMAKE_BUILD_TYPE:-Release}"
   QNN_SDK_ROOT="${QNN_SDK_ROOT:-}"
   if [ -n "$QNN_SDK_ROOT" ]; then
     EXECUTORCH_BUILD_QNN=ON
@@ -52,7 +53,7 @@ build_android_native_library() {
     -DNEURON_BUFFER_ALLOCATOR_LIB="${NEURON_BUFFER_ALLOCATOR_LIB}" \
     -DEXECUTORCH_BUILD_QNN="${EXECUTORCH_BUILD_QNN}" \
     -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
-    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_BUILD_TYPE="${EXECUTORCH_CMAKE_BUILD_TYPE}" \
     -B"${CMAKE_OUT}"
 
   if [ "$(uname)" == "Darwin" ]; then
@@ -60,7 +61,7 @@ build_android_native_library() {
   else
     CMAKE_JOBS=$(( $(nproc) - 1 ))
   fi
-  cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release
+  cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config "${EXECUTORCH_CMAKE_BUILD_TYPE}"
 
   cmake extension/android \
     -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
@@ -72,10 +73,10 @@ build_android_native_library() {
     -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
-    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_BUILD_TYPE="${EXECUTORCH_CMAKE_BUILD_TYPE}" \
     -B"${CMAKE_OUT}"/extension/android
 
-  cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config Release
+  cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config "${EXECUTORCH_CMAKE_BUILD_TYPE}"
 
   # Copy artifacts to ABI specific directory
   mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}"
@@ -111,6 +112,9 @@ build_aar() {
   # Rename libexecutorch_jni.so to libexecutorch.so for soname consistency
   # between Java and JNI
   find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \;
+  if [ "$EXECUTORCH_CMAKE_BUILD_TYPE" == "Release" ]; then
+    find jni -type f -name "*.so" -exec "$ANDROID_NDK"/toolchains/llvm/prebuilt/*/bin/llvm-strip {} \;
+  fi
   # Zip all necessary files into the AAR file
   zip -r executorch.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so jni/*/libneuron_backend.so jni/*/libneuron_buffer_allocator.so jni/*/libneuronusdk_adapter.mtk.so AndroidManifest.xml
   popd
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
index 3f93498fba..f9dc26abae 100644
--- a/examples/models/llama/llama_transformer.py
+++ b/examples/models/llama/llama_transformer.py
@@ -156,7 +156,7 @@ def __init__(
     ):
         super().__init__()
         self.max_seq_length = max_seq_length
-        self.is_tranposed = transpose_cache
+        self.is_transposed = transpose_cache
         if transpose_cache:
             cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
         else:
diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py
index 9977256975..6d92a45e80 100644
--- a/examples/models/llama/source_transformation/quantized_kv_cache.py
+++ b/examples/models/llama/source_transformation/quantized_kv_cache.py
@@ -193,7 +193,7 @@ def update(self, input_pos, k_val, v_val):
     @classmethod
     def from_float(cls, kv_cache, cache_type: QuantizedCacheType):
         cache_shape = kv_cache.k_cache.shape
-        if kv_cache.is_tranposed:
+        if kv_cache.is_transposed:
             max_batch_size, n_heads, max_seq_length, head_dim = cache_shape
         else:
             max_batch_size, max_seq_length, n_heads, head_dim = cache_shape
@@ -203,7 +203,7 @@ def from_float(cls, kv_cache, cache_type: QuantizedCacheType):
             n_heads,
             head_dim,
             cache_type,
-            kv_cache.is_tranposed,
+            kv_cache.is_transposed,
             kv_cache.enable_dynamic_shape,
         )
 
diff --git a/examples/models/llama/source_transformation/test_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_quantized_kv_cache.py
index 2f38f96552..e5ade3dd12 100644
--- a/examples/models/llama/source_transformation/test_quantized_kv_cache.py
+++ b/examples/models/llama/source_transformation/test_quantized_kv_cache.py
@@ -48,8 +48,8 @@ def setUp(self):
         self.transpose_kv_cache = False
         self.dtype = torch.float32
 
-    def _test_simple_update_fetch(self, is_tranposed=False, is_dynamic_shape=False):
-        self.transpose_kv_cache = is_tranposed
+    def _test_simple_update_fetch(self, is_transposed=False, is_dynamic_shape=False):
+        self.transpose_kv_cache = is_transposed
         self.enable_dynamic_shape = is_dynamic_shape
         input_pos = torch.tensor([0, 1, 2])
         self.seq_len = input_pos.size(0)
@@ -122,7 +122,7 @@ def test_simple_update_fetch_not_transposed_dynamic_shape(self):
         self._test_simple_update_fetch(is_dynamic_shape=True)
 
     def test_simple_update_fetch_transposed(self):
-        self._test_simple_update_fetch(is_tranposed=True)
+        self._test_simple_update_fetch(is_transposed=True)
 
     def test_simple_update_fetch_transposed_dynamic_shape(self):
-        self._test_simple_update_fetch(is_tranposed=True, is_dynamic_shape=True)
+        self._test_simple_update_fetch(is_transposed=True, is_dynamic_shape=True)
diff --git a/runtime/core/exec_aten/exec_aten.h b/runtime/core/exec_aten/exec_aten.h
index 84b918bf28..bfb47daa05 100644
--- a/runtime/core/exec_aten/exec_aten.h
+++ b/runtime/core/exec_aten/exec_aten.h
@@ -61,8 +61,8 @@ template <typename T>
 using ArrayRef = c10::ArrayRef<T>;
 template <typename T>
 using optional = std::optional<T>;
-using nullopt_t = c10::nullopt_t;
-using c10::nullopt;
+using nullopt_t = std::nullopt_t;
+using std::nullopt;
 using ScalarType = at::ScalarType;
 using Scalar = c10::Scalar;
 using MemoryFormat = c10::MemoryFormat;