From 88d7f99c1f8f47eea2ccc1504928467e84ee4e8e Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Thu, 24 Oct 2024 11:35:19 +0000 Subject: [PATCH] 2024-10-24 nightly release (2553788a5dca2970c0394fa88223f51183a28279) --- 1.txt | 676 ------------------ backends/arm/operators/op_permute.py | 77 +- backends/arm/test/ops/test_hardtanh.py | 125 ++++ backends/arm/test/ops/test_permute.py | 152 ++++ .../runtime/graph/ops/glsl/q_8w_linear.glsl | 14 +- build/build_android_llm_demo.sh | 12 +- examples/models/llama/llama_transformer.py | 2 +- .../quantized_kv_cache.py | 4 +- .../test_quantized_kv_cache.py | 8 +- runtime/core/exec_aten/exec_aten.h | 4 +- 10 files changed, 375 insertions(+), 699 deletions(-) delete mode 100644 1.txt create mode 100644 backends/arm/test/ops/test_hardtanh.py create mode 100644 backends/arm/test/ops/test_permute.py diff --git a/1.txt b/1.txt deleted file mode 100644 index 96745dd0af..0000000000 --- a/1.txt +++ /dev/null @@ -1,676 +0,0 @@ -diff --git a/backends/cadence/cadence.cmake b/backends/cadence/cadence.cmake -index cb6a2531..0fa55c6a 100644 ---- a/backends/cadence/cadence.cmake -+++ b/backends/cadence/cadence.cmake -@@ -44,7 +44,7 @@ set(CMAKE_CXX_COMPILER ${TOOLCHAIN_HOME}/bin/${CROSS_COMPILE_TARGET}-clang++) - set(CMAKE_C_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls") - set(CMAKE_CXX_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls") - #workaround for larger compilation time --SET(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -fno-strict-aliasing") -+set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -fno-strict-aliasing") - - set(CMAKE_SYSROOT ${TOOLCHAIN_HOME}/${SYSROOT_TARGET}) - set(CMAKE_LINKER ${TOOLCHAIN_HOME}/bin/xt-ld) -diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h -index 8faf0671..a08144d9 100644 ---- a/backends/cadence/hifi/kernels/kernels.h -+++ b/backends/cadence/hifi/kernels/kernels.h -@@ -16,21 +16,24 @@ - #include "xa_nnlib_kernels_api.h" - - /* Potential NNLIB function/APIs */ --extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out, -+extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32( -+ FLOAT32 * __restrict__ p_out, - const WORD32 *const p_out_shape, - const FLOAT32 * __restrict__ p_inp1, - const WORD32 *const p_inp1_shape, - const FLOAT32 * __restrict__ p_inp2, - const WORD32 *const p_inp2_shape); - --extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out, -+extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32( -+ FLOAT32 * __restrict__ p_out, - const WORD32 *const p_out_shape, - const FLOAT32 * __restrict__ p_inp1, - const WORD32 *const p_inp1_shape, - const FLOAT32 * __restrict__ p_inp2, - const WORD32 *const p_inp2_shape); - --extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32(FLOAT32 * __restrict__ p_out, -+extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32( -+ FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 num_elm, -@@ -45,7 +48,8 @@ extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32( - const WORD32 *const p_inp2_shape, - WORD32 mode); - --extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out, -+extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32( -+ FLOAT32 * __restrict__ p_out, - const WORD32 *const p_out_shape, - const FLOAT32 * __restrict__ p_inp1, - const WORD32 *const p_inp1_shape, -diff --git a/backends/cadence/hifi/operators/op_add.cpp b/backends/cadence/hifi/operators/op_add.cpp -index 883cc74d..56adab71 100644 ---- a/backends/cadence/hifi/operators/op_add.cpp -+++ b/backends/cadence/hifi/operators/op_add.cpp -@@ -6,13 +6,13 @@ - * LICENSE file in the root directory of this source tree. - */ - -+#include - #include - #include - #include - #include - #include - #include --#include - - using exec_aten::Scalar; - using exec_aten::ScalarType; -@@ -23,7 +23,7 @@ using executorch::runtime::KernelRuntimeContext; - using torch::executor::Error; - - namespace impl { --namespace HiFi { -+namespace HiFi { - namespace native { - - namespace { -@@ -97,14 +97,15 @@ Tensor& add_out( - - ScalarType a_type = a.scalar_type(); - ScalarType b_type = b.scalar_type(); -- ScalarType alpha_type = torch::executor::native::utils::get_scalar_dtype(alpha); -+ ScalarType alpha_type = -+ torch::executor::native::utils::get_scalar_dtype(alpha); - ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); - ScalarType out_type = out.scalar_type(); - - ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); - ET_KERNEL_CHECK( - ctx, check_alpha_type(alpha_type, common_type), InvalidArgument, out); -- -+ - float alpha_val; - torch::executor::native::utils::extract_scalar(alpha, &alpha_val); - -@@ -119,30 +120,28 @@ Tensor& add_out( - const bool broadcast = (a_is_broadcasted || b_is_broadcasted); - int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); - max_dim = out.dim() > max_dim ? out.dim() : max_dim; -- -- if((out_type != ScalarType::Float) || (alpha_val != 1.0)) -+ -+ if ((out_type != ScalarType::Float) || (alpha_val != 1.0)) - optimized = 0; -- -- if((a_dim == 0) || (b_dim == 0) ) -+ -+ if ((a_dim == 0) || (b_dim == 0) ) - optimized = 0; - -- if((broadcast == 1) && (max_dim > kNnlibMaxDim)) -+ if ((broadcast == 1) && (max_dim > kNnlibMaxDim)) - optimized = 0; - - -- if(optimized) -- { -+ if (optimized) { - const float* const a_data = a.const_data_ptr(); - const float* const b_data = b.const_data_ptr(); - float* const out_data = out.mutable_data_ptr(); -- if(broadcast == 1) -- { -+ -+ if(broadcast == 1) { - int out_shape[kNnlibMaxDim]; - int inp1_shape[kNnlibMaxDim]; - int inp2_shape[kNnlibMaxDim]; - -- for(int i = 0; i < kNnlibMaxDim; i++) -- { -+ for (int i = 0; i < kNnlibMaxDim; i++) { - out_shape[i] = 1; - inp1_shape[i] = 1; - inp2_shape[i] = 1; -@@ -152,15 +151,15 @@ Tensor& add_out( - int off_a = kNnlibMaxDim - a.dim(); - int off_b = kNnlibMaxDim - b.dim(); - -- for(int i = 0; i < out.dim(); i++) -+ for (int i = 0; i < out.dim(); i++) - out_shape[i+off_o] = out.size(i); -- for(int i = 0; i < a.dim(); i++) -+ for (int i = 0; i < a.dim(); i++) - inp1_shape[i+off_a] = a.size(i); -- for(int i = 0; i < b.dim(); i++) -+ for (int i = 0; i < b.dim(); i++) - inp2_shape[i+off_b] = b.size(i); - -- xa_nn_elm_add_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape, -- b_data, inp2_shape); -+ xa_nn_elm_add_broadcast_4D_f32xf32_f32( -+ out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape); - } - else - { -@@ -193,6 +192,6 @@ Tensor& add_out( - } - - --} // namespace impl --} // namespace HiFi - } // namespace native -+} // namespace HiFi -+} // namespace impl -diff --git a/backends/cadence/hifi/operators/op_div.cpp b/backends/cadence/hifi/operators/op_div.cpp -index 41220e5d..e887e8b5 100644 ---- a/backends/cadence/hifi/operators/op_div.cpp -+++ b/backends/cadence/hifi/operators/op_div.cpp -@@ -6,6 +6,7 @@ - * LICENSE file in the root directory of this source tree. - */ - -+#include - #include - #include - #include -@@ -13,7 +14,6 @@ - #include - #include - #include --#include - - using exec_aten::Scalar; - using exec_aten::ScalarType; -@@ -22,7 +22,7 @@ using executorch::aten::RuntimeContext; - using torch::executor::Error; - - namespace impl { --namespace HiFi { -+namespace HiFi { - namespace native { - - namespace { -@@ -74,29 +74,27 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { - int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); - max_dim = out.dim() > max_dim ? out.dim() : max_dim; - -- if((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) -+ if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) - optimized = 0; - -- if((a_dim == 0) || (b_dim == 0) ) -+ if ((a_dim == 0) || (b_dim == 0) ) - optimized = 0; - -- if((broadcast == 1) && (max_dim > kNnlibMaxDim)) -+ if ((broadcast == 1) && (max_dim > kNnlibMaxDim)) - optimized = 0; - -- if(optimized) -- { -+ if (optimized) { - float* a_data = a.mutable_data_ptr(); - float* b_data = b.mutable_data_ptr(); - float* out_data = out.mutable_data_ptr(); - -- if(broadcast == 1) -- { -+ if (broadcast == 1) { - - int out_shape[kNnlibMaxDim]; - int inp1_shape[kNnlibMaxDim]; - int inp2_shape[kNnlibMaxDim]; - -- for(int i = 0; i < kNnlibMaxDim; i++) -+ for (int i = 0; i < kNnlibMaxDim; i++) - { - out_shape[i] = 1; - inp1_shape[i] = 1; -@@ -106,34 +104,35 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { - int off_o = kNnlibMaxDim - out.dim(); - int off_a = kNnlibMaxDim - a.dim(); - int off_b = kNnlibMaxDim - b.dim(); -- for(int i = 0; i < out.dim(); i++) -+ for (int i = 0; i < out.dim(); i++) - out_shape[i+off_o] = out.size(i); -- for(int i = 0; i < a.dim(); i++) -+ for (int i = 0; i < a.dim(); i++) - inp1_shape[i+off_a] = a.size(i); -- for(int i = 0; i < b.dim(); i++) -+ for (int i = 0; i < b.dim(); i++) - inp2_shape[i+off_b] = b.size(i); - -- xa_nn_elm_div_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape); -+ xa_nn_elm_div_broadcast_4D_f32xf32_f32( -+ out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape); - } - else - { -- - xa_nn_elm_div_f32xf32_f32(out_data, a_data, b_data, out.numel()); - } -- -+ - return out; - } -- -+ - ScalarType common_type = get_compute_type(a_type, b_type); - ScalarType out_type = out.scalar_type(); -- -+ - ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); -- -+ - ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out", CTYPE_A, [&]() { - ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out", CTYPE_B, [&]() { - ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out", CTYPE_IN, [&]() { - ET_SWITCH_FLOAT_TYPES(out_type, ctx, "div.out", CTYPE_OUT, [&]() { -- torch::executor::apply_binary_elementwise_fn( -+ torch::executor:: -+ apply_binary_elementwise_fn( - [](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = static_cast(val_a); - CTYPE_IN b_casted = static_cast(val_b); -@@ -188,13 +187,13 @@ Tensor& div_out_mode( - int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); - max_dim = out.dim() > max_dim ? out.dim() : max_dim; - -- if((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) -+ if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) - optimized = 0; - -- if((a_dim == 0) || (b_dim == 0)) -+ if ((a_dim == 0) || (b_dim == 0)) - optimized = 0; - -- if((broadcast == 1) && (max_dim > kNnlibMaxDim)) -+ if ((broadcast == 1) && (max_dim > kNnlibMaxDim)) - optimized = 0; - int mode_val = -1; - if (mode.has_value() && mode.value() == "trunc") -@@ -204,20 +203,17 @@ Tensor& div_out_mode( - else - optimized = 0; - -- if(optimized) -- { -+ if (optimized) { - float* a_data = a.mutable_data_ptr(); - float* b_data = b.mutable_data_ptr(); - float* out_data = out.mutable_data_ptr(); - -- if(broadcast) -- { -+ if (broadcast) { - int out_shape[kNnlibMaxDim]; - int inp1_shape[kNnlibMaxDim]; - int inp2_shape[kNnlibMaxDim]; - -- for(int i = 0; i < kNnlibMaxDim; i++) -- { -+ for (int i = 0; i < kNnlibMaxDim; i++) { - inp1_shape[i] = 1; - inp2_shape[i] = 1; - out_shape[i] = 1; -@@ -227,18 +223,20 @@ Tensor& div_out_mode( - int off_a = kNnlibMaxDim - a.dim(); - int off_b = kNnlibMaxDim - b.dim(); - -- for(int i = 0; i < out.dim(); i++) -+ for (int i = 0; i < out.dim(); i++) - out_shape[i+off_o] = out.size(i); -- for(int i = 0; i < a.dim(); i++) -+ for (int i = 0; i < a.dim(); i++) - inp1_shape[i+off_a] = a.size(i); -- for(int i = 0; i < b.dim(); i++) -+ for (int i = 0; i < b.dim(); i++) - inp2_shape[i+off_b] = b.size(i); - -- xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape, mode_val); -+ xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32( -+ out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape, mode_val); - } - else - { -- xa_nn_elm_div_mode_f32xf32_f32(out_data, a_data, b_data, out.numel(), mode_val); -+ xa_nn_elm_div_mode_f32xf32_f32( -+ out_data, a_data, b_data, out.numel(), mode_val); - } - - return out; -@@ -248,7 +246,8 @@ Tensor& div_out_mode( - ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out_mode", CTYPE_B, [&]() { - ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out_mode", CTYPE_IN, [&]() { - ET_SWITCH_REAL_TYPES(out_type, ctx, "div.out_mode", CTYPE_OUT, [&]() { -- torch::executor::apply_binary_elementwise_fn( -+ torch::executor:: -+ apply_binary_elementwise_fn( - [mode](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = static_cast(val_a); - CTYPE_IN b_casted = static_cast(val_b); -@@ -272,6 +271,6 @@ Tensor& div_out_mode( - } - - --} // namespace impl --} // namespace HiFi - } // namespace native -+} // namespace HiFi -+} // namespace impl -diff --git a/backends/cadence/hifi/operators/op_mul.cpp b/backends/cadence/hifi/operators/op_mul.cpp -index 9200d980..1b2e62cd 100644 ---- a/backends/cadence/hifi/operators/op_mul.cpp -+++ b/backends/cadence/hifi/operators/op_mul.cpp -@@ -6,12 +6,12 @@ - * LICENSE file in the root directory of this source tree. - */ - -+#include - #include - #include - #include - #include - #include --#include - - using exec_aten::Scalar; - using exec_aten::ScalarType; -@@ -86,7 +86,7 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { - ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); - ScalarType out_type = out.scalar_type(); - constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ -- -+ - int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim(); - bool optimized = 1; - /*find broadcast*/ -@@ -97,28 +97,25 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { - max_dim = out.dim() > max_dim ? out.dim() : max_dim; - - -- if((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) -+ if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) - optimized = 0; - -- if( (a_dim == 0) || (b_dim == 0) ) -+ if ((a_dim == 0) || (b_dim == 0) ) - optimized = 0; - -- if((broadcast == 1) && (max_dim > kNnlibMaxDim)) -+ if ((broadcast == 1) && (max_dim > kNnlibMaxDim)) - optimized = 0; - -- if(optimized) -- { -+ if (optimized) { - float* a_data = a.mutable_data_ptr(); - float* b_data = b.mutable_data_ptr(); - float* out_data = out.mutable_data_ptr(); - -- if(broadcast == 1) -- { -+ if (broadcast == 1) { - int out_shape[kNnlibMaxDim]; - int inp1_shape[kNnlibMaxDim]; - int inp2_shape[kNnlibMaxDim]; -- for(int i = 0; i < kNnlibMaxDim; i++) -- { -+ for (int i = 0; i < kNnlibMaxDim; i++) { - out_shape[i] = 1; - inp1_shape[i] = 1; - inp2_shape[i] = 1; -@@ -126,14 +123,15 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { - int off_o = kNnlibMaxDim - out.dim(); - int off_a = kNnlibMaxDim - a.dim(); - int off_b = kNnlibMaxDim - b.dim(); -- for(int i = 0; i < out.dim(); i++){ -- out_shape[i+off_o] = out.size(i);} -- for(int i = 0; i < a.dim(); i++) -+ for (int i = 0; i < out.dim(); i++) -+ out_shape[i+off_o] = out.size(i); -+ for (int i = 0; i < a.dim(); i++) - inp1_shape[i+off_a] = a.size(i); -- for(int i = 0; i < b.dim(); i++) -+ for (int i = 0; i < b.dim(); i++) - inp2_shape[i+off_b] = b.size(i); - -- xa_nn_elm_mul_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape); -+ xa_nn_elm_mul_broadcast_4D_f32xf32_f32( -+ out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape); - } - else - { -@@ -154,7 +152,7 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { - CTYPE_A, - CTYPE_B, - CTYPE_IN, -- CTYPE_OUT>::run(a, b, out); -+ CTYPE_OUT>::run(a, b, out); - }); - }); - }); -@@ -162,6 +160,6 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { - return out; - } - --} // namespace impl --} // namespace HiFi - } // namespace native -+} // namespace HiFi -+} // namespace impl -diff --git a/backends/cadence/hifi/operators/op_sigmoid.cpp b/backends/cadence/hifi/operators/op_sigmoid.cpp -index fa408d4b..1ed89880 100644 ---- a/backends/cadence/hifi/operators/op_sigmoid.cpp -+++ b/backends/cadence/hifi/operators/op_sigmoid.cpp -@@ -8,9 +8,9 @@ - - #include - -+#include - #include - #include --#include - - using exec_aten::ScalarType; - using exec_aten::Tensor; -@@ -18,7 +18,7 @@ using executorch::aten::RuntimeContext; - using torch::executor::Error; - - namespace impl { --namespace HiFi { -+namespace HiFi { - namespace native { - - using Tensor = exec_aten::Tensor; -@@ -40,13 +40,12 @@ Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { - - ScalarType in_type = in.scalar_type(); - ScalarType out_type = out.scalar_type(); -- -+ - bool optimized = 1; -- if((in_type != ScalarType::Float) || (out_type != ScalarType::Float)) -+ if ((in_type != ScalarType::Float) || (out_type != ScalarType::Float)) - optimized = 0; - -- if(optimized) -- { -+ if (optimized) { - float* data_in = in.mutable_data_ptr(); - float* data_out = out.mutable_data_ptr(); - xa_nn_vec_sigmoid_f32_f32(data_out, data_in, in.numel()); -diff --git a/backends/cadence/hifi/operators/op_sub.cpp b/backends/cadence/hifi/operators/op_sub.cpp -index b9f35caf..d9958bf8 100644 ---- a/backends/cadence/hifi/operators/op_sub.cpp -+++ b/backends/cadence/hifi/operators/op_sub.cpp -@@ -6,25 +6,25 @@ - * LICENSE file in the root directory of this source tree. - */ - -+#include - #include - #include - #include - #include - #include - #include --#include - - using exec_aten::Scalar; - using exec_aten::ScalarType; - using exec_aten::Tensor; -+using executorch::aten::RuntimeContext; - using executorch::runtime::can_cast; - using executorch::runtime::CppTypeToScalarType; --using executorch::aten::RuntimeContext; - using torch::executor::Error; - - - namespace impl { --namespace HiFi { -+namespace HiFi { - namespace native { - - namespace { -@@ -92,7 +92,8 @@ Tensor& sub_out( - - ScalarType a_type = a.scalar_type(); - ScalarType b_type = b.scalar_type(); -- ScalarType alpha_type = torch::executor::native::utils::get_scalar_dtype(alpha); -+ ScalarType alpha_type = -+ torch::executor::native::utils::get_scalar_dtype(alpha); - ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); - ScalarType out_type = out.scalar_type(); - -@@ -115,18 +116,17 @@ Tensor& sub_out( - int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); - max_dim = out.dim() > max_dim ? out.dim() : max_dim; - -- if((out_type != ScalarType::Float) || (alpha_val != 1.0)) -+ if ((out_type != ScalarType::Float) || (alpha_val != 1.0)) - optimized = 0; - -- if((a_dim == 0) || (b_dim == 0)) -+ if ((a_dim == 0) || (b_dim == 0)) - optimized = 0; - -- if((broadcast == 1) && (max_dim > kNnlibMaxDim)) -+ if ((broadcast == 1) && (max_dim > kNnlibMaxDim)) - optimized = 0; - - -- if(optimized) -- { -+ if (optimized) { - /*logic to find broadcast*/ - const int a_is_broadcasted = !out.sizes().equals(a.sizes()); - const int b_is_broadcasted = !out.sizes().equals(b.sizes()); -@@ -135,14 +135,12 @@ Tensor& sub_out( - const float* const a_data = a.const_data_ptr(); - const float* const b_data = b.const_data_ptr(); - float* const out_data = out.mutable_data_ptr(); -- if(broadcast == 1) -- { -+ if (broadcast == 1) { - int out_shape[kNnlibMaxDim]; - int inp1_shape[kNnlibMaxDim]; - int inp2_shape[kNnlibMaxDim]; - -- for(int i = 0; i < kNnlibMaxDim; i++) -- { -+ for (int i = 0; i < kNnlibMaxDim; i++) { - out_shape[i] = 1; - inp1_shape[i] = 1; - inp2_shape[i] = 1; -@@ -151,14 +149,15 @@ Tensor& sub_out( - int off_o = kNnlibMaxDim - out_dim; - int off_a = kNnlibMaxDim - a_dim; - int off_b = kNnlibMaxDim - b_dim; -- for(int i = 0; i < out_dim; i++) -+ for (int i = 0; i < out_dim; i++) - out_shape[i+off_o] = out.size(i); -- for(int i = 0; i < a_dim; i++) -+ for (int i = 0; i < a_dim; i++) - inp1_shape[i+off_a] = a.size(i); -- for(int i = 0; i < b_dim; i++) -+ for (int i = 0; i < b_dim; i++) - inp2_shape[i+off_b] = b.size(i); - -- xa_nn_elm_sub_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape,b_data, inp2_shape); -+ xa_nn_elm_sub_broadcast_4D_f32xf32_f32( -+ out_data, out_shape, a_data, inp1_shape,b_data, inp2_shape); - } - else - { -@@ -190,6 +189,6 @@ Tensor& sub_out( - return out; - } - --} // namespace impl --} // namespace HiFi - } // namespace native -+} // namespace HiFi -+} // namespace impl -diff --git a/backends/cadence/hifi/operators/op_tanh.cpp b/backends/cadence/hifi/operators/op_tanh.cpp -index a80450b8..7989ac3b 100644 ---- a/backends/cadence/hifi/operators/op_tanh.cpp -+++ b/backends/cadence/hifi/operators/op_tanh.cpp -@@ -6,10 +6,10 @@ - * LICENSE file in the root directory of this source tree. - */ - -+#include - #include - #include - #include --#include - - using exec_aten::ScalarType; - using exec_aten::Tensor; -@@ -17,28 +17,29 @@ using executorch::aten::RuntimeContext; - using torch::executor::Error; - - namespace impl { --namespace HiFi { -+namespace HiFi { - namespace native { - - - Tensor& tanh_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { - - bool optimized = 1; -- if((in.scalar_type() != ScalarType::Float) || (out.scalar_type() != ScalarType::Float)) -- optimized = 0; -+ if ((in.scalar_type() != ScalarType::Float) || -+ (out.scalar_type() != ScalarType::Float)) -+ optimized = 0; - -- if(optimized) -- { -+ if (optimized) { - float* data_in = in.mutable_data_ptr(); - float* data_out = out.mutable_data_ptr(); - xa_nn_vec_tanh_f32_f32(data_out, data_in, (int)in.numel()); - return out; - } - -- return torch::executor::native::internal::unary_ufunc_realhb_to_floath(std::tanh, ctx, in, out); -+ return torch::executor::native::internal::unary_ufunc_realhb_to_floath( -+ std::tanh, ctx, in, out); - - } - --} // namespace impl --} // namespace HiFi - } // namespace native -+} // namespace HiFi -+} // namespace impl diff --git a/backends/arm/operators/op_permute.py b/backends/arm/operators/op_permute.py index 167a0c382f..69f6f6506c 100644 --- a/backends/arm/operators/op_permute.py +++ b/backends/arm/operators/op_permute.py @@ -1,4 +1,4 @@ -# Copyright 2023 Arm Limited and/or its affiliates. +# Copyright 2023-2024 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -18,6 +18,54 @@ from serializer.tosa_serializer import TosaOp +def permutation_vector_to_matrix(permutation_vector: list[int]) -> torch.Tensor: + """ + Converts a permutation vector of length N to a NxN matrix that describes the same permutation. + for example: + (1,0,2) + -> + [0 1 0] + |1 0 0| + [0 0 1] + """ + N = len(permutation_vector) + P = torch.zeros(N, N) + for row_index, col_index in enumerate(permutation_vector): + P[row_index][col_index] = 1 + return P + + +def permutation_matrix_to_vector(permutation_matrix: torch.Tensor) -> list[int]: + """ + Converts a NxN permutation matrix to a permutation vector of length N that describes the same permutation. + [0 1 0] + |1 0 0| + [0 0 1] + -> + (1,0,2) + """ + N = len(permutation_matrix) + assert N == len( + permutation_matrix[0] + ), f"A permutation matrix must be square, got shape {permutation_matrix.shape}" + + p = [0] * N + for row_index, row in enumerate(permutation_matrix): + saw_one = False + for col_index, value in enumerate(row): + if value == 1: + assert ( + not saw_one + ), f"A permutation matrix can only have one 1 per row, got row {row}." + p[row_index] = col_index + saw_one = True + else: + assert ( + value == 0 + ), f"A permutation matrix only contains 1's and 0's, got value {value}." + return p + + @register_node_visitor class PermuteVisitor(NodeVisitor): target = "aten.permute_copy.default" @@ -40,8 +88,33 @@ def define_node( ) return + # The permutation vector describes a permutation P in default Pytorch dim_order. + # For rank 4, the default dim_order NCHW. + # E.g. (2,3,0,1) -> permute (n,c,h,w) to (w,c,n,h) + permutation_vector = inputs[1].special + + if output.dim_order != tuple(range(len(output.dim_order))): + # the permutation vector can't be used directly if we are not in NCHW dim_order. + # We need to first transform to NCHW, apply P, + # and then transform back to the original dim_order. + # This transformation, S, is also a permutation, with the dim_order as permutation vector. + + # To do this, represent P and S with permutation matrices. + # Matrices can handle chained transformations and inversion easily. + S = permutation_vector_to_matrix(output.dim_order) + # The inverse of a permutation matrix is its transpose. + S_inverse = S.transpose(1, 0) + P = permutation_vector_to_matrix(permutation_vector) + + # The complete transformation is S * P * S_inverse. + transformation_matrix = S.matmul(P.matmul(S_inverse)) + + # Luckily, since it is just a combination of permutations, the result is also a permutation + # that can again be described by a new permutation vector. + permutation_vector = permutation_matrix_to_vector(transformation_matrix) + attr = ts.TosaSerializerAttribute() - attr.TransposeAttribute(inputs[1].special) + attr.TransposeAttribute(permutation_vector) tosa_graph.addOperator( TosaOp.Op().TRANSPOSE, [inputs[0].name], [output.name], attr ) diff --git a/backends/arm/test/ops/test_hardtanh.py b/backends/arm/test/ops/test_hardtanh.py new file mode 100644 index 0000000000..c7c3736e37 --- /dev/null +++ b/backends/arm/test/ops/test_hardtanh.py @@ -0,0 +1,125 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from typing import Tuple + +import torch + +from executorch.backends.arm.quantizer.arm_quantizer import ( + ArmQuantizer, + get_symmetric_quantization_config, +) + +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.xnnpack.test.tester.tester import Quantize +from parameterized import parameterized + + +test_data_suite = [ + # (test_name, test_data) + ("zeros", torch.zeros(1, 10, 10, 10)), + ("ones", torch.ones(10, 10, 10)), + ("rand", torch.rand(10, 10) - 0.5), + ("randn_pos", torch.randn(10) + 10), + ("randn_neg", torch.randn(10) - 10), + ("ramp", torch.arange(-16, 16, 0.2)), +] + + +class TestHardTanh(unittest.TestCase): + """Tests HardTanh Operator.""" + + class HardTanh(torch.nn.Module): + + def __init__(self): + super().__init__() + + self.hardTanh = torch.nn.Hardtanh() + + def forward(self, x): + return self.hardTanh(x) + + def _test_hardtanh_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check(["torch.ops.aten.hardtanh.default"]) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_hardtanh_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.hardtanh.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_hardtanh_tosa_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.hardtanh.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(test_data_suite) + def test_hardtanh_tosa_MI( + self, + test_name: str, + test_data: torch.Tensor, + ): + self._test_hardtanh_tosa_MI_pipeline(self.HardTanh(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_hardtanh_tosa_BI(self, test_name: str, test_data: torch.Tensor): + self._test_hardtanh_tosa_BI_pipeline(self.HardTanh(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_hardtanh_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor): + self._test_hardtanh_tosa_u55_BI_pipeline(self.HardTanh(), (test_data,)) diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py new file mode 100644 index 0000000000..6346e847c9 --- /dev/null +++ b/backends/arm/test/ops/test_permute.py @@ -0,0 +1,152 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from typing import Tuple + +import torch + +from executorch.backends.arm.quantizer.arm_quantizer import ( + ArmQuantizer, + get_symmetric_quantization_config, +) + +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.xnnpack.test.tester.tester import Quantize +from executorch.exir.backend.compile_spec_schema import CompileSpec +from parameterized import parameterized +from torchvision.ops import Permute + +test_data_suite = [ + # (test_name,test_data,dims) + ("rank_2", torch.rand(10, 10), [1, 0]), + ("rank_3", torch.rand(10, 10, 10), [2, 0, 1]), + ("rank_3", torch.rand(10, 10, 10), [1, 2, 0]), + ("rank_4", torch.rand(1, 5, 1, 10), [0, 2, 3, 1]), + ("rank_4", torch.rand(1, 2, 5, 10), [1, 0, 2, 3]), + ("rank_4", torch.rand(1, 10, 10, 5), [2, 0, 1, 3]), +] + + +class TestPermute(unittest.TestCase): + """Tests Permute Operator.""" + + class Permute(torch.nn.Module): + + def __init__(self, dims: list[int]): + super().__init__() + + self.permute = Permute(dims=dims) + + def forward(self, x): + return self.permute(x) + + def _test_permute_tosa_MI_pipeline( + self, + module: torch.nn.Module, + test_data: Tuple[torch.tensor], + permute_memory_to_nhwc: bool, + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec( + permute_memory_to_nhwc=permute_memory_to_nhwc + ), + ) + .export() + .check(["torch.ops.aten.permute.default"]) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_permute_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_permute_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.permute.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_permute_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_permute_ethos_BI_pipeline( + self, + module: torch.nn.Module, + compile_spec: CompileSpec, + test_data: Tuple[torch.Tensor], + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=compile_spec, + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.permute.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_permute_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .serialize() + ) + + @parameterized.expand(test_data_suite) + def test_permute_tosa_MI( + self, test_name: str, test_data: torch.Tensor, dims: list[int] + ): + self._test_permute_tosa_MI_pipeline(self.Permute(dims=dims), (test_data,), True) + self._test_permute_tosa_MI_pipeline( + self.Permute(dims=dims), (test_data,), False + ) + + @parameterized.expand(test_data_suite) + def test_permute_tosa_BI( + self, test_name: str, test_data: torch.Tensor, dims: list[int] + ): + self._test_permute_tosa_BI_pipeline(self.Permute(dims=dims), (test_data,)) + + # Expected to fail as TOSA.Transpose is not supported by Ethos-U55. + @parameterized.expand(test_data_suite[0:1]) + @unittest.expectedFailure + def test_permute_u55_BI( + self, test_name: str, test_data: torch.Tensor, dims: list[int] + ): + self._test_permute_ethos_BI_pipeline( + self.Permute(dims=dims), common.get_u55_compile_spec(), (test_data,) + ) + + @parameterized.expand(test_data_suite) + def test_permute_u85_BI( + self, test_name: str, test_data: torch.Tensor, dims: list[int] + ): + self._test_permute_ethos_BI_pipeline( + self.Permute(dims=dims), common.get_u85_compile_spec(), (test_data,) + ) diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl index 624878a17c..36b9c24317 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl @@ -102,15 +102,11 @@ VEC4_T q_8w_linear(const ivec3 out_pos, const int K) { for (int i = 0; i < K; i += 4) { const VEC4_T mat1_tex = load_texel(t_mat1, mat1_pos); - const VEC4_T sums = VEC4_T( - dot(mat1_tex, load_texel(t_qmat2, qmat2_pos) * scales.x), - dot(mat1_tex, - load_texel(t_qmat2, qmat2_pos + u16vec3(0, 1, 0)) * scales.y), - dot(mat1_tex, - load_texel(t_qmat2, qmat2_pos + u16vec3(0, 2, 0)) * scales.z), - dot(mat1_tex, - load_texel(t_qmat2, qmat2_pos + u16vec3(0, 3, 0)) * scales.w)); + dot(mat1_tex, load_texel(t_qmat2, qmat2_pos)), + dot(mat1_tex, load_texel(t_qmat2, qmat2_pos + u16vec3(0, 1, 0))), + dot(mat1_tex, load_texel(t_qmat2, qmat2_pos + u16vec3(0, 2, 0))), + dot(mat1_tex, load_texel(t_qmat2, qmat2_pos + u16vec3(0, 3, 0)))); outtex += sums; @@ -118,6 +114,8 @@ VEC4_T q_8w_linear(const ivec3 out_pos, const int K) { qmat2_pos.x++; } + outtex *= scales; + return outtex; } diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 4eb47c7d05..1f8a554edc 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -19,6 +19,7 @@ build_android_native_library() { ANDROID_ABI="$1" ANDROID_NDK="${ANDROID_NDK:-/opt/ndk}" CMAKE_OUT="cmake-out-android-${ANDROID_ABI}" + EXECUTORCH_CMAKE_BUILD_TYPE="${EXECUTORCH_CMAKE_BUILD_TYPE:-Release}" QNN_SDK_ROOT="${QNN_SDK_ROOT:-}" if [ -n "$QNN_SDK_ROOT" ]; then EXECUTORCH_BUILD_QNN=ON @@ -52,7 +53,7 @@ build_android_native_library() { -DNEURON_BUFFER_ALLOCATOR_LIB="${NEURON_BUFFER_ALLOCATOR_LIB}" \ -DEXECUTORCH_BUILD_QNN="${EXECUTORCH_BUILD_QNN}" \ -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \ - -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_BUILD_TYPE="${EXECUTORCH_CMAKE_BUILD_TYPE}" \ -B"${CMAKE_OUT}" if [ "$(uname)" == "Darwin" ]; then @@ -60,7 +61,7 @@ build_android_native_library() { else CMAKE_JOBS=$(( $(nproc) - 1 )) fi - cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release + cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config "${EXECUTORCH_CMAKE_BUILD_TYPE}" cmake extension/android \ -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \ @@ -72,10 +73,10 @@ build_android_native_library() { -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ - -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_BUILD_TYPE="${EXECUTORCH_CMAKE_BUILD_TYPE}" \ -B"${CMAKE_OUT}"/extension/android - cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config Release + cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config "${EXECUTORCH_CMAKE_BUILD_TYPE}" # Copy artifacts to ABI specific directory mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" @@ -111,6 +112,9 @@ build_aar() { # Rename libexecutorch_jni.so to libexecutorch.so for soname consistency # between Java and JNI find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \; + if [ "$EXECUTORCH_CMAKE_BUILD_TYPE" == "Release" ]; then + find jni -type f -name "*.so" -exec "$ANDROID_NDK"/toolchains/llvm/prebuilt/*/bin/llvm-strip {} \; + fi # Zip all necessary files into the AAR file zip -r executorch.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so jni/*/libneuron_backend.so jni/*/libneuron_buffer_allocator.so jni/*/libneuronusdk_adapter.mtk.so AndroidManifest.xml popd diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py index 3f93498fba..f9dc26abae 100644 --- a/examples/models/llama/llama_transformer.py +++ b/examples/models/llama/llama_transformer.py @@ -156,7 +156,7 @@ def __init__( ): super().__init__() self.max_seq_length = max_seq_length - self.is_tranposed = transpose_cache + self.is_transposed = transpose_cache if transpose_cache: cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim) else: diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py index 9977256975..6d92a45e80 100644 --- a/examples/models/llama/source_transformation/quantized_kv_cache.py +++ b/examples/models/llama/source_transformation/quantized_kv_cache.py @@ -193,7 +193,7 @@ def update(self, input_pos, k_val, v_val): @classmethod def from_float(cls, kv_cache, cache_type: QuantizedCacheType): cache_shape = kv_cache.k_cache.shape - if kv_cache.is_tranposed: + if kv_cache.is_transposed: max_batch_size, n_heads, max_seq_length, head_dim = cache_shape else: max_batch_size, max_seq_length, n_heads, head_dim = cache_shape @@ -203,7 +203,7 @@ def from_float(cls, kv_cache, cache_type: QuantizedCacheType): n_heads, head_dim, cache_type, - kv_cache.is_tranposed, + kv_cache.is_transposed, kv_cache.enable_dynamic_shape, ) diff --git a/examples/models/llama/source_transformation/test_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_quantized_kv_cache.py index 2f38f96552..e5ade3dd12 100644 --- a/examples/models/llama/source_transformation/test_quantized_kv_cache.py +++ b/examples/models/llama/source_transformation/test_quantized_kv_cache.py @@ -48,8 +48,8 @@ def setUp(self): self.transpose_kv_cache = False self.dtype = torch.float32 - def _test_simple_update_fetch(self, is_tranposed=False, is_dynamic_shape=False): - self.transpose_kv_cache = is_tranposed + def _test_simple_update_fetch(self, is_transposed=False, is_dynamic_shape=False): + self.transpose_kv_cache = is_transposed self.enable_dynamic_shape = is_dynamic_shape input_pos = torch.tensor([0, 1, 2]) self.seq_len = input_pos.size(0) @@ -122,7 +122,7 @@ def test_simple_update_fetch_not_transposed_dynamic_shape(self): self._test_simple_update_fetch(is_dynamic_shape=True) def test_simple_update_fetch_transposed(self): - self._test_simple_update_fetch(is_tranposed=True) + self._test_simple_update_fetch(is_transposed=True) def test_simple_update_fetch_transposed_dynamic_shape(self): - self._test_simple_update_fetch(is_tranposed=True, is_dynamic_shape=True) + self._test_simple_update_fetch(is_transposed=True, is_dynamic_shape=True) diff --git a/runtime/core/exec_aten/exec_aten.h b/runtime/core/exec_aten/exec_aten.h index 84b918bf28..bfb47daa05 100644 --- a/runtime/core/exec_aten/exec_aten.h +++ b/runtime/core/exec_aten/exec_aten.h @@ -61,8 +61,8 @@ template using ArrayRef = c10::ArrayRef; template using optional = std::optional; -using nullopt_t = c10::nullopt_t; -using c10::nullopt; +using nullopt_t = std::nullopt_t; +using std::nullopt; using ScalarType = at::ScalarType; using Scalar = c10::Scalar; using MemoryFormat = c10::MemoryFormat;