diff --git a/BUILD.bazel b/BUILD.bazel
index f31cf1417e1..7f6ec2d1db7 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -233,7 +233,6 @@ MICROKERNEL_HDRS = [
     "src/xnnpack/pad.h",
     "src/xnnpack/pavgpool.h",
     "src/xnnpack/ppmm.h",
-    "src/xnnpack/prelu.h",
     "src/xnnpack/quantization.h",
     "src/xnnpack/raddexpminusmax.h",
     "src/xnnpack/raddextexp.h",
@@ -244,9 +243,6 @@ MICROKERNEL_HDRS = [
     "src/xnnpack/unpool.h",
     "src/xnnpack/vbinary.h",
     "src/xnnpack/vcvt.h",
-    "src/xnnpack/vhswish.h",
-    "src/xnnpack/vlog.h",
-    "src/xnnpack/vlrelu.h",
     "src/xnnpack/vmulcaddc.h",
     "src/xnnpack/vscaleexpminusmax.h",
     "src/xnnpack/vscaleextexp.h",
@@ -323,6 +319,7 @@ xnnpack_cc_library(
     deps = [
         ":common",
         ":config_hdrs",
+        ":fp16",
         ":math",
         ":memory",
         ":microparams",
@@ -1247,6 +1244,18 @@ config_setting(
     define_values = {"xnn_enable_avx512skx": "false"},
 )
 
+# Enables usage of Intel AVX512VBMI (evex512) kernels.
+config_setting(
+    name = "xnn_enable_avx512vbmi_explicit_true",
+    define_values = {"xnn_enable_avx512vbmi": "true"},
+)
+
+# Disables usage of Intel AVX512VBMI (evex512) kernels.
+config_setting(
+    name = "xnn_enable_avx512vbmi_explicit_false",
+    define_values = {"xnn_enable_avx512vbmi": "false"},
+)
+
 # Enables usage of Intel AVX512VNNI (evex512) kernels.
 config_setting(
     name = "xnn_enable_avx512vnni_explicit_true",
@@ -1704,6 +1713,22 @@ alias(
     }),
 )
 
+selects.config_setting_group(
+    name = "avx512vbmi_enabled_by_default",
+    match_any = [
+        "//build_config:x86",
+    ],
+)
+
+alias(
+    name = "avx512vbmi_enabled",
+    actual = select({
+        ":xnn_enable_avx512vbmi_explicit_true": ":xnn_enable_avx512vbmi_explicit_true",
+        ":xnn_enable_avx512vbmi_explicit_false": ":xnn_enable_avx512vbmi_explicit_true",
+        "//conditions:default": ":avx512vbmi_enabled_by_default",
+    }),
+)
+
 selects.config_setting_group(
     name = "avx512vnni_enabled_by_default",
     match_any = [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 039de1be556..00147e0d6de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -201,6 +201,16 @@ ELSEIF(CMAKE_C_COMPILER_ID STREQUAL "Clang")
     SET(XNNPACK_ENABLE_AVX512SKX OFF)
   ENDIF()
 ENDIF()
+OPTION(XNNPACK_ENABLE_AVX512VBMI "Build XNNPACK with AVX512VBMI micro-kernels" ON)
+IF(CMAKE_C_COMPILER_ID STREQUAL "GNU")
+  IF(CMAKE_C_COMPILER_VERSION VERSION_LESS "8")
+    SET(XNNPACK_ENABLE_AVX512VBMI OFF)
+  ENDIF()
+ELSEIF(CMAKE_C_COMPILER_ID STREQUAL "Clang")
+  IF(CMAKE_C_COMPILER_VERSION VERSION_LESS "6")
+    SET(XNNPACK_ENABLE_AVX512VBMI OFF)
+  ENDIF()
+ENDIF()
 OPTION(XNNPACK_ENABLE_AVX512VNNI "Build XNNPACK with AVX512VNNI micro-kernels" ON)
 IF(CMAKE_C_COMPILER_ID STREQUAL "GNU")
   IF(CMAKE_C_COMPILER_VERSION VERSION_LESS "8")
@@ -278,6 +288,7 @@ ADD_COMPILE_DEFINITIONS("XNN_ENABLE_AVX256VNNI=$<BOOL:${XNNPACK_ENABLE_AVX256VNN
 ADD_COMPILE_DEFINITIONS("XNN_ENABLE_AVX256VNNIGFNI=$<BOOL:${XNNPACK_ENABLE_AVX256VNNIGFNI}>")
 ADD_COMPILE_DEFINITIONS("XNN_ENABLE_AVX512F=$<BOOL:${XNNPACK_ENABLE_AVX512F}>")
 ADD_COMPILE_DEFINITIONS("XNN_ENABLE_AVX512SKX=$<BOOL:${XNNPACK_ENABLE_AVX512SKX}>")
+ADD_COMPILE_DEFINITIONS("XNN_ENABLE_AVX512VBMI=$<BOOL:${XNNPACK_ENABLE_AVX512VBMI}>")
 ADD_COMPILE_DEFINITIONS("XNN_ENABLE_AVX512VNNI=$<BOOL:${XNNPACK_ENABLE_AVX512VNNI}>")
 ADD_COMPILE_DEFINITIONS("XNN_ENABLE_AVX512VNNIGFNI=$<BOOL:${XNNPACK_ENABLE_AVX512VNNIGFNI}>")
 ADD_COMPILE_DEFINITIONS("XNN_ENABLE_AVX512AMX=$<BOOL:${XNNPACK_ENABLE_AVX512AMX}>")
@@ -418,7 +429,6 @@ SET(OPERATOR_SRCS
   src/operators/global-average-pooling-nwc.c
   src/operators/lut-elementwise-nc.c
   src/operators/max-pooling-nhwc.c
-  src/operators/prelu-nc.c
   src/operators/reduce-nd.c
   src/operators/resize-bilinear-nchw.c
   src/operators/resize-bilinear-nhwc.c
@@ -464,7 +474,6 @@ SET(SUBGRAPH_SRCS
   src/subgraph/log.c
   src/subgraph/max-pooling-2d.c
   src/subgraph/negate.c
-  src/subgraph/prelu.c
   src/subgraph/reciprocal-square-root.c
   src/subgraph/reshape-helpers.c
   src/subgraph/scaled-dot-product-attention.c
@@ -508,7 +517,6 @@ SET(XNNPACK_SRCS
   src/configs/lut32norm-config.c
   src/configs/maxpool-config.c
   src/configs/pavgpool-config.c
-  src/configs/prelu-config.c
   src/configs/raddstoreexpminusmax-config.c
   src/configs/reduce-config.c
   src/configs/rmax-config.c
@@ -657,7 +665,6 @@ IF(XNNPACK_TARGET_PROCESSOR MATCHES "^x86(_64)?$")
   LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_F16C_MICROKERNEL_SRCS})
   LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_FMA3_MICROKERNEL_SRCS})
   LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_AVX2_MICROKERNEL_SRCS})
-  LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_AVX512VBMI_MICROKERNEL_SRCS})
   IF(XNNPACK_ENABLE_AVX512AMX)
     LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_AVX512AMX_MICROKERNEL_SRCS})
   ENDIF()
@@ -685,6 +692,9 @@ IF(XNNPACK_TARGET_PROCESSOR MATCHES "^x86(_64)?$")
   IF(XNNPACK_ENABLE_AVX512SKX)
     LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_AVX512SKX_MICROKERNEL_SRCS})
   ENDIF()
+  IF(XNNPACK_ENABLE_AVX512VBMI)
+    LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_AVX512VBMI_MICROKERNEL_SRCS})
+  ENDIF()
   IF(XNNPACK_ENABLE_AVX512VNNI)
     LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_AVX512VNNI_MICROKERNEL_SRCS})
   ENDIF()
@@ -702,7 +712,6 @@ IF(XNNPACK_TARGET_PROCESSOR MATCHES "^x86(_64)?$")
   LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_F16C_MICROKERNEL_SRCS})
   LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_FMA3_MICROKERNEL_SRCS})
   LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_AVX2_MICROKERNEL_SRCS})
-  LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_AVX512VBMI_MICROKERNEL_SRCS})
   IF(XNNPACK_ENABLE_AVX512AMX)
     LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_AVX512AMX_MICROKERNEL_SRCS})
   ENDIF()
@@ -730,6 +739,9 @@ IF(XNNPACK_TARGET_PROCESSOR MATCHES "^x86(_64)?$")
   IF(XNNPACK_ENABLE_AVX512SKX)
     LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_AVX512SKX_MICROKERNEL_SRCS})
   ENDIF()
+  IF(XNNPACK_ENABLE_AVX512VBMI)
+    LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_AVX512VBMI_MICROKERNEL_SRCS})
+  ENDIF()
   IF(XNNPACK_ENABLE_AVX512VNNI)
     LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_AVX512VNNI_MICROKERNEL_SRCS})
   ENDIF()
@@ -836,8 +848,8 @@ IF(XNNPACK_BUILD_LIBRARY)
   TARGET_LINK_LIBRARIES(operator-run PRIVATE xnnpack-base logging)
   TARGET_LINK_LIBRARIES(operator-utils PRIVATE xnnpack-base logging)
   TARGET_LINK_LIBRARIES(subgraph PRIVATE xnnpack-base allocator logging memory mutex operators operator-run)
-  TARGET_LINK_LIBRARIES(XNNPACK PRIVATE xnnpack-base allocator cache hardware-config indirection logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing microkernels-prod subgraph)
-  TARGET_LINK_LIBRARIES(XNNPACK PUBLIC pthreadpool)
+  TARGET_LINK_LIBRARIES(XNNPACK PRIVATE xnnpack-base allocator cache hardware-config indirection memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing microkernels-prod subgraph)
+  TARGET_LINK_LIBRARIES(XNNPACK PUBLIC pthreadpool logging)
   SET_TARGET_PROPERTIES(XNNPACK PROPERTIES C_EXTENSIONS YES)
 ENDIF()
 IF(NOT MSVC)
@@ -984,7 +996,7 @@ IF(XNNPACK_TARGET_PROCESSOR MATCHES "^x86(_64)?$")
 ENDIF()
 
 # Set `XNN_LOG_LEVEL` transitively for all targets that depend on `logging`.
-TARGET_COMPILE_DEFINITIONS(logging PUBLIC "XNN_LOG_LEVEL=$<$<CONFIG:Debug>:4>$<$<NOT:$<CONFIG:Debug>>:0>")
+TARGET_COMPILE_DEFINITIONS(logging PUBLIC "XNN_LOG_LEVEL=$<$<CONFIG:Debug>:5>$<$<NOT:$<CONFIG:Debug>>:0>")
 
 IF(MSVC)
   # Even though MSVC has __restrict, it can't be used in all the same contexts as the C99 restrict keyword
@@ -1033,9 +1045,9 @@ ELSE()
 ENDIF()
 
 IF(XNNPACK_BUILD_ALL_MICROKERNELS)
-  TARGET_INCLUDE_DIRECTORIES(microkernels-all PRIVATE . include src)
+  TARGET_INCLUDE_DIRECTORIES(microkernels-all PRIVATE include src)
 ENDIF()
-TARGET_INCLUDE_DIRECTORIES(microkernels-prod PRIVATE . include src)
+TARGET_INCLUDE_DIRECTORIES(microkernels-prod PRIVATE include src)
 TARGET_INCLUDE_DIRECTORIES(hardware-config PRIVATE include src ${CPUINFO_SOURCE_DIR}/include)
 TARGET_INCLUDE_DIRECTORIES(indirection PRIVATE include src)
 TARGET_INCLUDE_DIRECTORIES(microparams-init PRIVATE include src)
@@ -1044,13 +1056,13 @@ TARGET_INCLUDE_DIRECTORIES(packing PRIVATE include src)
 TARGET_INCLUDE_DIRECTORIES(logging PRIVATE include src)
 IF(XNNPACK_BUILD_LIBRARY)
   TARGET_INCLUDE_DIRECTORIES(XNNPACK PUBLIC include)
-  TARGET_INCLUDE_DIRECTORIES(XNNPACK PRIVATE . src)
+  TARGET_INCLUDE_DIRECTORIES(XNNPACK PRIVATE src)
   TARGET_INCLUDE_DIRECTORIES(allocator PRIVATE include src)
   TARGET_INCLUDE_DIRECTORIES(cache PRIVATE include src)
   TARGET_INCLUDE_DIRECTORIES(microkernel-utils PRIVATE include src)
-  TARGET_INCLUDE_DIRECTORIES(subgraph PRIVATE . include src)
-  TARGET_INCLUDE_DIRECTORIES(operators PRIVATE . include src)
-  TARGET_INCLUDE_DIRECTORIES(operator-run PRIVATE . include src)
+  TARGET_INCLUDE_DIRECTORIES(subgraph PRIVATE include src)
+  TARGET_INCLUDE_DIRECTORIES(operators PRIVATE include src)
+  TARGET_INCLUDE_DIRECTORIES(operator-run PRIVATE include src)
   TARGET_INCLUDE_DIRECTORIES(operator-utils PRIVATE include src)
   TARGET_INCLUDE_DIRECTORIES(memory PRIVATE include src)
   TARGET_INCLUDE_DIRECTORIES(mutex PRIVATE include src)
@@ -1214,7 +1226,7 @@ IF(XNNPACK_BUILD_TESTS)
   ADD_LIBRARY(next-prime STATIC test/next_prime.cc)
 
   ADD_LIBRARY(gemm-microkernel-tester STATIC test/gemm-microkernel-tester.cc)
-  TARGET_INCLUDE_DIRECTORIES(gemm-microkernel-tester PRIVATE . include src test)
+  TARGET_INCLUDE_DIRECTORIES(gemm-microkernel-tester PRIVATE include src test)
   TARGET_LINK_LIBRARIES(gemm-microkernel-tester PRIVATE xnnpack-base pthreadpool GTest::gtest)
   TARGET_LINK_LIBRARIES(gemm-microkernel-tester PRIVATE packing)
   IF(XNNPACK_ENABLE_KLEIDIAI)
@@ -1223,25 +1235,25 @@ IF(XNNPACK_BUILD_TESTS)
   TARGET_LINK_LIBRARIES(gemm-microkernel-tester PUBLIC next-prime)
 
   ADD_LIBRARY(unary-operator-tester STATIC test/unary-operator-tester.cc)
-  TARGET_INCLUDE_DIRECTORIES(unary-operator-tester PRIVATE . include src test)
+  TARGET_INCLUDE_DIRECTORIES(unary-operator-tester PRIVATE include src test)
   TARGET_LINK_LIBRARIES(unary-operator-tester PRIVATE XNNPACK pthreadpool GTest::gtest)
 
   ADD_LIBRARY(dwconv-microkernel-tester STATIC test/dwconv-microkernel-tester.cc)
-  TARGET_INCLUDE_DIRECTORIES(dwconv-microkernel-tester PRIVATE . include src test)
+  TARGET_INCLUDE_DIRECTORIES(dwconv-microkernel-tester PRIVATE include src test)
   TARGET_LINK_LIBRARIES(dwconv-microkernel-tester PRIVATE XNNPACK pthreadpool GTest::gtest)
   TARGET_LINK_LIBRARIES(dwconv-microkernel-tester PUBLIC next-prime)
 
   ADD_LIBRARY(vbinary-microkernel-tester STATIC test/vbinary-microkernel-tester.cc)
   SET_TARGET_PROPERTIES(vbinary-microkernel-tester PROPERTIES CXX_EXTENSIONS YES)
-  TARGET_INCLUDE_DIRECTORIES(vbinary-microkernel-tester PRIVATE . include src test)
+  TARGET_INCLUDE_DIRECTORIES(vbinary-microkernel-tester PRIVATE include src test)
   TARGET_LINK_LIBRARIES(vbinary-microkernel-tester PRIVATE XNNPACK pthreadpool GTest::gtest)
 
   ADD_LIBRARY(vcvt-microkernel-tester STATIC test/vcvt-microkernel-tester.cc)
-  TARGET_INCLUDE_DIRECTORIES(vcvt-microkernel-tester PRIVATE . include src test)
+  TARGET_INCLUDE_DIRECTORIES(vcvt-microkernel-tester PRIVATE include src test)
   TARGET_LINK_LIBRARIES(vcvt-microkernel-tester PRIVATE XNNPACK pthreadpool GTest::gtest)
 
   ADD_LIBRARY(vunary-microkernel-tester STATIC test/vunary-microkernel-tester.cc)
-  TARGET_INCLUDE_DIRECTORIES(vunary-microkernel-tester PRIVATE . include src test)
+  TARGET_INCLUDE_DIRECTORIES(vunary-microkernel-tester PRIVATE include src test)
   TARGET_LINK_LIBRARIES(vunary-microkernel-tester PRIVATE XNNPACK pthreadpool GTest::gtest)
   TARGET_LINK_LIBRARIES(vunary-microkernel-tester PUBLIC next-prime)
 
@@ -1250,7 +1262,7 @@ IF(XNNPACK_BUILD_TESTS)
   TARGET_LINK_LIBRARIES(convolution-test-helpers PRIVATE xnnpack-base)
 
   ADD_LIBRARY(packq-microkernel-tester STATIC test/packq-microkernel-tester.cc)
-  TARGET_INCLUDE_DIRECTORIES(packq-microkernel-tester PRIVATE . include src test)
+  TARGET_INCLUDE_DIRECTORIES(packq-microkernel-tester PRIVATE include src test)
   TARGET_LINK_LIBRARIES(packq-microkernel-tester PRIVATE XNNPACK pthreadpool  GTest::gtest)
   IF(XNNPACK_ENABLE_KLEIDIAI)
     TARGET_LINK_LIBRARIES(packq-microkernel-tester PRIVATE kleidiai)
@@ -1269,7 +1281,7 @@ IF(XNNPACK_BUILD_TESTS)
   )
   FOREACH(TEST ${SHARDED_TESTS})
     ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
-    TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test)
+    TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
     TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE
       GTest::gtest
       GTest::gtest_main
@@ -1295,7 +1307,7 @@ IF(XNNPACK_BUILD_TESTS)
     )
     FOREACH(TEST ${LIBRARY_SHARDED_TESTS})
       ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
-      TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . src test)
+      TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE src test)
       TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE
         GTest::gmock
         GTest::gtest
@@ -1305,9 +1317,6 @@ IF(XNNPACK_BUILD_TESTS)
     ENDFOREACH()
 
     # ---[ Build size tests
-    ADD_EXECUTABLE(operator-size-test test/operator-size.c)
-    TARGET_LINK_LIBRARIES(operator-size-test PRIVATE XNNPACK)
-
     ADD_EXECUTABLE(subgraph-size-test test/subgraph-size.c)
     TARGET_LINK_LIBRARIES(subgraph-size-test PRIVATE XNNPACK)
 
@@ -1402,7 +1411,6 @@ IF(XNNPACK_BUILD_TESTS)
         log
         max-pooling-2d
         negate
-        prelu
         reciprocal-square-root
         reshape-helpers
         sigmoid
@@ -1473,7 +1481,6 @@ IF(XNNPACK_BUILD_TESTS)
       f16-gavgpool-minmax
       f16-ibilinear-chw
       f16-ibilinear
-      f16-prelu
       f16-raddstoreexpminusmax
       f16-rmax
       f16-rsum
@@ -1486,7 +1493,6 @@ IF(XNNPACK_BUILD_TESTS)
       f32-gavgpool-minmax
       f32-ibilinear-chw
       f32-ibilinear
-      f32-prelu
       f32-raddexpminusmax
       f32-raddextexp
       f32-raddstoreexpminusmax
@@ -1534,7 +1540,7 @@ IF(XNNPACK_BUILD_TESTS)
       xx-pad)
   FOREACH(TEST ${MICROKERNEL_UNIT_TESTS})
     ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
-    TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test)
+    TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
     TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE
         GTest::gmock
         GTest::gtest
@@ -1573,7 +1579,7 @@ IF(XNNPACK_BUILD_TESTS)
       qu8-dwconv-minmax-unipass-rndnu)
   FOREACH(TEST ${MICROKERNEL_DWCONV_UNIT_TESTS})
     ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
-    TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test)
+    TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
     TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE
         dwconv-microkernel-tester
         GTest::gmock
@@ -1627,7 +1633,7 @@ IF(XNNPACK_BUILD_TESTS)
     FILE(GLOB TEST_SOURCES "test/${TEST}*.cc")
     IF(TEST_SOURCES)
         ADD_EXECUTABLE(${TEST}-test ${TEST_SOURCES})
-        TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test)
+        TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
         TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE
             gemm-microkernel-tester
             GTest::gmock
@@ -1648,7 +1654,7 @@ IF(XNNPACK_BUILD_TESTS)
       x8-packq)
   FOREACH(TEST ${MICROKERNEL_PACKQ_UNIT_TESTS})
     ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
-    TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test)
+    TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
     TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE
         packq-microkernel-tester
         GTest::gmock
@@ -1720,7 +1726,7 @@ IF(XNNPACK_BUILD_TESTS)
       s32-vmulc)
   FOREACH(TEST ${MICROKERNEL_VBINARY_UNIT_TESTS})
     ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
-    TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test)
+    TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
     TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE
         vbinary-microkernel-tester
         GTest::gmock
@@ -1752,7 +1758,7 @@ IF(XNNPACK_BUILD_TESTS)
       u32-f32-vcvt)
   FOREACH(TEST ${MICROKERNEL_VCVT_TESTS})
     ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
-    TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test)
+    TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
     TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE
         vcvt-microkernel-tester
         GTest::gmock
@@ -1805,7 +1811,7 @@ IF(XNNPACK_BUILD_TESTS)
       u8-vclamp)
   FOREACH(TEST ${MICROKERNEL_VUNARY_TESTS})
     ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
-    TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE . include src test)
+    TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
     TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE
         vunary-microkernel-tester
         GTest::gmock
@@ -1885,7 +1891,6 @@ IF(XNNPACK_BUILD_BENCHMARKS)
   ENDIF()
 
   ADD_LIBRARY(bench-utils STATIC bench/utils.cc)
-  TARGET_INCLUDE_DIRECTORIES(bench-utils PRIVATE .)
   TARGET_INCLUDE_DIRECTORIES(bench-utils PUBLIC include src)
   TARGET_LINK_LIBRARIES(bench-utils PRIVATE benchmark::benchmark cpuinfo pthreadpool)
   TARGET_LINK_LIBRARIES(bench-utils PRIVATE xnnpack-base hardware-config)
@@ -1895,14 +1900,14 @@ IF(XNNPACK_BUILD_BENCHMARKS)
 
   # Helper libraries
   ADD_LIBRARY(packq-benchmark STATIC bench/packq-benchmark.cc)
-  TARGET_INCLUDE_DIRECTORIES(packq-benchmark PRIVATE . include src bench)
+  TARGET_INCLUDE_DIRECTORIES(packq-benchmark PRIVATE include src bench)
   TARGET_LINK_LIBRARIES(packq-benchmark PRIVATE XNNPACK benchmark::benchmark bench-utils)
   IF(XNNPACK_ENABLE_KLEIDIAI)
     TARGET_LINK_LIBRARIES(packq-benchmark PRIVATE kleidiai)
   ENDIF()
 
   ADD_LIBRARY(gemm-benchmark STATIC bench/gemm-benchmark.cc)
-  TARGET_INCLUDE_DIRECTORIES(gemm-benchmark PRIVATE . include src bench)
+  TARGET_INCLUDE_DIRECTORIES(gemm-benchmark PRIVATE include src bench)
   TARGET_LINK_LIBRARIES(gemm-benchmark PRIVATE XNNPACK benchmark::benchmark bench-utils)
   IF(XNNPACK_ENABLE_KLEIDIAI)
     TARGET_LINK_LIBRARIES(gemm-benchmark PUBLIC kleidiai)
@@ -1921,11 +1926,10 @@ IF(XNNPACK_BUILD_BENCHMARKS)
       bench/models/fp32-mobilenet-v3-small.cc
       bench/models/qs8-mobilenet-v2.cc)
     SET_TARGET_PROPERTIES(models PROPERTIES CXX_EXTENSIONS YES)
-    TARGET_INCLUDE_DIRECTORIES(models PRIVATE .)
     TARGET_LINK_LIBRARIES(models PRIVATE XNNPACK)
 
     ADD_EXECUTABLE(bench-models bench/models/benchmark.cc)
-    TARGET_INCLUDE_DIRECTORIES(bench-models PRIVATE .)
+    TARGET_INCLUDE_DIRECTORIES(bench-models PRIVATE bench)
     TARGET_LINK_LIBRARIES(bench-models PRIVATE
       bench-utils
       benchmark::benchmark
@@ -1949,7 +1953,6 @@ IF(XNNPACK_BUILD_BENCHMARKS)
         leaky-relu
         max-pooling
         negate
-        prelu
         reciprocal-square-root
         sigmoid
         softmax
@@ -1959,7 +1962,6 @@ IF(XNNPACK_BUILD_BENCHMARKS)
         tanh)
     FOREACH(BENCH ${LIBRARY_OPERATOR_BENCHMARKS})
       ADD_EXECUTABLE(${BENCH}-bench bench/${BENCH}.cc)
-      TARGET_INCLUDE_DIRECTORIES(${BENCH}-bench PRIVATE .)
       TARGET_LINK_LIBRARIES(${BENCH}-bench PRIVATE
         bench-utils
         benchmark::benchmark
@@ -2055,7 +2057,7 @@ IF(XNNPACK_BUILD_BENCHMARKS)
       xx-transposev)
   FOREACH(BENCH ${MICROKERNEL_BENCHMARKS})
     ADD_EXECUTABLE(${BENCH}-bench bench/${BENCH}.cc)
-    TARGET_INCLUDE_DIRECTORIES(${BENCH}-bench PRIVATE . include src)
+    TARGET_INCLUDE_DIRECTORIES(${BENCH}-bench PRIVATE include src)
     TARGET_LINK_LIBRARIES(${BENCH}-bench PRIVATE
       bench-utils
       benchmark::benchmark
diff --git a/bench/BUILD.bazel b/bench/BUILD.bazel
index 9e0a6862e39..bb6e122d1e9 100644
--- a/bench/BUILD.bazel
+++ b/bench/BUILD.bazel
@@ -51,6 +51,8 @@ OPERATOR_BENCHMARK_DEPS = [
     "//:math",
 ]
 
+############################### Helper libraries ###############################
+
 xnnpack_cxx_library(
     name = "bench_utils",
     srcs = ["utils.cc"],
@@ -98,8 +100,6 @@ cc_library(
     ],
 )
 
-######################### Benchmarks for micro-kernels #########################
-
 xnnpack_cxx_library(
     name = "gemm_benchmark",
     srcs = [
@@ -115,6 +115,29 @@ xnnpack_cxx_library(
     ],
 )
 
+xnnpack_cxx_library(
+    name = "packw_benchmark",
+    hdrs = [
+        "packw-benchmark.h",
+    ],
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
+        "@com_google_benchmark//:benchmark",
+    ],
+)
+
+xnnpack_cxx_library(
+    name = "bgemm",
+    hdrs = [
+        "bgemm.h",
+    ],
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        "@com_google_benchmark//:benchmark",
+    ],
+)
+
+######################### Benchmarks for micro-kernels #########################
+
 [xnnpack_benchmark(
     name = "%s_bench" % kernel,
     srcs = [
@@ -167,12 +190,12 @@ xnnpack_cxx_library(
 xnnpack_benchmark(
     name = "f32_bgemm_bench",
     srcs = [
-        "bgemm.h",
         "f32-bgemm.cc",
     ],
     copts = xnnpack_optional_ruy_copts(),
     tags = xnnpack_slow_benchmark_tags(),
     deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
         "//:allocator",
     ] + xnnpack_optional_ruy_deps(),
 )
@@ -192,6 +215,19 @@ xnnpack_benchmark(
     ]),
 )
 
+xnnpack_benchmark(
+    name = "qp8_f32_qb4w_gemm",
+    srcs = ["qp8-f32-qb4w-gemm.cc"],
+    defines = xnnpack_kleidiai_defines(),
+    tags = xnnpack_slow_benchmark_tags(),
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":gemm_benchmark",
+        "//:isa_checks",
+    ] + xnnpack_if_kleidiai_enabled([
+        "@KleidiAI//kai/ukernels/matmul",
+    ]),
+)
+
 [xnnpack_benchmark(
     name = "%s_bench" % kernel,
     srcs = [
@@ -292,6 +328,7 @@ xnnpack_benchmark(
     srcs = [
         "qs8-dwconv.cc",
     ],
+    tags = xnnpack_slow_benchmark_tags(),
     deps = MICROKERNEL_BENCHMARK_DEPS + [
         ":dwconv",
         "//:indirection",
@@ -470,11 +507,11 @@ xnnpack_benchmark(
 xnnpack_cxx_library(
     name = "packq_benchmark",
     srcs = [
-        "bgemm.h",
         "packq-benchmark.cc",
     ],
     hdrs = ["packq-benchmark.h"],
     deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
         "@com_google_benchmark//:benchmark",
     ],
 )
@@ -482,10 +519,11 @@ xnnpack_cxx_library(
 xnnpack_benchmark(
     name = "x8_packq_bench",
     srcs = [
-        "bgemm.h",
         "x8-packq.cc",
     ],
+    tags = xnnpack_slow_benchmark_tags(),
     deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
         ":packq_benchmark",
         "//:allocator",
     ],
@@ -494,11 +532,12 @@ xnnpack_benchmark(
 xnnpack_benchmark(
     name = "x8_packw_bench",
     srcs = [
-        "bgemm.h",
-        "packw-benchmark.h",
         "x8-packw.cc",
     ],
+    tags = xnnpack_slow_benchmark_tags(),
     deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
+        ":packw_benchmark",
         "//:allocator",
     ],
 )
@@ -506,11 +545,12 @@ xnnpack_benchmark(
 xnnpack_benchmark(
     name = "qs8_packw_bench",
     srcs = [
-        "bgemm.h",
-        "packw-benchmark.h",
         "qs8-packw.cc",
     ],
+    tags = xnnpack_slow_benchmark_tags(),
     deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
+        ":packw_benchmark",
         "//:allocator",
     ],
 )
@@ -518,11 +558,12 @@ xnnpack_benchmark(
 xnnpack_benchmark(
     name = "x16_packw_bench",
     srcs = [
-        "bgemm.h",
-        "packw-benchmark.h",
         "x16-packw.cc",
     ],
+    tags = xnnpack_slow_benchmark_tags(),
     deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
+        ":packw_benchmark",
         "//:allocator",
     ],
 )
@@ -530,11 +571,12 @@ xnnpack_benchmark(
 xnnpack_benchmark(
     name = "x32_packw_bench",
     srcs = [
-        "bgemm.h",
-        "packw-benchmark.h",
         "x32-packw.cc",
     ],
+    tags = xnnpack_slow_benchmark_tags(),
     deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":bgemm",
+        ":packw_benchmark",
         "//:allocator",
     ],
 )
diff --git a/bench/abs.cc b/bench/abs.cc
index c33171ddc14..03864613091 100644
--- a/bench/abs.cc
+++ b/bench/abs.cc
@@ -3,10 +3,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/bench/average-pooling.cc b/bench/average-pooling.cc
index fa939165fc3..6c3deca8616 100644
--- a/bench/average-pooling.cc
+++ b/bench/average-pooling.cc
@@ -27,7 +27,7 @@
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
 #endif  // BENCHMARK_TENSORFLOW_LITE
-#include "bench/utils.h"
+#include "utils.h"
 
 static void xnnpack_average_pooling_qu8(benchmark::State& state, const char* net) {
   const size_t batch_size = state.range(0);
diff --git a/bench/bankers-rounding.cc b/bench/bankers-rounding.cc
index 69c94384c35..870de837061 100644
--- a/bench/bankers-rounding.cc
+++ b/bench/bankers-rounding.cc
@@ -3,10 +3,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/bench/batch-matrix-multiply.cc b/bench/batch-matrix-multiply.cc
index e45eccd46e8..b09cdebc34b 100644
--- a/bench/batch-matrix-multiply.cc
+++ b/bench/batch-matrix-multiply.cc
@@ -15,7 +15,7 @@
 #include "xnnpack.h"
 
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack/buffer.h"
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "flatbuffers/include/flatbuffers/flatbuffers.h"
diff --git a/bench/bf16-gemm.cc b/bench/bf16-gemm.cc
index 98289df55b5..01ef4a8f475 100644
--- a/bench/bf16-gemm.cc
+++ b/bench/bf16-gemm.cc
@@ -11,8 +11,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/gemm.h"
-#include "bench/utils.h"
+#include "gemm.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
diff --git a/bench/ceiling.cc b/bench/ceiling.cc
index a377ae9289a..da08f7bb6d7 100644
--- a/bench/ceiling.cc
+++ b/bench/ceiling.cc
@@ -3,10 +3,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/bench/channel-shuffle.cc b/bench/channel-shuffle.cc
index 61d138849c2..0a9f820ba3a 100644
--- a/bench/channel-shuffle.cc
+++ b/bench/channel-shuffle.cc
@@ -16,7 +16,7 @@
 #include "xnnpack.h"
 
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack/buffer.h"
 
 
diff --git a/bench/convert.cc b/bench/convert.cc
index 77e6f2f7558..7c1725dc875 100644
--- a/bench/convert.cc
+++ b/bench/convert.cc
@@ -3,12 +3,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include <limits>
 
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "flatbuffers/include/flatbuffers/flatbuffers.h"
@@ -33,9 +33,7 @@ void xnnpack_convert_f32_qs8(benchmark::State& state) {
   benchmark_unary_operator<float, int8_t>(
       [](uint32_t flags, xnn_operator_t* op) {
         return xnn_create_convert_nc_f32_qs8(
-            1.0f / 128.0f /* scale */, 1 /* zero point */,
-            std::numeric_limits<int8_t>::min(),
-            std::numeric_limits<int8_t>::max(), flags, op);
+            1.0f / 128.0f /* scale */, 1 /* zero point */, flags, op);
       },
       xnn_reshape_convert_nc_f32_qs8, xnn_setup_convert_nc_f32_qs8, state);
 }
@@ -44,9 +42,7 @@ void xnnpack_convert_f32_qu8(benchmark::State& state) {
   benchmark_unary_operator<float, uint8_t>(
       [](uint32_t flags, xnn_operator_t* op) {
         return xnn_create_convert_nc_f32_qu8(
-            1.0f / 128.0f /* scale */, 127 /* zero point */,
-            std::numeric_limits<uint8_t>::min(),
-            std::numeric_limits<uint8_t>::max(), flags, op);
+            1.0f / 128.0f /* scale */, 127 /* zero point */, flags, op);
       },
       xnn_reshape_convert_nc_f32_qu8, xnn_setup_convert_nc_f32_qu8, state);
 }
diff --git a/bench/convolution.cc b/bench/convolution.cc
index 0011ed3545c..f6670b91351 100644
--- a/bench/convolution.cc
+++ b/bench/convolution.cc
@@ -28,7 +28,7 @@
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
 #endif  // BENCHMARK_TENSORFLOW_LITE
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack/buffer.h"
 
 void xnnpack_convolution_qu8(benchmark::State& state, const char* net) {
diff --git a/bench/deconvolution.cc b/bench/deconvolution.cc
index 63e3f02b78e..3fb970c3149 100644
--- a/bench/deconvolution.cc
+++ b/bench/deconvolution.cc
@@ -25,7 +25,7 @@
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
 #endif  // BENCHMARK_TENSORFLOW_LITE */
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack/buffer.h"
 
 void xnnpack_deconvolution_qu8(benchmark::State& state, const char* net) {
diff --git a/bench/elu.cc b/bench/elu.cc
index d2823c97e2b..7912502bc91 100644
--- a/bench/elu.cc
+++ b/bench/elu.cc
@@ -3,13 +3,13 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include <cstdint>
 #include <limits>
 
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "flatbuffers/include/flatbuffers/flatbuffer_builder.h"
diff --git a/bench/f16-conv-hwc2chw.cc b/bench/f16-conv-hwc2chw.cc
index 32ace527234..693871262bf 100644
--- a/bench/f16-conv-hwc2chw.cc
+++ b/bench/f16-conv-hwc2chw.cc
@@ -10,8 +10,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/dconv.h"
-#include "bench/utils.h"
+#include "dconv.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/conv.h"
diff --git a/bench/f16-dwconv.cc b/bench/f16-dwconv.cc
index ad08b6929e0..9733466d699 100644
--- a/bench/f16-dwconv.cc
+++ b/bench/f16-dwconv.cc
@@ -11,8 +11,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/dwconv.h"
-#include "bench/utils.h"
+#include "dwconv.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
diff --git a/bench/f16-dwconv2d-chw.cc b/bench/f16-dwconv2d-chw.cc
index 60d819f4f8b..a9f90ae24b5 100644
--- a/bench/f16-dwconv2d-chw.cc
+++ b/bench/f16-dwconv2d-chw.cc
@@ -10,8 +10,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/dwconv.h"
-#include "bench/utils.h"
+#include "dwconv.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
diff --git a/bench/f16-f32-vcvt.cc b/bench/f16-f32-vcvt.cc
index 7eca850e69f..07dacd35854 100644
--- a/bench/f16-f32-vcvt.cc
+++ b/bench/f16-f32-vcvt.cc
@@ -3,16 +3,17 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <benchmark/benchmark.h>
-#include "bench/utils.h"
-#include "bench/vcvt-benchmark.h"
+#include "utils.h"
+#include "vcvt-benchmark.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/hardware-config.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
-#include "xnnpack/microparams.h"
 #include "xnnpack/microparams-init.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/vcvt.h"
+#include <benchmark/benchmark.h>
 
 static void f16_f32_vcvt(
   benchmark::State& state,
@@ -28,7 +29,7 @@ static void f16_f32_vcvt(
 BENCHMARK_CAPTURE(f16_f32_vcvt, ukernel, arch_flags, ukernel, init_params)          \
   ->Apply(benchmark::utils::UnaryElementwiseParameters<datatype_in, datatype_out>)  \
   ->UseRealTime();
-#include "src/f16-f32-vcvt/f16-f32-vcvt.h"
+#include "f16-f32-vcvt/f16-f32-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
 
 
diff --git a/bench/f16-f32acc-gemm.cc b/bench/f16-f32acc-gemm.cc
index f81faef4c27..3678fcadf0e 100644
--- a/bench/f16-f32acc-gemm.cc
+++ b/bench/f16-f32acc-gemm.cc
@@ -14,8 +14,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/gemm.h"
-#include "bench/utils.h"
+#include "gemm.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
diff --git a/bench/f16-f32acc-igemm.cc b/bench/f16-f32acc-igemm.cc
index 0138241f467..5377556af6e 100644
--- a/bench/f16-f32acc-igemm.cc
+++ b/bench/f16-f32acc-igemm.cc
@@ -10,8 +10,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/conv.h"
-#include "bench/utils.h"
+#include "conv.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/igemm.h"
diff --git a/bench/f16-f32acc-rdsum.cc b/bench/f16-f32acc-rdsum.cc
index e9d7734bee4..367e9a7e766 100644
--- a/bench/f16-f32acc-rdsum.cc
+++ b/bench/f16-f32acc-rdsum.cc
@@ -7,8 +7,8 @@
 //   Specification: test/f16-f32acc-rdsum.yaml
 //   Generator: tools/generate-rdsum-benchmark.py
 
-#include "bench/rsum-benchmark.h"
-#include "bench/utils.h"
+#include "rsum-benchmark.h"
+#include "utils.h"
 #include <benchmark/benchmark.h>
 
 #include "xnnpack.h"
diff --git a/bench/f16-f32acc-rsum.cc b/bench/f16-f32acc-rsum.cc
index aa4f3411b5b..f087668d615 100644
--- a/bench/f16-f32acc-rsum.cc
+++ b/bench/f16-f32acc-rsum.cc
@@ -7,8 +7,8 @@
 //   Specification: test/f16-f32acc-rsum.yaml
 //   Generator: tools/generate-rdsum-benchmark.py
 
-#include "bench/rsum-benchmark.h"
-#include "bench/utils.h"
+#include "rsum-benchmark.h"
+#include "utils.h"
 #include <benchmark/benchmark.h>
 
 #include "xnnpack.h"
diff --git a/bench/f16-gavgpool-cw.cc b/bench/f16-gavgpool-cw.cc
index 2091dbf7879..b8913fffb10 100644
--- a/bench/f16-gavgpool-cw.cc
+++ b/bench/f16-gavgpool-cw.cc
@@ -9,16 +9,15 @@
 #include <numeric>
 #include <vector>
 
-#include "bench/utils.h"
-#include <benchmark/benchmark.h>
-
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/aligned-allocator.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gavgpool.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams-init.h"
-
+#include <benchmark/benchmark.h>
 
 void f16_gavgpool_cw(
     benchmark::State& state,
diff --git a/bench/f16-gemm-minmax.cc b/bench/f16-gemm-minmax.cc
index c0b491ec36c..c1834ba6117 100644
--- a/bench/f16-gemm-minmax.cc
+++ b/bench/f16-gemm-minmax.cc
@@ -8,8 +8,8 @@
 //   Generator: tools/generate-gemm-test.py
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/isa-checks.h"
diff --git a/bench/f16-gemm.cc b/bench/f16-gemm.cc
index 15196e54cc9..53de4dcd497 100644
--- a/bench/f16-gemm.cc
+++ b/bench/f16-gemm.cc
@@ -13,8 +13,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/gemm.h"
-#include "bench/utils.h"
+#include "gemm.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
diff --git a/bench/f16-igemm.cc b/bench/f16-igemm.cc
index abed264c5a8..8269b73165f 100644
--- a/bench/f16-igemm.cc
+++ b/bench/f16-igemm.cc
@@ -11,8 +11,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/conv.h"
-#include "bench/utils.h"
+#include "conv.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/igemm.h"
diff --git a/bench/f16-qs8-vcvt.cc b/bench/f16-qs8-vcvt.cc
index 0727b2eab6b..71f00b7f807 100644
--- a/bench/f16-qs8-vcvt.cc
+++ b/bench/f16-qs8-vcvt.cc
@@ -7,8 +7,8 @@
 #include <cstdint>
 
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
-#include "bench/vcvt-benchmark.h"
+#include "utils.h"
+#include "vcvt-benchmark.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/hardware-config.h"
@@ -27,9 +27,7 @@ static void f16_qs8_vcvt(
   xnn_f16_qs8_cvt_params params;
   init_params(&params,
     1.0f /* scale */,
-    1 /* output zero point */,
-    std::numeric_limits<int8_t>::min() + 1 /* output min */,
-    std::numeric_limits<int8_t>::max() - 1 /* output max */);
+    1 /* output zero point */);
 
   cvt_benchmark<xnn_float16, int8_t>(state, arch_flags, cvt, &params);
 }
@@ -39,7 +37,7 @@ static void f16_qs8_vcvt(
 BENCHMARK_CAPTURE(f16_qs8_vcvt, ukernel, arch_flags, ukernel, init_params)          \
   ->Apply(benchmark::utils::UnaryElementwiseParameters<datatype_in, datatype_out>)  \
   ->UseRealTime();
-#include "src/f16-qs8-vcvt/f16-qs8-vcvt.h"
+#include "f16-qs8-vcvt/f16-qs8-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
 
 
diff --git a/bench/f16-raddstoreexpminusmax.cc b/bench/f16-raddstoreexpminusmax.cc
index 407cad1ee3a..f3cf127c743 100644
--- a/bench/f16-raddstoreexpminusmax.cc
+++ b/bench/f16-raddstoreexpminusmax.cc
@@ -9,7 +9,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/math.h"
diff --git a/bench/f16-rmax.cc b/bench/f16-rmax.cc
index 4343c6d6386..a2d55817ecf 100644
--- a/bench/f16-rmax.cc
+++ b/bench/f16-rmax.cc
@@ -9,7 +9,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/math.h"
@@ -128,7 +128,7 @@ static void f16_rmax(
     ->UseRealTime();
 #endif  // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   BENCHMARK_CAPTURE(f16_rmax, avx512skx_u16,
                     xnn_f16_rmax_ukernel__avx512skx_u16,
                     /*init_params=*/nullptr,
@@ -159,7 +159,9 @@ static void f16_rmax(
                     benchmark::utils::CheckAVX512SKX)
     ->Apply(benchmark::utils::ReductionParameters<xnn_float16>)
     ->UseRealTime();
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   BENCHMARK_CAPTURE(f16_rmax, f16c_u32,
                     xnn_f16_rmax_ukernel__f16c_u32,
                     /*init_params=*/nullptr,
diff --git a/bench/f16-rmin.cc b/bench/f16-rmin.cc
index 09d14b09553..87888553d5b 100644
--- a/bench/f16-rmin.cc
+++ b/bench/f16-rmin.cc
@@ -9,7 +9,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/math.h"
@@ -128,7 +128,7 @@ static void f16_rmin(
     ->UseRealTime();
 #endif  // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   BENCHMARK_CAPTURE(f16_rmin, avx512skx_u16,
                     xnn_f16_rmin_ukernel__avx512skx_u16,
                     /*init_params=*/nullptr,
@@ -159,7 +159,7 @@ static void f16_rmin(
                     benchmark::utils::CheckAVX512SKX)
     ->Apply(benchmark::utils::ReductionParameters<xnn_float16>)
     ->UseRealTime();
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 BENCHMARK_CAPTURE(f16_rmin, scalar_u1,
                   xnn_f16_rmin_ukernel__scalar_u1)
diff --git a/bench/f16-rminmax.cc b/bench/f16-rminmax.cc
index 1d931d6de88..32866e1df62 100644
--- a/bench/f16-rminmax.cc
+++ b/bench/f16-rminmax.cc
@@ -9,7 +9,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/math.h"
@@ -128,7 +128,7 @@ static void f16_rminmax(
     ->UseRealTime();
 #endif  // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   BENCHMARK_CAPTURE(f16_rminmax, avx512skx_u16,
                     xnn_f16_rminmax_ukernel__avx512skx_u16,
                     /*init_params=*/nullptr,
@@ -159,7 +159,7 @@ static void f16_rminmax(
                     benchmark::utils::CheckAVX512SKX)
     ->Apply(benchmark::utils::ReductionParameters<xnn_float16>)
     ->UseRealTime();
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 BENCHMARK_CAPTURE(f16_rminmax, scalar_u1,
                   xnn_f16_rminmax_ukernel__scalar_u1)
diff --git a/bench/f16-rsum.cc b/bench/f16-rsum.cc
index 230016a5cfe..89fcafcf996 100644
--- a/bench/f16-rsum.cc
+++ b/bench/f16-rsum.cc
@@ -7,8 +7,8 @@
 //   Specification: test/f16-rsum.yaml
 //   Generator: tools/generate-rdsum-benchmark.py
 
-#include "bench/rsum-benchmark.h"
-#include "bench/utils.h"
+#include "rsum-benchmark.h"
+#include "utils.h"
 #include <benchmark/benchmark.h>
 
 #include "xnnpack.h"
diff --git a/bench/f16-spmm.cc b/bench/f16-spmm.cc
index dfe95839b79..a9cc6ae6d36 100644
--- a/bench/f16-spmm.cc
+++ b/bench/f16-spmm.cc
@@ -11,8 +11,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/spmm.h"
-#include "bench/utils.h"
+#include "spmm.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/math.h"
diff --git a/bench/f16-vcmul.cc b/bench/f16-vcmul.cc
index f72a0473818..26ef075c606 100644
--- a/bench/f16-vcmul.cc
+++ b/bench/f16-vcmul.cc
@@ -10,14 +10,15 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
 #include "xnnpack/common.h"
 #include "xnnpack/hardware-config.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams-init.h"
 #include "xnnpack/vbinary.h"
-#include "xnnpack/buffer.h"
 #include <benchmark/benchmark.h>
 
 static void f16_vcmul(benchmark::State& state, uint64_t arch_flags,
@@ -72,7 +73,7 @@ static void f16_vcmul(benchmark::State& state, uint64_t arch_flags,
           benchmark::utils::BinaryElementwiseParameters<std::complex<xnn_float16>,  \
                                                         std::complex<xnn_float16>>) \
       ->UseRealTime();
-#include "src/f16-vbinary/f16-vcmul.h"
+#include "f16-vbinary/f16-vcmul.h"
 #undef XNN_UKERNEL_WITH_PARAMS
 
 #ifndef XNNPACK_BENCHMARK_NO_MAIN
diff --git a/bench/f32-bgemm.cc b/bench/f32-bgemm.cc
index 2c210d00bf2..f078bca6aae 100644
--- a/bench/f32-bgemm.cc
+++ b/bench/f32-bgemm.cc
@@ -17,8 +17,8 @@
 #ifdef BENCHMARK_RUY
 #include "ruy/ruy.h"
 #endif  // BENCHMARK_RUY
-#include "bench/bgemm.h"
-#include "bench/utils.h"
+#include "bgemm.h"
+#include "utils.h"
 #include "xnnpack/allocator.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
diff --git a/bench/f32-conv-hwc.cc b/bench/f32-conv-hwc.cc
index 07048f96e23..86aad770c86 100644
--- a/bench/f32-conv-hwc.cc
+++ b/bench/f32-conv-hwc.cc
@@ -11,8 +11,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/dconv.h"
-#include "bench/utils.h"
+#include "dconv.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/conv.h"
diff --git a/bench/f32-conv-hwc2chw.cc b/bench/f32-conv-hwc2chw.cc
index d77c8cbab6f..ddeabbf47df 100644
--- a/bench/f32-conv-hwc2chw.cc
+++ b/bench/f32-conv-hwc2chw.cc
@@ -11,8 +11,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/dconv.h"
-#include "bench/utils.h"
+#include "dconv.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/conv.h"
diff --git a/bench/f32-dwconv.cc b/bench/f32-dwconv.cc
index 6ab991c2a88..fa21809f074 100644
--- a/bench/f32-dwconv.cc
+++ b/bench/f32-dwconv.cc
@@ -11,8 +11,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/dwconv.h"
-#include "bench/utils.h"
+#include "dwconv.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
@@ -825,9 +825,69 @@ static void f32_dwconv(
   BENCHMARK_DWCONV(f32_dwconv_8f8m9l4c4s4r__neon_acc2)
   BENCHMARK_DWCONV(f32_dwconv_8f8m9l8c4s4r__neon)
   BENCHMARK_DWCONV(f32_dwconv_8f8m9l8c4s4r__neon_acc2)
-
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+  static void f32_dwconv_25p16c__avx512f(benchmark::State& state, const char* net) {
+    f32_dwconv(state,
+               xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f,
+               xnn_init_f32_minmax_scalar_params,
+               16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F);
+  }
+  static void f32_dwconv_25p16c__avx512f_acc2(benchmark::State& state, const char* net) {
+    f32_dwconv(state,
+               xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f_acc2,
+               xnn_init_f32_minmax_scalar_params,
+               16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F);
+  }
+  static void f32_dwconv_25p32c__avx512f(benchmark::State& state, const char* net) {
+    f32_dwconv(state,
+               xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f,
+               xnn_init_f32_minmax_scalar_params,
+               32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F);
+  }
+  static void f32_dwconv_25p32c__avx512f_acc2(benchmark::State& state, const char* net) {
+    f32_dwconv(state,
+               xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f_acc2,
+               xnn_init_f32_minmax_scalar_params,
+               32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F);
+  }
+
+  static void f32_dwconv_5f5m5l16c16s1r__avx512f(benchmark::State& state, const char* net) {
+    f32_dwconv(state,
+               xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f, xnn_init_f32_minmax_scalar_params,
+               5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */,
+               16 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F);
+  }
+  static void f32_dwconv_5f5m5l16c16s1r__avx512f_acc2(benchmark::State& state, const char* net) {
+    f32_dwconv(state,
+               xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f_acc2, xnn_init_f32_minmax_scalar_params,
+               5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */,
+               16 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F);
+  }
+  static void f32_dwconv_5f5m5l32c16s1r__avx512f(benchmark::State& state, const char* net) {
+    f32_dwconv(state,
+               xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f, xnn_init_f32_minmax_scalar_params,
+               5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */,
+               32 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F);
+  }
+  static void f32_dwconv_5f5m5l32c16s1r__avx512f_acc2(benchmark::State& state, const char* net) {
+    f32_dwconv(state,
+               xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f_acc2, xnn_init_f32_minmax_scalar_params,
+               5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */,
+               32 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F);
+  }
+
+  BENCHMARK_DWCONV(f32_dwconv_25p16c__avx512f)
+  BENCHMARK_DWCONV(f32_dwconv_25p16c__avx512f_acc2)
+  BENCHMARK_DWCONV(f32_dwconv_25p32c__avx512f)
+  BENCHMARK_DWCONV(f32_dwconv_25p32c__avx512f_acc2)
+
+  BENCHMARK_DWCONV(f32_dwconv_5f5m5l16c16s1r__avx512f)
+  BENCHMARK_DWCONV(f32_dwconv_5f5m5l16c16s1r__avx512f_acc2)
+  BENCHMARK_DWCONV(f32_dwconv_5f5m5l32c16s1r__avx512f)
+  BENCHMARK_DWCONV(f32_dwconv_5f5m5l32c16s1r__avx512f_acc2)
+#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void f32_dwconv_4p4c__sse(benchmark::State& state, const char* net) {
@@ -1195,56 +1255,6 @@ static void f32_dwconv(
                benchmark::utils::CheckFMA3);
   }
 
-  static void f32_dwconv_25p16c__avx512f(benchmark::State& state, const char* net) {
-    f32_dwconv(state,
-               xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f,
-               xnn_init_f32_minmax_scalar_params,
-               16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F);
-  }
-  static void f32_dwconv_25p16c__avx512f_acc2(benchmark::State& state, const char* net) {
-    f32_dwconv(state,
-               xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f_acc2,
-               xnn_init_f32_minmax_scalar_params,
-               16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F);
-  }
-  static void f32_dwconv_25p32c__avx512f(benchmark::State& state, const char* net) {
-    f32_dwconv(state,
-               xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f,
-               xnn_init_f32_minmax_scalar_params,
-               32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F);
-  }
-  static void f32_dwconv_25p32c__avx512f_acc2(benchmark::State& state, const char* net) {
-    f32_dwconv(state,
-               xnn_f32_dwconv_minmax_ukernel_25p32c__avx512f_acc2,
-               xnn_init_f32_minmax_scalar_params,
-               32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckAVX512F);
-  }
-
-  static void f32_dwconv_5f5m5l16c16s1r__avx512f(benchmark::State& state, const char* net) {
-    f32_dwconv(state,
-               xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f, xnn_init_f32_minmax_scalar_params,
-               5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */,
-               16 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F);
-  }
-  static void f32_dwconv_5f5m5l16c16s1r__avx512f_acc2(benchmark::State& state, const char* net) {
-    f32_dwconv(state,
-               xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f_acc2, xnn_init_f32_minmax_scalar_params,
-               5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */,
-               16 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F);
-  }
-  static void f32_dwconv_5f5m5l32c16s1r__avx512f(benchmark::State& state, const char* net) {
-    f32_dwconv(state,
-               xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f, xnn_init_f32_minmax_scalar_params,
-               5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */,
-               32 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F);
-  }
-  static void f32_dwconv_5f5m5l32c16s1r__avx512f_acc2(benchmark::State& state, const char* net) {
-    f32_dwconv(state,
-               xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f_acc2, xnn_init_f32_minmax_scalar_params,
-               5 /* first pass tile */, 5 /* middle pass tile */, 5 /* last pass tile */,
-               32 /* channel tile */, 16 /* channel subtile */, 1 /* channel round */, benchmark::utils::CheckAVX512F);
-  }
-
   BENCHMARK_DWCONV(f32_dwconv_4p4c__sse)
   BENCHMARK_DWCONV(f32_dwconv_9p4c__sse)
   BENCHMARK_DWCONV(f32_dwconv_25p4c__sse)
@@ -1308,19 +1318,8 @@ static void f32_dwconv(
   BENCHMARK_DWCONV(f32_dwconv_7f6m6l16c8s4r__fma3_acc2)
   BENCHMARK_DWCONV(f32_dwconv_7f6m6l32c8s4r__fma3)
   BENCHMARK_DWCONV(f32_dwconv_7f6m6l32c8s4r__fma3_acc2)
-
-  BENCHMARK_DWCONV(f32_dwconv_25p16c__avx512f)
-  BENCHMARK_DWCONV(f32_dwconv_25p16c__avx512f_acc2)
-  BENCHMARK_DWCONV(f32_dwconv_25p32c__avx512f)
-  BENCHMARK_DWCONV(f32_dwconv_25p32c__avx512f_acc2)
-
-  BENCHMARK_DWCONV(f32_dwconv_5f5m5l16c16s1r__avx512f)
-  BENCHMARK_DWCONV(f32_dwconv_5f5m5l16c16s1r__avx512f_acc2)
-  BENCHMARK_DWCONV(f32_dwconv_5f5m5l32c16s1r__avx512f)
-  BENCHMARK_DWCONV(f32_dwconv_5f5m5l32c16s1r__avx512f_acc2)
 #endif  // XNN_ARCH_X88 || XNN_ARCH_X86_64
 
-
 #if XNN_ARCH_WASM
   static void f32_dwconv_9p1c__wasm(benchmark::State& state, const char* net) {
     f32_dwconv(state,
diff --git a/bench/f32-dwconv2d-chw.cc b/bench/f32-dwconv2d-chw.cc
index bbfa543cfc4..598efe15c3d 100644
--- a/bench/f32-dwconv2d-chw.cc
+++ b/bench/f32-dwconv2d-chw.cc
@@ -11,8 +11,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/dwconv.h"
-#include "bench/utils.h"
+#include "dwconv.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
diff --git a/bench/f32-f16-vcvt.cc b/bench/f32-f16-vcvt.cc
index b9c4ca8b903..b5a16001e87 100644
--- a/bench/f32-f16-vcvt.cc
+++ b/bench/f32-f16-vcvt.cc
@@ -3,16 +3,17 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <benchmark/benchmark.h>
-#include "bench/utils.h"
-#include "bench/vcvt-benchmark.h"
+#include "utils.h"
+#include "vcvt-benchmark.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/hardware-config.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
-#include "xnnpack/microparams.h"
 #include "xnnpack/microparams-init.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/vcvt.h"
+#include <benchmark/benchmark.h>
 
 static void f32_f16_vcvt(
   benchmark::State& state,
@@ -28,7 +29,7 @@ static void f32_f16_vcvt(
 BENCHMARK_CAPTURE(f32_f16_vcvt, ukernel, arch_flags, ukernel, init_params)          \
   ->Apply(benchmark::utils::UnaryElementwiseParameters<datatype_in, datatype_out>)  \
   ->UseRealTime();
-#include "src/f32-f16-vcvt/f32-f16-vcvt.h"
+#include "f32-f16-vcvt/f32-f16-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
 
 
diff --git a/bench/f32-gavgpool-cw.cc b/bench/f32-gavgpool-cw.cc
index e51a74fba9a..fd19411217b 100644
--- a/bench/f32-gavgpool-cw.cc
+++ b/bench/f32-gavgpool-cw.cc
@@ -10,7 +10,7 @@
 #include <numeric>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include <benchmark/benchmark.h>
 
 #include "xnnpack.h"
diff --git a/bench/f32-gemm-goi-minmax.cc b/bench/f32-gemm-goi-minmax.cc
index 20ff18bfce6..acfa1d80780 100644
--- a/bench/f32-gemm-goi-minmax.cc
+++ b/bench/f32-gemm-goi-minmax.cc
@@ -8,8 +8,8 @@
 //   Generator: tools/generate-gemm-test.py
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/isa-checks.h"
diff --git a/bench/f32-gemm-minmax.cc b/bench/f32-gemm-minmax.cc
index 2cdf495a69d..c625c166777 100644
--- a/bench/f32-gemm-minmax.cc
+++ b/bench/f32-gemm-minmax.cc
@@ -8,8 +8,8 @@
 //   Generator: tools/generate-gemm-test.py
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/isa-checks.h"
diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index b63abaa3903..ba5d3be4803 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc
@@ -20,8 +20,8 @@
 #ifdef BENCHMARK_RUY
 #include "ruy/ruy.h"
 #endif  // BENCHMARK_RUY
-#include "bench/gemm.h"
-#include "bench/utils.h"
+#include "gemm.h"
+#include "utils.h"
 #include "xnnpack/allocator.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
@@ -1306,7 +1306,7 @@ static void ruy_st(benchmark::State& state, const char* net)
   BENCHMARK_GEMM(f32_gemm_8x8s4__neonfma)
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   static void f32_gemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
     GEMMBenchmark(state,
       xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
@@ -1350,6 +1350,15 @@ static void ruy_st(benchmark::State& state, const char* net)
       benchmark::utils::CheckAVX512F);
   }
 
+  BENCHMARK_GEMM(f32_gemm_1x16__avx512f_broadcast)
+  BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast)
+  BENCHMARK_GEMM(f32_gemm_5x16__avx512f_broadcast)
+  BENCHMARK_GEMM(f32_gemm_6x16__avx512f_broadcast)
+  BENCHMARK_GEMM(f32_gemm_7x16__avx512f_broadcast)
+  BENCHMARK_GEMM(f32_gemm_8x16__avx512f_broadcast)
+#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void f32_gemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
     GEMMBenchmark(state,
       xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
@@ -1638,13 +1647,6 @@ static void ruy_st(benchmark::State& state, const char* net)
       /*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1);
   }
 
-  BENCHMARK_GEMM(f32_gemm_1x16__avx512f_broadcast)
-  BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast)
-  BENCHMARK_GEMM(f32_gemm_5x16__avx512f_broadcast)
-  BENCHMARK_GEMM(f32_gemm_6x16__avx512f_broadcast)
-  BENCHMARK_GEMM(f32_gemm_7x16__avx512f_broadcast)
-  BENCHMARK_GEMM(f32_gemm_8x16__avx512f_broadcast)
-
   BENCHMARK_GEMM(f32_gemm_1x8__fma3_broadcast)
   BENCHMARK_GEMM(f32_gemm_4x8__fma3_broadcast)
   BENCHMARK_GEMM(f32_gemm_5x8__fma3_broadcast)
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index 2cf8214746c..8a67ac74811 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -11,8 +11,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/conv.h"
-#include "bench/utils.h"
+#include "conv.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/igemm.h"
 #include "xnnpack/indirection.h"
@@ -682,6 +682,56 @@ static void f32_igemm(benchmark::State& state,
   BENCHMARK_CONV(f32_igemm_8x8s4__neonfma)
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+  static void f32_igemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
+    f32_igemm(state,
+      xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
+      xnn_init_f32_minmax_scalar_params,
+      /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX512F);
+  }
+  static void f32_igemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) {
+    f32_igemm(state,
+      xnn_f32_igemm_minmax_ukernel_4x16__avx512f_broadcast,
+      xnn_init_f32_minmax_scalar_params,
+      /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX512F);
+  }
+  static void f32_igemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) {
+    f32_igemm(state,
+      xnn_f32_igemm_minmax_ukernel_5x16__avx512f_broadcast,
+      xnn_init_f32_minmax_scalar_params,
+      /*mr=*/5, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX512F);
+  }
+  static void f32_igemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) {
+    f32_igemm(state,
+      xnn_f32_igemm_minmax_ukernel_6x16__avx512f_broadcast,
+      xnn_init_f32_minmax_scalar_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX512F);
+  }
+  static void f32_igemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) {
+    f32_igemm(state,
+      xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast,
+      xnn_init_f32_minmax_scalar_params,
+      /*mr=*/7, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX512F);
+  }
+  static void f32_igemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) {
+    f32_igemm(state,
+      xnn_f32_igemm_minmax_ukernel_8x16__avx512f_broadcast,
+      xnn_init_f32_minmax_scalar_params,
+      /*mr=*/8, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX512F);
+  }
+  BENCHMARK_CONV(f32_igemm_1x16__avx512f_broadcast)
+  BENCHMARK_CONV(f32_igemm_4x16__avx512f_broadcast)
+  BENCHMARK_CONV(f32_igemm_5x16__avx512f_broadcast)
+  BENCHMARK_CONV(f32_igemm_6x16__avx512f_broadcast)
+  BENCHMARK_CONV(f32_igemm_7x16__avx512f_broadcast)
+  BENCHMARK_CONV(f32_igemm_8x16__avx512f_broadcast)
+#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void f32_igemm_1x8__sse_load1(benchmark::State& state, const char* net) {
@@ -904,48 +954,6 @@ static void f32_igemm(benchmark::State& state,
       /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
       benchmark::utils::CheckFMA3);
   }
-  static void f32_igemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
-    f32_igemm(state,
-      xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
-      xnn_init_f32_minmax_scalar_params,
-      /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
-      benchmark::utils::CheckAVX512F);
-  }
-  static void f32_igemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) {
-    f32_igemm(state,
-      xnn_f32_igemm_minmax_ukernel_4x16__avx512f_broadcast,
-      xnn_init_f32_minmax_scalar_params,
-      /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
-      benchmark::utils::CheckAVX512F);
-  }
-  static void f32_igemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) {
-    f32_igemm(state,
-      xnn_f32_igemm_minmax_ukernel_5x16__avx512f_broadcast,
-      xnn_init_f32_minmax_scalar_params,
-      /*mr=*/5, /*nr=*/16, /*kr=*/1, /*sr=*/1,
-      benchmark::utils::CheckAVX512F);
-  }
-  static void f32_igemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) {
-    f32_igemm(state,
-      xnn_f32_igemm_minmax_ukernel_6x16__avx512f_broadcast,
-      xnn_init_f32_minmax_scalar_params,
-      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
-      benchmark::utils::CheckAVX512F);
-  }
-  static void f32_igemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) {
-    f32_igemm(state,
-      xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast,
-      xnn_init_f32_minmax_scalar_params,
-      /*mr=*/7, /*nr=*/16, /*kr=*/1, /*sr=*/1,
-      benchmark::utils::CheckAVX512F);
-  }
-  static void f32_igemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) {
-    f32_igemm(state,
-      xnn_f32_igemm_minmax_ukernel_8x16__avx512f_broadcast,
-      xnn_init_f32_minmax_scalar_params,
-      /*mr=*/8, /*nr=*/16, /*kr=*/1, /*sr=*/1,
-      benchmark::utils::CheckAVX512F);
-  }
 
   BENCHMARK_CONV(f32_igemm_1x8__sse_load1)
   BENCHMARK_CONV(f32_igemm_3x8__sse_load1)
@@ -980,12 +988,6 @@ static void f32_igemm(benchmark::State& state,
   BENCHMARK_CONV(f32_igemm_6x16__fma3_broadcast)
   BENCHMARK_CONV(f32_igemm_5x16__fma3_broadcast_prfm)
   BENCHMARK_CONV(f32_igemm_6x16__fma3_broadcast_prfm)
-  BENCHMARK_CONV(f32_igemm_1x16__avx512f_broadcast)
-  BENCHMARK_CONV(f32_igemm_4x16__avx512f_broadcast)
-  BENCHMARK_CONV(f32_igemm_5x16__avx512f_broadcast)
-  BENCHMARK_CONV(f32_igemm_6x16__avx512f_broadcast)
-  BENCHMARK_CONV(f32_igemm_7x16__avx512f_broadcast)
-  BENCHMARK_CONV(f32_igemm_8x16__avx512f_broadcast)
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
diff --git a/bench/f32-im2col-gemm.cc b/bench/f32-im2col-gemm.cc
index 864893a60f6..554f7ffbde6 100644
--- a/bench/f32-im2col-gemm.cc
+++ b/bench/f32-im2col-gemm.cc
@@ -11,8 +11,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/conv.h"
-#include "bench/utils.h"
+#include "conv.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
diff --git a/bench/f32-qc4w-gemm.cc b/bench/f32-qc4w-gemm.cc
index 70d76a784b1..cb302351efd 100644
--- a/bench/f32-qc4w-gemm.cc
+++ b/bench/f32-qc4w-gemm.cc
@@ -12,8 +12,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/gemm.h"
-#include "bench/utils.h"
+#include "gemm.h"
+#include "utils.h"
 #include "xnnpack/allocator.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
@@ -395,6 +395,74 @@ static void GEMMBenchmark(benchmark::State& state,
   BENCHMARK_GEMM(f32_qc4w_gemm_6x8__neonfma_dup_ld64)
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+  static void f32_qc4w_gemm_1x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state,
+      xnn_f32_qc4w_gemm_minmax_ukernel_1x32__avx512skx_broadcast,
+      xnn_init_f32_qc4w_minmax_scalar_params,
+      /*mr=*/1, /*nr=*/32, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX512SKX);
+  }
+  static void f32_qc4w_gemm_2x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state,
+      xnn_f32_qc4w_gemm_minmax_ukernel_2x32__avx512skx_broadcast,
+      xnn_init_f32_qc4w_minmax_scalar_params,
+      /*mr=*/2, /*nr=*/32, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX512SKX);
+  }
+  static void f32_qc4w_gemm_3x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state,
+      xnn_f32_qc4w_gemm_minmax_ukernel_3x32__avx512skx_broadcast,
+      xnn_init_f32_qc4w_minmax_scalar_params,
+      /*mr=*/3, /*nr=*/32, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX512SKX);
+  }
+  static void f32_qc4w_gemm_4x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state,
+      xnn_f32_qc4w_gemm_minmax_ukernel_4x32__avx512skx_broadcast,
+      xnn_init_f32_qc4w_minmax_scalar_params,
+      /*mr=*/4, /*nr=*/32, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX512SKX);
+  }
+  static void f32_qc4w_gemm_5x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state,
+      xnn_f32_qc4w_gemm_minmax_ukernel_5x32__avx512skx_broadcast,
+      xnn_init_f32_qc4w_minmax_scalar_params,
+      /*mr=*/5, /*nr=*/32, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX512SKX);
+  }
+  static void f32_qc4w_gemm_6x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state,
+      xnn_f32_qc4w_gemm_minmax_ukernel_6x32__avx512skx_broadcast,
+      xnn_init_f32_qc4w_minmax_scalar_params,
+      /*mr=*/6, /*nr=*/32, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX512SKX);
+  }
+  static void f32_qc4w_gemm_7x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state,
+      xnn_f32_qc4w_gemm_minmax_ukernel_7x32__avx512skx_broadcast,
+      xnn_init_f32_qc4w_minmax_scalar_params,
+      /*mr=*/7, /*nr=*/32, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX512SKX);
+  }
+  static void f32_qc4w_gemm_8x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state,
+      xnn_f32_qc4w_gemm_minmax_ukernel_8x32__avx512skx_broadcast,
+      xnn_init_f32_qc4w_minmax_scalar_params,
+      /*mr=*/8, /*nr=*/32, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX512SKX);
+  }
+
+  BENCHMARK_GEMM(f32_qc4w_gemm_1x32__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc4w_gemm_2x32__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc4w_gemm_3x32__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc4w_gemm_4x32__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc4w_gemm_5x32__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc4w_gemm_6x32__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc4w_gemm_7x32__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc4w_gemm_8x32__avx512skx_broadcast)
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void f32_qc4w_gemm_1x16__avx2_broadcast(benchmark::State& state, const char* net) {
     GEMMBenchmark(state,
@@ -566,62 +634,7 @@ static void GEMMBenchmark(benchmark::State& state,
       /*mr=*/8, /*nr=*/16, /*kr=*/1, /*sr=*/1,
       benchmark::utils::CheckAVX2);
   }
-  static void f32_qc4w_gemm_1x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state,
-      xnn_f32_qc4w_gemm_minmax_ukernel_1x32__avx512skx_broadcast,
-      xnn_init_f32_qc4w_minmax_scalar_params,
-      /*mr=*/1, /*nr=*/32, /*kr=*/1, /*sr=*/1,
-      benchmark::utils::CheckAVX512SKX);
-  }
-  static void f32_qc4w_gemm_2x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state,
-      xnn_f32_qc4w_gemm_minmax_ukernel_2x32__avx512skx_broadcast,
-      xnn_init_f32_qc4w_minmax_scalar_params,
-      /*mr=*/2, /*nr=*/32, /*kr=*/1, /*sr=*/1,
-      benchmark::utils::CheckAVX512SKX);
-  }
-  static void f32_qc4w_gemm_3x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state,
-      xnn_f32_qc4w_gemm_minmax_ukernel_3x32__avx512skx_broadcast,
-      xnn_init_f32_qc4w_minmax_scalar_params,
-      /*mr=*/3, /*nr=*/32, /*kr=*/1, /*sr=*/1,
-      benchmark::utils::CheckAVX512SKX);
-  }
-  static void f32_qc4w_gemm_4x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state,
-      xnn_f32_qc4w_gemm_minmax_ukernel_4x32__avx512skx_broadcast,
-      xnn_init_f32_qc4w_minmax_scalar_params,
-      /*mr=*/4, /*nr=*/32, /*kr=*/1, /*sr=*/1,
-      benchmark::utils::CheckAVX512SKX);
-  }
-  static void f32_qc4w_gemm_5x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state,
-      xnn_f32_qc4w_gemm_minmax_ukernel_5x32__avx512skx_broadcast,
-      xnn_init_f32_qc4w_minmax_scalar_params,
-      /*mr=*/5, /*nr=*/32, /*kr=*/1, /*sr=*/1,
-      benchmark::utils::CheckAVX512SKX);
-  }
-  static void f32_qc4w_gemm_6x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state,
-      xnn_f32_qc4w_gemm_minmax_ukernel_6x32__avx512skx_broadcast,
-      xnn_init_f32_qc4w_minmax_scalar_params,
-      /*mr=*/6, /*nr=*/32, /*kr=*/1, /*sr=*/1,
-      benchmark::utils::CheckAVX512SKX);
-  }
-  static void f32_qc4w_gemm_7x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state,
-      xnn_f32_qc4w_gemm_minmax_ukernel_7x32__avx512skx_broadcast,
-      xnn_init_f32_qc4w_minmax_scalar_params,
-      /*mr=*/7, /*nr=*/32, /*kr=*/1, /*sr=*/1,
-      benchmark::utils::CheckAVX512SKX);
-  }
-  static void f32_qc4w_gemm_8x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state,
-      xnn_f32_qc4w_gemm_minmax_ukernel_8x32__avx512skx_broadcast,
-      xnn_init_f32_qc4w_minmax_scalar_params,
-      /*mr=*/8, /*nr=*/32, /*kr=*/1, /*sr=*/1,
-      benchmark::utils::CheckAVX512SKX);
-  }
+
   static void f32_qc4w_gemm_1x8__sse41_dup(benchmark::State& state, const char* net) {
     GEMMBenchmark(state,
       xnn_f32_qc4w_gemm_minmax_ukernel_1x8__sse41_dup,
@@ -658,14 +671,6 @@ static void GEMMBenchmark(benchmark::State& state,
       benchmark::utils::CheckSSE41);
   }
 
-  BENCHMARK_GEMM(f32_qc4w_gemm_1x32__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc4w_gemm_2x32__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc4w_gemm_3x32__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc4w_gemm_4x32__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc4w_gemm_5x32__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc4w_gemm_6x32__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc4w_gemm_7x32__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc4w_gemm_8x32__avx512skx_broadcast)
   BENCHMARK_GEMM(f32_qc4w_gemm_1x16__avx2_broadcast)
   BENCHMARK_GEMM(f32_qc4w_gemm_2x16__avx2_broadcast)
   BENCHMARK_GEMM(f32_qc4w_gemm_3x16__avx2_broadcast)
diff --git a/bench/f32-qc8w-gemm.cc b/bench/f32-qc8w-gemm.cc
index 97d34cf9301..76486933043 100644
--- a/bench/f32-qc8w-gemm.cc
+++ b/bench/f32-qc8w-gemm.cc
@@ -16,8 +16,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/gemm.h"
-#include "bench/utils.h"
+#include "gemm.h"
+#include "utils.h"
 #include "xnnpack/allocator.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
@@ -470,7 +470,7 @@ static void GEMMBenchmark(benchmark::State& state,
   BENCHMARK_GEMM(f32_qc8w_gemm_6x8s4__neonfma)
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   static void f32_qc8w_gemm_1x32__avx512skx_broadcast(benchmark::State& state, const char* net) {
     GEMMBenchmark(state,
       xnn_f32_qc8w_gemm_minmax_ukernel_1x32__avx512skx_broadcast,
@@ -583,6 +583,27 @@ static void GEMMBenchmark(benchmark::State& state,
       /*mr=*/8, /*nr=*/16, /*kr=*/1, /*sr=*/1,
       benchmark::utils::CheckAVX512SKX);
   }
+
+  BENCHMARK_GEMM(f32_qc8w_gemm_1x32__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc8w_gemm_2x32__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc8w_gemm_3x32__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc8w_gemm_4x32__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc8w_gemm_5x32__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc8w_gemm_6x32__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc8w_gemm_7x32__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc8w_gemm_8x32__avx512skx_broadcast)
+
+  BENCHMARK_GEMM(f32_qc8w_gemm_1x16__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc8w_gemm_2x16__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc8w_gemm_3x16__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc8w_gemm_4x16__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc8w_gemm_5x16__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc8w_gemm_6x16__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc8w_gemm_7x16__avx512skx_broadcast)
+  BENCHMARK_GEMM(f32_qc8w_gemm_8x16__avx512skx_broadcast)
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void f32_qc8w_gemm_1x8__avx2_broadcast(benchmark::State& state, const char* net) {
     GEMMBenchmark(state,
       xnn_f32_qc8w_gemm_minmax_ukernel_1x8__avx2_broadcast,
@@ -941,24 +962,6 @@ static void GEMMBenchmark(benchmark::State& state,
       benchmark::utils::CheckSSE41);
   }
 
-  BENCHMARK_GEMM(f32_qc8w_gemm_1x32__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc8w_gemm_2x32__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc8w_gemm_3x32__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc8w_gemm_4x32__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc8w_gemm_5x32__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc8w_gemm_6x32__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc8w_gemm_7x32__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc8w_gemm_8x32__avx512skx_broadcast)
-
-  BENCHMARK_GEMM(f32_qc8w_gemm_1x16__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc8w_gemm_2x16__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc8w_gemm_3x16__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc8w_gemm_4x16__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc8w_gemm_5x16__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc8w_gemm_6x16__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc8w_gemm_7x16__avx512skx_broadcast)
-  BENCHMARK_GEMM(f32_qc8w_gemm_8x16__avx512skx_broadcast)
-
   BENCHMARK_GEMM(f32_qc8w_gemm_1x8__avx2_broadcast)
   BENCHMARK_GEMM(f32_qc8w_gemm_4x8__avx2_broadcast)
   BENCHMARK_GEMM(f32_qc8w_gemm_5x8__avx2_broadcast)
diff --git a/bench/f32-qs8-vcvt.cc b/bench/f32-qs8-vcvt.cc
index 4e05c0a1782..e622edb2a09 100644
--- a/bench/f32-qs8-vcvt.cc
+++ b/bench/f32-qs8-vcvt.cc
@@ -4,7 +4,7 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <benchmark/benchmark.h>
-#include "bench/vcvt-benchmark.h"
+#include "vcvt-benchmark.h"
 #include "xnnpack.h"
 #include "xnnpack/hardware-config.h"
 #include "xnnpack/microfnptr.h"
@@ -21,9 +21,7 @@ static void f32_qs8_vcvt(
   xnn_f32_qs8_cvt_params params;
   init_params(&params,
     25.0f /* scale */,
-    1 /* output zero point */,
-    std::numeric_limits<int8_t>::min() + 1 /* output min */,
-    std::numeric_limits<int8_t>::max() - 1 /* output max */);
+    1 /* output zero point */);
 
   cvt_benchmark<float, int8_t>(state, arch_flags, cvt, &params);
 }
@@ -33,7 +31,7 @@ static void f32_qs8_vcvt(
 BENCHMARK_CAPTURE(f32_qs8_vcvt, ukernel, arch_flags, ukernel, init_params)          \
   ->Apply(benchmark::utils::UnaryElementwiseParameters<datatype_in, datatype_out>)  \
   ->UseRealTime();
-#include "src/f32-qs8-vcvt/f32-qs8-vcvt.h"
+#include "f32-qs8-vcvt/f32-qs8-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
 
 
diff --git a/bench/f32-qu8-vcvt.cc b/bench/f32-qu8-vcvt.cc
index 977995befd4..7110d9f1443 100644
--- a/bench/f32-qu8-vcvt.cc
+++ b/bench/f32-qu8-vcvt.cc
@@ -4,8 +4,8 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
-#include "bench/vcvt-benchmark.h"
+#include "utils.h"
+#include "vcvt-benchmark.h"
 #include "xnnpack.h"
 #include "xnnpack/hardware-config.h"
 #include "xnnpack/microfnptr.h"
@@ -22,9 +22,7 @@ static void f32_qu8_vcvt(
   xnn_f32_qu8_cvt_params params;
   init_params(&params,
     25.0f /* scale */,
-    127 /* output zero point */,
-    std::numeric_limits<uint8_t>::min() + 1 /* output min */,
-    std::numeric_limits<uint8_t>::max() - 1 /* output max */);
+    127 /* output zero point */);
 
   cvt_benchmark<float, uint8_t>(state, arch_flags, cvt, &params);
 }
@@ -34,7 +32,7 @@ static void f32_qu8_vcvt(
 BENCHMARK_CAPTURE(f32_qu8_vcvt, ukernel, arch_flags, ukernel, init_params)          \
   ->Apply(benchmark::utils::UnaryElementwiseParameters<datatype_in, datatype_out>)  \
   ->UseRealTime();
-#include "src/f32-qu8-vcvt/f32-qu8-vcvt.h"
+#include "f32-qu8-vcvt/f32-qu8-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
 
 
diff --git a/bench/f32-raddexpminusmax.cc b/bench/f32-raddexpminusmax.cc
index 3622332f4eb..76dd491a1f6 100644
--- a/bench/f32-raddexpminusmax.cc
+++ b/bench/f32-raddexpminusmax.cc
@@ -9,7 +9,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/microfnptr.h"
@@ -79,7 +79,7 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
   }
 }
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_u64,
     xnn_f32_rmax_ukernel__avx_u32_acc4,
     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_u64,
@@ -144,7 +144,9 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
     xnn_f32_rmax_ukernel__avx_u32_acc4,
     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_u192_acc6,
     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
+#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_u32,
     xnn_f32_rmax_ukernel__avx_u32_acc4,
     xnn_f32_raddexpminusmax_ukernel__avx2_p5_u32,
diff --git a/bench/f32-raddextexp.cc b/bench/f32-raddextexp.cc
index c422f1a8c3b..c7c9bb8a609 100644
--- a/bench/f32-raddextexp.cc
+++ b/bench/f32-raddextexp.cc
@@ -9,7 +9,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/microfnptr.h"
@@ -73,7 +73,7 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
   }
 }
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_u128,
     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_u128,
     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
@@ -113,7 +113,9 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
   BENCHMARK_CAPTURE(f32_raddextexp, avx512f_p5_scalef_u192_acc6,
     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_u192_acc6,
     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
+#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   BENCHMARK_CAPTURE(f32_raddextexp, avx2_p5_u64,
     xnn_f32_raddextexp_ukernel__avx2_p5_u64,
     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
diff --git a/bench/f32-raddstoreexpminusmax.cc b/bench/f32-raddstoreexpminusmax.cc
index fd64bc3fdf4..06270debda1 100644
--- a/bench/f32-raddstoreexpminusmax.cc
+++ b/bench/f32-raddstoreexpminusmax.cc
@@ -9,7 +9,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/microfnptr.h"
@@ -249,7 +249,7 @@ static void f32_raddstoreexpminusmax(
 
 #endif  // XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_u16,
                     xnn_f32_rmax_ukernel__avx_u32_acc4,
                     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u16,
@@ -307,7 +307,9 @@ static void f32_raddstoreexpminusmax(
                     benchmark::utils::CheckAVX512F)
     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
     ->UseRealTime();
+#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_u8,
                     xnn_f32_rmax_ukernel__avx_u32_acc4,
                     xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_u8,
diff --git a/bench/f32-rdsum.cc b/bench/f32-rdsum.cc
index 188c0411ad2..4bc890ff208 100644
--- a/bench/f32-rdsum.cc
+++ b/bench/f32-rdsum.cc
@@ -7,8 +7,8 @@
 //   Specification: test/f32-rdsum.yaml
 //   Generator: tools/generate-rdsum-benchmark.py
 
-#include "bench/rsum-benchmark.h"
-#include "bench/utils.h"
+#include "rsum-benchmark.h"
+#include "utils.h"
 #include <benchmark/benchmark.h>
 
 #include "xnnpack.h"
diff --git a/bench/f32-rmax.cc b/bench/f32-rmax.cc
index a5f58e0eb18..a4175917931 100644
--- a/bench/f32-rmax.cc
+++ b/bench/f32-rmax.cc
@@ -9,7 +9,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/microfnptr.h"
@@ -61,7 +61,7 @@ static void f32_rmax(
     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
 }
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   BENCHMARK_CAPTURE(f32_rmax, avx512f_u16,
                     xnn_f32_rmax_ukernel__avx512f_u16,
                     /*init_params=*/nullptr,
@@ -92,7 +92,9 @@ static void f32_rmax(
                     benchmark::utils::CheckAVX512F)
     ->Apply(benchmark::utils::ReductionParameters<float>)
     ->UseRealTime();
+#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   BENCHMARK_CAPTURE(f32_rmax, avx_u8,
                     xnn_f32_rmax_ukernel__avx_u8,
                     /*init_params=*/nullptr,
diff --git a/bench/f32-rmin.cc b/bench/f32-rmin.cc
index c666f5cf30d..acba077e41a 100644
--- a/bench/f32-rmin.cc
+++ b/bench/f32-rmin.cc
@@ -9,7 +9,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/microfnptr.h"
@@ -61,7 +61,7 @@ static void f32_rmin(
     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
 }
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   BENCHMARK_CAPTURE(f32_rmin, avx512f_u16,
                     xnn_f32_rmin_ukernel__avx512f_u16,
                     /*init_params=*/nullptr,
@@ -92,7 +92,9 @@ static void f32_rmin(
                     benchmark::utils::CheckAVX512F)
     ->Apply(benchmark::utils::ReductionParameters<float>)
     ->UseRealTime();
+#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   BENCHMARK_CAPTURE(f32_rmin, avx_u8,
                     xnn_f32_rmin_ukernel__avx_u8,
                     /*init_params=*/nullptr,
diff --git a/bench/f32-rminmax.cc b/bench/f32-rminmax.cc
index 4fd4dfb10ee..0ab42c83dbb 100644
--- a/bench/f32-rminmax.cc
+++ b/bench/f32-rminmax.cc
@@ -9,7 +9,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/microfnptr.h"
@@ -61,7 +61,7 @@ static void f32_rminmax(
     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
 }
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   BENCHMARK_CAPTURE(f32_rminmax, avx512f_u16,
                     xnn_f32_rminmax_ukernel__avx512f_u16,
                     /*init_params=*/nullptr,
@@ -92,7 +92,9 @@ static void f32_rminmax(
                     benchmark::utils::CheckAVX512F)
     ->Apply(benchmark::utils::ReductionParameters<float>)
     ->UseRealTime();
+#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   BENCHMARK_CAPTURE(f32_rminmax, avx_u8,
                     xnn_f32_rminmax_ukernel__avx_u8,
                     /*init_params=*/nullptr,
diff --git a/bench/f32-rsum.cc b/bench/f32-rsum.cc
index 6b654c28b73..b4fcb5a70de 100644
--- a/bench/f32-rsum.cc
+++ b/bench/f32-rsum.cc
@@ -7,8 +7,8 @@
 //   Specification: test/f32-rsum.yaml
 //   Generator: tools/generate-rdsum-benchmark.py
 
-#include "bench/rsum-benchmark.h"
-#include "bench/utils.h"
+#include "rsum-benchmark.h"
+#include "utils.h"
 #include <benchmark/benchmark.h>
 
 #include "xnnpack.h"
diff --git a/bench/f32-softmax.cc b/bench/f32-softmax.cc
index 54dd1bbb2d2..f92ec8890ed 100644
--- a/bench/f32-softmax.cc
+++ b/bench/f32-softmax.cc
@@ -10,7 +10,7 @@
 #ifdef BENCHMARK_INTEL_DNNL
 #include <dnnl.h>
 #endif  // BENCHMARK_INTEL_DNNL
-#include "bench/utils.h"
+#include "utils.h"
 
 #include "xnnpack.h"
 #include "xnnpack/common.h"
@@ -419,6 +419,26 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
   BENCHMARK(DNNLSoftArgMax)->Apply(CharacteristicArguments)->UseManualTime();
 #endif
 
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+  BENCHMARK_CAPTURE(TwoPassSoftMax, avx512f_p5_scalef,
+    xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_u144_acc3,
+    xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_u16,
+    benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
+  BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx512f_p5_scalef,
+    xnn_f32_rmax_ukernel__avx512f_u64_acc4,
+    (xnn_init_f32_default_params_fn) nullptr,
+    xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_u128_acc4,
+    xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_u16,
+    benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
+  BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx512f_p5_scalef,
+    xnn_f32_rmax_ukernel__avx512f_u64_acc4,
+    (xnn_init_f32_default_params_fn) nullptr,
+    xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2,
+    nullptr,
+    xnn_f32_vmulc_ukernel__avx512f_u32,
+    benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
+#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   BENCHMARK_CAPTURE(TwoPassSoftMax, avx2_p5,
     xnn_f32_raddextexp_ukernel__avx2_p5_u96,
@@ -437,24 +457,6 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
     nullptr,
     xnn_f32_vmulc_ukernel__avx_u16,
     benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime();
-
-  BENCHMARK_CAPTURE(TwoPassSoftMax, avx512f_p5_scalef,
-    xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_u144_acc3,
-    xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_u16,
-    benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
-  BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx512f_p5_scalef,
-    xnn_f32_rmax_ukernel__avx512f_u64_acc4,
-    (xnn_init_f32_default_params_fn) nullptr,
-    xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_u128_acc4,
-    xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_u16,
-    benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
-  BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx512f_p5_scalef,
-    xnn_f32_rmax_ukernel__avx512f_u64_acc4,
-    (xnn_init_f32_default_params_fn) nullptr,
-    xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_u64_acc2,
-    nullptr,
-    xnn_f32_vmulc_ukernel__avx512f_u32,
-    benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime();
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV
diff --git a/bench/f32-spmm.cc b/bench/f32-spmm.cc
index 40da2d0dcf3..f802e9d3ad2 100644
--- a/bench/f32-spmm.cc
+++ b/bench/f32-spmm.cc
@@ -8,8 +8,8 @@
 //   Generator: tools/generate-spmm-test.py
 
 #include <benchmark/benchmark.h>
-#include "bench/spmm-benchmark.h"
-#include "bench/utils.h"
+#include "spmm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams-init.h"
diff --git a/bench/f32-vcmul.cc b/bench/f32-vcmul.cc
index 7d680a4f4ea..37983b27e40 100644
--- a/bench/f32-vcmul.cc
+++ b/bench/f32-vcmul.cc
@@ -10,7 +10,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/hardware-config.h"
@@ -71,7 +71,7 @@ static void f32_vcmul(benchmark::State& state, uint64_t arch_flags,
           benchmark::utils::BinaryElementwiseParameters<std::complex<float>,  \
                                                         std::complex<float>>) \
       ->UseRealTime();
-#include "src/f32-vbinary/f32-vcmul.h"
+#include "f32-vbinary/f32-vcmul.h"
 #undef XNN_UKERNEL_WITH_PARAMS
 
 #ifndef XNNPACK_BENCHMARK_NO_MAIN
diff --git a/bench/f32-vscaleexpminusmax.cc b/bench/f32-vscaleexpminusmax.cc
index cb3c68d1900..928d37d997b 100644
--- a/bench/f32-vscaleexpminusmax.cc
+++ b/bench/f32-vscaleexpminusmax.cc
@@ -6,7 +6,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/microfnptr.h"
@@ -80,7 +80,7 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
   }
 }
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_u16,
     xnn_f32_rmax_ukernel__avx512f_u64_acc4,
     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_u128_acc2,
@@ -141,7 +141,9 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
     xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_u128_acc2,
     xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_u192,
     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
+#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_u8,
     xnn_f32_rmax_ukernel__avx_u32_acc4,
     xnn_f32_raddexpminusmax_ukernel__avx2_p5_u80_acc2,
diff --git a/bench/f32-vscaleextexp.cc b/bench/f32-vscaleextexp.cc
index ca51d3a9682..620c82a2249 100644
--- a/bench/f32-vscaleextexp.cc
+++ b/bench/f32-vscaleextexp.cc
@@ -6,7 +6,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/microfnptr.h"
@@ -78,7 +78,7 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
   }
 }
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   BENCHMARK_CAPTURE(f32_vscaleextexp, avx512f_p5_scalef_u16,
     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_u128_acc2,
     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_u16,
@@ -127,7 +127,9 @@ static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
     xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_u128_acc2,
     xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_u192,
     benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
+#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   BENCHMARK_CAPTURE(f32_vscaleextexp, avx2_p5_u8,
     xnn_f32_raddextexp_ukernel__avx2_p5_u80_acc2,
     xnn_f32_vscaleextexp_ukernel__avx2_p5_u8,
diff --git a/bench/floor.cc b/bench/floor.cc
index b7e7f76a41e..06752a0b85c 100644
--- a/bench/floor.cc
+++ b/bench/floor.cc
@@ -3,10 +3,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/bench/fully-connected.cc b/bench/fully-connected.cc
index c1b4f0bc23d..9e6c67dd1ac 100644
--- a/bench/fully-connected.cc
+++ b/bench/fully-connected.cc
@@ -18,7 +18,7 @@
 #include "xnnpack.h"
 
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack/buffer.h"
 
 void xnnpack_fully_connected_f32(benchmark::State& state, const char* net) {
diff --git a/bench/gemm-benchmark.cc b/bench/gemm-benchmark.cc
index dee22586271..ab9f30562c8 100644
--- a/bench/gemm-benchmark.cc
+++ b/bench/gemm-benchmark.cc
@@ -12,7 +12,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/config-types.h"
diff --git a/bench/gemm-benchmark.h b/bench/gemm-benchmark.h
index 9924be2f3df..ea83c5d7d8d 100644
--- a/bench/gemm-benchmark.h
+++ b/bench/gemm-benchmark.h
@@ -16,8 +16,8 @@
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.h"
 #endif  // XNN_ENABLE_KLEIDIAI
 
-#include "bench/gemm.h"
-#include "bench/utils.h"
+#include "gemm.h"
+#include "utils.h"
 #include <benchmark/benchmark.h>
 
 void GEMMBenchmark(benchmark::State& state, xnn_qs8_gemm_minmax_ukernel_fn gemm,
diff --git a/bench/global-average-pooling.cc b/bench/global-average-pooling.cc
index 03949a986ab..a9adbba20ff 100644
--- a/bench/global-average-pooling.cc
+++ b/bench/global-average-pooling.cc
@@ -20,7 +20,7 @@
 #include "xnnpack/math.h"
 
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
+#include "utils.h"
 
 static void global_average_pooling_qu8(benchmark::State& state) {
   const size_t batch_size = state.range(0);
diff --git a/bench/hardswish.cc b/bench/hardswish.cc
index 35a7f100d3e..a666346f473 100644
--- a/bench/hardswish.cc
+++ b/bench/hardswish.cc
@@ -3,10 +3,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/bench/leaky-relu.cc b/bench/leaky-relu.cc
index 70ac545e647..4c419286848 100644
--- a/bench/leaky-relu.cc
+++ b/bench/leaky-relu.cc
@@ -9,7 +9,7 @@
 #include <limits>
 
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
diff --git a/bench/max-pooling.cc b/bench/max-pooling.cc
index c281d5dc1a1..9e3ee9117c4 100644
--- a/bench/max-pooling.cc
+++ b/bench/max-pooling.cc
@@ -15,7 +15,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/buffer.h"
 #include <benchmark/benchmark.h>
diff --git a/bench/models/benchmark.cc b/bench/models/benchmark.cc
index bed5507e6fe..f2e7b4436af 100644
--- a/bench/models/benchmark.cc
+++ b/bench/models/benchmark.cc
@@ -15,7 +15,7 @@
 #include <vector>
 
 #include "models.h"
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/allocator.h"
 #include "xnnpack/subgraph.h"
diff --git a/bench/models/qs8-mobilenet-v2.cc b/bench/models/qs8-mobilenet-v2.cc
index e2aea9b6965..ae2aac57240 100644
--- a/bench/models/qs8-mobilenet-v2.cc
+++ b/bench/models/qs8-mobilenet-v2.cc
@@ -972,7 +972,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(864, int8_t)> w67_data;
   uint32_t w67 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w67_dims = {{32, 3, 3, 3}};
-  std::array<float, 32> w67_scale;
+  static std::array<float, 32> w67_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w67_scale.begin(), w67_scale.end(), std::ref(scalerng));
@@ -991,7 +991,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(32, int32_t)> w68_data;
   uint32_t w68 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w68_dims = {{32}};
-  std::array<float, 32> w68_scale;
+  static std::array<float, 32> w68_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w68_scale.begin(), w68_scale.end(), std::ref(scalerng));
@@ -1010,7 +1010,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(288, int8_t)> w69_data;
   uint32_t w69 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w69_dims = {{1, 3, 3, 32}};
-  std::array<float, 32> w69_scale;
+  static std::array<float, 32> w69_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w69_scale.begin(), w69_scale.end(), std::ref(scalerng));
@@ -1029,7 +1029,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(32, int32_t)> w70_data;
   uint32_t w70 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w70_dims = {{32}};
-  std::array<float, 32> w70_scale;
+  static std::array<float, 32> w70_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w70_scale.begin(), w70_scale.end(), std::ref(scalerng));
@@ -1048,7 +1048,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(512, int8_t)> w71_data;
   uint32_t w71 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w71_dims = {{16, 1, 1, 32}};
-  std::array<float, 16> w71_scale;
+  static std::array<float, 16> w71_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w71_scale.begin(), w71_scale.end(), std::ref(scalerng));
@@ -1067,7 +1067,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(16, int32_t)> w72_data;
   uint32_t w72 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w72_dims = {{16}};
-  std::array<float, 16> w72_scale;
+  static std::array<float, 16> w72_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w72_scale.begin(), w72_scale.end(), std::ref(scalerng));
@@ -1086,7 +1086,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(1536, int8_t)> w73_data;
   uint32_t w73 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w73_dims = {{96, 1, 1, 16}};
-  std::array<float, 96> w73_scale;
+  static std::array<float, 96> w73_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w73_scale.begin(), w73_scale.end(), std::ref(scalerng));
@@ -1105,7 +1105,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(96, int32_t)> w74_data;
   uint32_t w74 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w74_dims = {{96}};
-  std::array<float, 96> w74_scale;
+  static std::array<float, 96> w74_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w74_scale.begin(), w74_scale.end(), std::ref(scalerng));
@@ -1124,7 +1124,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(864, int8_t)> w75_data;
   uint32_t w75 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w75_dims = {{1, 3, 3, 96}};
-  std::array<float, 96> w75_scale;
+  static std::array<float, 96> w75_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w75_scale.begin(), w75_scale.end(), std::ref(scalerng));
@@ -1143,7 +1143,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(96, int32_t)> w76_data;
   uint32_t w76 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w76_dims = {{96}};
-  std::array<float, 96> w76_scale;
+  static std::array<float, 96> w76_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w76_scale.begin(), w76_scale.end(), std::ref(scalerng));
@@ -1162,7 +1162,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(2304, int8_t)> w77_data;
   uint32_t w77 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w77_dims = {{24, 1, 1, 96}};
-  std::array<float, 24> w77_scale;
+  static std::array<float, 24> w77_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w77_scale.begin(), w77_scale.end(), std::ref(scalerng));
@@ -1181,7 +1181,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(24, int32_t)> w78_data;
   uint32_t w78 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w78_dims = {{24}};
-  std::array<float, 24> w78_scale;
+  static std::array<float, 24> w78_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w78_scale.begin(), w78_scale.end(), std::ref(scalerng));
@@ -1200,7 +1200,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(3456, int8_t)> w79_data;
   uint32_t w79 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w79_dims = {{144, 1, 1, 24}};
-  std::array<float, 144> w79_scale;
+  static std::array<float, 144> w79_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w79_scale.begin(), w79_scale.end(), std::ref(scalerng));
@@ -1219,7 +1219,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(144, int32_t)> w80_data;
   uint32_t w80 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w80_dims = {{144}};
-  std::array<float, 144> w80_scale;
+  static std::array<float, 144> w80_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w80_scale.begin(), w80_scale.end(), std::ref(scalerng));
@@ -1238,7 +1238,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(1296, int8_t)> w81_data;
   uint32_t w81 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w81_dims = {{1, 3, 3, 144}};
-  std::array<float, 144> w81_scale;
+  static std::array<float, 144> w81_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w81_scale.begin(), w81_scale.end(), std::ref(scalerng));
@@ -1257,7 +1257,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(144, int32_t)> w82_data;
   uint32_t w82 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w82_dims = {{144}};
-  std::array<float, 144> w82_scale;
+  static std::array<float, 144> w82_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w82_scale.begin(), w82_scale.end(), std::ref(scalerng));
@@ -1276,7 +1276,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(3456, int8_t)> w83_data;
   uint32_t w83 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w83_dims = {{24, 1, 1, 144}};
-  std::array<float, 24> w83_scale;
+  static std::array<float, 24> w83_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w83_scale.begin(), w83_scale.end(), std::ref(scalerng));
@@ -1295,7 +1295,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(24, int32_t)> w84_data;
   uint32_t w84 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w84_dims = {{24}};
-  std::array<float, 24> w84_scale;
+  static std::array<float, 24> w84_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w84_scale.begin(), w84_scale.end(), std::ref(scalerng));
@@ -1314,7 +1314,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(3456, int8_t)> w85_data;
   uint32_t w85 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w85_dims = {{144, 1, 1, 24}};
-  std::array<float, 144> w85_scale;
+  static std::array<float, 144> w85_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w85_scale.begin(), w85_scale.end(), std::ref(scalerng));
@@ -1333,7 +1333,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(144, int32_t)> w86_data;
   uint32_t w86 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w86_dims = {{144}};
-  std::array<float, 144> w86_scale;
+  static std::array<float, 144> w86_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w86_scale.begin(), w86_scale.end(), std::ref(scalerng));
@@ -1352,7 +1352,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(1296, int8_t)> w87_data;
   uint32_t w87 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w87_dims = {{1, 3, 3, 144}};
-  std::array<float, 144> w87_scale;
+  static std::array<float, 144> w87_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w87_scale.begin(), w87_scale.end(), std::ref(scalerng));
@@ -1371,7 +1371,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(144, int32_t)> w88_data;
   uint32_t w88 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w88_dims = {{144}};
-  std::array<float, 144> w88_scale;
+  static std::array<float, 144> w88_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w88_scale.begin(), w88_scale.end(), std::ref(scalerng));
@@ -1390,7 +1390,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(4608, int8_t)> w89_data;
   uint32_t w89 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w89_dims = {{32, 1, 1, 144}};
-  std::array<float, 32> w89_scale;
+  static std::array<float, 32> w89_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w89_scale.begin(), w89_scale.end(), std::ref(scalerng));
@@ -1409,7 +1409,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(32, int32_t)> w90_data;
   uint32_t w90 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w90_dims = {{32}};
-  std::array<float, 32> w90_scale;
+  static std::array<float, 32> w90_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w90_scale.begin(), w90_scale.end(), std::ref(scalerng));
@@ -1428,7 +1428,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(6144, int8_t)> w91_data;
   uint32_t w91 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w91_dims = {{192, 1, 1, 32}};
-  std::array<float, 192> w91_scale;
+  static std::array<float, 192> w91_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w91_scale.begin(), w91_scale.end(), std::ref(scalerng));
@@ -1447,7 +1447,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(192, int32_t)> w92_data;
   uint32_t w92 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w92_dims = {{192}};
-  std::array<float, 192> w92_scale;
+  static std::array<float, 192> w92_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w92_scale.begin(), w92_scale.end(), std::ref(scalerng));
@@ -1466,7 +1466,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(1728, int8_t)> w93_data;
   uint32_t w93 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w93_dims = {{1, 3, 3, 192}};
-  std::array<float, 192> w93_scale;
+  static std::array<float, 192> w93_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w93_scale.begin(), w93_scale.end(), std::ref(scalerng));
@@ -1485,7 +1485,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(192, int32_t)> w94_data;
   uint32_t w94 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w94_dims = {{192}};
-  std::array<float, 192> w94_scale;
+  static std::array<float, 192> w94_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w94_scale.begin(), w94_scale.end(), std::ref(scalerng));
@@ -1504,7 +1504,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(6144, int8_t)> w95_data;
   uint32_t w95 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w95_dims = {{32, 1, 1, 192}};
-  std::array<float, 32> w95_scale;
+  static std::array<float, 32> w95_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w95_scale.begin(), w95_scale.end(), std::ref(scalerng));
@@ -1523,7 +1523,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(32, int32_t)> w96_data;
   uint32_t w96 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w96_dims = {{32}};
-  std::array<float, 32> w96_scale;
+  static std::array<float, 32> w96_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w96_scale.begin(), w96_scale.end(), std::ref(scalerng));
@@ -1542,7 +1542,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(6144, int8_t)> w97_data;
   uint32_t w97 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w97_dims = {{192, 1, 1, 32}};
-  std::array<float, 192> w97_scale;
+  static std::array<float, 192> w97_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w97_scale.begin(), w97_scale.end(), std::ref(scalerng));
@@ -1561,7 +1561,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(192, int32_t)> w98_data;
   uint32_t w98 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w98_dims = {{192}};
-  std::array<float, 192> w98_scale;
+  static std::array<float, 192> w98_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w98_scale.begin(), w98_scale.end(), std::ref(scalerng));
@@ -1580,7 +1580,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(1728, int8_t)> w99_data;
   uint32_t w99 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w99_dims = {{1, 3, 3, 192}};
-  std::array<float, 192> w99_scale;
+  static std::array<float, 192> w99_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w99_scale.begin(), w99_scale.end(), std::ref(scalerng));
@@ -1599,7 +1599,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(192, int32_t)> w100_data;
   uint32_t w100 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w100_dims = {{192}};
-  std::array<float, 192> w100_scale;
+  static std::array<float, 192> w100_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w100_scale.begin(), w100_scale.end(), std::ref(scalerng));
@@ -1618,7 +1618,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(6144, int8_t)> w101_data;
   uint32_t w101 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w101_dims = {{32, 1, 1, 192}};
-  std::array<float, 32> w101_scale;
+  static std::array<float, 32> w101_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w101_scale.begin(), w101_scale.end(), std::ref(scalerng));
@@ -1637,7 +1637,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(32, int32_t)> w102_data;
   uint32_t w102 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w102_dims = {{32}};
-  std::array<float, 32> w102_scale;
+  static std::array<float, 32> w102_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w102_scale.begin(), w102_scale.end(), std::ref(scalerng));
@@ -1656,7 +1656,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(6144, int8_t)> w103_data;
   uint32_t w103 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w103_dims = {{192, 1, 1, 32}};
-  std::array<float, 192> w103_scale;
+  static std::array<float, 192> w103_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w103_scale.begin(), w103_scale.end(), std::ref(scalerng));
@@ -1675,7 +1675,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(192, int32_t)> w104_data;
   uint32_t w104 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w104_dims = {{192}};
-  std::array<float, 192> w104_scale;
+  static std::array<float, 192> w104_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w104_scale.begin(), w104_scale.end(), std::ref(scalerng));
@@ -1694,7 +1694,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(1728, int8_t)> w105_data;
   uint32_t w105 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w105_dims = {{1, 3, 3, 192}};
-  std::array<float, 192> w105_scale;
+  static std::array<float, 192> w105_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w105_scale.begin(), w105_scale.end(), std::ref(scalerng));
@@ -1713,7 +1713,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(192, int32_t)> w106_data;
   uint32_t w106 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w106_dims = {{192}};
-  std::array<float, 192> w106_scale;
+  static std::array<float, 192> w106_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w106_scale.begin(), w106_scale.end(), std::ref(scalerng));
@@ -1732,7 +1732,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(12288, int8_t)> w107_data;
   uint32_t w107 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w107_dims = {{64, 1, 1, 192}};
-  std::array<float, 64> w107_scale;
+  static std::array<float, 64> w107_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w107_scale.begin(), w107_scale.end(), std::ref(scalerng));
@@ -1751,7 +1751,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(64, int32_t)> w108_data;
   uint32_t w108 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w108_dims = {{64}};
-  std::array<float, 64> w108_scale;
+  static std::array<float, 64> w108_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w108_scale.begin(), w108_scale.end(), std::ref(scalerng));
@@ -1770,7 +1770,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(24576, int8_t)> w109_data;
   uint32_t w109 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w109_dims = {{384, 1, 1, 64}};
-  std::array<float, 384> w109_scale;
+  static std::array<float, 384> w109_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w109_scale.begin(), w109_scale.end(), std::ref(scalerng));
@@ -1789,7 +1789,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(384, int32_t)> w110_data;
   uint32_t w110 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w110_dims = {{384}};
-  std::array<float, 384> w110_scale;
+  static std::array<float, 384> w110_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w110_scale.begin(), w110_scale.end(), std::ref(scalerng));
@@ -1808,7 +1808,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(3456, int8_t)> w111_data;
   uint32_t w111 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w111_dims = {{1, 3, 3, 384}};
-  std::array<float, 384> w111_scale;
+  static std::array<float, 384> w111_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w111_scale.begin(), w111_scale.end(), std::ref(scalerng));
@@ -1827,7 +1827,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(384, int32_t)> w112_data;
   uint32_t w112 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w112_dims = {{384}};
-  std::array<float, 384> w112_scale;
+  static std::array<float, 384> w112_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w112_scale.begin(), w112_scale.end(), std::ref(scalerng));
@@ -1846,7 +1846,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(24576, int8_t)> w113_data;
   uint32_t w113 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w113_dims = {{64, 1, 1, 384}};
-  std::array<float, 64> w113_scale;
+  static std::array<float, 64> w113_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w113_scale.begin(), w113_scale.end(), std::ref(scalerng));
@@ -1865,7 +1865,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(64, int32_t)> w114_data;
   uint32_t w114 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w114_dims = {{64}};
-  std::array<float, 64> w114_scale;
+  static std::array<float, 64> w114_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w114_scale.begin(), w114_scale.end(), std::ref(scalerng));
@@ -1884,7 +1884,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(24576, int8_t)> w115_data;
   uint32_t w115 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w115_dims = {{384, 1, 1, 64}};
-  std::array<float, 384> w115_scale;
+  static std::array<float, 384> w115_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w115_scale.begin(), w115_scale.end(), std::ref(scalerng));
@@ -1903,7 +1903,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(384, int32_t)> w116_data;
   uint32_t w116 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w116_dims = {{384}};
-  std::array<float, 384> w116_scale;
+  static std::array<float, 384> w116_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w116_scale.begin(), w116_scale.end(), std::ref(scalerng));
@@ -1922,7 +1922,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(3456, int8_t)> w117_data;
   uint32_t w117 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w117_dims = {{1, 3, 3, 384}};
-  std::array<float, 384> w117_scale;
+  static std::array<float, 384> w117_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w117_scale.begin(), w117_scale.end(), std::ref(scalerng));
@@ -1941,7 +1941,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(384, int32_t)> w118_data;
   uint32_t w118 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w118_dims = {{384}};
-  std::array<float, 384> w118_scale;
+  static std::array<float, 384> w118_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w118_scale.begin(), w118_scale.end(), std::ref(scalerng));
@@ -1960,7 +1960,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(24576, int8_t)> w119_data;
   uint32_t w119 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w119_dims = {{64, 1, 1, 384}};
-  std::array<float, 64> w119_scale;
+  static std::array<float, 64> w119_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w119_scale.begin(), w119_scale.end(), std::ref(scalerng));
@@ -1979,7 +1979,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(64, int32_t)> w120_data;
   uint32_t w120 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w120_dims = {{64}};
-  std::array<float, 64> w120_scale;
+  static std::array<float, 64> w120_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w120_scale.begin(), w120_scale.end(), std::ref(scalerng));
@@ -1998,7 +1998,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(24576, int8_t)> w121_data;
   uint32_t w121 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w121_dims = {{384, 1, 1, 64}};
-  std::array<float, 384> w121_scale;
+  static std::array<float, 384> w121_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w121_scale.begin(), w121_scale.end(), std::ref(scalerng));
@@ -2017,7 +2017,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(384, int32_t)> w122_data;
   uint32_t w122 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w122_dims = {{384}};
-  std::array<float, 384> w122_scale;
+  static std::array<float, 384> w122_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w122_scale.begin(), w122_scale.end(), std::ref(scalerng));
@@ -2036,7 +2036,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(3456, int8_t)> w123_data;
   uint32_t w123 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w123_dims = {{1, 3, 3, 384}};
-  std::array<float, 384> w123_scale;
+  static std::array<float, 384> w123_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w123_scale.begin(), w123_scale.end(), std::ref(scalerng));
@@ -2055,7 +2055,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(384, int32_t)> w124_data;
   uint32_t w124 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w124_dims = {{384}};
-  std::array<float, 384> w124_scale;
+  static std::array<float, 384> w124_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w124_scale.begin(), w124_scale.end(), std::ref(scalerng));
@@ -2074,7 +2074,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(24576, int8_t)> w125_data;
   uint32_t w125 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w125_dims = {{64, 1, 1, 384}};
-  std::array<float, 64> w125_scale;
+  static std::array<float, 64> w125_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w125_scale.begin(), w125_scale.end(), std::ref(scalerng));
@@ -2093,7 +2093,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(64, int32_t)> w126_data;
   uint32_t w126 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w126_dims = {{64}};
-  std::array<float, 64> w126_scale;
+  static std::array<float, 64> w126_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w126_scale.begin(), w126_scale.end(), std::ref(scalerng));
@@ -2112,7 +2112,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(24576, int8_t)> w127_data;
   uint32_t w127 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w127_dims = {{384, 1, 1, 64}};
-  std::array<float, 384> w127_scale;
+  static std::array<float, 384> w127_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w127_scale.begin(), w127_scale.end(), std::ref(scalerng));
@@ -2131,7 +2131,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(384, int32_t)> w128_data;
   uint32_t w128 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w128_dims = {{384}};
-  std::array<float, 384> w128_scale;
+  static std::array<float, 384> w128_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w128_scale.begin(), w128_scale.end(), std::ref(scalerng));
@@ -2150,7 +2150,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(3456, int8_t)> w129_data;
   uint32_t w129 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w129_dims = {{1, 3, 3, 384}};
-  std::array<float, 384> w129_scale;
+  static std::array<float, 384> w129_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w129_scale.begin(), w129_scale.end(), std::ref(scalerng));
@@ -2169,7 +2169,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(384, int32_t)> w130_data;
   uint32_t w130 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w130_dims = {{384}};
-  std::array<float, 384> w130_scale;
+  static std::array<float, 384> w130_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w130_scale.begin(), w130_scale.end(), std::ref(scalerng));
@@ -2188,7 +2188,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(36864, int8_t)> w131_data;
   uint32_t w131 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w131_dims = {{96, 1, 1, 384}};
-  std::array<float, 96> w131_scale;
+  static std::array<float, 96> w131_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w131_scale.begin(), w131_scale.end(), std::ref(scalerng));
@@ -2207,7 +2207,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(96, int32_t)> w132_data;
   uint32_t w132 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w132_dims = {{96}};
-  std::array<float, 96> w132_scale;
+  static std::array<float, 96> w132_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w132_scale.begin(), w132_scale.end(), std::ref(scalerng));
@@ -2226,7 +2226,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(55296, int8_t)> w133_data;
   uint32_t w133 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w133_dims = {{576, 1, 1, 96}};
-  std::array<float, 576> w133_scale;
+  static std::array<float, 576> w133_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w133_scale.begin(), w133_scale.end(), std::ref(scalerng));
@@ -2245,7 +2245,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(576, int32_t)> w134_data;
   uint32_t w134 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w134_dims = {{576}};
-  std::array<float, 576> w134_scale;
+  static std::array<float, 576> w134_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w134_scale.begin(), w134_scale.end(), std::ref(scalerng));
@@ -2264,7 +2264,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(5184, int8_t)> w135_data;
   uint32_t w135 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w135_dims = {{1, 3, 3, 576}};
-  std::array<float, 576> w135_scale;
+  static std::array<float, 576> w135_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w135_scale.begin(), w135_scale.end(), std::ref(scalerng));
@@ -2283,7 +2283,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(576, int32_t)> w136_data;
   uint32_t w136 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w136_dims = {{576}};
-  std::array<float, 576> w136_scale;
+  static std::array<float, 576> w136_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w136_scale.begin(), w136_scale.end(), std::ref(scalerng));
@@ -2302,7 +2302,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(55296, int8_t)> w137_data;
   uint32_t w137 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w137_dims = {{96, 1, 1, 576}};
-  std::array<float, 96> w137_scale;
+  static std::array<float, 96> w137_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w137_scale.begin(), w137_scale.end(), std::ref(scalerng));
@@ -2321,7 +2321,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(96, int32_t)> w138_data;
   uint32_t w138 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w138_dims = {{96}};
-  std::array<float, 96> w138_scale;
+  static std::array<float, 96> w138_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w138_scale.begin(), w138_scale.end(), std::ref(scalerng));
@@ -2340,7 +2340,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(55296, int8_t)> w139_data;
   uint32_t w139 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w139_dims = {{576, 1, 1, 96}};
-  std::array<float, 576> w139_scale;
+  static std::array<float, 576> w139_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w139_scale.begin(), w139_scale.end(), std::ref(scalerng));
@@ -2359,7 +2359,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(576, int32_t)> w140_data;
   uint32_t w140 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w140_dims = {{576}};
-  std::array<float, 576> w140_scale;
+  static std::array<float, 576> w140_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w140_scale.begin(), w140_scale.end(), std::ref(scalerng));
@@ -2378,7 +2378,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(5184, int8_t)> w141_data;
   uint32_t w141 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w141_dims = {{1, 3, 3, 576}};
-  std::array<float, 576> w141_scale;
+  static std::array<float, 576> w141_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w141_scale.begin(), w141_scale.end(), std::ref(scalerng));
@@ -2397,7 +2397,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(576, int32_t)> w142_data;
   uint32_t w142 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w142_dims = {{576}};
-  std::array<float, 576> w142_scale;
+  static std::array<float, 576> w142_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w142_scale.begin(), w142_scale.end(), std::ref(scalerng));
@@ -2416,7 +2416,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(55296, int8_t)> w143_data;
   uint32_t w143 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w143_dims = {{96, 1, 1, 576}};
-  std::array<float, 96> w143_scale;
+  static std::array<float, 96> w143_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w143_scale.begin(), w143_scale.end(), std::ref(scalerng));
@@ -2435,7 +2435,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(96, int32_t)> w144_data;
   uint32_t w144 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w144_dims = {{96}};
-  std::array<float, 96> w144_scale;
+  static std::array<float, 96> w144_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w144_scale.begin(), w144_scale.end(), std::ref(scalerng));
@@ -2454,7 +2454,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(55296, int8_t)> w145_data;
   uint32_t w145 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w145_dims = {{576, 1, 1, 96}};
-  std::array<float, 576> w145_scale;
+  static std::array<float, 576> w145_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w145_scale.begin(), w145_scale.end(), std::ref(scalerng));
@@ -2473,7 +2473,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(576, int32_t)> w146_data;
   uint32_t w146 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w146_dims = {{576}};
-  std::array<float, 576> w146_scale;
+  static std::array<float, 576> w146_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w146_scale.begin(), w146_scale.end(), std::ref(scalerng));
@@ -2492,7 +2492,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(5184, int8_t)> w147_data;
   uint32_t w147 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w147_dims = {{1, 3, 3, 576}};
-  std::array<float, 576> w147_scale;
+  static std::array<float, 576> w147_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w147_scale.begin(), w147_scale.end(), std::ref(scalerng));
@@ -2511,7 +2511,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(576, int32_t)> w148_data;
   uint32_t w148 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w148_dims = {{576}};
-  std::array<float, 576> w148_scale;
+  static std::array<float, 576> w148_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w148_scale.begin(), w148_scale.end(), std::ref(scalerng));
@@ -2530,7 +2530,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(92160, int8_t)> w149_data;
   uint32_t w149 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w149_dims = {{160, 1, 1, 576}};
-  std::array<float, 160> w149_scale;
+  static std::array<float, 160> w149_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w149_scale.begin(), w149_scale.end(), std::ref(scalerng));
@@ -2549,7 +2549,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(160, int32_t)> w150_data;
   uint32_t w150 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w150_dims = {{160}};
-  std::array<float, 160> w150_scale;
+  static std::array<float, 160> w150_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w150_scale.begin(), w150_scale.end(), std::ref(scalerng));
@@ -2568,7 +2568,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(153600, int8_t)> w151_data;
   uint32_t w151 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w151_dims = {{960, 1, 1, 160}};
-  std::array<float, 960> w151_scale;
+  static std::array<float, 960> w151_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w151_scale.begin(), w151_scale.end(), std::ref(scalerng));
@@ -2587,7 +2587,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(960, int32_t)> w152_data;
   uint32_t w152 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w152_dims = {{960}};
-  std::array<float, 960> w152_scale;
+  static std::array<float, 960> w152_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w152_scale.begin(), w152_scale.end(), std::ref(scalerng));
@@ -2606,7 +2606,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(8640, int8_t)> w153_data;
   uint32_t w153 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w153_dims = {{1, 3, 3, 960}};
-  std::array<float, 960> w153_scale;
+  static std::array<float, 960> w153_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w153_scale.begin(), w153_scale.end(), std::ref(scalerng));
@@ -2625,7 +2625,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(960, int32_t)> w154_data;
   uint32_t w154 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w154_dims = {{960}};
-  std::array<float, 960> w154_scale;
+  static std::array<float, 960> w154_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w154_scale.begin(), w154_scale.end(), std::ref(scalerng));
@@ -2644,7 +2644,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(153600, int8_t)> w155_data;
   uint32_t w155 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w155_dims = {{160, 1, 1, 960}};
-  std::array<float, 160> w155_scale;
+  static std::array<float, 160> w155_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w155_scale.begin(), w155_scale.end(), std::ref(scalerng));
@@ -2663,7 +2663,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(160, int32_t)> w156_data;
   uint32_t w156 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w156_dims = {{160}};
-  std::array<float, 160> w156_scale;
+  static std::array<float, 160> w156_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w156_scale.begin(), w156_scale.end(), std::ref(scalerng));
@@ -2682,7 +2682,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(153600, int8_t)> w157_data;
   uint32_t w157 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w157_dims = {{960, 1, 1, 160}};
-  std::array<float, 960> w157_scale;
+  static std::array<float, 960> w157_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w157_scale.begin(), w157_scale.end(), std::ref(scalerng));
@@ -2701,7 +2701,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(960, int32_t)> w158_data;
   uint32_t w158 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w158_dims = {{960}};
-  std::array<float, 960> w158_scale;
+  static std::array<float, 960> w158_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w158_scale.begin(), w158_scale.end(), std::ref(scalerng));
@@ -2720,7 +2720,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(8640, int8_t)> w159_data;
   uint32_t w159 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w159_dims = {{1, 3, 3, 960}};
-  std::array<float, 960> w159_scale;
+  static std::array<float, 960> w159_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w159_scale.begin(), w159_scale.end(), std::ref(scalerng));
@@ -2739,7 +2739,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(960, int32_t)> w160_data;
   uint32_t w160 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w160_dims = {{960}};
-  std::array<float, 960> w160_scale;
+  static std::array<float, 960> w160_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w160_scale.begin(), w160_scale.end(), std::ref(scalerng));
@@ -2758,7 +2758,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(153600, int8_t)> w161_data;
   uint32_t w161 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w161_dims = {{160, 1, 1, 960}};
-  std::array<float, 160> w161_scale;
+  static std::array<float, 160> w161_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w161_scale.begin(), w161_scale.end(), std::ref(scalerng));
@@ -2777,7 +2777,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(160, int32_t)> w162_data;
   uint32_t w162 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w162_dims = {{160}};
-  std::array<float, 160> w162_scale;
+  static std::array<float, 160> w162_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w162_scale.begin(), w162_scale.end(), std::ref(scalerng));
@@ -2796,7 +2796,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(153600, int8_t)> w163_data;
   uint32_t w163 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w163_dims = {{960, 1, 1, 160}};
-  std::array<float, 960> w163_scale;
+  static std::array<float, 960> w163_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w163_scale.begin(), w163_scale.end(), std::ref(scalerng));
@@ -2815,7 +2815,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(960, int32_t)> w164_data;
   uint32_t w164 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w164_dims = {{960}};
-  std::array<float, 960> w164_scale;
+  static std::array<float, 960> w164_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w164_scale.begin(), w164_scale.end(), std::ref(scalerng));
@@ -2834,7 +2834,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(8640, int8_t)> w165_data;
   uint32_t w165 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w165_dims = {{1, 3, 3, 960}};
-  std::array<float, 960> w165_scale;
+  static std::array<float, 960> w165_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w165_scale.begin(), w165_scale.end(), std::ref(scalerng));
@@ -2853,7 +2853,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(960, int32_t)> w166_data;
   uint32_t w166 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w166_dims = {{960}};
-  std::array<float, 960> w166_scale;
+  static std::array<float, 960> w166_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w166_scale.begin(), w166_scale.end(), std::ref(scalerng));
@@ -2872,7 +2872,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(307200, int8_t)> w167_data;
   uint32_t w167 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w167_dims = {{320, 1, 1, 960}};
-  std::array<float, 320> w167_scale;
+  static std::array<float, 320> w167_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w167_scale.begin(), w167_scale.end(), std::ref(scalerng));
@@ -2891,7 +2891,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(320, int32_t)> w168_data;
   uint32_t w168 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w168_dims = {{320}};
-  std::array<float, 320> w168_scale;
+  static std::array<float, 320> w168_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w168_scale.begin(), w168_scale.end(), std::ref(scalerng));
@@ -2910,7 +2910,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int8_t, XNN_PAD_EXTRA_BYTES(409600, int8_t)> w169_data;
   uint32_t w169 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 4> w169_dims = {{1280, 1, 1, 320}};
-  std::array<float, 1280> w169_scale;
+  static std::array<float, 1280> w169_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w169_scale.begin(), w169_scale.end(), std::ref(scalerng));
@@ -2929,7 +2929,7 @@ xnn_subgraph_t QS8MobileNetV2() {
   alignas(16) static std::array<int32_t, XNN_PAD_EXTRA_BYTES(1280, int32_t)> w170_data;
   uint32_t w170 = XNN_INVALID_VALUE_ID;
   std::array<size_t, 1> w170_dims = {{1280}};
-  std::array<float, 1280> w170_scale;
+  static std::array<float, 1280> w170_scale;
   {
     auto scalerng = std::bind(std::uniform_real_distribution<float>(0.01f, 1.0f), std::ref(rng));
     std::generate(w170_scale.begin(), w170_scale.end(), std::ref(scalerng));
diff --git a/bench/negate.cc b/bench/negate.cc
index a53d5c0c66b..4ca421d4887 100644
--- a/bench/negate.cc
+++ b/bench/negate.cc
@@ -3,10 +3,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/bench/packq-benchmark.cc b/bench/packq-benchmark.cc
index 168fb6c1af8..a7b09412d41 100644
--- a/bench/packq-benchmark.cc
+++ b/bench/packq-benchmark.cc
@@ -9,14 +9,12 @@
 #include <cstddef>
 #include <cstdint>
 #include <random>
-#include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack/buffer.h"
 #include "xnnpack/common.h"
 #include "xnnpack/microfnptr.h"
-#include "xnnpack/pack.h"
 #include "xnnpack/packq.h"
-#include "xnnpack/buffer.h"
 #include <benchmark/benchmark.h>
 
 void x8_packq(benchmark::State& state, xnn_x8_packq_f32qp8_ukernel_fn packq,
diff --git a/bench/packq-benchmark.h b/bench/packq-benchmark.h
index 77f5b80d37d..7502098714a 100644
--- a/bench/packq-benchmark.h
+++ b/bench/packq-benchmark.h
@@ -8,7 +8,7 @@
 
 #include <cstddef>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/pack.h"
diff --git a/bench/packw-benchmark.h b/bench/packw-benchmark.h
index 8204e51ccfc..d82f2ace33a 100644
--- a/bench/packw-benchmark.h
+++ b/bench/packw-benchmark.h
@@ -8,8 +8,8 @@
 #include <random>
 #include <vector>
 
-#include "bench/bgemm.h"
-#include "bench/utils.h"
+#include "bgemm.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/pack.h"
 #include "xnnpack/packw.h"
diff --git a/bench/prelu.cc b/bench/prelu.cc
index bd17cac386c..8da165cb695 100644
--- a/bench/prelu.cc
+++ b/bench/prelu.cc
@@ -7,13 +7,13 @@
 #include <cfloat>
 #include <cmath>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <random>
 #include <vector>
 
+#include "utils.h"
 #include "xnnpack.h"
-
-#include "bench/utils.h"
 #include "xnnpack/buffer.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
@@ -43,6 +43,9 @@ void xnnpack_prelu_f32(benchmark::State& state, const char* net) {
   std::generate(slope.begin(), slope.end(), std::ref(f32wrng));
   xnnpack::Buffer<float> output(batch_size * height * width * channels);
 
+  const size_t input_shape[4] = {batch_size, height, width, channels};
+  const size_t slope_shape[1] = {channels};
+
   xnn_status status = xnn_initialize(nullptr /* allocator */);
   if (status != xnn_status_success) {
     state.SkipWithError("failed to initialize XNNPACK");
@@ -50,27 +53,24 @@ void xnnpack_prelu_f32(benchmark::State& state, const char* net) {
   }
 
   xnn_operator_t prelu_op = nullptr;
-  status = xnn_create_prelu_nc_f32(
-    channels, /*slope_channels=*/channels, /*input_stride=*/channels , /*output_stride=*/channels,
-    slope.data(),
-    0 /* flags */, nullptr, nullptr, &prelu_op);
+  status = xnn_create_binary_elementwise_nd(xnn_binary_prelu, xnn_datatype_fp32,
+                                            nullptr, nullptr, nullptr,
+                                            /*flags=*/0, &prelu_op);
   if (status != xnn_status_success) {
     state.SkipWithError("failed to create FP32 PReLU operator");
     return;
   }
 
-  status = xnn_reshape_prelu_nc_f32(
-    prelu_op,
-    batch_size * height * width,
-    /*threadpool=*/nullptr);
+  status = xnn_reshape_binary_elementwise_nd(prelu_op, 4, &input_shape[0], 1,
+                                             &slope_shape[0],
+                                             /*threadpool=*/nullptr);
   if (status != xnn_status_success) {
     state.SkipWithError("failed to reshape FP32 PReLU operator");
     return;
   }
 
-  status = xnn_setup_prelu_nc_f32(
-    prelu_op,
-    input.data(), output.data());
+  status = xnn_setup_binary_elementwise_nd(prelu_op, input.data(), slope.data(),
+                                           output.data());
   if (status != xnn_status_success) {
     state.SkipWithError("failed to setup FP32 PReLU operator");
     return;
diff --git a/bench/qd8-f16-qb4w-gemm.cc b/bench/qd8-f16-qb4w-gemm.cc
index 406c7a9b8fb..ee0ece7a993 100644
--- a/bench/qd8-f16-qb4w-gemm.cc
+++ b/bench/qd8-f16-qb4w-gemm.cc
@@ -8,8 +8,8 @@
 //   Generator: tools/generate-gemm-test.py
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/isa-checks.h"
diff --git a/bench/qd8-f16-qc4w-gemm.cc b/bench/qd8-f16-qc4w-gemm.cc
index 72b644ec1f3..916d129ee52 100644
--- a/bench/qd8-f16-qc4w-gemm.cc
+++ b/bench/qd8-f16-qc4w-gemm.cc
@@ -8,8 +8,8 @@
 //   Generator: tools/generate-gemm-test.py
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/isa-checks.h"
diff --git a/bench/qd8-f16-qc8w-gemm.cc b/bench/qd8-f16-qc8w-gemm.cc
index 6de088c0fd8..c46f1602263 100644
--- a/bench/qd8-f16-qc8w-gemm.cc
+++ b/bench/qd8-f16-qc8w-gemm.cc
@@ -8,8 +8,8 @@
 //   Generator: tools/generate-gemm-test.py
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/isa-checks.h"
diff --git a/bench/qd8-f32-qb4w-gemm.cc b/bench/qd8-f32-qb4w-gemm.cc
index 4794bfefb34..175d9b816c3 100644
--- a/bench/qd8-f32-qb4w-gemm.cc
+++ b/bench/qd8-f32-qb4w-gemm.cc
@@ -8,8 +8,8 @@
 //   Generator: tools/generate-gemm-test.py
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/isa-checks.h"
diff --git a/bench/qd8-f32-qc4w-gemm.cc b/bench/qd8-f32-qc4w-gemm.cc
index 4e9931b6924..9066e099c1c 100644
--- a/bench/qd8-f32-qc4w-gemm.cc
+++ b/bench/qd8-f32-qc4w-gemm.cc
@@ -8,8 +8,8 @@
 //   Generator: tools/generate-gemm-test.py
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/isa-checks.h"
diff --git a/bench/qd8-f32-qc8w-gemm.cc b/bench/qd8-f32-qc8w-gemm.cc
index 838ad87ab76..906d7057611 100644
--- a/bench/qd8-f32-qc8w-gemm.cc
+++ b/bench/qd8-f32-qc8w-gemm.cc
@@ -8,8 +8,8 @@
 //   Generator: tools/generate-gemm-test.py
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/isa-checks.h"
diff --git a/bench/qp8-f32-qb4w-gemm.cc b/bench/qp8-f32-qb4w-gemm.cc
index f769132e5c5..71c64196e4e 100644
--- a/bench/qp8-f32-qb4w-gemm.cc
+++ b/bench/qp8-f32-qb4w-gemm.cc
@@ -8,8 +8,8 @@
 //   Generator: tools/generate-gemm-test.py
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/isa-checks.h"
diff --git a/bench/qp8-f32-qc4w-gemm.cc b/bench/qp8-f32-qc4w-gemm.cc
index 4a25dfd0708..cd5af5412eb 100644
--- a/bench/qp8-f32-qc4w-gemm.cc
+++ b/bench/qp8-f32-qc4w-gemm.cc
@@ -8,8 +8,8 @@
 //   Generator: tools/generate-gemm-test.py
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/isa-checks.h"
diff --git a/bench/qs16-qs8-vcvt.cc b/bench/qs16-qs8-vcvt.cc
index e17893e50a6..4ea6126a68d 100644
--- a/bench/qs16-qs8-vcvt.cc
+++ b/bench/qs16-qs8-vcvt.cc
@@ -4,8 +4,8 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
-#include "bench/vcvt-benchmark.h"
+#include "utils.h"
+#include "vcvt-benchmark.h"
 #include "xnnpack.h"
 #include "xnnpack/hardware-config.h"
 #include "xnnpack/microfnptr.h"
@@ -30,7 +30,7 @@ static void qs16_qs8_vcvt(
 BENCHMARK_CAPTURE(qs16_qs8_vcvt, ukernel, arch_flags, ukernel, init_params)         \
   ->Apply(benchmark::utils::UnaryElementwiseParameters<datatype_in, datatype_out>)  \
   ->UseRealTime();
-#include "src/qs16-qs8-vcvt/qs16-qs8-vcvt.h"
+#include "qs16-qs8-vcvt/qs16-qs8-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
 
 
diff --git a/bench/qs8-dwconv.cc b/bench/qs8-dwconv.cc
index b5ced99b2df..3d862732f7d 100644
--- a/bench/qs8-dwconv.cc
+++ b/bench/qs8-dwconv.cc
@@ -4,32 +4,31 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <algorithm>
-#include <cfloat>
-#include <cmath>
+#include <cstddef>
+#include <cstdint>
 #include <functional>
 #include <limits>
 #include <random>
-#include <vector>
 
-#include "bench/dwconv.h"
-#include "bench/utils.h"
+#include "dwconv.h"
+#include "utils.h"
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
 #include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/indirection.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microkernel-utils.h"
 #include "xnnpack/microparams-init.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/pack.h"
-#include "xnnpack/buffer.h"
 #include <benchmark/benchmark.h>
 
-static void DWConvBenchmark(benchmark::State& state,
-  xnn_qs8_dwconv_minmax_unipass_ukernel_fn dwconv,
-  xnn_init_qs8_conv_minmax_params_fn init_params,
-  uint32_t channel_tile, uint32_t primary_tile,
-  benchmark::utils::IsaCheckFunction isa_check = nullptr)
-{
+static void DWConvBenchmark(
+    benchmark::State& state, xnn_qs8_dwconv_minmax_unipass_ukernel_fn dwconv,
+    xnn_init_qs8_conv_minmax_params_fn init_params, uint32_t channel_tile,
+    uint32_t primary_tile,
+    benchmark::utils::IsaCheckFunction isa_check = nullptr) {
   if (isa_check != nullptr && !isa_check(state)) {
     return;
   }
@@ -52,66 +51,83 @@ static void DWConvBenchmark(benchmark::State& state,
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
-  auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
-  auto i8rng = std::bind(
-    std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()), std::ref(rng));
+  auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000),
+                          std::ref(rng));
+  auto i8rng = std::bind(std::uniform_int_distribution<int32_t>(
+                             -std::numeric_limits<int8_t>::max(),
+                             std::numeric_limits<int8_t>::max()),
+                         std::ref(rng));
 
   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
   const size_t padding_left = padding_width / 2;
   const size_t padding_top = padding_height / 2;
-  const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
-  const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
+  const size_t output_height =
+      (input_height + padding_height - effective_kernel_height) / subsampling +
+      1;
+  const size_t output_width =
+      (input_width + padding_width - effective_kernel_width) / subsampling + 1;
   const size_t output_size = output_height * output_width;
-  const size_t step_width = dilation == 1 ? std::min(subsampling, kernel_width) : kernel_width;
-  const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
+  const size_t step_width =
+      dilation == 1 ? std::min(subsampling, kernel_width) : kernel_width;
+  const size_t step_height =
+      kernel_size + (output_width - 1) * step_width * kernel_height;
 
-  const size_t c_stride = benchmark::utils::RoundUp<size_t>(channels, channel_tile);
+  const size_t c_stride =
+      benchmark::utils::RoundUp<size_t>(channels, channel_tile);
 
-  xnnpack::Buffer<int8_t> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(int8_t));
+  xnnpack::Buffer<int8_t> a(channels * input_height * input_width +
+                            XNN_EXTRA_BYTES / sizeof(int8_t));
   std::generate(a.begin(), a.end(), std::ref(i8rng));
   xnnpack::Buffer<int8_t> k(channels * kernel_height * kernel_width);
   std::generate(k.begin(), k.end(), std::ref(i8rng));
   xnnpack::Buffer<int32_t> b(channels);
   std::generate(b.begin(), b.end(), std::ref(i32rng));
 
+  // Zero buffer needs to be initialized with zeros.
   xnnpack::Buffer<int8_t> z(channels + XNN_EXTRA_BYTES / sizeof(int8_t));
+  std::fill(z.begin(), z.end(), 0);
 
   const size_t k_elements = kernel_size * c_stride;
   const size_t b_elements = c_stride;
-  const size_t w_size = k_elements * sizeof(int8_t) + b_elements * sizeof(int32_t);
-  // Can read (primary_tile - kernel_size) elements after end of indirection buffer.
-  const size_t i_elements = (primary_tile - kernel_size) + output_height * step_height;
+  const size_t w_size =
+      k_elements * sizeof(int8_t) + b_elements * sizeof(int32_t);
+  // Can read (primary_tile - kernel_size) elements after end of indirection
+  // buffer.
+  const size_t i_elements =
+      (primary_tile - kernel_size) + output_height * step_height;
   const size_t c_elements = output_size * channels;
-  const size_t num_buffers = 1 +
-    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
-      (c_elements * sizeof(int8_t) + w_size) + sizeof(void*) * i_elements);
+  const size_t num_buffers = 1 + benchmark::utils::DivideRoundUp<size_t>(
+                                     benchmark::utils::GetMaxCacheSize(),
+                                     (c_elements * sizeof(int8_t) + w_size) +
+                                         sizeof(void*) * i_elements);
 
+  // Explicitly initialize the weights buffer since `num_buffers` may be larger
+  // than the number of buffers that are actually initialized/needed.
   xnnpack::Buffer<char, XNN_ALLOCATION_ALIGNMENT> w(w_size * num_buffers);
+  std::fill(w.begin(), w.end(), 0);
+
+  // Pack the weights buffer.
   struct xnn_qs8_packing_params packing_params;
   packing_params.input_zero_point = 0;
-  xnn_pack_qs8_dwconv_ghw_w(primary_tile, 0, 0, kernel_height, kernel_width, channels,
-                            channel_tile, channel_tile, /*channel_round=*/1,
-                            k.data(), b.data(), /*scale=*/nullptr, w.data(),
-                            /*per_tile_extra_bytes=*/0, /*per_subtile_extra_bytes=*/0, &packing_params);
+  xnn_pack_qs8_dwconv_ghw_w(primary_tile, 0, 0, kernel_height, kernel_width,
+                            channels, channel_tile, channel_tile,
+                            /*channel_round=*/1, k.data(), b.data(),
+                            /*scale=*/nullptr, w.data(),
+                            /*per_tile_extra_bytes=*/0,
+                            /*per_subtile_extra_bytes=*/0, &packing_params);
   for (size_t n = 1; n < num_buffers; n++) {
     std::copy(w.cbegin(), w.cbegin() + w_size, w.begin() + n * w_size);
   }
 
   xnnpack::Buffer<const int8_t*> i(i_elements * num_buffers);
   xnn_indirection_init_dwconv2d(
-    /*output_y_start=*/0, /*output_y_end=*/output_height,
-    reinterpret_cast<const void**>(i.data()),
-    a.data(),
-    channels << XNN_LOG2_SIZEOF_INT8_T,
-    z.data(),
-    input_height, input_width,
-    output_height, output_width,
-    kernel_height, kernel_width,
-    subsampling, subsampling,
-    dilation, dilation,
-    padding_top, padding_left,
-    step_height, step_width, primary_tile);
+      /*output_y_start=*/0, /*output_y_end=*/output_height,
+      reinterpret_cast<const void**>(i.data()), a.data(),
+      channels << XNN_LOG2_SIZEOF_INT8_T, z.data(), input_height, input_width,
+      output_height, output_width, kernel_height, kernel_width, subsampling,
+      subsampling, dilation, dilation, padding_top, padding_left, step_height,
+      step_width, primary_tile);
   for (size_t n = 1; n < num_buffers; n++) {
     std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
   }
@@ -119,8 +135,9 @@ static void DWConvBenchmark(benchmark::State& state,
   xnnpack::Buffer<int8_t> c(c_elements * num_buffers);
 
   xnn_qs8_conv_minmax_params params;
-  init_params(&params,
-    0.5f /* scale */, 0 /* output zero point */, std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
+  init_params(&params, 0.5f /* scale */, 0 /* output zero point */,
+              std::numeric_limits<int8_t>::min(),
+              std::numeric_limits<int8_t>::max());
 
   size_t buffer_index = 0;
   for (auto _ : state) {
@@ -131,11 +148,11 @@ static void DWConvBenchmark(benchmark::State& state,
 
     for (size_t y = 0; y < output_height; y++) {
       dwconv(channels, output_width,
-        i.data() + buffer_index * i_elements + step_height * y,
-        w.data() + buffer_index * w_size,
-        c.data() + buffer_index * c_elements + y * output_width * channels,
-        kernel_height * step_width * sizeof(void*), 0,
-        0, z.data(), &params);
+             i.data() + buffer_index * i_elements + step_height * y,
+             w.data() + buffer_index * w_size,
+             c.data() + buffer_index * c_elements + y * output_width * channels,
+             kernel_height * step_width * sizeof(void*), 0, 0, z.data(),
+             &params);
     }
   }
 
@@ -144,13 +161,17 @@ static void DWConvBenchmark(benchmark::State& state,
     state.counters["cpufreq"] = cpu_frequency;
   }
 
-  state.counters["OPS"] = benchmark::Counter(
-    uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
-    benchmark::Counter::kIsRate);
+  state.counters["OPS"] =
+      benchmark::Counter(static_cast<uint64_t>(state.iterations()) * 2 *
+                             output_size * channels * kernel_size,
+                         benchmark::Counter::kIsRate);
 
   state.counters["bytes"] = benchmark::Counter(
-    uint64_t(state.iterations()) * channels * ((output_size + input_height * input_width + kernel_size) * sizeof(int8_t) + sizeof(int32_t)),
-    benchmark::Counter::kIsRate);
+      static_cast<uint64_t>(state.iterations()) * channels *
+          ((output_size + input_height * input_width + kernel_size) *
+               sizeof(int8_t) +
+           sizeof(int32_t)),
+      benchmark::Counter::kIsRate);
 }
 
 static void DWConvBenchmark(benchmark::State& state,
@@ -679,8 +700,7 @@ static void DWConvBenchmark(benchmark::State& state,
 
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   static void qs8_dwconv_9p16c__avx512skx_mul32(benchmark::State& state, const char* net) {
     DWConvBenchmark(state,
       xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32,
@@ -693,6 +713,12 @@ static void DWConvBenchmark(benchmark::State& state,
       xnn_init_qs8_conv_minmax_fp32_scalar_params,
       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX);
   }
+
+  BENCHMARK_DWCONV(qs8_dwconv_9p16c__avx512skx_mul32);
+  BENCHMARK_DWCONV(qs8_dwconv_9p32c__avx512skx_mul32);
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void qs8_dwconv_9p16c__avx2_mul16_vpmovsx(benchmark::State& state, const char* net) {
     DWConvBenchmark(state,
       xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul16_vpmovsx,
@@ -1239,9 +1265,6 @@ static void DWConvBenchmark(benchmark::State& state,
       benchmark::utils::CheckAVX2);
   }
 
-  BENCHMARK_DWCONV(qs8_dwconv_9p16c__avx512skx_mul32);
-  BENCHMARK_DWCONV(qs8_dwconv_9p32c__avx512skx_mul32);
-
   BENCHMARK_DWCONV(qs8_dwconv_9p16c__avx2_mul16_vpmovsx);
   BENCHMARK_DWCONV(qs8_dwconv_9p32c__avx2_mul16_vpmovsx);
   BENCHMARK_DWCONV(qs8_dwconv_9p16c__avx2_mul16_vpunpck);
diff --git a/bench/qs8-f16-vcvt.cc b/bench/qs8-f16-vcvt.cc
index 8feb7e15af9..bcf73e25966 100644
--- a/bench/qs8-f16-vcvt.cc
+++ b/bench/qs8-f16-vcvt.cc
@@ -3,16 +3,17 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <benchmark/benchmark.h>
-#include "bench/utils.h"
-#include "bench/vcvt-benchmark.h"
+#include "utils.h"
+#include "vcvt-benchmark.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/hardware-config.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
-#include "xnnpack/microparams.h"
 #include "xnnpack/microparams-init.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/vcvt.h"
+#include <benchmark/benchmark.h>
 
 static void qs8_f16_vcvt(
   benchmark::State& state,
@@ -33,7 +34,7 @@ static void qs8_f16_vcvt(
 BENCHMARK_CAPTURE(qs8_f16_vcvt, ukernel, arch_flags, ukernel, init_params)          \
   ->Apply(benchmark::utils::UnaryElementwiseParameters<datatype_in, datatype_out>)  \
   ->UseRealTime();
-#include "src/qs8-f16-vcvt/qs8-f16-vcvt.h"
+#include "qs8-f16-vcvt/qs8-f16-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
 
 
diff --git a/bench/qs8-f32-vcvt.cc b/bench/qs8-f32-vcvt.cc
index 8bac67bc1f1..5dc917aa950 100644
--- a/bench/qs8-f32-vcvt.cc
+++ b/bench/qs8-f32-vcvt.cc
@@ -4,8 +4,8 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
-#include "bench/vcvt-benchmark.h"
+#include "utils.h"
+#include "vcvt-benchmark.h"
 #include "xnnpack.h"
 #include "xnnpack/hardware-config.h"
 #include "xnnpack/microfnptr.h"
@@ -32,7 +32,7 @@ static void qs8_f32_vcvt(
 BENCHMARK_CAPTURE(qs8_f32_vcvt, ukernel, arch_flags, ukernel, init_params)          \
   ->Apply(benchmark::utils::UnaryElementwiseParameters<datatype_in, datatype_out>)  \
   ->UseRealTime();
-#include "src/qs8-f32-vcvt/qs8-f32-vcvt.h"
+#include "qs8-f32-vcvt/qs8-f32-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
 
 
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index 1590c078f59..50e436e06a3 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -14,8 +14,8 @@
 #include <vector>
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #ifdef BENCHMARK_RUY
 #include "ruy/ruy.h"
 #endif  // BENCHMARK_RUY
diff --git a/bench/qs8-packw.cc b/bench/qs8-packw.cc
index 6b54534823a..dcd8971c795 100644
--- a/bench/qs8-packw.cc
+++ b/bench/qs8-packw.cc
@@ -5,9 +5,9 @@
 
 
 #include <benchmark/benchmark.h>
-#include "bench/bgemm.h"
-#include "bench/packw-benchmark.h"
-#include "bench/utils.h"
+#include "bgemm.h"
+#include "packw-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/hardware-config.h"
 #include "xnnpack/packw.h"
@@ -22,7 +22,7 @@ static void qs8_packw(benchmark::State& state, const char* net,
 #define XNN_QS8_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale, izp)       \
 BENCHMARK_CAPTURE_BGEMM(qs8_packw, ukernel##_, ukernel, arch_flags, nr, kr, sr);
 
-#include "src/qs8-packw/qs8-packw.h"
+#include "qs8-packw/qs8-packw.h"
 
 #undef XNN_QS8_UKERNEL
 
diff --git a/bench/qs8-qc8w-gemm-fp32.cc b/bench/qs8-qc8w-gemm-fp32.cc
index 905f3fc491d..795f8393620 100644
--- a/bench/qs8-qc8w-gemm-fp32.cc
+++ b/bench/qs8-qc8w-gemm-fp32.cc
@@ -8,8 +8,8 @@
 //   Generator: tools/generate-gemm-test.py
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/isa-checks.h"
diff --git a/bench/qs8-rdsum.cc b/bench/qs8-rdsum.cc
index f227b87b189..0a136758c59 100644
--- a/bench/qs8-rdsum.cc
+++ b/bench/qs8-rdsum.cc
@@ -7,8 +7,8 @@
 //   Specification: test/qs8-rdsum-minmax-fp32.yaml
 //   Generator: tools/generate-rdsum-benchmark.py
 
-#include "bench/rsum-benchmark.h"
-#include "bench/utils.h"
+#include "rsum-benchmark.h"
+#include "utils.h"
 #include <benchmark/benchmark.h>
 
 #include "xnnpack.h"
diff --git a/bench/qs8-requantization.cc b/bench/qs8-requantization.cc
index 3b1d63e6927..cff6e6934aa 100644
--- a/bench/qs8-requantization.cc
+++ b/bench/qs8-requantization.cc
@@ -9,7 +9,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/microfnptr.h"
@@ -72,12 +72,6 @@ static void qs8_requantization(
     ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, int8_t>)
     ->UseRealTime();
 
-  BENCHMARK_CAPTURE(qs8_requantization, rndna__neon,
-                    xnn_qs8_requantize_rndna__neon,
-                    benchmark::utils::CheckNEON)
-    ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, int8_t>)
-    ->UseRealTime();
-
   BENCHMARK_CAPTURE(qs8_requantization, rndnu__neon_mull,
                     xnn_qs8_requantize_rndnu__neon_mull,
                     benchmark::utils::CheckNEON)
@@ -116,21 +110,6 @@ static void qs8_requantization(
     ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, int8_t>)
     ->UseRealTime();
 
-  BENCHMARK_CAPTURE(qs8_requantization, rndna__sse2,
-                    xnn_qs8_requantize_rndna__sse2)
-    ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, int8_t>)
-    ->UseRealTime();
-  BENCHMARK_CAPTURE(qs8_requantization, rndna__ssse3,
-                    xnn_qs8_requantize_rndna__ssse3,
-                    benchmark::utils::CheckSSSE3)
-    ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, int8_t>)
-    ->UseRealTime();
-  BENCHMARK_CAPTURE(qs8_requantization, rndna__sse41,
-                    xnn_qs8_requantize_rndna__sse41,
-                    benchmark::utils::CheckSSE41)
-    ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, int8_t>)
-    ->UseRealTime();
-
   BENCHMARK_CAPTURE(qs8_requantization, rndnu__sse41_sra,
                     xnn_qs8_requantize_rndnu__sse41_sra,
                     benchmark::utils::CheckSSE41)
@@ -169,19 +148,6 @@ BENCHMARK_CAPTURE(qs8_requantization, gemmlowp__scalar,
   ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, int8_t>)
   ->UseRealTime();
 
-BENCHMARK_CAPTURE(qs8_requantization, rndna__scalar_signed64,
-                  xnn_qs8_requantize_rndna__scalar_signed64)
-  ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, int8_t>)
-  ->UseRealTime();
-BENCHMARK_CAPTURE(qs8_requantization, rndna__scalar_unsigned32,
-                  xnn_qs8_requantize_rndna__scalar_unsigned32)
-  ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, int8_t>)
-  ->UseRealTime();
-BENCHMARK_CAPTURE(qs8_requantization, rndna__scalar_unsigned64,
-                  xnn_qs8_requantize_rndna__scalar_unsigned64)
-  ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, int8_t>)
-  ->UseRealTime();
-
 BENCHMARK_CAPTURE(qs8_requantization, rndnu__scalar,
                   xnn_qs8_requantize_rndnu__scalar)
   ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, int8_t>)
diff --git a/bench/qs8-rsum.cc b/bench/qs8-rsum.cc
index 5b4f1a319ca..2651d3625ef 100644
--- a/bench/qs8-rsum.cc
+++ b/bench/qs8-rsum.cc
@@ -7,8 +7,8 @@
 //   Specification: test/qs8-rsum.yaml
 //   Generator: tools/generate-rdsum-benchmark.py
 
-#include "bench/rsum-benchmark.h"
-#include "bench/utils.h"
+#include "rsum-benchmark.h"
+#include "utils.h"
 #include <benchmark/benchmark.h>
 
 #include "xnnpack.h"
diff --git a/bench/qs8-vcvt.cc b/bench/qs8-vcvt.cc
index fc4009e787f..765c08b107b 100644
--- a/bench/qs8-vcvt.cc
+++ b/bench/qs8-vcvt.cc
@@ -5,8 +5,8 @@
 
 #include <cstdint>
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
-#include "bench/vcvt-benchmark.h"
+#include "utils.h"
+#include "vcvt-benchmark.h"
 #include "xnnpack.h"
 #include "xnnpack/hardware-config.h"
 #include "xnnpack/microfnptr.h"
@@ -31,7 +31,7 @@ static void qs8_vcvt(
 BENCHMARK_CAPTURE(qs8_vcvt, ukernel, arch_flags, ukernel, init_params)              \
   ->Apply(benchmark::utils::UnaryElementwiseParameters<datatype_in, datatype_out>)  \
   ->UseRealTime();
-#include "src/qs8-vcvt/qs8-vcvt.h"
+#include "qs8-vcvt/qs8-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
 
 
diff --git a/bench/qu8-f32-vcvt.cc b/bench/qu8-f32-vcvt.cc
index 08c9571b28b..f90bf47d203 100644
--- a/bench/qu8-f32-vcvt.cc
+++ b/bench/qu8-f32-vcvt.cc
@@ -4,8 +4,8 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
-#include "bench/vcvt-benchmark.h"
+#include "utils.h"
+#include "vcvt-benchmark.h"
 #include "xnnpack.h"
 #include "xnnpack/hardware-config.h"
 #include "xnnpack/microfnptr.h"
@@ -32,7 +32,7 @@ static void qu8_f32_vcvt(
 BENCHMARK_CAPTURE(qu8_f32_vcvt, ukernel, arch_flags, ukernel, init_params)          \
   ->Apply(benchmark::utils::UnaryElementwiseParameters<datatype_in, datatype_out>)  \
   ->UseRealTime();
-#include "src/qu8-f32-vcvt/qu8-f32-vcvt.h"
+#include "qu8-f32-vcvt/qu8-f32-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
 
 
diff --git a/bench/qu8-gemm-fp32.cc b/bench/qu8-gemm-fp32.cc
index e7fc2ba3364..67f1ce5f5c2 100644
--- a/bench/qu8-gemm-fp32.cc
+++ b/bench/qu8-gemm-fp32.cc
@@ -8,8 +8,8 @@
 //   Generator: tools/generate-gemm-test.py
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/isa-checks.h"
diff --git a/bench/qu8-gemm-rndnu.cc b/bench/qu8-gemm-rndnu.cc
index ac374a7a12e..c812703d5e8 100644
--- a/bench/qu8-gemm-rndnu.cc
+++ b/bench/qu8-gemm-rndnu.cc
@@ -8,8 +8,8 @@
 //   Generator: tools/generate-gemm-test.py
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/isa-checks.h"
diff --git a/bench/qu8-gemm.cc b/bench/qu8-gemm.cc
index 89514b12aa0..2ecea2ff721 100644
--- a/bench/qu8-gemm.cc
+++ b/bench/qu8-gemm.cc
@@ -24,8 +24,8 @@
 #ifdef BENCHMARK_RUY
 #include "ruy/ruy.h"
 #endif  // BENCHMARK_RUY
-#include "bench/gemm.h"
-#include "bench/utils.h"
+#include "gemm.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
@@ -537,8 +537,7 @@ static void ruy_st(benchmark::State& state, const char* net)
   BENCHMARK_GEMM(qu8_gemm_2x2c4__armsimd32)
 #endif  // XNN_ARCH_ARM
 
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   static void qu8_gemm_1x16c8__avx512skx(benchmark::State& state, const char* net) {
     GEMMBenchmark(state,
       xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
@@ -546,6 +545,11 @@ static void ruy_st(benchmark::State& state, const char* net)
       1, 16, 8, 1,
       benchmark::utils::CheckAVX512SKX);
   }
+
+  BENCHMARK_GEMM(qu8_gemm_1x16c8__avx512skx)
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void qu8_gemm_1x8c8__avx2(benchmark::State& state, const char* net) {
     GEMMBenchmark(state,
       xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
@@ -849,8 +853,6 @@ static void ruy_st(benchmark::State& state, const char* net)
       /*mr=*/3, /*nr=*/4, /*kr=*/8, /*sr=*/1);
   }
 
-  BENCHMARK_GEMM(qu8_gemm_1x16c8__avx512skx)
-
   BENCHMARK_GEMM(qu8_gemm_1x8c8__avx2)
   BENCHMARK_GEMM(qu8_gemm_2x8c8__avx2)
   BENCHMARK_GEMM(qu8_gemm_3x8c8__avx2)
diff --git a/bench/qu8-rdsum.cc b/bench/qu8-rdsum.cc
index b61c39de3ed..747117bcc11 100644
--- a/bench/qu8-rdsum.cc
+++ b/bench/qu8-rdsum.cc
@@ -7,8 +7,8 @@
 //   Specification: test/qu8-rdsum.yaml
 //   Generator: tools/generate-rdsum-benchmark.py
 
-#include "bench/rsum-benchmark.h"
-#include "bench/utils.h"
+#include "rsum-benchmark.h"
+#include "utils.h"
 #include <benchmark/benchmark.h>
 
 #include "xnnpack.h"
diff --git a/bench/qu8-requantization.cc b/bench/qu8-requantization.cc
index d01936b587e..301872bf193 100644
--- a/bench/qu8-requantization.cc
+++ b/bench/qu8-requantization.cc
@@ -9,7 +9,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/microfnptr.h"
@@ -71,12 +71,6 @@ static void qu8_requantization(
                     benchmark::utils::CheckNEON)
     ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, uint8_t>)
     ->UseRealTime();
-
-  BENCHMARK_CAPTURE(qu8_requantization, rndna__neon,
-                    xnn_qu8_requantize_rndna__neon,
-                    benchmark::utils::CheckNEON)
-    ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, uint8_t>)
-    ->UseRealTime();
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -99,21 +93,6 @@ static void qu8_requantization(
                     benchmark::utils::CheckSSE41)
     ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, uint8_t>)
     ->UseRealTime();
-
-  BENCHMARK_CAPTURE(qu8_requantization, rndna__sse2,
-                    xnn_qu8_requantize_rndna__sse2)
-    ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, uint8_t>)
-    ->UseRealTime();
-  BENCHMARK_CAPTURE(qu8_requantization, rndna__ssse3,
-                    xnn_qu8_requantize_rndna__ssse3,
-                    benchmark::utils::CheckSSSE3)
-    ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, uint8_t>)
-    ->UseRealTime();
-  BENCHMARK_CAPTURE(qu8_requantization, rndna__sse41,
-                    xnn_qu8_requantize_rndna__sse41,
-                    benchmark::utils::CheckSSE41)
-    ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, uint8_t>)
-    ->UseRealTime();
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
@@ -142,19 +121,6 @@ BENCHMARK_CAPTURE(qu8_requantization, gemmlowp__scalar,
   ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, uint8_t>)
   ->UseRealTime();
 
-BENCHMARK_CAPTURE(qu8_requantization, rndna__scalar_signed64,
-                  xnn_qu8_requantize_rndna__scalar_signed64)
-  ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, uint8_t>)
-  ->UseRealTime();
-BENCHMARK_CAPTURE(qu8_requantization, rndna__scalar_unsigned32,
-                  xnn_qu8_requantize_rndna__scalar_unsigned32)
-  ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, uint8_t>)
-  ->UseRealTime();
-BENCHMARK_CAPTURE(qu8_requantization, rndna__scalar_unsigned64,
-                  xnn_qu8_requantize_rndna__scalar_unsigned64)
-  ->Apply(benchmark::utils::UnaryElementwiseParameters<int32_t, uint8_t>)
-  ->UseRealTime();
-
 #ifndef XNNPACK_BENCHMARK_NO_MAIN
 BENCHMARK_MAIN();
 #endif
diff --git a/bench/qu8-rsum.cc b/bench/qu8-rsum.cc
index f5c7ff8b2fb..54982589776 100644
--- a/bench/qu8-rsum.cc
+++ b/bench/qu8-rsum.cc
@@ -7,8 +7,8 @@
 //   Specification: test/qu8-rsum.yaml
 //   Generator: tools/generate-rdsum-benchmark.py
 
-#include "bench/rsum-benchmark.h"
-#include "bench/utils.h"
+#include "rsum-benchmark.h"
+#include "utils.h"
 #include <benchmark/benchmark.h>
 
 #include "xnnpack.h"
diff --git a/bench/qu8-vcvt.cc b/bench/qu8-vcvt.cc
index 26a5d911075..f43be113cd2 100644
--- a/bench/qu8-vcvt.cc
+++ b/bench/qu8-vcvt.cc
@@ -4,8 +4,8 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
-#include "bench/vcvt-benchmark.h"
+#include "utils.h"
+#include "vcvt-benchmark.h"
 #include "xnnpack.h"
 #include "xnnpack/hardware-config.h"
 #include "xnnpack/microfnptr.h"
@@ -30,7 +30,7 @@ static void qu8_vcvt(
 BENCHMARK_CAPTURE(qu8_vcvt, ukernel, arch_flags, ukernel, init_params)              \
   ->Apply(benchmark::utils::UnaryElementwiseParameters<datatype_in, datatype_out>)  \
   ->UseRealTime();
-#include "src/qu8-vcvt/qu8-vcvt.h"
+#include "qu8-vcvt/qu8-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
 
 
diff --git a/bench/reciprocal-square-root.cc b/bench/reciprocal-square-root.cc
index 858a193afa4..2ce8ebe1003 100644
--- a/bench/reciprocal-square-root.cc
+++ b/bench/reciprocal-square-root.cc
@@ -6,7 +6,7 @@
 #include "xnnpack.h"
 
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/bench/rsum-benchmark.h b/bench/rsum-benchmark.h
index 31327107d92..1aa40fc2d10 100644
--- a/bench/rsum-benchmark.h
+++ b/bench/rsum-benchmark.h
@@ -12,13 +12,14 @@
 #include <numeric>
 #include <vector>
 
-#include "bench/rsum-benchmark.h"
-#include "bench/utils.h"
+#include "rsum-benchmark.h"
+#include "utils.h"
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/reduce.h"
-#include "xnnpack/buffer.h"
 #include <benchmark/benchmark.h>
 
 namespace {
diff --git a/bench/s32-f32-vcvt.cc b/bench/s32-f32-vcvt.cc
index a5f8748247c..a90c238e95f 100644
--- a/bench/s32-f32-vcvt.cc
+++ b/bench/s32-f32-vcvt.cc
@@ -4,8 +4,8 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
-#include "bench/vcvt-benchmark.h"
+#include "utils.h"
+#include "vcvt-benchmark.h"
 #include "xnnpack.h"
 #include "xnnpack/hardware-config.h"
 #include "xnnpack/microfnptr.h"
@@ -30,7 +30,7 @@ static void s32_f32_vcvt(
 BENCHMARK_CAPTURE(s32_f32_vcvt, ukernel, arch_flags, ukernel, init_params)          \
   ->Apply(benchmark::utils::UnaryElementwiseParameters<datatype_in, datatype_out>)  \
   ->UseRealTime();
-#include "src/s32-f32-vcvt/s32-f32-vcvt.h"
+#include "s32-f32-vcvt/s32-f32-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
 
 
diff --git a/bench/scaled-dot-product-attention.cc b/bench/scaled-dot-product-attention.cc
index ec89d58d71c..17043f7f715 100644
--- a/bench/scaled-dot-product-attention.cc
+++ b/bench/scaled-dot-product-attention.cc
@@ -15,7 +15,7 @@
 #include "xnnpack.h"
 
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack/buffer.h"
 
 void xnnpack_multihead_scaled_batch_matrix_multiply_cap_tanh_f32(benchmark::State& state, const char* net) {
diff --git a/bench/sigmoid.cc b/bench/sigmoid.cc
index 6344a8a0363..1c34fcdb52e 100644
--- a/bench/sigmoid.cc
+++ b/bench/sigmoid.cc
@@ -3,13 +3,13 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include <cstdint>
 #include <limits>
 
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "flatbuffers/include/flatbuffers/flatbuffer_builder.h"
diff --git a/bench/softmax.cc b/bench/softmax.cc
index 5672efe6ab4..3dc57217889 100644
--- a/bench/softmax.cc
+++ b/bench/softmax.cc
@@ -14,7 +14,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/math.h"
 #include "xnnpack/buffer.h"
diff --git a/bench/spmm-benchmark.h b/bench/spmm-benchmark.h
index b0d02b57e9e..300a747d082 100644
--- a/bench/spmm-benchmark.h
+++ b/bench/spmm-benchmark.h
@@ -5,8 +5,8 @@
 
 #pragma once
 
-#include "bench/spmm.h"
-#include "bench/utils.h"
+#include "spmm.h"
+#include "utils.h"
 
 #include "xnnpack.h"
 #include "xnnpack/aligned-allocator.h"
diff --git a/bench/square-root.cc b/bench/square-root.cc
index 5411a4c7feb..e7d56c97a06 100644
--- a/bench/square-root.cc
+++ b/bench/square-root.cc
@@ -3,10 +3,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/bench/square.cc b/bench/square.cc
index c794322017a..17d1c4fbc90 100644
--- a/bench/square.cc
+++ b/bench/square.cc
@@ -3,10 +3,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/bench/tanh.cc b/bench/tanh.cc
index ad30bbfbb10..02b8f8113b0 100644
--- a/bench/tanh.cc
+++ b/bench/tanh.cc
@@ -3,13 +3,13 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include "xnnpack.h"
-
 #include <cstdint>
 #include <limits>
 
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
+#include "xnnpack.h"
+#include "xnnpack/math.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "flatbuffers/include/flatbuffers/flatbuffer_builder.h"
diff --git a/bench/truncation.cc b/bench/truncation.cc
index 2358fc3e445..4b5b3d780d9 100644
--- a/bench/truncation.cc
+++ b/bench/truncation.cc
@@ -6,7 +6,7 @@
 #include "xnnpack.h"
 
 #include "unary_operator.h"
-#include "bench/utils.h"
+#include "utils.h"
 #include <benchmark/benchmark.h>
 #ifdef BENCHMARK_TENSORFLOW_LITE
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/bench/u32-f32-vcvt.cc b/bench/u32-f32-vcvt.cc
index 44d70e02723..4f570cf1cad 100644
--- a/bench/u32-f32-vcvt.cc
+++ b/bench/u32-f32-vcvt.cc
@@ -9,8 +9,8 @@
 
 
 #include <benchmark/benchmark.h>
-#include "bench/utils.h"
-#include "bench/vcvt-benchmark.h"
+#include "utils.h"
+#include "vcvt-benchmark.h"
 #include "xnnpack.h"
 #include "xnnpack/hardware-config.h"
 #include "xnnpack/microfnptr.h"
@@ -35,7 +35,7 @@ static void u32_f32_vcvt(
 BENCHMARK_CAPTURE(u32_f32_vcvt, ukernel, arch_flags, ukernel, init_params)          \
   ->Apply(benchmark::utils::UnaryElementwiseParameters<datatype_in, datatype_out>)  \
   ->UseRealTime();
-#include "src/u32-f32-vcvt/u32-f32-vcvt.h"
+#include "u32-f32-vcvt/u32-f32-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
 
 
diff --git a/bench/unary_operator.h b/bench/unary_operator.h
index f8b6eb7a59c..2ad458acdbe 100644
--- a/bench/unary_operator.h
+++ b/bench/unary_operator.h
@@ -17,7 +17,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack/math.h"
 #include "xnnpack/buffer.h"
 #include <benchmark/benchmark.h>
diff --git a/bench/utils.cc b/bench/utils.cc
index d612d964cd1..5239d10e980 100644
--- a/bench/utils.cc
+++ b/bench/utils.cc
@@ -28,7 +28,7 @@
 #include "xnnpack/allocator.h"
 #include "xnnpack/hardware-config.h"
 
-#include "bench/utils.h"
+#include "utils.h"
 
 static void* wipe_buffer = nullptr;
 static size_t wipe_buffer_size = 0;
diff --git a/bench/vbinary.cc b/bench/vbinary.cc
index 526439f412a..7d24e90a663 100644
--- a/bench/vbinary.cc
+++ b/bench/vbinary.cc
@@ -13,7 +13,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/hardware-config.h"
@@ -156,59 +156,59 @@ static void vbinary(benchmark::State& state, uint64_t arch_flags,
       ->Apply(                                                                \
           benchmark::utils::BinaryElementwiseParameters<datatype, datatype>)  \
       ->UseRealTime();
-#include "src/f16-vbinary/f16-vadd.h"
-#include "src/f16-vbinary/f16-vaddc.h"
-#include "src/f16-vbinary/f16-vdiv.h"
-#include "src/f16-vbinary/f16-vdivc.h"
-#include "src/f16-vbinary/f16-vmax.h"
-#include "src/f16-vbinary/f16-vmaxc.h"
-#include "src/f16-vbinary/f16-vmin.h"
-#include "src/f16-vbinary/f16-vminc.h"
-#include "src/f16-vbinary/f16-vmul.h"
-#include "src/f16-vbinary/f16-vmulc.h"
-#include "src/f16-vbinary/f16-vprelu.h"
-#include "src/f16-vbinary/f16-vpreluc.h"
-#include "src/f16-vbinary/f16-vrdivc.h"
-#include "src/f16-vbinary/f16-vrpreluc.h"
-#include "src/f16-vbinary/f16-vrsubc.h"
-#include "src/f16-vbinary/f16-vsqrdiff.h"
-#include "src/f16-vbinary/f16-vsqrdiffc.h"
-#include "src/f16-vbinary/f16-vsub.h"
-#include "src/f16-vbinary/f16-vsubc.h"
-#include "src/f32-vbinary/f32-vadd.h"
-#include "src/f32-vbinary/f32-vaddc.h"
-#include "src/f32-vbinary/f32-vcopysign.h"
-#include "src/f32-vbinary/f32-vcopysignc.h"
-#include "src/f32-vbinary/f32-vdiv.h"
-#include "src/f32-vbinary/f32-vdivc.h"
-#include "src/f32-vbinary/f32-vmax.h"
-#include "src/f32-vbinary/f32-vmaxc.h"
-#include "src/f32-vbinary/f32-vmin.h"
-#include "src/f32-vbinary/f32-vminc.h"
-#include "src/f32-vbinary/f32-vmul.h"
-#include "src/f32-vbinary/f32-vmulc.h"
-#include "src/f32-vbinary/f32-vprelu.h"
-#include "src/f32-vbinary/f32-vpreluc.h"
-#include "src/f32-vbinary/f32-vrcopysignc.h"
-#include "src/f32-vbinary/f32-vrdivc.h"
-#include "src/f32-vbinary/f32-vrpreluc.h"
-#include "src/f32-vbinary/f32-vrsubc.h"
-#include "src/f32-vbinary/f32-vsqrdiff.h"
-#include "src/f32-vbinary/f32-vsqrdiffc.h"
-#include "src/f32-vbinary/f32-vsub.h"
-#include "src/f32-vbinary/f32-vsubc.h"
-#include "src/qs8-vadd/qs8-vadd-minmax.h"
-#include "src/qs8-vaddc/qs8-vaddc-minmax.h"
-#include "src/qs8-vmul/qs8-vmul-minmax-fp32.h"
-#include "src/qs8-vmul/qs8-vmul-minmax-rndnu.h"
-#include "src/qs8-vmulc/qs8-vmulc-minmax-fp32.h"
-#include "src/qs8-vmulc/qs8-vmulc-minmax-rndnu.h"
-#include "src/qu8-vadd/qu8-vadd-minmax.h"
-#include "src/qu8-vaddc/qu8-vaddc-minmax.h"
-#include "src/qu8-vmul/qu8-vmul-minmax-fp32.h"
-#include "src/qu8-vmul/qu8-vmul-minmax-rndnu.h"
-#include "src/qu8-vmulc/qu8-vmulc-minmax-fp32.h"
-#include "src/qu8-vmulc/qu8-vmulc-minmax-rndnu.h"
+#include "f16-vbinary/f16-vadd.h"
+#include "f16-vbinary/f16-vaddc.h"
+#include "f16-vbinary/f16-vdiv.h"
+#include "f16-vbinary/f16-vdivc.h"
+#include "f16-vbinary/f16-vmax.h"
+#include "f16-vbinary/f16-vmaxc.h"
+#include "f16-vbinary/f16-vmin.h"
+#include "f16-vbinary/f16-vminc.h"
+#include "f16-vbinary/f16-vmul.h"
+#include "f16-vbinary/f16-vmulc.h"
+#include "f16-vbinary/f16-vprelu.h"
+#include "f16-vbinary/f16-vpreluc.h"
+#include "f16-vbinary/f16-vrdivc.h"
+#include "f16-vbinary/f16-vrpreluc.h"
+#include "f16-vbinary/f16-vrsubc.h"
+#include "f16-vbinary/f16-vsqrdiff.h"
+#include "f16-vbinary/f16-vsqrdiffc.h"
+#include "f16-vbinary/f16-vsub.h"
+#include "f16-vbinary/f16-vsubc.h"
+#include "f32-vbinary/f32-vadd.h"
+#include "f32-vbinary/f32-vaddc.h"
+#include "f32-vbinary/f32-vcopysign.h"
+#include "f32-vbinary/f32-vcopysignc.h"
+#include "f32-vbinary/f32-vdiv.h"
+#include "f32-vbinary/f32-vdivc.h"
+#include "f32-vbinary/f32-vmax.h"
+#include "f32-vbinary/f32-vmaxc.h"
+#include "f32-vbinary/f32-vmin.h"
+#include "f32-vbinary/f32-vminc.h"
+#include "f32-vbinary/f32-vmul.h"
+#include "f32-vbinary/f32-vmulc.h"
+#include "f32-vbinary/f32-vprelu.h"
+#include "f32-vbinary/f32-vpreluc.h"
+#include "f32-vbinary/f32-vrcopysignc.h"
+#include "f32-vbinary/f32-vrdivc.h"
+#include "f32-vbinary/f32-vrpreluc.h"
+#include "f32-vbinary/f32-vrsubc.h"
+#include "f32-vbinary/f32-vsqrdiff.h"
+#include "f32-vbinary/f32-vsqrdiffc.h"
+#include "f32-vbinary/f32-vsub.h"
+#include "f32-vbinary/f32-vsubc.h"
+#include "qs8-vadd/qs8-vadd-minmax.h"
+#include "qs8-vaddc/qs8-vaddc-minmax.h"
+#include "qs8-vmul/qs8-vmul-minmax-fp32.h"
+#include "qs8-vmul/qs8-vmul-minmax-rndnu.h"
+#include "qs8-vmulc/qs8-vmulc-minmax-fp32.h"
+#include "qs8-vmulc/qs8-vmulc-minmax-rndnu.h"
+#include "qu8-vadd/qu8-vadd-minmax.h"
+#include "qu8-vaddc/qu8-vaddc-minmax.h"
+#include "qu8-vmul/qu8-vmul-minmax-fp32.h"
+#include "qu8-vmul/qu8-vmul-minmax-rndnu.h"
+#include "qu8-vmulc/qu8-vmulc-minmax-fp32.h"
+#include "qu8-vmulc/qu8-vmulc-minmax-rndnu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
 
 #ifndef XNNPACK_BENCHMARK_NO_MAIN
diff --git a/bench/vcvt-benchmark.h b/bench/vcvt-benchmark.h
index db65e7d2fb6..a97a235fc66 100644
--- a/bench/vcvt-benchmark.h
+++ b/bench/vcvt-benchmark.h
@@ -8,7 +8,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/buffer.h"
 #include <benchmark/benchmark.h>
diff --git a/bench/vunary.cc b/bench/vunary.cc
index a41b83222f3..30ef008f441 100644
--- a/bench/vunary.cc
+++ b/bench/vunary.cc
@@ -14,16 +14,16 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
 #include "xnnpack/common.h"
 #include "xnnpack/hardware-config.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams-init.h"
 #include "xnnpack/microparams.h"
-#include "xnnpack/vhswish.h"
-#include "xnnpack/vlrelu.h"
-#include "xnnpack/buffer.h"
+#include "xnnpack/vunary.h"
 #include <benchmark/benchmark.h>
 
 template <typename T>
@@ -197,45 +197,45 @@ void vunary(benchmark::State& state, uint64_t arch_flags,
       ->Apply(                                                                \
           benchmark::utils::UnaryElementwiseParameters<datatype, datatype>)   \
       ->UseRealTime();
-#include "src/f16-vabs/f16-vabs.h"
-#include "src/f16-vclamp/f16-vclamp.h"
-#include "src/f16-velu/f16-velu.h"
-#include "src/f16-vhswish/f16-vhswish.h"
-#include "src/f16-vlrelu/f16-vlrelu.h"
-#include "src/f16-vneg/f16-vneg.h"
-#include "src/f16-vrnd/f16-vrndd.h"
-#include "src/f16-vrnd/f16-vrndne.h"
-#include "src/f16-vrnd/f16-vrndu.h"
-#include "src/f16-vrnd/f16-vrndz.h"
-#include "src/f16-vrsqrt/f16-vrsqrt.h"
-#include "src/f16-vsigmoid/f16-vsigmoid.h"
-#include "src/f16-vsqr/f16-vsqr.h"
-#include "src/f16-vsqrt/f16-vsqrt.h"
-#include "src/f16-vtanh/f16-vtanh.h"
-#include "src/f32-vabs/f32-vabs.h"
-#include "src/f32-vclamp/f32-vclamp.h"
-#include "src/f32-velu/f32-velu.h"
-#include "src/f32-vgelu/f32-vgelu.h"
-#include "src/f32-vhswish/f32-vhswish.h"
-#include "src/f32-vlog/f32-vlog.h"
-#include "src/f32-vlrelu/f32-vlrelu.h"
-#include "src/f32-vneg/f32-vneg.h"
-#include "src/f32-vrelu/f32-vrelu.h"
-#include "src/f32-vrnd/f32-vrndd.h"
-#include "src/f32-vrnd/f32-vrndne.h"
-#include "src/f32-vrnd/f32-vrndu.h"
-#include "src/f32-vrnd/f32-vrndz.h"
-#include "src/f32-vrsqrt/f32-vrsqrt.h"
-#include "src/f32-vsigmoid/f32-vsigmoid.h"
-#include "src/f32-vsqr/f32-vsqr.h"
-#include "src/f32-vsqrt/f32-vsqrt.h"
-#include "src/f32-vtanh/f32-vtanh.h"
-#include "src/qs8-vhswish/qs8-vhswish.h"
-#include "src/qs8-vlrelu/qs8-vlrelu.h"
-#include "src/qu8-vhswish/qu8-vhswish.h"
-#include "src/qu8-vlrelu/qu8-vlrelu.h"
-#include "src/s8-vclamp/s8-vclamp.h"
-#include "src/u8-vclamp/u8-vclamp.h"
+#include "f16-vabs/f16-vabs.h"
+#include "f16-vclamp/f16-vclamp.h"
+#include "f16-velu/f16-velu.h"
+#include "f16-vhswish/f16-vhswish.h"
+#include "f16-vlrelu/f16-vlrelu.h"
+#include "f16-vneg/f16-vneg.h"
+#include "f16-vrnd/f16-vrndd.h"
+#include "f16-vrnd/f16-vrndne.h"
+#include "f16-vrnd/f16-vrndu.h"
+#include "f16-vrnd/f16-vrndz.h"
+#include "f16-vrsqrt/f16-vrsqrt.h"
+#include "f16-vsigmoid/f16-vsigmoid.h"
+#include "f16-vsqr/f16-vsqr.h"
+#include "f16-vsqrt/f16-vsqrt.h"
+#include "f16-vtanh/f16-vtanh.h"
+#include "f32-vabs/f32-vabs.h"
+#include "f32-vclamp/f32-vclamp.h"
+#include "f32-velu/f32-velu.h"
+#include "f32-vgelu/f32-vgelu.h"
+#include "f32-vhswish/f32-vhswish.h"
+#include "f32-vlog/f32-vlog.h"
+#include "f32-vlrelu/f32-vlrelu.h"
+#include "f32-vneg/f32-vneg.h"
+#include "f32-vrelu/f32-vrelu.h"
+#include "f32-vrnd/f32-vrndd.h"
+#include "f32-vrnd/f32-vrndne.h"
+#include "f32-vrnd/f32-vrndu.h"
+#include "f32-vrnd/f32-vrndz.h"
+#include "f32-vrsqrt/f32-vrsqrt.h"
+#include "f32-vsigmoid/f32-vsigmoid.h"
+#include "f32-vsqr/f32-vsqr.h"
+#include "f32-vsqrt/f32-vsqrt.h"
+#include "f32-vtanh/f32-vtanh.h"
+#include "qs8-vhswish/qs8-vhswish.h"
+#include "qs8-vlrelu/qs8-vlrelu.h"
+#include "qu8-vhswish/qu8-vhswish.h"
+#include "qu8-vlrelu/qu8-vlrelu.h"
+#include "s8-vclamp/s8-vclamp.h"
+#include "u8-vclamp/u8-vclamp.h"
 #undef XNN_UKERNEL_WITH_PARAMS
 
 #ifndef XNNPACK_BENCHMARK_NO_MAIN
diff --git a/bench/x16-packw.cc b/bench/x16-packw.cc
index 74a11000bc4..a2b699bc3fc 100644
--- a/bench/x16-packw.cc
+++ b/bench/x16-packw.cc
@@ -5,9 +5,9 @@
 
 
 #include <benchmark/benchmark.h>
-#include "bench/bgemm.h"
-#include "bench/packw-benchmark.h"
-#include "bench/utils.h"
+#include "bgemm.h"
+#include "packw-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/hardware-config.h"
 #include "xnnpack/packw.h"
@@ -22,7 +22,7 @@ static void x16_packw(benchmark::State& state, const char* net,
 #define XNN_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale)       \
 BENCHMARK_CAPTURE_BGEMM(x16_packw, ukernel##_, ukernel, arch_flags, nr, kr, sr);
 
-#include "src/x16-packw/x16-packw.h"
+#include "x16-packw/x16-packw.h"
 
 #undef XNN_UKERNEL
 
diff --git a/bench/x32-packw.cc b/bench/x32-packw.cc
index 1d2e9b52551..a8663f9e4cd 100644
--- a/bench/x32-packw.cc
+++ b/bench/x32-packw.cc
@@ -5,9 +5,9 @@
 
 
 #include <benchmark/benchmark.h>
-#include "bench/bgemm.h"
-#include "bench/packw-benchmark.h"
-#include "bench/utils.h"
+#include "bgemm.h"
+#include "packw-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/hardware-config.h"
 #include "xnnpack/packw.h"
@@ -22,7 +22,7 @@ static void x32_packw(benchmark::State& state, const char* net,
 #define XNN_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale)       \
 BENCHMARK_CAPTURE_BGEMM(x32_packw, ukernel##_, ukernel, arch_flags, nr, kr, sr);
 
-#include "src/x32-packw/x32-packw.h"
+#include "x32-packw/x32-packw.h"
 
 #undef XNN_UKERNEL
 
diff --git a/bench/x8-lut.cc b/bench/x8-lut.cc
index 29d1263e648..5cd9bafadc1 100644
--- a/bench/x8-lut.cc
+++ b/bench/x8-lut.cc
@@ -10,7 +10,7 @@
 #include <random>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/lut.h"
@@ -74,7 +74,7 @@ static void x8_lut(
     ->UseRealTime();
 #endif  // XNN_ARCH_ARM64
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   BENCHMARK_CAPTURE(x8_lut, avx512vbmi_vpermx2b_u64,
                     xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u64,
                     benchmark::utils::CheckAVX512VBMI)
@@ -95,7 +95,9 @@ static void x8_lut(
                     benchmark::utils::CheckAVX512VBMI)
     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
     ->UseRealTime();
+#endif  // XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   BENCHMARK_CAPTURE(x8_lut, avx512skx_vpshufb_u64,
                     xnn_x8_lut_ukernel__avx512skx_vpshufb_u64,
                     benchmark::utils::CheckAVX512SKX)
@@ -116,7 +118,9 @@ static void x8_lut(
                     benchmark::utils::CheckAVX512SKX)
     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
     ->UseRealTime();
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   BENCHMARK_CAPTURE(x8_lut, avx2_u32,
                     xnn_x8_lut_ukernel__avx2_u32,
                     benchmark::utils::CheckAVX2)
diff --git a/bench/x8-packq.cc b/bench/x8-packq.cc
index aa5839fae92..b69755239b3 100644
--- a/bench/x8-packq.cc
+++ b/bench/x8-packq.cc
@@ -5,9 +5,9 @@
 
 
 #include <benchmark/benchmark.h>
-#include "bench/bgemm.h"
-#include "bench/packq-benchmark.h"
-#include "bench/utils.h"
+#include "bgemm.h"
+#include "packq-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/packq.h"
 
@@ -31,7 +31,7 @@ BENCHMARK_CAPTURE_BGEMM(x8_packq, ukernel##_mr4_kr1_, ukernel, arch_flags, /*mr=
 BENCHMARK_CAPTURE_BGEMM(x8_packq, ukernel##_mr4_kr2_, ukernel, arch_flags, /*mr=*/4, /*kr=*/2); \
 BENCHMARK_CAPTURE_BGEMM(x8_packq, ukernel##_mr4_kr4_, ukernel, arch_flags, /*mr=*/4, /*kr=*/4);
 
-#include "src/x8-packq/x8-packq.h"
+#include "x8-packq/x8-packq.h"
 
 #undef XNN_UKERNEL
 
diff --git a/bench/x8-packw.cc b/bench/x8-packw.cc
index cbaffdd79b5..439fcf3f1c0 100644
--- a/bench/x8-packw.cc
+++ b/bench/x8-packw.cc
@@ -5,9 +5,9 @@
 
 
 #include <benchmark/benchmark.h>
-#include "bench/bgemm.h"
-#include "bench/packw-benchmark.h"
-#include "bench/utils.h"
+#include "bgemm.h"
+#include "packw-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/hardware-config.h"
 #include "xnnpack/packw.h"
@@ -22,7 +22,7 @@ static void x8_packw(benchmark::State& state, const char* net,
 #define XNN_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale)       \
 BENCHMARK_CAPTURE_BGEMM(x8_packw, ukernel##_, ukernel, arch_flags, nr, kr, sr);
 
-#include "src/x8-packw/x8-packw.h"
+#include "x8-packw/x8-packw.h"
 #undef XNN_UKERNEL
 
 #ifndef XNNPACK_BENCHMARK_NO_MAIN
diff --git a/bench/xN-transposec.cc b/bench/xN-transposec.cc
index 981485561e5..a642dc0b38f 100644
--- a/bench/xN-transposec.cc
+++ b/bench/xN-transposec.cc
@@ -9,7 +9,7 @@
 #include <numeric>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/hardware-config.h"
@@ -65,11 +65,11 @@ static void BenchmarkKernelSize(benchmark::internal::Benchmark* b)
                     element_size)                                             \
       ->Apply(BenchmarkKernelSize)                                            \
       ->UseRealTime();
-#include "src/x8-transposec/x8-transposec.h"
-#include "src/x16-transposec/x16-transposec.h"
-#include "src/x24-transposec/x24-transposec.h"
-#include "src/x32-transposec/x32-transposec.h"
-#include "src/x64-transposec/x64-transposec.h"
+#include "x8-transposec/x8-transposec.h"
+#include "x16-transposec/x16-transposec.h"
+#include "x24-transposec/x24-transposec.h"
+#include "x32-transposec/x32-transposec.h"
+#include "x64-transposec/x64-transposec.h"
 #undef XNN_TRANSPOSE_UKERNEL
 
 #ifndef XNNPACK_BENCHMARK_NO_MAIN
diff --git a/bench/xx-transposev.cc b/bench/xx-transposev.cc
index 9621a771cc0..6f5407abb1e 100644
--- a/bench/xx-transposev.cc
+++ b/bench/xx-transposev.cc
@@ -9,7 +9,7 @@
 #include <numeric>
 #include <vector>
 
-#include "bench/utils.h"
+#include "utils.h"
 #include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/microfnptr.h"
diff --git a/build_params.bzl b/build_params.bzl
index d4f27a09099..d3deb037f5e 100644
--- a/build_params.bzl
+++ b/build_params.bzl
@@ -119,6 +119,10 @@ def xnnpack_configurable_defines():
         ":avx512skx_enabled",
         ["XNN_ENABLE_AVX512SKX=1"],
         ["XNN_ENABLE_AVX512SKX=0"],
+    ) + xnnpack_select_if(
+        ":avx512vbmi_enabled",
+        ["XNN_ENABLE_AVX512VBMI=1"],
+        ["XNN_ENABLE_AVX512VBMI=0"],
     ) + xnnpack_select_if(
         ":avx512vnni_enabled",
         ["XNN_ENABLE_AVX512VNNI=1"],
@@ -645,7 +649,7 @@ XNNPACK_PARAMS_FOR_ARCH = {
         msys_copts = ["-fno-asynchronous-unwind-tables"],
     ),
     "avx512vbmi": _create_params(
-        cond = "//build_config:x86",
+        cond = "//:avx512vbmi_enabled",
         gcc_x86_copts = [
             "-mf16c",
             "-mfma",
diff --git a/build_srcs.bzl b/build_srcs.bzl
index 6d9746dcf6b..8ee26ea8578 100644
--- a/build_srcs.bzl
+++ b/build_srcs.bzl
@@ -24,7 +24,6 @@ OPERATOR_SRCS = [
     "src/operators/global-average-pooling-nwc.c",
     "src/operators/lut-elementwise-nc.c",
     "src/operators/max-pooling-nhwc.c",
-    "src/operators/prelu-nc.c",
     "src/operators/reduce-nd.c",
     "src/operators/resize-bilinear-nchw.c",
     "src/operators/resize-bilinear-nhwc.c",
@@ -71,7 +70,6 @@ SUBGRAPH_SRCS = [
     "src/subgraph/log.c",
     "src/subgraph/max-pooling-2d.c",
     "src/subgraph/negate.c",
-    "src/subgraph/prelu.c",
     "src/subgraph/reciprocal-square-root.c",
     "src/subgraph/reshape-helpers.c",
     "src/subgraph/rope.c",
@@ -120,7 +118,6 @@ XNNPACK_SRCS = [
     "src/configs/lut32norm-config.c",
     "src/configs/maxpool-config.c",
     "src/configs/pavgpool-config.c",
-    "src/configs/prelu-config.c",
     "src/configs/raddstoreexpminusmax-config.c",
     "src/configs/reduce-config.c",
     "src/configs/rmax-config.c",
diff --git a/cmake/gen/avx2_microkernels.cmake b/cmake/gen/avx2_microkernels.cmake
index b4e04e06dca..7efba2d3b2a 100644
--- a/cmake/gen/avx2_microkernels.cmake
+++ b/cmake/gen/avx2_microkernels.cmake
@@ -74,9 +74,11 @@ SET(PROD_AVX2_MICROKERNEL_SRCS
   src/qu8-vaddc/gen/qu8-vaddc-minmax-avx2-mul32-ld64-u16.c
   src/qu8-vcvt/gen/qu8-vcvt-avx2-u32.c
   src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c
+  src/s8-vclamp/s8-vclamp-avx2-u128.c
   src/s32-f32-vcvt/gen/s32-f32-vcvt-avx2.c
   src/s32-vmul/gen/s32-vmul-avx2.c
   src/s32-vmul/gen/s32-vmulc-avx2.c
+  src/u8-vclamp/u8-vclamp-avx2-u128.c
   src/u32-f32-vcvt/gen/u32-f32-vcvt-avx2.c
   src/x8-lut/gen/x8-lut-avx2-u128.c
   src/x8-transposec/gen/x8-transposec-32x32-reuse-switch-avx2.c
diff --git a/cmake/gen/avx512f_microkernels.cmake b/cmake/gen/avx512f_microkernels.cmake
index c871a1bce06..32879d7660c 100644
--- a/cmake/gen/avx512f_microkernels.cmake
+++ b/cmake/gen/avx512f_microkernels.cmake
@@ -19,7 +19,6 @@ SET(PROD_AVX512F_MICROKERNEL_SRCS
   src/f32-gemm/gen/f32-gemm-7x16-minmax-avx512f-broadcast.c
   src/f32-igemm/gen/f32-igemm-1x16-minmax-avx512f-broadcast.c
   src/f32-igemm/gen/f32-igemm-7x16-minmax-avx512f-broadcast.c
-  src/f32-prelu/gen/f32-prelu-avx512f-2x16.c
   src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc2.c
   src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c64.c
   src/f32-rminmax/gen/f32-rmax-avx512f-u64-acc4.c
@@ -35,7 +34,10 @@ SET(PROD_AVX512F_MICROKERNEL_SRCS
   src/f32-vbinary/gen/f32-vminc-avx512f-u32.c
   src/f32-vbinary/gen/f32-vmul-avx512f-u32.c
   src/f32-vbinary/gen/f32-vmulc-avx512f-u32.c
+  src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c
+  src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c
   src/f32-vbinary/gen/f32-vrdivc-avx512f-u32.c
+  src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c
   src/f32-vbinary/gen/f32-vrsubc-avx512f-u32.c
   src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u32.c
   src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u32.c
@@ -98,7 +100,6 @@ SET(NON_PROD_AVX512F_MICROKERNEL_SRCS
   src/f32-igemm/gen/f32-igemm-5x16-minmax-avx512f-broadcast.c
   src/f32-igemm/gen/f32-igemm-6x16-minmax-avx512f-broadcast.c
   src/f32-igemm/gen/f32-igemm-8x16-minmax-avx512f-broadcast.c
-  src/f32-prelu/gen/f32-prelu-avx512f-2x32.c
   src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-u64-acc2.c
   src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-u64-acc4.c
   src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-u64.c
@@ -167,12 +168,9 @@ SET(NON_PROD_AVX512F_MICROKERNEL_SRCS
   src/f32-vbinary/gen/f32-vmul-avx512f-u16.c
   src/f32-vbinary/gen/f32-vmulc-avx512f-u16.c
   src/f32-vbinary/gen/f32-vprelu-avx512f-u16.c
-  src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c
   src/f32-vbinary/gen/f32-vpreluc-avx512f-u16.c
-  src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c
   src/f32-vbinary/gen/f32-vrdivc-avx512f-u16.c
   src/f32-vbinary/gen/f32-vrpreluc-avx512f-u16.c
-  src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c
   src/f32-vbinary/gen/f32-vrsubc-avx512f-u16.c
   src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u16.c
   src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u16.c
diff --git a/cmake/gen/avx512fp16_microkernels.cmake b/cmake/gen/avx512fp16_microkernels.cmake
index addfb1edaf1..7b4246ce191 100644
--- a/cmake/gen/avx512fp16_microkernels.cmake
+++ b/cmake/gen/avx512fp16_microkernels.cmake
@@ -26,7 +26,10 @@ SET(PROD_AVX512FP16_MICROKERNEL_SRCS
   src/f16-vbinary/gen/f16-vminc-avx512fp16-u64.c
   src/f16-vbinary/gen/f16-vmul-avx512fp16-u64.c
   src/f16-vbinary/gen/f16-vmulc-avx512fp16-u64.c
+  src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c
+  src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c
   src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u64.c
+  src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c
   src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u64.c
   src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u64.c
   src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u64.c
@@ -83,12 +86,9 @@ SET(NON_PROD_AVX512FP16_MICROKERNEL_SRCS
   src/f16-vbinary/gen/f16-vmul-avx512fp16-u32.c
   src/f16-vbinary/gen/f16-vmulc-avx512fp16-u32.c
   src/f16-vbinary/gen/f16-vprelu-avx512fp16-u32.c
-  src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c
   src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u32.c
-  src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c
   src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u32.c
   src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u32.c
-  src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c
   src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u32.c
   src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u32.c
   src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u32.c
diff --git a/cmake/gen/avx512skx_microkernels.cmake b/cmake/gen/avx512skx_microkernels.cmake
index f589f669149..ce5910e3a3c 100644
--- a/cmake/gen/avx512skx_microkernels.cmake
+++ b/cmake/gen/avx512skx_microkernels.cmake
@@ -51,6 +51,8 @@ SET(PROD_AVX512SKX_MICROKERNEL_SRCS
   src/qu8-igemm/gen/qu8-igemm-7x16c8-minmax-fp32-avx512skx-prfm.c
   src/qu8-vadd/gen/qu8-vadd-minmax-avx512skx-mul32-ld128-u16.c
   src/qu8-vaddc/gen/qu8-vaddc-minmax-avx512skx-mul32-ld128-u16.c
+  src/s8-vclamp/s8-vclamp-avx512skx-u256.c
+  src/u8-vclamp/u8-vclamp-avx512skx-u256.c
   src/x8-lut/gen/x8-lut-avx512skx-vpshufb-u64.c)
 
 SET(NON_PROD_AVX512SKX_MICROKERNEL_SRCS
diff --git a/cmake/gen/avx_microkernels.cmake b/cmake/gen/avx_microkernels.cmake
index eb28687e6c5..900094b180a 100644
--- a/cmake/gen/avx_microkernels.cmake
+++ b/cmake/gen/avx_microkernels.cmake
@@ -21,7 +21,6 @@ SET(PROD_AVX_MICROKERNEL_SRCS
   src/f32-gemm/gen/f32-gemm-5x16-minmax-avx-broadcast.c
   src/f32-igemm/gen/f32-igemm-1x16-minmax-avx-broadcast.c
   src/f32-igemm/gen/f32-igemm-5x16-minmax-avx-broadcast.c
-  src/f32-prelu/gen/f32-prelu-avx-2x16.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x16-minmax-avx-broadcast.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-3x16-minmax-avx-broadcast.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x16-minmax-avx-broadcast.c
@@ -42,7 +41,10 @@ SET(PROD_AVX_MICROKERNEL_SRCS
   src/f32-vbinary/gen/f32-vminc-avx-u16.c
   src/f32-vbinary/gen/f32-vmul-avx-u16.c
   src/f32-vbinary/gen/f32-vmulc-avx-u16.c
+  src/f32-vbinary/gen/f32-vprelu-avx-u16.c
+  src/f32-vbinary/gen/f32-vpreluc-avx-u16.c
   src/f32-vbinary/gen/f32-vrdivc-avx-u16.c
+  src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c
   src/f32-vbinary/gen/f32-vrsubc-avx-u16.c
   src/f32-vbinary/gen/f32-vsqrdiff-avx-u16.c
   src/f32-vbinary/gen/f32-vsqrdiffc-avx-u16.c
@@ -165,7 +167,7 @@ SET(NON_PROD_AVX_MICROKERNEL_SRCS
   src/f32-igemm/gen/f32-igemm-6x8-minmax-avx-broadcast.c
   src/f32-igemm/gen/f32-igemm-6x16-minmax-avx-broadcast.c
   src/f32-igemm/gen/f32-igemm-7x8-minmax-avx-broadcast.c
-  src/f32-prelu/gen/f32-prelu-avx-2x8.c
+  src/f32-prelu/gen/f32-prelu-avx-2x16.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-2x16-minmax-avx-broadcast.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x16-minmax-avx-broadcast.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-5x16-minmax-avx-broadcast.c
@@ -214,12 +216,9 @@ SET(NON_PROD_AVX_MICROKERNEL_SRCS
   src/f32-vbinary/gen/f32-vmul-avx-u8.c
   src/f32-vbinary/gen/f32-vmulc-avx-u8.c
   src/f32-vbinary/gen/f32-vprelu-avx-u8.c
-  src/f32-vbinary/gen/f32-vprelu-avx-u16.c
   src/f32-vbinary/gen/f32-vpreluc-avx-u8.c
-  src/f32-vbinary/gen/f32-vpreluc-avx-u16.c
   src/f32-vbinary/gen/f32-vrdivc-avx-u8.c
   src/f32-vbinary/gen/f32-vrpreluc-avx-u8.c
-  src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c
   src/f32-vbinary/gen/f32-vrsubc-avx-u8.c
   src/f32-vbinary/gen/f32-vsqrdiff-avx-u8.c
   src/f32-vbinary/gen/f32-vsqrdiffc-avx-u8.c
diff --git a/cmake/gen/f16c_microkernels.cmake b/cmake/gen/f16c_microkernels.cmake
index e478a8abc1d..1d48cc491de 100644
--- a/cmake/gen/f16c_microkernels.cmake
+++ b/cmake/gen/f16c_microkernels.cmake
@@ -18,7 +18,6 @@ SET(PROD_F16C_MICROKERNEL_SRCS
   src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c8.c
   src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c8.c
   src/f16-maxpool/f16-maxpool-9p8x-minmax-f16c-c8.c
-  src/f16-prelu/gen/f16-prelu-f16c-2x16.c
   src/f16-rminmax/f16-rmax-f16c-u32.c
   src/f16-vbinary/gen/f16-vadd-f16c-u16.c
   src/f16-vbinary/gen/f16-vaddc-f16c-u16.c
@@ -30,7 +29,10 @@ SET(PROD_F16C_MICROKERNEL_SRCS
   src/f16-vbinary/gen/f16-vminc-f16c-u16.c
   src/f16-vbinary/gen/f16-vmul-f16c-u16.c
   src/f16-vbinary/gen/f16-vmulc-f16c-u16.c
+  src/f16-vbinary/gen/f16-vprelu-f16c-u16.c
+  src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c
   src/f16-vbinary/gen/f16-vrdivc-f16c-u8.c
+  src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c
   src/f16-vbinary/gen/f16-vrsubc-f16c-u16.c
   src/f16-vbinary/gen/f16-vsqrdiff-f16c-u16.c
   src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u16.c
@@ -64,7 +66,6 @@ SET(NON_PROD_F16C_MICROKERNEL_SRCS
   src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c16.c
   src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c24.c
   src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c32.c
-  src/f16-prelu/gen/f16-prelu-f16c-2x8.c
   src/f16-vbinary/gen/f16-vadd-f16c-u8.c
   src/f16-vbinary/gen/f16-vaddc-f16c-u8.c
   src/f16-vbinary/gen/f16-vdiv-f16c-u16.c
@@ -76,12 +77,9 @@ SET(NON_PROD_F16C_MICROKERNEL_SRCS
   src/f16-vbinary/gen/f16-vmul-f16c-u8.c
   src/f16-vbinary/gen/f16-vmulc-f16c-u8.c
   src/f16-vbinary/gen/f16-vprelu-f16c-u8.c
-  src/f16-vbinary/gen/f16-vprelu-f16c-u16.c
   src/f16-vbinary/gen/f16-vpreluc-f16c-u8.c
-  src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c
   src/f16-vbinary/gen/f16-vrdivc-f16c-u16.c
   src/f16-vbinary/gen/f16-vrpreluc-f16c-u8.c
-  src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c
   src/f16-vbinary/gen/f16-vrsubc-f16c-u8.c
   src/f16-vbinary/gen/f16-vsqrdiff-f16c-u8.c
   src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u8.c
diff --git a/cmake/gen/neon_microkernels.cmake b/cmake/gen/neon_microkernels.cmake
index 775480a2655..ff1f9efe773 100644
--- a/cmake/gen/neon_microkernels.cmake
+++ b/cmake/gen/neon_microkernels.cmake
@@ -41,7 +41,6 @@ SET(PROD_NEON_MICROKERNEL_SRCS
   src/f32-maxpool/f32-maxpool-9p8x-minmax-neon-c4.c
   src/f32-pavgpool/f32-pavgpool-9p8x-minmax-neon-c4.c
   src/f32-pavgpool/f32-pavgpool-9x-minmax-neon-c4.c
-  src/f32-prelu/gen/f32-prelu-neon-2x8.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-neon-lane-ld64.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-neon-lane-ld64.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-neon-lane-ld64.c
@@ -61,6 +60,9 @@ SET(PROD_NEON_MICROKERNEL_SRCS
   src/f32-vbinary/gen/f32-vminc-neon-u8.c
   src/f32-vbinary/gen/f32-vmul-neon-u8.c
   src/f32-vbinary/gen/f32-vmulc-neon-u8.c
+  src/f32-vbinary/gen/f32-vprelu-neon-u8.c
+  src/f32-vbinary/gen/f32-vpreluc-neon-u8.c
+  src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c
   src/f32-vbinary/gen/f32-vrsubc-neon-u8.c
   src/f32-vbinary/gen/f32-vsqrdiff-neon-u8.c
   src/f32-vbinary/gen/f32-vsqrdiffc-neon-u8.c
@@ -335,14 +337,6 @@ SET(NON_PROD_NEON_MICROKERNEL_SRCS
   src/f32-ppmm/gen/f32-ppmm-4x16-minmax-neon.c
   src/f32-ppmm/gen/f32-ppmm-8x8-minmax-neon-prfm.c
   src/f32-ppmm/gen/f32-ppmm-8x8-minmax-neon.c
-  src/f32-prelu/gen/f32-prelu-neon-1x4.c
-  src/f32-prelu/gen/f32-prelu-neon-1x8.c
-  src/f32-prelu/gen/f32-prelu-neon-1x16.c
-  src/f32-prelu/gen/f32-prelu-neon-2x4.c
-  src/f32-prelu/gen/f32-prelu-neon-2x16.c
-  src/f32-prelu/gen/f32-prelu-neon-4x4.c
-  src/f32-prelu/gen/f32-prelu-neon-4x8.c
-  src/f32-prelu/gen/f32-prelu-neon-4x16.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-neon-dup-ld64.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-neon-dup-ld64.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-5x8-minmax-neon-lane-ld64.c
@@ -409,11 +403,8 @@ SET(NON_PROD_NEON_MICROKERNEL_SRCS
   src/f32-vbinary/gen/f32-vmul-neon-u4.c
   src/f32-vbinary/gen/f32-vmulc-neon-u4.c
   src/f32-vbinary/gen/f32-vprelu-neon-u4.c
-  src/f32-vbinary/gen/f32-vprelu-neon-u8.c
   src/f32-vbinary/gen/f32-vpreluc-neon-u4.c
-  src/f32-vbinary/gen/f32-vpreluc-neon-u8.c
   src/f32-vbinary/gen/f32-vrpreluc-neon-u4.c
-  src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c
   src/f32-vbinary/gen/f32-vrsubc-neon-u4.c
   src/f32-vbinary/gen/f32-vsqrdiff-neon-u4.c
   src/f32-vbinary/gen/f32-vsqrdiffc-neon-u4.c
@@ -721,7 +712,6 @@ SET(NON_PROD_NEON_MICROKERNEL_SRCS
   src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c64.c
   src/qs8-requantization/qs8-requantization-fp32-neon.c
   src/qs8-requantization/qs8-requantization-gemmlowp-neon.c
-  src/qs8-requantization/qs8-requantization-rndna-neon.c
   src/qs8-requantization/qs8-requantization-rndnu-neon-mull.c
   src/qs8-requantization/qs8-requantization-rndnu-neon-qdmulh.c
   src/qs8-rsum/gen/qs8-rsum-neon-u16.c
@@ -837,7 +827,6 @@ SET(NON_PROD_NEON_MICROKERNEL_SRCS
   src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u64.c
   src/qu8-requantization/qu8-requantization-fp32-neon.c
   src/qu8-requantization/qu8-requantization-gemmlowp-neon.c
-  src/qu8-requantization/qu8-requantization-rndna-neon.c
   src/qu8-rsum/gen/qu8-rsum-neon-u16.c
   src/qu8-rsum/gen/qu8-rsum-neon-u64-acc2.c
   src/qu8-rsum/gen/qu8-rsum-neon-u64-acc4.c
diff --git a/cmake/gen/neonfp16arith_microkernels.cmake b/cmake/gen/neonfp16arith_microkernels.cmake
index 4cc50373bae..aa8ca503c40 100644
--- a/cmake/gen/neonfp16arith_microkernels.cmake
+++ b/cmake/gen/neonfp16arith_microkernels.cmake
@@ -40,7 +40,6 @@ SET(PROD_NEONFP16ARITH_MICROKERNEL_SRCS
   src/f16-maxpool/f16-maxpool-9p8x-minmax-neonfp16arith-c8.c
   src/f16-pavgpool/f16-pavgpool-9p8x-minmax-neonfp16arith-c8.c
   src/f16-pavgpool/f16-pavgpool-9x-minmax-neonfp16arith-c8.c
-  src/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c
   src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u32.c
   src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32.c
   src/f16-rminmax/gen/f16-rmax-neonfp16arith-u32-acc4.c
@@ -54,6 +53,9 @@ SET(PROD_NEONFP16ARITH_MICROKERNEL_SRCS
   src/f16-vbinary/gen/f16-vminc-neonfp16arith-u16.c
   src/f16-vbinary/gen/f16-vmul-neonfp16arith-u16.c
   src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u16.c
+  src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c
+  src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c
+  src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c
   src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u16.c
   src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u16.c
   src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u16.c
@@ -194,7 +196,6 @@ SET(NON_PROD_NEONFP16ARITH_MICROKERNEL_SRCS
   src/f16-igemm/gen/f16-igemm-4x16-minmax-neonfp16arith-ld64.c
   src/f16-igemm/gen/f16-igemm-8x8-minmax-neonfp16arith-ld64.c
   src/f16-igemm/gen/f16-igemm-8x16-minmax-neonfp16arith-ld64.c
-  src/f16-prelu/gen/f16-prelu-neonfp16arith-2x8.c
   src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u8.c
   src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u16.c
   src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u24.c
@@ -271,11 +272,8 @@ SET(NON_PROD_NEONFP16ARITH_MICROKERNEL_SRCS
   src/f16-vbinary/gen/f16-vmul-neonfp16arith-u8.c
   src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u8.c
   src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u8.c
-  src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c
   src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u8.c
-  src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c
   src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u8.c
-  src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c
   src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u8.c
   src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u8.c
   src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u8.c
diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake
index 75b8e94b6c6..5d13ebb9fcd 100644
--- a/cmake/gen/rvv_microkernels.cmake
+++ b/cmake/gen/rvv_microkernels.cmake
@@ -62,6 +62,8 @@ SET(PROD_RVV_MICROKERNEL_SRCS
   src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c
   src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c
   src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c
+  src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c
+  src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c
   src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c
   src/x32-transposec/gen/x32-transposec-4x4-rvv.c
   src/x32-transposec/gen/x32-transposec-8x8-rvv.c
@@ -190,6 +192,12 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS
   src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c
   src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c
   src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c
+  src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c
+  src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c
+  src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c
+  src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c
+  src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c
+  src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c
   src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u2.c
   src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u4.c
   src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u8.c
diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake
index b4e3b340933..d7f3e8aafe0 100644
--- a/cmake/gen/scalar_microkernels.cmake
+++ b/cmake/gen/scalar_microkernels.cmake
@@ -70,7 +70,6 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS
   src/f32-maxpool/f32-maxpool-9p8x-minmax-scalar-c1.c
   src/f32-pavgpool/f32-pavgpool-9p8x-minmax-scalar-c1.c
   src/f32-pavgpool/f32-pavgpool-9x-minmax-scalar-c1.c
-  src/f32-prelu/gen/f32-prelu-scalar-2x4.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x4-minmax-scalar.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x4-minmax-scalar.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-minmax-scalar.c
@@ -99,7 +98,10 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS
   src/f32-vbinary/gen/f32-vminc-scalar-u8.c
   src/f32-vbinary/gen/f32-vmul-scalar-u8.c
   src/f32-vbinary/gen/f32-vmulc-scalar-u8.c
+  src/f32-vbinary/gen/f32-vprelu-scalar-u8.c
+  src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c
   src/f32-vbinary/gen/f32-vrdivc-scalar-u2.c
+  src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c
   src/f32-vbinary/gen/f32-vrsubc-scalar-u8.c
   src/f32-vbinary/gen/f32-vsqrdiff-scalar-u8.c
   src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u8.c
@@ -383,7 +385,6 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS
   src/f32-ppmm/gen/f32-ppmm-3x3-minmax-scalar.c
   src/f32-ppmm/gen/f32-ppmm-4x2-minmax-scalar.c
   src/f32-ppmm/gen/f32-ppmm-4x4-minmax-scalar.c
-  src/f32-prelu/gen/f32-prelu-scalar-2x1.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-2x4-minmax-scalar.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-scalar.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-relu-scalar.c
@@ -484,18 +485,15 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS
   src/f32-vbinary/gen/f32-vprelu-scalar-u1.c
   src/f32-vbinary/gen/f32-vprelu-scalar-u2.c
   src/f32-vbinary/gen/f32-vprelu-scalar-u4.c
-  src/f32-vbinary/gen/f32-vprelu-scalar-u8.c
   src/f32-vbinary/gen/f32-vpreluc-scalar-u1.c
   src/f32-vbinary/gen/f32-vpreluc-scalar-u2.c
   src/f32-vbinary/gen/f32-vpreluc-scalar-u4.c
-  src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c
   src/f32-vbinary/gen/f32-vrdivc-scalar-u1.c
   src/f32-vbinary/gen/f32-vrdivc-scalar-u4.c
   src/f32-vbinary/gen/f32-vrdivc-scalar-u8.c
   src/f32-vbinary/gen/f32-vrpreluc-scalar-u1.c
   src/f32-vbinary/gen/f32-vrpreluc-scalar-u2.c
   src/f32-vbinary/gen/f32-vrpreluc-scalar-u4.c
-  src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c
   src/f32-vbinary/gen/f32-vrsubc-scalar-u1.c
   src/f32-vbinary/gen/f32-vrsubc-scalar-u2.c
   src/f32-vbinary/gen/f32-vrsubc-scalar-u4.c
@@ -724,9 +722,6 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS
   src/qs8-requantization/qs8-requantization-fp32-scalar-fmagic.c
   src/qs8-requantization/qs8-requantization-fp32-scalar-lrintf.c
   src/qs8-requantization/qs8-requantization-gemmlowp-scalar.c
-  src/qs8-requantization/qs8-requantization-rndna-scalar-signed64.c
-  src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c
-  src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c
   src/qs8-requantization/qs8-requantization-rndnu-scalar.c
   src/qs8-rsum/gen/qs8-rsum-scalar-u1.c
   src/qs8-rsum/gen/qs8-rsum-scalar-u2.c
@@ -863,9 +858,6 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS
   src/qu8-requantization/qu8-requantization-fp32-scalar-fmagic.c
   src/qu8-requantization/qu8-requantization-fp32-scalar-lrintf.c
   src/qu8-requantization/qu8-requantization-gemmlowp-scalar.c
-  src/qu8-requantization/qu8-requantization-rndna-scalar-signed64.c
-  src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned32.c
-  src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned64.c
   src/qu8-rsum/gen/qu8-rsum-scalar-u1.c
   src/qu8-rsum/gen/qu8-rsum-scalar-u2.c
   src/qu8-vadd/gen/qu8-vadd-minmax-scalar-u2.c
diff --git a/cmake/gen/sse2_microkernels.cmake b/cmake/gen/sse2_microkernels.cmake
index 0868611bc1f..64623e83e6a 100644
--- a/cmake/gen/sse2_microkernels.cmake
+++ b/cmake/gen/sse2_microkernels.cmake
@@ -17,10 +17,12 @@ SET(PROD_SSE2_MICROKERNEL_SRCS
   src/f32-argmaxpool/f32-argmaxpool-9p8x-sse2-c4.c
   src/f32-argmaxpool/f32-argmaxpool-9x-sse2-c4.c
   src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u16.c
-  src/f32-prelu/gen/f32-prelu-sse2-2x8.c
   src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u32.c
   src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u32.c
   src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc2.c
+  src/f32-vbinary/gen/f32-vprelu-sse2-u8.c
+  src/f32-vbinary/gen/f32-vpreluc-sse2-u8.c
+  src/f32-vbinary/gen/f32-vrpreluc-sse2-u8.c
   src/f32-vcopysign/gen/f32-vcopysign-sse2.c
   src/f32-vcopysign/gen/f32-vcopysignc-sse2.c
   src/f32-vcopysign/gen/f32-vrcopysignc-sse2.c
@@ -118,7 +120,6 @@ SET(NON_PROD_SSE2_MICROKERNEL_SRCS
   src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u8.c
   src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u24.c
   src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u32.c
-  src/f32-prelu/gen/f32-prelu-sse2-2x4.c
   src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u8.c
   src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u16.c
   src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u24.c
@@ -129,11 +130,8 @@ SET(NON_PROD_SSE2_MICROKERNEL_SRCS
   src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8-acc2.c
   src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc4.c
   src/f32-vbinary/gen/f32-vprelu-sse2-u4.c
-  src/f32-vbinary/gen/f32-vprelu-sse2-u8.c
   src/f32-vbinary/gen/f32-vpreluc-sse2-u4.c
-  src/f32-vbinary/gen/f32-vpreluc-sse2-u8.c
   src/f32-vbinary/gen/f32-vrpreluc-sse2-u4.c
-  src/f32-vbinary/gen/f32-vrpreluc-sse2-u8.c
   src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-u4.c
   src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-u8.c
   src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-u16.c
@@ -272,7 +270,6 @@ SET(NON_PROD_SSE2_MICROKERNEL_SRCS
   src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x4c2s4-minmax-fp32-sse2-ld128.c
   src/qs8-requantization/qs8-requantization-fp32-sse2.c
   src/qs8-requantization/qs8-requantization-gemmlowp-sse2.c
-  src/qs8-requantization/qs8-requantization-rndna-sse2.c
   src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-u16.c
   src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-u24.c
   src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-u32.c
@@ -344,7 +341,6 @@ SET(NON_PROD_SSE2_MICROKERNEL_SRCS
   src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse2-ld128.c
   src/qu8-requantization/qu8-requantization-fp32-sse2.c
   src/qu8-requantization/qu8-requantization-gemmlowp-sse2.c
-  src/qu8-requantization/qu8-requantization-rndna-sse2.c
   src/qu8-rsum/gen/qu8-rsum-sse2-u16.c
   src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc2.c
   src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc4.c
diff --git a/cmake/gen/sse41_microkernels.cmake b/cmake/gen/sse41_microkernels.cmake
index 0ea18c62396..6bf14b87fbb 100644
--- a/cmake/gen/sse41_microkernels.cmake
+++ b/cmake/gen/sse41_microkernels.cmake
@@ -12,7 +12,6 @@
 SET(PROD_SSE41_MICROKERNEL_SRCS
   src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u16.c
   src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u8.c
-  src/f32-prelu/gen/f32-prelu-sse41-2x8.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-sse41-dup.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-sse41-dup.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-sse41-dup.c
@@ -83,7 +82,6 @@ SET(NON_PROD_SSE41_MICROKERNEL_SRCS
   src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u16.c
   src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u24.c
   src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u32.c
-  src/f32-prelu/gen/f32-prelu-sse41-2x4.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-3x8-minmax-sse41-dup.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-5x8-minmax-sse41-dup.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-sse41-dup.c
@@ -280,7 +278,6 @@ SET(NON_PROD_SSE41_MICROKERNEL_SRCS
   src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c32.c
   src/qs8-requantization/qs8-requantization-fp32-sse41.c
   src/qs8-requantization/qs8-requantization-gemmlowp-sse41.c
-  src/qs8-requantization/qs8-requantization-rndna-sse41.c
   src/qs8-requantization/qs8-requantization-rndnu-sse41-sra.c
   src/qs8-requantization/qs8-requantization-rndnu-sse41-srl.c
   src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-u16.c
@@ -374,7 +371,6 @@ SET(NON_PROD_SSE41_MICROKERNEL_SRCS
   src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse41-ld64.c
   src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse41-ld128.c
   src/qu8-requantization/qu8-requantization-gemmlowp-sse41.c
-  src/qu8-requantization/qu8-requantization-rndna-sse41.c
   src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul16-ld64-u16.c
   src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul32-ld32-u8.c
   src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul32-ld32-u16.c
diff --git a/cmake/gen/sse_microkernels.cmake b/cmake/gen/sse_microkernels.cmake
index befa0bb3246..a682a18f490 100644
--- a/cmake/gen/sse_microkernels.cmake
+++ b/cmake/gen/sse_microkernels.cmake
@@ -180,8 +180,6 @@ SET(NON_PROD_SSE_MICROKERNEL_SRCS
   src/f32-igemm/gen/f32-igemm-6x8-minmax-sse-load1.c
   src/f32-igemm/gen/f32-igemm-6x8s4-minmax-sse.c
   src/f32-ppmm/gen/f32-ppmm-4x8-minmax-sse.c
-  src/f32-prelu/gen/f32-prelu-sse-2x4.c
-  src/f32-prelu/gen/f32-prelu-sse-2x8.c
   src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c32.c
   src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c64.c
   src/f32-rminmax/gen/f32-rmax-sse-u4.c
diff --git a/cmake/gen/ssse3_microkernels.cmake b/cmake/gen/ssse3_microkernels.cmake
index 7cb398b68eb..6309ebf281e 100644
--- a/cmake/gen/ssse3_microkernels.cmake
+++ b/cmake/gen/ssse3_microkernels.cmake
@@ -39,7 +39,6 @@ SET(NON_PROD_SSSE3_MICROKERNEL_SRCS
   src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-ssse3-madd.c
   src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-ssse3-madd-prfm.c
   src/qs8-requantization/qs8-requantization-gemmlowp-ssse3.c
-  src/qs8-requantization/qs8-requantization-rndna-ssse3.c
   src/qs8-rsum/gen/qs8-rsum-ssse3-u16.c
   src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc2.c
   src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc4.c
@@ -52,7 +51,6 @@ SET(NON_PROD_SSSE3_MICROKERNEL_SRCS
   src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c16.c
   src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c32.c
   src/qu8-requantization/qu8-requantization-gemmlowp-ssse3.c
-  src/qu8-requantization/qu8-requantization-rndna-ssse3.c
   src/qu8-vcvt/gen/qu8-vcvt-ssse3-u16.c
   src/qu8-vhswish/gen/qu8-vhswish-ssse3-u16.c
   src/qu8-vhswish/gen/qu8-vhswish-ssse3-u32.c
diff --git a/cmake/gen/wasm_microkernels.cmake b/cmake/gen/wasm_microkernels.cmake
index de89cd83440..f60129cee2c 100644
--- a/cmake/gen/wasm_microkernels.cmake
+++ b/cmake/gen/wasm_microkernels.cmake
@@ -32,7 +32,6 @@ SET(PROD_WASM_MICROKERNEL_SRCS
   src/f32-maxpool/f32-maxpool-9p8x-minmax-wasm-c1.c
   src/f32-pavgpool/f32-pavgpool-9p8x-minmax-wasm-c1.c
   src/f32-pavgpool/f32-pavgpool-9x-minmax-wasm-c1.c
-  src/f32-prelu/gen/f32-prelu-wasm-2x4.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x4-minmax-wasm.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x4-minmax-wasm.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-minmax-wasm.c
@@ -50,7 +49,10 @@ SET(PROD_WASM_MICROKERNEL_SRCS
   src/f32-vbinary/gen/f32-vminc-wasm-u8.c
   src/f32-vbinary/gen/f32-vmul-wasm-u8.c
   src/f32-vbinary/gen/f32-vmulc-wasm-u8.c
+  src/f32-vbinary/gen/f32-vprelu-wasm-u8.c
+  src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c
   src/f32-vbinary/gen/f32-vrdivc-wasm-u8.c
+  src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c
   src/f32-vbinary/gen/f32-vrsubc-wasm-u8.c
   src/f32-vbinary/gen/f32-vsub-wasm-u8.c
   src/f32-vbinary/gen/f32-vsubc-wasm-u8.c
@@ -110,7 +112,6 @@ SET(NON_PROD_WASM_MICROKERNEL_SRCS
   src/f32-igemm/gen/f32-igemm-2x4-minmax-wasm.c
   src/f32-igemm/gen/f32-igemm-2x4-relu-wasm.c
   src/f32-igemm/gen/f32-igemm-4x2-relu-wasm.c
-  src/f32-prelu/gen/f32-prelu-wasm-2x1.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-2x4-minmax-wasm.c
   src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-wasm.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-relu-wasm.c
@@ -172,18 +173,15 @@ SET(NON_PROD_WASM_MICROKERNEL_SRCS
   src/f32-vbinary/gen/f32-vprelu-wasm-u1.c
   src/f32-vbinary/gen/f32-vprelu-wasm-u2.c
   src/f32-vbinary/gen/f32-vprelu-wasm-u4.c
-  src/f32-vbinary/gen/f32-vprelu-wasm-u8.c
   src/f32-vbinary/gen/f32-vpreluc-wasm-u1.c
   src/f32-vbinary/gen/f32-vpreluc-wasm-u2.c
   src/f32-vbinary/gen/f32-vpreluc-wasm-u4.c
-  src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c
   src/f32-vbinary/gen/f32-vrdivc-wasm-u1.c
   src/f32-vbinary/gen/f32-vrdivc-wasm-u2.c
   src/f32-vbinary/gen/f32-vrdivc-wasm-u4.c
   src/f32-vbinary/gen/f32-vrpreluc-wasm-u1.c
   src/f32-vbinary/gen/f32-vrpreluc-wasm-u2.c
   src/f32-vbinary/gen/f32-vrpreluc-wasm-u4.c
-  src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c
   src/f32-vbinary/gen/f32-vrsubc-wasm-u1.c
   src/f32-vbinary/gen/f32-vrsubc-wasm-u2.c
   src/f32-vbinary/gen/f32-vrsubc-wasm-u4.c
diff --git a/cmake/gen/wasmrelaxedsimd_microkernels.cmake b/cmake/gen/wasmrelaxedsimd_microkernels.cmake
index 09596cbf072..509ffa887b5 100644
--- a/cmake/gen/wasmrelaxedsimd_microkernels.cmake
+++ b/cmake/gen/wasmrelaxedsimd_microkernels.cmake
@@ -51,8 +51,6 @@ SET(PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS
   src/f32-igemm/gen/f32-igemm-6x8-minmax-wasmrelaxedsimd-fma-splat.c
   src/f32-igemm/gen/f32-igemm-6x8-relu-wasmrelaxedsimd-fma-splat.c
   src/f32-igemm/gen/f32-igemm-6x8-wasmrelaxedsimd-fma-splat.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x4.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x4.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-fma-loadsplat.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-fma-splat.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-relu-wasmrelaxedsimd-fma-loadsplat.c
@@ -284,22 +282,6 @@ SET(NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS
   src/f32-igemm/gen/f32-igemm-6x8s4-minmax-wasmrelaxedsimd.c
   src/f32-igemm/gen/f32-igemm-6x8s4-relu-wasmrelaxedsimd-fma.c
   src/f32-igemm/gen/f32-igemm-6x8s4-wasmrelaxedsimd-fma.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x4.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x8.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x16.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x8.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x16.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x4.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x8.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x16.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x4.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x8.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x16.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x8.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x16.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x4.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x8.c
-  src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x16.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-loadsplat.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-splat.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8s4-minmax-wasmrelaxedsimd-fma.c
@@ -513,6 +495,7 @@ SET(NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS
   src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c8-minmax-wasmusdot.c
   src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot-u2.c
   src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot.c
+  src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c
   src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c16-minmax-fp32-wasmsdot.c
   src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c16-minmax-fp32-wasmusdot.c
   src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmusdot.c
@@ -601,6 +584,7 @@ SET(NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS
   src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c
   src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2.c
   src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot.c
+  src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c
   src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u16.c
   src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u32-acc2.c
   src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u64-acc2.c
diff --git a/cmake/gen/wasmsimd_microkernels.cmake b/cmake/gen/wasmsimd_microkernels.cmake
index 103c2e67711..ba54c415f58 100644
--- a/cmake/gen/wasmsimd_microkernels.cmake
+++ b/cmake/gen/wasmsimd_microkernels.cmake
@@ -100,8 +100,6 @@ SET(PROD_WASMSIMD_MICROKERNEL_SRCS
   src/f32-pavgpool/f32-pavgpool-9p8x-minmax-wasmsimd-x86-c4.c
   src/f32-pavgpool/f32-pavgpool-9x-minmax-wasmsimd-arm-c4.c
   src/f32-pavgpool/f32-pavgpool-9x-minmax-wasmsimd-x86-c4.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x8.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x8.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-arm-splat.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-x86-splat.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-relu-wasmsimd-splat.c
@@ -135,7 +133,10 @@ SET(PROD_WASMSIMD_MICROKERNEL_SRCS
   src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u16.c
   src/f32-vbinary/gen/f32-vmul-wasmsimd-u16.c
   src/f32-vbinary/gen/f32-vmulc-wasmsimd-u16.c
+  src/f32-vbinary/gen/f32-vprelu-wasmsimd-u16.c
+  src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c
   src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u16.c
+  src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c
   src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u16.c
   src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u16.c
   src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u16.c
@@ -568,22 +569,6 @@ SET(NON_PROD_WASMSIMD_MICROKERNEL_SRCS
   src/f32-igemm/gen/f32-igemm-6x8s4-wasmsimd.c
   src/f32-ppmm/gen/f32-ppmm-4x8-minmax-wasmsimd-arm-splat.c
   src/f32-ppmm/gen/f32-ppmm-4x8-minmax-wasmsimd-x86-splat.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x4.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x8.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x16.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x4.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x16.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x4.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x8.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x16.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x4.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x8.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x16.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x4.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x16.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x4.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x8.c
-  src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x16.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-arm-loadsplat.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-x86-loadsplat.c
   src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-relu-wasmsimd-loadsplat.c
@@ -757,15 +742,12 @@ SET(NON_PROD_WASMSIMD_MICROKERNEL_SRCS
   src/f32-vbinary/gen/f32-vmulc-wasmsimd-u8.c
   src/f32-vbinary/gen/f32-vprelu-wasmsimd-u4.c
   src/f32-vbinary/gen/f32-vprelu-wasmsimd-u8.c
-  src/f32-vbinary/gen/f32-vprelu-wasmsimd-u16.c
   src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u4.c
   src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u8.c
-  src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c
   src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u4.c
   src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u8.c
   src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u4.c
   src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u8.c
-  src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c
   src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u4.c
   src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u8.c
   src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u4.c
diff --git a/gen/avx2_microkernels.bzl b/gen/avx2_microkernels.bzl
index fc68f74797d..f0bb53976ff 100644
--- a/gen/avx2_microkernels.bzl
+++ b/gen/avx2_microkernels.bzl
@@ -70,9 +70,11 @@ PROD_AVX2_MICROKERNEL_SRCS = [
     "src/qu8-vaddc/gen/qu8-vaddc-minmax-avx2-mul32-ld64-u16.c",
     "src/qu8-vcvt/gen/qu8-vcvt-avx2-u32.c",
     "src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c",
+    "src/s8-vclamp/s8-vclamp-avx2-u128.c",
     "src/s32-f32-vcvt/gen/s32-f32-vcvt-avx2.c",
     "src/s32-vmul/gen/s32-vmul-avx2.c",
     "src/s32-vmul/gen/s32-vmulc-avx2.c",
+    "src/u8-vclamp/u8-vclamp-avx2-u128.c",
     "src/u32-f32-vcvt/gen/u32-f32-vcvt-avx2.c",
     "src/x8-lut/gen/x8-lut-avx2-u128.c",
     "src/x8-transposec/gen/x8-transposec-32x32-reuse-switch-avx2.c",
diff --git a/gen/avx512f_microkernels.bzl b/gen/avx512f_microkernels.bzl
index 38bb3ba21c5..f12f928fb23 100644
--- a/gen/avx512f_microkernels.bzl
+++ b/gen/avx512f_microkernels.bzl
@@ -15,7 +15,6 @@ PROD_AVX512F_MICROKERNEL_SRCS = [
     "src/f32-gemm/gen/f32-gemm-7x16-minmax-avx512f-broadcast.c",
     "src/f32-igemm/gen/f32-igemm-1x16-minmax-avx512f-broadcast.c",
     "src/f32-igemm/gen/f32-igemm-7x16-minmax-avx512f-broadcast.c",
-    "src/f32-prelu/gen/f32-prelu-avx512f-2x16.c",
     "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr2-p5-u64-acc2.c",
     "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c64.c",
     "src/f32-rminmax/gen/f32-rmax-avx512f-u64-acc4.c",
@@ -31,7 +30,10 @@ PROD_AVX512F_MICROKERNEL_SRCS = [
     "src/f32-vbinary/gen/f32-vminc-avx512f-u32.c",
     "src/f32-vbinary/gen/f32-vmul-avx512f-u32.c",
     "src/f32-vbinary/gen/f32-vmulc-avx512f-u32.c",
+    "src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c",
+    "src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c",
     "src/f32-vbinary/gen/f32-vrdivc-avx512f-u32.c",
+    "src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c",
     "src/f32-vbinary/gen/f32-vrsubc-avx512f-u32.c",
     "src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u32.c",
     "src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u32.c",
@@ -95,7 +97,6 @@ NON_PROD_AVX512F_MICROKERNEL_SRCS = [
     "src/f32-igemm/gen/f32-igemm-5x16-minmax-avx512f-broadcast.c",
     "src/f32-igemm/gen/f32-igemm-6x16-minmax-avx512f-broadcast.c",
     "src/f32-igemm/gen/f32-igemm-8x16-minmax-avx512f-broadcast.c",
-    "src/f32-prelu/gen/f32-prelu-avx512f-2x32.c",
     "src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-u64-acc2.c",
     "src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-u64-acc4.c",
     "src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-u64.c",
@@ -164,12 +165,9 @@ NON_PROD_AVX512F_MICROKERNEL_SRCS = [
     "src/f32-vbinary/gen/f32-vmul-avx512f-u16.c",
     "src/f32-vbinary/gen/f32-vmulc-avx512f-u16.c",
     "src/f32-vbinary/gen/f32-vprelu-avx512f-u16.c",
-    "src/f32-vbinary/gen/f32-vprelu-avx512f-u32.c",
     "src/f32-vbinary/gen/f32-vpreluc-avx512f-u16.c",
-    "src/f32-vbinary/gen/f32-vpreluc-avx512f-u32.c",
     "src/f32-vbinary/gen/f32-vrdivc-avx512f-u16.c",
     "src/f32-vbinary/gen/f32-vrpreluc-avx512f-u16.c",
-    "src/f32-vbinary/gen/f32-vrpreluc-avx512f-u32.c",
     "src/f32-vbinary/gen/f32-vrsubc-avx512f-u16.c",
     "src/f32-vbinary/gen/f32-vsqrdiff-avx512f-u16.c",
     "src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-u16.c",
diff --git a/gen/avx512fp16_microkernels.bzl b/gen/avx512fp16_microkernels.bzl
index aeb7b028a3f..f1c8d65af13 100644
--- a/gen/avx512fp16_microkernels.bzl
+++ b/gen/avx512fp16_microkernels.bzl
@@ -22,7 +22,10 @@ PROD_AVX512FP16_MICROKERNEL_SRCS = [
     "src/f16-vbinary/gen/f16-vminc-avx512fp16-u64.c",
     "src/f16-vbinary/gen/f16-vmul-avx512fp16-u64.c",
     "src/f16-vbinary/gen/f16-vmulc-avx512fp16-u64.c",
+    "src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c",
+    "src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c",
     "src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u64.c",
+    "src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c",
     "src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u64.c",
     "src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u64.c",
     "src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u64.c",
@@ -80,12 +83,9 @@ NON_PROD_AVX512FP16_MICROKERNEL_SRCS = [
     "src/f16-vbinary/gen/f16-vmul-avx512fp16-u32.c",
     "src/f16-vbinary/gen/f16-vmulc-avx512fp16-u32.c",
     "src/f16-vbinary/gen/f16-vprelu-avx512fp16-u32.c",
-    "src/f16-vbinary/gen/f16-vprelu-avx512fp16-u64.c",
     "src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u32.c",
-    "src/f16-vbinary/gen/f16-vpreluc-avx512fp16-u64.c",
     "src/f16-vbinary/gen/f16-vrdivc-avx512fp16-u32.c",
     "src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u32.c",
-    "src/f16-vbinary/gen/f16-vrpreluc-avx512fp16-u64.c",
     "src/f16-vbinary/gen/f16-vrsubc-avx512fp16-u32.c",
     "src/f16-vbinary/gen/f16-vsqrdiff-avx512fp16-u32.c",
     "src/f16-vbinary/gen/f16-vsqrdiffc-avx512fp16-u32.c",
diff --git a/gen/avx512skx_microkernels.bzl b/gen/avx512skx_microkernels.bzl
index fe29a4c806c..b1a653ae98a 100644
--- a/gen/avx512skx_microkernels.bzl
+++ b/gen/avx512skx_microkernels.bzl
@@ -47,6 +47,8 @@ PROD_AVX512SKX_MICROKERNEL_SRCS = [
     "src/qu8-igemm/gen/qu8-igemm-7x16c8-minmax-fp32-avx512skx-prfm.c",
     "src/qu8-vadd/gen/qu8-vadd-minmax-avx512skx-mul32-ld128-u16.c",
     "src/qu8-vaddc/gen/qu8-vaddc-minmax-avx512skx-mul32-ld128-u16.c",
+    "src/s8-vclamp/s8-vclamp-avx512skx-u256.c",
+    "src/u8-vclamp/u8-vclamp-avx512skx-u256.c",
     "src/x8-lut/gen/x8-lut-avx512skx-vpshufb-u64.c",
 ]
 
diff --git a/gen/avx_microkernels.bzl b/gen/avx_microkernels.bzl
index 2794ad08fa7..04d649297cf 100644
--- a/gen/avx_microkernels.bzl
+++ b/gen/avx_microkernels.bzl
@@ -17,7 +17,6 @@ PROD_AVX_MICROKERNEL_SRCS = [
     "src/f32-gemm/gen/f32-gemm-5x16-minmax-avx-broadcast.c",
     "src/f32-igemm/gen/f32-igemm-1x16-minmax-avx-broadcast.c",
     "src/f32-igemm/gen/f32-igemm-5x16-minmax-avx-broadcast.c",
-    "src/f32-prelu/gen/f32-prelu-avx-2x16.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x16-minmax-avx-broadcast.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-3x16-minmax-avx-broadcast.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x16-minmax-avx-broadcast.c",
@@ -38,7 +37,10 @@ PROD_AVX_MICROKERNEL_SRCS = [
     "src/f32-vbinary/gen/f32-vminc-avx-u16.c",
     "src/f32-vbinary/gen/f32-vmul-avx-u16.c",
     "src/f32-vbinary/gen/f32-vmulc-avx-u16.c",
+    "src/f32-vbinary/gen/f32-vprelu-avx-u16.c",
+    "src/f32-vbinary/gen/f32-vpreluc-avx-u16.c",
     "src/f32-vbinary/gen/f32-vrdivc-avx-u16.c",
+    "src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c",
     "src/f32-vbinary/gen/f32-vrsubc-avx-u16.c",
     "src/f32-vbinary/gen/f32-vsqrdiff-avx-u16.c",
     "src/f32-vbinary/gen/f32-vsqrdiffc-avx-u16.c",
@@ -162,7 +164,7 @@ NON_PROD_AVX_MICROKERNEL_SRCS = [
     "src/f32-igemm/gen/f32-igemm-6x8-minmax-avx-broadcast.c",
     "src/f32-igemm/gen/f32-igemm-6x16-minmax-avx-broadcast.c",
     "src/f32-igemm/gen/f32-igemm-7x8-minmax-avx-broadcast.c",
-    "src/f32-prelu/gen/f32-prelu-avx-2x8.c",
+    "src/f32-prelu/gen/f32-prelu-avx-2x16.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-2x16-minmax-avx-broadcast.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x16-minmax-avx-broadcast.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-5x16-minmax-avx-broadcast.c",
@@ -211,12 +213,9 @@ NON_PROD_AVX_MICROKERNEL_SRCS = [
     "src/f32-vbinary/gen/f32-vmul-avx-u8.c",
     "src/f32-vbinary/gen/f32-vmulc-avx-u8.c",
     "src/f32-vbinary/gen/f32-vprelu-avx-u8.c",
-    "src/f32-vbinary/gen/f32-vprelu-avx-u16.c",
     "src/f32-vbinary/gen/f32-vpreluc-avx-u8.c",
-    "src/f32-vbinary/gen/f32-vpreluc-avx-u16.c",
     "src/f32-vbinary/gen/f32-vrdivc-avx-u8.c",
     "src/f32-vbinary/gen/f32-vrpreluc-avx-u8.c",
-    "src/f32-vbinary/gen/f32-vrpreluc-avx-u16.c",
     "src/f32-vbinary/gen/f32-vrsubc-avx-u8.c",
     "src/f32-vbinary/gen/f32-vsqrdiff-avx-u8.c",
     "src/f32-vbinary/gen/f32-vsqrdiffc-avx-u8.c",
diff --git a/gen/f16c_microkernels.bzl b/gen/f16c_microkernels.bzl
index 383bdde3cd5..5e2a145eb6c 100644
--- a/gen/f16c_microkernels.bzl
+++ b/gen/f16c_microkernels.bzl
@@ -14,7 +14,6 @@ PROD_F16C_MICROKERNEL_SRCS = [
     "src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c8.c",
     "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c8.c",
     "src/f16-maxpool/f16-maxpool-9p8x-minmax-f16c-c8.c",
-    "src/f16-prelu/gen/f16-prelu-f16c-2x16.c",
     "src/f16-rminmax/f16-rmax-f16c-u32.c",
     "src/f16-vbinary/gen/f16-vadd-f16c-u16.c",
     "src/f16-vbinary/gen/f16-vaddc-f16c-u16.c",
@@ -26,7 +25,10 @@ PROD_F16C_MICROKERNEL_SRCS = [
     "src/f16-vbinary/gen/f16-vminc-f16c-u16.c",
     "src/f16-vbinary/gen/f16-vmul-f16c-u16.c",
     "src/f16-vbinary/gen/f16-vmulc-f16c-u16.c",
+    "src/f16-vbinary/gen/f16-vprelu-f16c-u16.c",
+    "src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c",
     "src/f16-vbinary/gen/f16-vrdivc-f16c-u8.c",
+    "src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c",
     "src/f16-vbinary/gen/f16-vrsubc-f16c-u16.c",
     "src/f16-vbinary/gen/f16-vsqrdiff-f16c-u16.c",
     "src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u16.c",
@@ -61,7 +63,6 @@ NON_PROD_F16C_MICROKERNEL_SRCS = [
     "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c16.c",
     "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c24.c",
     "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c32.c",
-    "src/f16-prelu/gen/f16-prelu-f16c-2x8.c",
     "src/f16-vbinary/gen/f16-vadd-f16c-u8.c",
     "src/f16-vbinary/gen/f16-vaddc-f16c-u8.c",
     "src/f16-vbinary/gen/f16-vdiv-f16c-u16.c",
@@ -73,12 +74,9 @@ NON_PROD_F16C_MICROKERNEL_SRCS = [
     "src/f16-vbinary/gen/f16-vmul-f16c-u8.c",
     "src/f16-vbinary/gen/f16-vmulc-f16c-u8.c",
     "src/f16-vbinary/gen/f16-vprelu-f16c-u8.c",
-    "src/f16-vbinary/gen/f16-vprelu-f16c-u16.c",
     "src/f16-vbinary/gen/f16-vpreluc-f16c-u8.c",
-    "src/f16-vbinary/gen/f16-vpreluc-f16c-u16.c",
     "src/f16-vbinary/gen/f16-vrdivc-f16c-u16.c",
     "src/f16-vbinary/gen/f16-vrpreluc-f16c-u8.c",
-    "src/f16-vbinary/gen/f16-vrpreluc-f16c-u16.c",
     "src/f16-vbinary/gen/f16-vrsubc-f16c-u8.c",
     "src/f16-vbinary/gen/f16-vsqrdiff-f16c-u8.c",
     "src/f16-vbinary/gen/f16-vsqrdiffc-f16c-u8.c",
diff --git a/gen/neon_microkernels.bzl b/gen/neon_microkernels.bzl
index ef111a9a5f7..66370894399 100644
--- a/gen/neon_microkernels.bzl
+++ b/gen/neon_microkernels.bzl
@@ -37,7 +37,6 @@ PROD_NEON_MICROKERNEL_SRCS = [
     "src/f32-maxpool/f32-maxpool-9p8x-minmax-neon-c4.c",
     "src/f32-pavgpool/f32-pavgpool-9p8x-minmax-neon-c4.c",
     "src/f32-pavgpool/f32-pavgpool-9x-minmax-neon-c4.c",
-    "src/f32-prelu/gen/f32-prelu-neon-2x8.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-neon-lane-ld64.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-neon-lane-ld64.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-neon-lane-ld64.c",
@@ -57,6 +56,9 @@ PROD_NEON_MICROKERNEL_SRCS = [
     "src/f32-vbinary/gen/f32-vminc-neon-u8.c",
     "src/f32-vbinary/gen/f32-vmul-neon-u8.c",
     "src/f32-vbinary/gen/f32-vmulc-neon-u8.c",
+    "src/f32-vbinary/gen/f32-vprelu-neon-u8.c",
+    "src/f32-vbinary/gen/f32-vpreluc-neon-u8.c",
+    "src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c",
     "src/f32-vbinary/gen/f32-vrsubc-neon-u8.c",
     "src/f32-vbinary/gen/f32-vsqrdiff-neon-u8.c",
     "src/f32-vbinary/gen/f32-vsqrdiffc-neon-u8.c",
@@ -332,14 +334,6 @@ NON_PROD_NEON_MICROKERNEL_SRCS = [
     "src/f32-ppmm/gen/f32-ppmm-4x16-minmax-neon.c",
     "src/f32-ppmm/gen/f32-ppmm-8x8-minmax-neon-prfm.c",
     "src/f32-ppmm/gen/f32-ppmm-8x8-minmax-neon.c",
-    "src/f32-prelu/gen/f32-prelu-neon-1x4.c",
-    "src/f32-prelu/gen/f32-prelu-neon-1x8.c",
-    "src/f32-prelu/gen/f32-prelu-neon-1x16.c",
-    "src/f32-prelu/gen/f32-prelu-neon-2x4.c",
-    "src/f32-prelu/gen/f32-prelu-neon-2x16.c",
-    "src/f32-prelu/gen/f32-prelu-neon-4x4.c",
-    "src/f32-prelu/gen/f32-prelu-neon-4x8.c",
-    "src/f32-prelu/gen/f32-prelu-neon-4x16.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-neon-dup-ld64.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-neon-dup-ld64.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-5x8-minmax-neon-lane-ld64.c",
@@ -406,11 +400,8 @@ NON_PROD_NEON_MICROKERNEL_SRCS = [
     "src/f32-vbinary/gen/f32-vmul-neon-u4.c",
     "src/f32-vbinary/gen/f32-vmulc-neon-u4.c",
     "src/f32-vbinary/gen/f32-vprelu-neon-u4.c",
-    "src/f32-vbinary/gen/f32-vprelu-neon-u8.c",
     "src/f32-vbinary/gen/f32-vpreluc-neon-u4.c",
-    "src/f32-vbinary/gen/f32-vpreluc-neon-u8.c",
     "src/f32-vbinary/gen/f32-vrpreluc-neon-u4.c",
-    "src/f32-vbinary/gen/f32-vrpreluc-neon-u8.c",
     "src/f32-vbinary/gen/f32-vrsubc-neon-u4.c",
     "src/f32-vbinary/gen/f32-vsqrdiff-neon-u4.c",
     "src/f32-vbinary/gen/f32-vsqrdiffc-neon-u4.c",
@@ -718,7 +709,6 @@ NON_PROD_NEON_MICROKERNEL_SRCS = [
     "src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-neon-c64.c",
     "src/qs8-requantization/qs8-requantization-fp32-neon.c",
     "src/qs8-requantization/qs8-requantization-gemmlowp-neon.c",
-    "src/qs8-requantization/qs8-requantization-rndna-neon.c",
     "src/qs8-requantization/qs8-requantization-rndnu-neon-mull.c",
     "src/qs8-requantization/qs8-requantization-rndnu-neon-qdmulh.c",
     "src/qs8-rsum/gen/qs8-rsum-neon-u16.c",
@@ -834,7 +824,6 @@ NON_PROD_NEON_MICROKERNEL_SRCS = [
     "src/qu8-rdsum/gen/qu8-rdsum-7p7x-neon-u64.c",
     "src/qu8-requantization/qu8-requantization-fp32-neon.c",
     "src/qu8-requantization/qu8-requantization-gemmlowp-neon.c",
-    "src/qu8-requantization/qu8-requantization-rndna-neon.c",
     "src/qu8-rsum/gen/qu8-rsum-neon-u16.c",
     "src/qu8-rsum/gen/qu8-rsum-neon-u64-acc2.c",
     "src/qu8-rsum/gen/qu8-rsum-neon-u64-acc4.c",
diff --git a/gen/neonfp16arith_microkernels.bzl b/gen/neonfp16arith_microkernels.bzl
index 7e0c07c3ca3..018ca23cdfd 100644
--- a/gen/neonfp16arith_microkernels.bzl
+++ b/gen/neonfp16arith_microkernels.bzl
@@ -36,7 +36,6 @@ PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [
     "src/f16-maxpool/f16-maxpool-9p8x-minmax-neonfp16arith-c8.c",
     "src/f16-pavgpool/f16-pavgpool-9p8x-minmax-neonfp16arith-c8.c",
     "src/f16-pavgpool/f16-pavgpool-9x-minmax-neonfp16arith-c8.c",
-    "src/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c",
     "src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u32.c",
     "src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-u32.c",
     "src/f16-rminmax/gen/f16-rmax-neonfp16arith-u32-acc4.c",
@@ -50,6 +49,9 @@ PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [
     "src/f16-vbinary/gen/f16-vminc-neonfp16arith-u16.c",
     "src/f16-vbinary/gen/f16-vmul-neonfp16arith-u16.c",
     "src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u16.c",
+    "src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c",
+    "src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c",
+    "src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c",
     "src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u16.c",
     "src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u16.c",
     "src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u16.c",
@@ -191,7 +193,6 @@ NON_PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [
     "src/f16-igemm/gen/f16-igemm-4x16-minmax-neonfp16arith-ld64.c",
     "src/f16-igemm/gen/f16-igemm-8x8-minmax-neonfp16arith-ld64.c",
     "src/f16-igemm/gen/f16-igemm-8x16-minmax-neonfp16arith-ld64.c",
-    "src/f16-prelu/gen/f16-prelu-neonfp16arith-2x8.c",
     "src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u8.c",
     "src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u16.c",
     "src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u24.c",
@@ -268,11 +269,8 @@ NON_PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [
     "src/f16-vbinary/gen/f16-vmul-neonfp16arith-u8.c",
     "src/f16-vbinary/gen/f16-vmulc-neonfp16arith-u8.c",
     "src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u8.c",
-    "src/f16-vbinary/gen/f16-vprelu-neonfp16arith-u16.c",
     "src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u8.c",
-    "src/f16-vbinary/gen/f16-vpreluc-neonfp16arith-u16.c",
     "src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u8.c",
-    "src/f16-vbinary/gen/f16-vrpreluc-neonfp16arith-u16.c",
     "src/f16-vbinary/gen/f16-vrsubc-neonfp16arith-u8.c",
     "src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-u8.c",
     "src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-u8.c",
diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl
index 949c9f387eb..8790b58a8e4 100644
--- a/gen/rvv_microkernels.bzl
+++ b/gen/rvv_microkernels.bzl
@@ -58,6 +58,8 @@ PROD_RVV_MICROKERNEL_SRCS = [
     "src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c",
     "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c",
     "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c",
+    "src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c",
+    "src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c",
     "src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c",
     "src/x32-transposec/gen/x32-transposec-4x4-rvv.c",
     "src/x32-transposec/gen/x32-transposec-8x8-rvv.c",
@@ -187,6 +189,12 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [
     "src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c",
     "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c",
     "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c",
+    "src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c",
+    "src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c",
+    "src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c",
+    "src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c",
+    "src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c",
+    "src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c",
     "src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u2.c",
     "src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u4.c",
     "src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u8.c",
diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl
index a7c2bba6c7c..b89b83c3211 100644
--- a/gen/scalar_microkernels.bzl
+++ b/gen/scalar_microkernels.bzl
@@ -66,7 +66,6 @@ PROD_SCALAR_MICROKERNEL_SRCS = [
     "src/f32-maxpool/f32-maxpool-9p8x-minmax-scalar-c1.c",
     "src/f32-pavgpool/f32-pavgpool-9p8x-minmax-scalar-c1.c",
     "src/f32-pavgpool/f32-pavgpool-9x-minmax-scalar-c1.c",
-    "src/f32-prelu/gen/f32-prelu-scalar-2x4.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x4-minmax-scalar.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x4-minmax-scalar.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-minmax-scalar.c",
@@ -95,7 +94,10 @@ PROD_SCALAR_MICROKERNEL_SRCS = [
     "src/f32-vbinary/gen/f32-vminc-scalar-u8.c",
     "src/f32-vbinary/gen/f32-vmul-scalar-u8.c",
     "src/f32-vbinary/gen/f32-vmulc-scalar-u8.c",
+    "src/f32-vbinary/gen/f32-vprelu-scalar-u8.c",
+    "src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c",
     "src/f32-vbinary/gen/f32-vrdivc-scalar-u2.c",
+    "src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c",
     "src/f32-vbinary/gen/f32-vrsubc-scalar-u8.c",
     "src/f32-vbinary/gen/f32-vsqrdiff-scalar-u8.c",
     "src/f32-vbinary/gen/f32-vsqrdiffc-scalar-u8.c",
@@ -380,7 +382,6 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [
     "src/f32-ppmm/gen/f32-ppmm-3x3-minmax-scalar.c",
     "src/f32-ppmm/gen/f32-ppmm-4x2-minmax-scalar.c",
     "src/f32-ppmm/gen/f32-ppmm-4x4-minmax-scalar.c",
-    "src/f32-prelu/gen/f32-prelu-scalar-2x1.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-2x4-minmax-scalar.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-scalar.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-relu-scalar.c",
@@ -481,18 +482,15 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [
     "src/f32-vbinary/gen/f32-vprelu-scalar-u1.c",
     "src/f32-vbinary/gen/f32-vprelu-scalar-u2.c",
     "src/f32-vbinary/gen/f32-vprelu-scalar-u4.c",
-    "src/f32-vbinary/gen/f32-vprelu-scalar-u8.c",
     "src/f32-vbinary/gen/f32-vpreluc-scalar-u1.c",
     "src/f32-vbinary/gen/f32-vpreluc-scalar-u2.c",
     "src/f32-vbinary/gen/f32-vpreluc-scalar-u4.c",
-    "src/f32-vbinary/gen/f32-vpreluc-scalar-u8.c",
     "src/f32-vbinary/gen/f32-vrdivc-scalar-u1.c",
     "src/f32-vbinary/gen/f32-vrdivc-scalar-u4.c",
     "src/f32-vbinary/gen/f32-vrdivc-scalar-u8.c",
     "src/f32-vbinary/gen/f32-vrpreluc-scalar-u1.c",
     "src/f32-vbinary/gen/f32-vrpreluc-scalar-u2.c",
     "src/f32-vbinary/gen/f32-vrpreluc-scalar-u4.c",
-    "src/f32-vbinary/gen/f32-vrpreluc-scalar-u8.c",
     "src/f32-vbinary/gen/f32-vrsubc-scalar-u1.c",
     "src/f32-vbinary/gen/f32-vrsubc-scalar-u2.c",
     "src/f32-vbinary/gen/f32-vrsubc-scalar-u4.c",
@@ -721,9 +719,6 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [
     "src/qs8-requantization/qs8-requantization-fp32-scalar-fmagic.c",
     "src/qs8-requantization/qs8-requantization-fp32-scalar-lrintf.c",
     "src/qs8-requantization/qs8-requantization-gemmlowp-scalar.c",
-    "src/qs8-requantization/qs8-requantization-rndna-scalar-signed64.c",
-    "src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c",
-    "src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c",
     "src/qs8-requantization/qs8-requantization-rndnu-scalar.c",
     "src/qs8-rsum/gen/qs8-rsum-scalar-u1.c",
     "src/qs8-rsum/gen/qs8-rsum-scalar-u2.c",
@@ -860,9 +855,6 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [
     "src/qu8-requantization/qu8-requantization-fp32-scalar-fmagic.c",
     "src/qu8-requantization/qu8-requantization-fp32-scalar-lrintf.c",
     "src/qu8-requantization/qu8-requantization-gemmlowp-scalar.c",
-    "src/qu8-requantization/qu8-requantization-rndna-scalar-signed64.c",
-    "src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned32.c",
-    "src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned64.c",
     "src/qu8-rsum/gen/qu8-rsum-scalar-u1.c",
     "src/qu8-rsum/gen/qu8-rsum-scalar-u2.c",
     "src/qu8-vadd/gen/qu8-vadd-minmax-scalar-u2.c",
diff --git a/gen/sse2_microkernels.bzl b/gen/sse2_microkernels.bzl
index d53103095ce..86cbc15edeb 100644
--- a/gen/sse2_microkernels.bzl
+++ b/gen/sse2_microkernels.bzl
@@ -13,10 +13,12 @@ PROD_SSE2_MICROKERNEL_SRCS = [
     "src/f32-argmaxpool/f32-argmaxpool-9p8x-sse2-c4.c",
     "src/f32-argmaxpool/f32-argmaxpool-9x-sse2-c4.c",
     "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u16.c",
-    "src/f32-prelu/gen/f32-prelu-sse2-2x8.c",
     "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u32.c",
     "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u32.c",
     "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc2.c",
+    "src/f32-vbinary/gen/f32-vprelu-sse2-u8.c",
+    "src/f32-vbinary/gen/f32-vpreluc-sse2-u8.c",
+    "src/f32-vbinary/gen/f32-vrpreluc-sse2-u8.c",
     "src/f32-vcopysign/gen/f32-vcopysign-sse2.c",
     "src/f32-vcopysign/gen/f32-vcopysignc-sse2.c",
     "src/f32-vcopysign/gen/f32-vrcopysignc-sse2.c",
@@ -115,7 +117,6 @@ NON_PROD_SSE2_MICROKERNEL_SRCS = [
     "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u8.c",
     "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u24.c",
     "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-u32.c",
-    "src/f32-prelu/gen/f32-prelu-sse2-2x4.c",
     "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u8.c",
     "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u16.c",
     "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u24.c",
@@ -126,11 +127,8 @@ NON_PROD_SSE2_MICROKERNEL_SRCS = [
     "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u8-acc2.c",
     "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-u16-acc4.c",
     "src/f32-vbinary/gen/f32-vprelu-sse2-u4.c",
-    "src/f32-vbinary/gen/f32-vprelu-sse2-u8.c",
     "src/f32-vbinary/gen/f32-vpreluc-sse2-u4.c",
-    "src/f32-vbinary/gen/f32-vpreluc-sse2-u8.c",
     "src/f32-vbinary/gen/f32-vrpreluc-sse2-u4.c",
-    "src/f32-vbinary/gen/f32-vrpreluc-sse2-u8.c",
     "src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-u4.c",
     "src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-u8.c",
     "src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-u16.c",
@@ -269,7 +267,6 @@ NON_PROD_SSE2_MICROKERNEL_SRCS = [
     "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x4c2s4-minmax-fp32-sse2-ld128.c",
     "src/qs8-requantization/qs8-requantization-fp32-sse2.c",
     "src/qs8-requantization/qs8-requantization-gemmlowp-sse2.c",
-    "src/qs8-requantization/qs8-requantization-rndna-sse2.c",
     "src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-u16.c",
     "src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-u24.c",
     "src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-u32.c",
@@ -341,7 +338,6 @@ NON_PROD_SSE2_MICROKERNEL_SRCS = [
     "src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse2-ld128.c",
     "src/qu8-requantization/qu8-requantization-fp32-sse2.c",
     "src/qu8-requantization/qu8-requantization-gemmlowp-sse2.c",
-    "src/qu8-requantization/qu8-requantization-rndna-sse2.c",
     "src/qu8-rsum/gen/qu8-rsum-sse2-u16.c",
     "src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc2.c",
     "src/qu8-rsum/gen/qu8-rsum-sse2-u64-acc4.c",
diff --git a/gen/sse41_microkernels.bzl b/gen/sse41_microkernels.bzl
index 50a2cbe1c7d..8ee156de2e2 100644
--- a/gen/sse41_microkernels.bzl
+++ b/gen/sse41_microkernels.bzl
@@ -8,7 +8,6 @@ Auto-generated file. Do not edit!
 PROD_SSE41_MICROKERNEL_SRCS = [
     "src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u16.c",
     "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u8.c",
-    "src/f32-prelu/gen/f32-prelu-sse41-2x8.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x8-minmax-sse41-dup.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x8-minmax-sse41-dup.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-sse41-dup.c",
@@ -80,7 +79,6 @@ NON_PROD_SSE41_MICROKERNEL_SRCS = [
     "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u16.c",
     "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u24.c",
     "src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-u32.c",
-    "src/f32-prelu/gen/f32-prelu-sse41-2x4.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-3x8-minmax-sse41-dup.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-5x8-minmax-sse41-dup.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-6x8-minmax-sse41-dup.c",
@@ -277,7 +275,6 @@ NON_PROD_SSE41_MICROKERNEL_SRCS = [
     "src/qs8-rdsum/gen/qs8-rdsum-7p7x-minmax-fp32-sse41-c32.c",
     "src/qs8-requantization/qs8-requantization-fp32-sse41.c",
     "src/qs8-requantization/qs8-requantization-gemmlowp-sse41.c",
-    "src/qs8-requantization/qs8-requantization-rndna-sse41.c",
     "src/qs8-requantization/qs8-requantization-rndnu-sse41-sra.c",
     "src/qs8-requantization/qs8-requantization-rndnu-sse41-srl.c",
     "src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-u16.c",
@@ -371,7 +368,6 @@ NON_PROD_SSE41_MICROKERNEL_SRCS = [
     "src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse41-ld64.c",
     "src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse41-ld128.c",
     "src/qu8-requantization/qu8-requantization-gemmlowp-sse41.c",
-    "src/qu8-requantization/qu8-requantization-rndna-sse41.c",
     "src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul16-ld64-u16.c",
     "src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul32-ld32-u8.c",
     "src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul32-ld32-u16.c",
diff --git a/gen/sse_microkernels.bzl b/gen/sse_microkernels.bzl
index 9364c3c4623..e912651df93 100644
--- a/gen/sse_microkernels.bzl
+++ b/gen/sse_microkernels.bzl
@@ -177,8 +177,6 @@ NON_PROD_SSE_MICROKERNEL_SRCS = [
     "src/f32-igemm/gen/f32-igemm-6x8-minmax-sse-load1.c",
     "src/f32-igemm/gen/f32-igemm-6x8s4-minmax-sse.c",
     "src/f32-ppmm/gen/f32-ppmm-4x8-minmax-sse.c",
-    "src/f32-prelu/gen/f32-prelu-sse-2x4.c",
-    "src/f32-prelu/gen/f32-prelu-sse-2x8.c",
     "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c32.c",
     "src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c64.c",
     "src/f32-rminmax/gen/f32-rmax-sse-u4.c",
diff --git a/gen/ssse3_microkernels.bzl b/gen/ssse3_microkernels.bzl
index 3175ba4c9ea..6e756912d54 100644
--- a/gen/ssse3_microkernels.bzl
+++ b/gen/ssse3_microkernels.bzl
@@ -36,7 +36,6 @@ NON_PROD_SSSE3_MICROKERNEL_SRCS = [
     "src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x4c8-minmax-ssse3-madd.c",
     "src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x4c8-minmax-ssse3-madd-prfm.c",
     "src/qs8-requantization/qs8-requantization-gemmlowp-ssse3.c",
-    "src/qs8-requantization/qs8-requantization-rndna-ssse3.c",
     "src/qs8-rsum/gen/qs8-rsum-ssse3-u16.c",
     "src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc2.c",
     "src/qs8-rsum/gen/qs8-rsum-ssse3-u64-acc4.c",
@@ -49,7 +48,6 @@ NON_PROD_SSSE3_MICROKERNEL_SRCS = [
     "src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c16.c",
     "src/qu8-rdsum/gen/qu8-rdsum-7p7x-ssse3-c32.c",
     "src/qu8-requantization/qu8-requantization-gemmlowp-ssse3.c",
-    "src/qu8-requantization/qu8-requantization-rndna-ssse3.c",
     "src/qu8-vcvt/gen/qu8-vcvt-ssse3-u16.c",
     "src/qu8-vhswish/gen/qu8-vhswish-ssse3-u16.c",
     "src/qu8-vhswish/gen/qu8-vhswish-ssse3-u32.c",
diff --git a/gen/wasm_microkernels.bzl b/gen/wasm_microkernels.bzl
index 79f5d620c30..a5f1b10aeae 100644
--- a/gen/wasm_microkernels.bzl
+++ b/gen/wasm_microkernels.bzl
@@ -28,7 +28,6 @@ PROD_WASM_MICROKERNEL_SRCS = [
     "src/f32-maxpool/f32-maxpool-9p8x-minmax-wasm-c1.c",
     "src/f32-pavgpool/f32-pavgpool-9p8x-minmax-wasm-c1.c",
     "src/f32-pavgpool/f32-pavgpool-9x-minmax-wasm-c1.c",
-    "src/f32-prelu/gen/f32-prelu-wasm-2x4.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-1x4-minmax-wasm.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x4-minmax-wasm.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-minmax-wasm.c",
@@ -46,7 +45,10 @@ PROD_WASM_MICROKERNEL_SRCS = [
     "src/f32-vbinary/gen/f32-vminc-wasm-u8.c",
     "src/f32-vbinary/gen/f32-vmul-wasm-u8.c",
     "src/f32-vbinary/gen/f32-vmulc-wasm-u8.c",
+    "src/f32-vbinary/gen/f32-vprelu-wasm-u8.c",
+    "src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c",
     "src/f32-vbinary/gen/f32-vrdivc-wasm-u8.c",
+    "src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c",
     "src/f32-vbinary/gen/f32-vrsubc-wasm-u8.c",
     "src/f32-vbinary/gen/f32-vsub-wasm-u8.c",
     "src/f32-vbinary/gen/f32-vsubc-wasm-u8.c",
@@ -107,7 +109,6 @@ NON_PROD_WASM_MICROKERNEL_SRCS = [
     "src/f32-igemm/gen/f32-igemm-2x4-minmax-wasm.c",
     "src/f32-igemm/gen/f32-igemm-2x4-relu-wasm.c",
     "src/f32-igemm/gen/f32-igemm-4x2-relu-wasm.c",
-    "src/f32-prelu/gen/f32-prelu-wasm-2x1.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-2x4-minmax-wasm.c",
     "src/f32-qc4w-gemm/gen/f32-qc4w-gemm-4x2-minmax-wasm.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x4-relu-wasm.c",
@@ -169,18 +170,15 @@ NON_PROD_WASM_MICROKERNEL_SRCS = [
     "src/f32-vbinary/gen/f32-vprelu-wasm-u1.c",
     "src/f32-vbinary/gen/f32-vprelu-wasm-u2.c",
     "src/f32-vbinary/gen/f32-vprelu-wasm-u4.c",
-    "src/f32-vbinary/gen/f32-vprelu-wasm-u8.c",
     "src/f32-vbinary/gen/f32-vpreluc-wasm-u1.c",
     "src/f32-vbinary/gen/f32-vpreluc-wasm-u2.c",
     "src/f32-vbinary/gen/f32-vpreluc-wasm-u4.c",
-    "src/f32-vbinary/gen/f32-vpreluc-wasm-u8.c",
     "src/f32-vbinary/gen/f32-vrdivc-wasm-u1.c",
     "src/f32-vbinary/gen/f32-vrdivc-wasm-u2.c",
     "src/f32-vbinary/gen/f32-vrdivc-wasm-u4.c",
     "src/f32-vbinary/gen/f32-vrpreluc-wasm-u1.c",
     "src/f32-vbinary/gen/f32-vrpreluc-wasm-u2.c",
     "src/f32-vbinary/gen/f32-vrpreluc-wasm-u4.c",
-    "src/f32-vbinary/gen/f32-vrpreluc-wasm-u8.c",
     "src/f32-vbinary/gen/f32-vrsubc-wasm-u1.c",
     "src/f32-vbinary/gen/f32-vrsubc-wasm-u2.c",
     "src/f32-vbinary/gen/f32-vrsubc-wasm-u4.c",
diff --git a/gen/wasmrelaxedsimd_microkernels.bzl b/gen/wasmrelaxedsimd_microkernels.bzl
index b2705786dcb..e4a4a042d83 100644
--- a/gen/wasmrelaxedsimd_microkernels.bzl
+++ b/gen/wasmrelaxedsimd_microkernels.bzl
@@ -47,8 +47,6 @@ PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [
     "src/f32-igemm/gen/f32-igemm-6x8-minmax-wasmrelaxedsimd-fma-splat.c",
     "src/f32-igemm/gen/f32-igemm-6x8-relu-wasmrelaxedsimd-fma-splat.c",
     "src/f32-igemm/gen/f32-igemm-6x8-wasmrelaxedsimd-fma-splat.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x4.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x4.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-fma-splat.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-relu-wasmrelaxedsimd-fma-loadsplat.c",
@@ -281,22 +279,6 @@ NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [
     "src/f32-igemm/gen/f32-igemm-6x8s4-minmax-wasmrelaxedsimd.c",
     "src/f32-igemm/gen/f32-igemm-6x8s4-relu-wasmrelaxedsimd-fma.c",
     "src/f32-igemm/gen/f32-igemm-6x8s4-wasmrelaxedsimd-fma.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x4.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x8.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x16.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x8.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x16.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x4.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x8.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x16.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x4.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x8.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x16.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x8.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x16.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x4.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x8.c",
-    "src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x16.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-loadsplat.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmrelaxedsimd-splat.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8s4-minmax-wasmrelaxedsimd-fma.c",
@@ -510,6 +492,7 @@ NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [
     "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c8-minmax-wasmusdot.c",
     "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot-u2.c",
     "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot.c",
+    "src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c",
     "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c16-minmax-fp32-wasmsdot.c",
     "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c16-minmax-fp32-wasmusdot.c",
     "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmusdot.c",
@@ -598,6 +581,7 @@ NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [
     "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c",
     "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2.c",
     "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot.c",
+    "src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c",
     "src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u16.c",
     "src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u32-acc2.c",
     "src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u64-acc2.c",
diff --git a/gen/wasmsimd_microkernels.bzl b/gen/wasmsimd_microkernels.bzl
index b84cc5ec87f..1146bcac755 100644
--- a/gen/wasmsimd_microkernels.bzl
+++ b/gen/wasmsimd_microkernels.bzl
@@ -96,8 +96,6 @@ PROD_WASMSIMD_MICROKERNEL_SRCS = [
     "src/f32-pavgpool/f32-pavgpool-9p8x-minmax-wasmsimd-x86-c4.c",
     "src/f32-pavgpool/f32-pavgpool-9x-minmax-wasmsimd-arm-c4.c",
     "src/f32-pavgpool/f32-pavgpool-9x-minmax-wasmsimd-x86-c4.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x8.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x8.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-arm-splat.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-x86-splat.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-relu-wasmsimd-splat.c",
@@ -131,7 +129,10 @@ PROD_WASMSIMD_MICROKERNEL_SRCS = [
     "src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-u16.c",
     "src/f32-vbinary/gen/f32-vmul-wasmsimd-u16.c",
     "src/f32-vbinary/gen/f32-vmulc-wasmsimd-u16.c",
+    "src/f32-vbinary/gen/f32-vprelu-wasmsimd-u16.c",
+    "src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c",
     "src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u16.c",
+    "src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c",
     "src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u16.c",
     "src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u16.c",
     "src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-u16.c",
@@ -565,22 +566,6 @@ NON_PROD_WASMSIMD_MICROKERNEL_SRCS = [
     "src/f32-igemm/gen/f32-igemm-6x8s4-wasmsimd.c",
     "src/f32-ppmm/gen/f32-ppmm-4x8-minmax-wasmsimd-arm-splat.c",
     "src/f32-ppmm/gen/f32-ppmm-4x8-minmax-wasmsimd-x86-splat.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x4.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x8.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x16.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x4.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x16.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x4.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x8.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x16.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x4.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x8.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x16.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x4.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x16.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x4.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x8.c",
-    "src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x16.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-arm-loadsplat.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-minmax-wasmsimd-x86-loadsplat.c",
     "src/f32-qc8w-gemm/gen/f32-qc8w-gemm-1x8-relu-wasmsimd-loadsplat.c",
@@ -754,15 +739,12 @@ NON_PROD_WASMSIMD_MICROKERNEL_SRCS = [
     "src/f32-vbinary/gen/f32-vmulc-wasmsimd-u8.c",
     "src/f32-vbinary/gen/f32-vprelu-wasmsimd-u4.c",
     "src/f32-vbinary/gen/f32-vprelu-wasmsimd-u8.c",
-    "src/f32-vbinary/gen/f32-vprelu-wasmsimd-u16.c",
     "src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u4.c",
     "src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u8.c",
-    "src/f32-vbinary/gen/f32-vpreluc-wasmsimd-u16.c",
     "src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u4.c",
     "src/f32-vbinary/gen/f32-vrdivc-wasmsimd-u8.c",
     "src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u4.c",
     "src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u8.c",
-    "src/f32-vbinary/gen/f32-vrpreluc-wasmsimd-u16.c",
     "src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u4.c",
     "src/f32-vbinary/gen/f32-vrsubc-wasmsimd-u8.c",
     "src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-u4.c",
diff --git a/include/xnnpack.h b/include/xnnpack.h
index f23338ac8c0..801cbcea64c 100644
--- a/include/xnnpack.h
+++ b/include/xnnpack.h
@@ -997,6 +997,7 @@ enum xnn_binary_operator {
   xnn_binary_minimum,
   xnn_binary_copysign,
   xnn_binary_squared_difference,
+  xnn_binary_prelu,
 };
 
 struct xnn_binary_params {
@@ -1650,7 +1651,7 @@ enum xnn_status xnn_define_static_resize_bilinear_2d(
 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
 ///                    with [N, H, W, channels] dimensions.
 /// @param flags - binary features of the PReLU Node. No supported flags are currently defined.
-enum xnn_status xnn_define_prelu(
+XNN_DEPRECATED enum xnn_status xnn_define_prelu(
   xnn_subgraph_t subgraph,
   uint32_t input_id,
   uint32_t slope_id,
@@ -1660,7 +1661,7 @@ enum xnn_status xnn_define_prelu(
 /// Define a RoPE (Rotary Positional Embeddings) Node and add it to a Subgraph.
 ///
 /// @param subgraph - a Subgraph object that will own the created Node.
-/// @param max_tokens - maximum possible number of tokens (maximum sequence length) of the input/output tensors.
+/// @param max_tokens - deprecated.
 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
 ///                   with [batch, tokens, heads, channels] dimensions.
 /// @param weights_id - Value ID for the weights tensor. The weights tensor must be a 2D tensor defined in the
@@ -2948,8 +2949,6 @@ enum xnn_status xnn_run_convert_nc_f32_f16(
 enum xnn_status xnn_create_convert_nc_f32_qs8(
   float output_scale,
   int8_t output_zero_point,
-  int8_t output_min,
-  int8_t output_max,
   uint32_t flags,
   xnn_operator_t* convert_op_out);
 
@@ -2981,8 +2980,6 @@ enum xnn_status xnn_run_convert_nc_f32_qs8(
 enum xnn_status xnn_create_convert_nc_f32_qu8(
   float output_scale,
   uint8_t output_zero_point,
-  uint8_t output_min,
-  uint8_t output_max,
   uint32_t flags,
   xnn_operator_t* convert_op_out);
 
@@ -5094,48 +5091,6 @@ enum xnn_status xnn_run_negate_nc_f32(
   uint32_t flags,
   pthreadpool_t threadpool);
 
-enum xnn_status xnn_create_prelu_nc_f16(
-  size_t input_channels,
-  size_t slope_channels,
-  size_t input_stride,
-  size_t output_stride,
-  const void* negative_slope,
-  uint32_t flags,
-  xnn_code_cache_t code_cache,
-  xnn_weights_cache_t weights_cache,
-  xnn_operator_t* prelu_op_out);
-
-enum xnn_status xnn_reshape_prelu_nc_f16(
-  xnn_operator_t prelu_op,
-  size_t batch_size,
-  pthreadpool_t threadpool);
-
-enum xnn_status xnn_setup_prelu_nc_f16(
-  xnn_operator_t prelu_op,
-  const void* input,
-  void* output);
-
-enum xnn_status xnn_create_prelu_nc_f32(
-  size_t input_channels,
-  size_t slope_channels,
-  size_t input_stride,
-  size_t output_stride,
-  const float* negative_slope,
-  uint32_t flags,
-  xnn_code_cache_t code_cache,
-  xnn_weights_cache_t weights_cache,
-  xnn_operator_t* prelu_op_out);
-
-enum xnn_status xnn_reshape_prelu_nc_f32(
-  xnn_operator_t prelu_op,
-  size_t batch_size,
-  pthreadpool_t threadpool);
-
-enum xnn_status xnn_setup_prelu_nc_f32(
-  xnn_operator_t prelu_op,
-  const float* input,
-  float* output);
-
 enum xnn_status xnn_create_resize_bilinear2d_nchw_f32(
   size_t output_height,
   size_t output_width,
@@ -5275,7 +5230,6 @@ enum xnn_status xnn_setup_resize_bilinear2d_nhwc_u8(
   uint8_t* output);
 
 enum xnn_status xnn_create_rope_nthc_f16(
-  size_t max_tokens,
   uint32_t flags,
   xnn_operator_t* rope_op_out);
 
@@ -5294,7 +5248,6 @@ enum xnn_status xnn_setup_rope_nthc_f16(
   void* output);
 
 enum xnn_status xnn_create_rope_nthc_f32(
-  size_t max_tokens,
   uint32_t flags,
   xnn_operator_t* rope_op_out);
 
diff --git a/scripts/generate-f16-prelu.sh b/scripts/generate-f16-prelu.sh
deleted file mode 100755
index b7213bb7836..00000000000
--- a/scripts/generate-f16-prelu.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/sh
-# Copyright 2020 Google LLC
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-################################### ARM NEON ##################################
-tools/xngen src/f16-prelu/neonfp16arith.c.in -D CHANNEL_TILE=8  -D ROW_TILE=2 -o src/f16-prelu/gen/f16-prelu-neonfp16arith-2x8.c &
-tools/xngen src/f16-prelu/neonfp16arith.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -o src/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c &
-
-################################### x86 F16C ##################################
-tools/xngen src/f16-prelu/f16c.c.in -D CHANNEL_TILE=8  -D ROW_TILE=2 -o src/f16-prelu/gen/f16-prelu-f16c-2x8.c &
-tools/xngen src/f16-prelu/f16c.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -o src/f16-prelu/gen/f16-prelu-f16c-2x16.c &
-
-wait
diff --git a/scripts/generate-f32-prelu.sh b/scripts/generate-f32-prelu.sh
deleted file mode 100755
index f36b7ec405c..00000000000
--- a/scripts/generate-f32-prelu.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/bin/sh
-# Copyright 2019 Google LLC
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-#################################### Scalar ###################################
-tools/xngen src/f32-prelu/scalar.c.in -D CHANNEL_TILE=1 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-scalar-2x1.c &
-tools/xngen src/f32-prelu/scalar.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-scalar-2x4.c &
-
-##################################### WAsm ####################################
-tools/xngen src/f32-prelu/wasm.c.in -D CHANNEL_TILE=1 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-wasm-2x1.c &
-tools/xngen src/f32-prelu/wasm.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-wasm-2x4.c &
-
-################################### ARM NEON ##################################
-tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=4  -D ROW_TILE=1 -o src/f32-prelu/gen/f32-prelu-neon-1x4.c &
-tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=8  -D ROW_TILE=1 -o src/f32-prelu/gen/f32-prelu-neon-1x8.c &
-tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=16 -D ROW_TILE=1 -o src/f32-prelu/gen/f32-prelu-neon-1x16.c &
-tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=4  -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-neon-2x4.c &
-tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=8  -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-neon-2x8.c &
-tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-neon-2x16.c &
-tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=4  -D ROW_TILE=4 -o src/f32-prelu/gen/f32-prelu-neon-4x4.c &
-tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=8  -D ROW_TILE=4 -o src/f32-prelu/gen/f32-prelu-neon-4x8.c &
-tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=16 -D ROW_TILE=4 -o src/f32-prelu/gen/f32-prelu-neon-4x16.c &
-
-################################## WAsm SIMD ##################################
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=4  -D ROW_TILE=1 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x4.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=8  -D ROW_TILE=1 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x8.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=16 -D ROW_TILE=1 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x16.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=4  -D ROW_TILE=2 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x4.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=8  -D ROW_TILE=2 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x8.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x16.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=4  -D ROW_TILE=4 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x4.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=8  -D ROW_TILE=4 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x8.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=16 -D ROW_TILE=4 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x16.c &
-
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=4  -D ROW_TILE=1 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x4.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=8  -D ROW_TILE=1 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x8.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=16 -D ROW_TILE=1 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x16.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=4  -D ROW_TILE=2 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x4.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=8  -D ROW_TILE=2 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x8.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x16.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=4  -D ROW_TILE=4 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x4.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=8  -D ROW_TILE=4 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x8.c &
-tools/xngen src/f32-prelu/wasmsimd-laneselect.c.in -D CHANNEL_TILE=16 -D ROW_TILE=4 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x16.c &
-
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=4  -D ROW_TILE=1 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x4.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=8  -D ROW_TILE=1 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x8.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=16 -D ROW_TILE=1 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x16.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=4  -D ROW_TILE=2 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x4.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=8  -D ROW_TILE=2 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x8.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=16 -D ROW_TILE=2 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x16.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=4  -D ROW_TILE=4 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x4.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=8  -D ROW_TILE=4 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x8.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=16 -D ROW_TILE=4 -D RELAXED=0 -o src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x16.c &
-
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=4  -D ROW_TILE=1 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x4.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=8  -D ROW_TILE=1 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x8.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=16 -D ROW_TILE=1 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x16.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=4  -D ROW_TILE=2 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x4.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=8  -D ROW_TILE=2 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x8.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=16 -D ROW_TILE=2 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x16.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=4  -D ROW_TILE=4 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x4.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=8  -D ROW_TILE=4 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x8.c &
-tools/xngen src/f32-prelu/wasmsimd-iminmax.c.in    -D CHANNEL_TILE=16 -D ROW_TILE=4 -D RELAXED=1 -o src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x16.c &
-
-################################### ARM NEON ##################################
-tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-neon-2x4.c &
-tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-neon-2x8.c &
-
-############################# x86 SSE/SSE2/SSE4.1 #############################
-tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D SSE=1 -o src/f32-prelu/gen/f32-prelu-sse-2x4.c &
-tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D SSE=1 -o src/f32-prelu/gen/f32-prelu-sse-2x8.c &
-tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D SSE=2 -o src/f32-prelu/gen/f32-prelu-sse2-2x4.c &
-tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D SSE=2 -o src/f32-prelu/gen/f32-prelu-sse2-2x8.c &
-tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D SSE=4 -o src/f32-prelu/gen/f32-prelu-sse41-2x4.c &
-tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D SSE=4 -o src/f32-prelu/gen/f32-prelu-sse41-2x8.c &
-
-################################### x86 AVX ###################################
-tools/xngen src/f32-prelu/avx.c.in -D CHANNEL_TILE=8  -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-avx-2x8.c &
-tools/xngen src/f32-prelu/avx.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-avx-2x16.c &
-
-################################## x86 AVX512 #################################
-tools/xngen src/f32-prelu/avx512f.c.in -D CHANNEL_TILE=16 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-avx512f-2x16.c &
-tools/xngen src/f32-prelu/avx512f.c.in -D CHANNEL_TILE=32 -D ROW_TILE=2 -o src/f32-prelu/gen/f32-prelu-avx512f-2x32.c &
-
-wait
diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh
index ef6cb064603..75fc425003e 100755
--- a/scripts/generate-tests.sh
+++ b/scripts/generate-tests.sh
@@ -249,8 +249,8 @@ tools/generate-dwconv2d-chw-test.py --spec test/f32-dwconv2d-chw.yaml --output t
 ### Tests for VHSwish micro-kernels
 tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vhswish --output test/f16-vhswish.cc &
 tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vhswish --output test/f32-vhswish.cc &
-tools/generate-vhswish-test.py --spec test/qs8-vhswish.yaml --output test/qs8-vhswish.cc &
-tools/generate-vhswish-test.py --spec test/qu8-vhswish.yaml --output test/qu8-vhswish.cc &
+tools/generate-vunary-test.py --tester VHSwishMicrokernelTester --ukernel qs8-vhswish --output test/qs8-vhswish.cc &
+tools/generate-vunary-test.py --tester VHSwishMicrokernelTester --ukernel qu8-vhswish --output test/qu8-vhswish.cc &
 
 ### Tests for IBilinear micro-kernels
 tools/generate-ibilinear-test.py --spec test/f16-ibilinear.yaml --output test/f16-ibilinear.cc &
@@ -262,10 +262,6 @@ tools/generate-ibilinear-test.py --spec test/u8-ibilinear.yaml --output test/u8-
 tools/generate-ibilinear-chw-test.py --spec test/f16-ibilinear-chw.yaml --output test/f16-ibilinear-chw.cc &
 tools/generate-ibilinear-chw-test.py --spec test/f32-ibilinear-chw.yaml --output test/f32-ibilinear-chw.cc &
 
-### Tests for PRelu micro-kernels
-tools/generate-prelu-test.py --spec test/f16-prelu.yaml --output test/f16-prelu.cc &
-tools/generate-prelu-test.py --spec test/f32-prelu.yaml --output test/f32-prelu.cc &
-
 ### Tests for RAddExpMinusMax micro-kernels
 tools/generate-raddexpminusmax-test.py --spec test/f32-raddexpminusmax.yaml --output test/f32-raddexpminusmax.cc &
 
diff --git a/scripts/generate-x8-packw.sh b/scripts/generate-x8-packw.sh
index 58bb3d2f991..d16e1214b47 100755
--- a/scripts/generate-x8-packw.sh
+++ b/scripts/generate-x8-packw.sh
@@ -53,4 +53,9 @@ tools/xngen src/x8-packw/kr-avxvnni.c.in -D NR=16 -D KR=8 -D TYPE=int8_t -D IZP=
 tools/xngen src/x8-packw/kr-avxvnni.c.in -D NR=16 -D KR=8 -D TYPE=int8_t -D IZP=128 -D AVX=10 -D PREFETCH=0 -o src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni.c &
 tools/xngen src/x8-packw/kr-avxvnni.c.in -D NR=16 -D KR=8 -D TYPE=int8_t -D IZP=128 -D AVX=10 -D PREFETCH=1 -o src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni-prfm.c &
 
+### WAsm Relaxed SIMD
+### C8 packing
+tools/xngen src/x8-packw/kr-wasmdot.c.in -D NR=8  -D KR=8 -D TYPE=int8_t -D IZP=0   -o src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c &
+tools/xngen src/x8-packw/kr-wasmdot.c.in -D NR=8  -D KR=8 -D TYPE=int8_t -D IZP=128 -o src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c &
+
 wait
diff --git a/scripts/generate-x8-vclamp.sh b/scripts/generate-x8-vclamp.sh
new file mode 100755
index 00000000000..952691773bc
--- /dev/null
+++ b/scripts/generate-x8-vclamp.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+# Copyright 2024 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+################################ RISC-V Vector ################################
+tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=1 -D DATATYPE=S8 -o src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c &
+tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=2 -D DATATYPE=S8 -o src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c &
+tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=4 -D DATATYPE=S8 -o src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c &
+tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=8 -D DATATYPE=S8 -o src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c &
+
+tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=1 -D DATATYPE=U8 -o src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c &
+tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=2 -D DATATYPE=U8 -o src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c &
+tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=4 -D DATATYPE=U8 -o src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c &
+tools/xngen src/s8-vclamp/rvv.c.in -D LMUL=8 -D DATATYPE=U8 -o src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c &
+
+wait
diff --git a/src/configs/binary-elementwise-config.c b/src/configs/binary-elementwise-config.c
index 3dd38743a99..73888eb564a 100644
--- a/src/configs/binary-elementwise-config.c
+++ b/src/configs/binary-elementwise-config.c
@@ -18,6 +18,7 @@ static struct xnn_binary_elementwise_config f16_vdiv_config = {0};
 static struct xnn_binary_elementwise_config f16_vmax_config = {0};
 static struct xnn_binary_elementwise_config f16_vmin_config = {0};
 static struct xnn_binary_elementwise_config f16_vmul_config = {0};
+static struct xnn_binary_elementwise_config f16_vprelu_config = {0};
 static struct xnn_binary_elementwise_config f16_vsub_config = {0};
 static struct xnn_binary_elementwise_config f16_vsqrdiff_config = {0};
 
@@ -27,6 +28,7 @@ static struct xnn_binary_elementwise_config f32_vdiv_config = {0};
 static struct xnn_binary_elementwise_config f32_vmax_config = {0};
 static struct xnn_binary_elementwise_config f32_vmin_config = {0};
 static struct xnn_binary_elementwise_config f32_vmul_config = {0};
+static struct xnn_binary_elementwise_config f32_vprelu_config = {0};
 static struct xnn_binary_elementwise_config f32_vsub_config = {0};
 static struct xnn_binary_elementwise_config f32_vsqrdiff_config = {0};
 
@@ -44,6 +46,7 @@ XNN_INIT_ONCE_GUARD(f16_vdiv);
 XNN_INIT_ONCE_GUARD(f16_vmax);
 XNN_INIT_ONCE_GUARD(f16_vmin);
 XNN_INIT_ONCE_GUARD(f16_vmul);
+XNN_INIT_ONCE_GUARD(f16_vprelu);
 XNN_INIT_ONCE_GUARD(f16_vsub);
 XNN_INIT_ONCE_GUARD(f16_vsqrdiff);
 XNN_INIT_ONCE_GUARD(f32_vadd);
@@ -52,6 +55,7 @@ XNN_INIT_ONCE_GUARD(f32_vdiv);
 XNN_INIT_ONCE_GUARD(f32_vmax);
 XNN_INIT_ONCE_GUARD(f32_vmin);
 XNN_INIT_ONCE_GUARD(f32_vmul);
+XNN_INIT_ONCE_GUARD(f32_vprelu);
 XNN_INIT_ONCE_GUARD(f32_vsub);
 XNN_INIT_ONCE_GUARD(f32_vsqrdiff);
 XNN_INIT_ONCE_GUARD(s32_vmul);
@@ -256,6 +260,45 @@ static void init_f16_vmul_config(void) {
   #endif
 }
 
+static void init_f16_vprelu_config(void) {
+  #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (hardware_config->use_arm_neon_fp16_arith) {
+      f16_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vprelu_ukernel__neonfp16arith_u16;
+      f16_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vpreluc_ukernel__neonfp16arith_u16;
+      f16_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrpreluc_ukernel__neonfp16arith_u16;
+      f16_vprelu_config.element_tile = 16;
+    }
+  #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (hardware_config->use_arm_neon_fp16_arith) {
+      f16_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vprelu_ukernel__neonfp16arith_u16;
+      f16_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vpreluc_ukernel__neonfp16arith_u16;
+      f16_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrpreluc_ukernel__neonfp16arith_u16;
+      f16_vprelu_config.element_tile = 16;
+    }
+  #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    #if XNN_ENABLE_AVX512FP16
+      if (hardware_config->use_x86_avx512fp16) {
+        f16_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vprelu_ukernel__avx512fp16_u64;
+        f16_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vpreluc_ukernel__avx512fp16_u64;
+        f16_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrpreluc_ukernel__avx512fp16_u64;
+        f16_vprelu_config.element_tile = 64;
+      } else
+    #endif
+    if (hardware_config->use_x86_f16c) {
+      f16_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vprelu_ukernel__f16c_u16;
+      f16_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vpreluc_ukernel__f16c_u16;
+      f16_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f16_vrpreluc_ukernel__f16c_u16;
+      f16_vprelu_config.element_tile = 16;
+    }
+  #endif
+}
+
 static void init_f16_vsub_config(void) {
   #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR
     const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
@@ -796,6 +839,66 @@ static void init_f32_vmul_config(void) {
   #endif
 }
 
+static void init_f32_vprelu_config(void) {
+  #if XNN_ARCH_ARM
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    if (hardware_config->use_arm_neon){
+      f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__neon_u8;
+      f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__neon_u8;
+      f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__neon_u8;
+      f32_vprelu_config.element_tile = 8;
+    } else if (!XNN_PLATFORM_MOBILE) {
+      f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__scalar_u8;
+      f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__scalar_u8;
+      f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__scalar_u8;
+      f32_vprelu_config.element_tile = 8;
+    }
+  #elif XNN_ARCH_ARM64
+    f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__neon_u8;
+    f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__neon_u8;
+    f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__neon_u8;
+    f32_vprelu_config.element_tile = 8;
+  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    #if XNN_ENABLE_AVX512F
+      if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
+        f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__avx512f_u32;
+        f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__avx512f_u32;
+        f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__avx512f_u32;
+        f32_vprelu_config.element_tile = 32;
+      } else
+    #endif
+    if (hardware_config->use_x86_avx) {
+      f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__avx_u16;
+      f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__avx_u16;
+      f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__avx_u16;
+      f32_vprelu_config.element_tile = 16;
+    } else {
+      f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__sse2_u8;
+      f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__sse2_u8;
+      f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__sse2_u8;
+      f32_vprelu_config.element_tile = 8;
+    }
+  #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+    f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__wasmsimd_u16;
+    f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__wasmsimd_u16;
+    f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__wasmsimd_u16;
+    f32_vprelu_config.element_tile = 16;
+  #elif XNN_ARCH_WASM
+    f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__wasm_u8;
+    f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__wasm_u8;
+    f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__wasm_u8;
+    f32_vprelu_config.element_tile = 8;
+  #else
+    f32_vprelu_config.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vprelu_ukernel__scalar_u8;
+    f32_vprelu_config.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vpreluc_ukernel__scalar_u8;
+    f32_vprelu_config.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrpreluc_ukernel__scalar_u8;
+    f32_vprelu_config.element_tile = 8;
+  #endif
+}
+
 static void init_f32_vsub_config(void) {
   #if XNN_ARCH_ARM
     const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
@@ -1250,6 +1353,15 @@ const struct xnn_binary_elementwise_config* xnn_init_f16_vmul_config() {
   return &f16_vmul_config;
 }
 
+const struct xnn_binary_elementwise_config* xnn_init_f16_vprelu_config() {
+  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) {
+    return NULL;
+  }
+  XNN_INIT_ONCE(f16_vprelu);
+  return &f16_vprelu_config;
+}
+
 const struct xnn_binary_elementwise_config* xnn_init_f16_vsub_config() {
   const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
   if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) {
@@ -1331,6 +1443,15 @@ const struct xnn_binary_elementwise_config* xnn_init_f32_vmul_config() {
   return &f32_vmul_config;
 }
 
+const struct xnn_binary_elementwise_config* xnn_init_f32_vprelu_config() {
+  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  if (hardware_config == NULL) {
+    return NULL;
+  }
+  XNN_INIT_ONCE(f32_vprelu);
+  return &f32_vprelu_config;
+}
+
 const struct xnn_binary_elementwise_config* xnn_init_f32_vsub_config() {
   const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
   if (hardware_config == NULL) {
diff --git a/src/configs/hardware-config.c b/src/configs/hardware-config.c
index ca3f4f6e281..100186c2adf 100644
--- a/src/configs/hardware-config.c
+++ b/src/configs/hardware-config.c
@@ -126,7 +126,11 @@ static void init_hardware_config(void) {
 #else
     hardware_config.use_x86_avx512skx = 0;
 #endif
+#if XNN_ENABLE_AVX512VBMI
     hardware_config.use_x86_avx512vbmi = hardware_config.use_x86_avx512skx && cpuinfo_has_x86_avx512vbmi();
+#else
+    hardware_config.use_x86_avx512vbmi = 0;
+#endif
 #if XNN_ENABLE_AVX512VNNI
     hardware_config.use_x86_avx512vnni = hardware_config.use_x86_avx512skx && cpuinfo_has_x86_avx512vnni();
 #else
diff --git a/src/configs/prelu-config.c b/src/configs/prelu-config.c
deleted file mode 100644
index e3508004459..00000000000
--- a/src/configs/prelu-config.c
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright 2023 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stddef.h>
-
-#include "xnnpack/common.h"
-#include "xnnpack/config.h"
-#include "xnnpack/init-once.h"
-#include "xnnpack/microfnptr.h"
-#include "xnnpack/prelu.h"
-
-static struct xnn_prelu_config f16_prelu_config = {0};
-static struct xnn_prelu_config f32_prelu_config = {0};
-
-XNN_INIT_ONCE_GUARD(f16_prelu);
-XNN_INIT_ONCE_GUARD(f32_prelu);
-
-static void init_f16_prelu_config(void) {
-  #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR
-    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-    assert(hardware_config != NULL);
-    if (hardware_config->use_arm_neon_fp16_arith) {
-      f16_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f16_prelu_ukernel__neonfp16arith_2x16;
-      f16_prelu_config.row_tile = 2;
-      f16_prelu_config.channel_tile = 16;
-    }
-  #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR
-    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-    assert(hardware_config != NULL);
-    if (hardware_config->use_arm_neon_fp16_arith) {
-      f16_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f16_prelu_ukernel__neonfp16arith_2x16;
-      f16_prelu_config.row_tile = 2;
-      f16_prelu_config.channel_tile = 16;
-    }
-  #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE
-    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-    assert(hardware_config != NULL);
-    if (hardware_config->use_x86_f16c) {
-      f16_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f16_prelu_ukernel__f16c_2x16;
-      f16_prelu_config.row_tile = 2;
-      f16_prelu_config.channel_tile = 16;
-    }
-  #endif
-}
-
-static void init_f32_prelu_config(void) {
-  #if XNN_ARCH_ARM
-    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-    assert(hardware_config != NULL);
-    if (hardware_config->use_arm_neon) {
-      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__neon_2x8;
-      f32_prelu_config.row_tile = 2;
-      f32_prelu_config.channel_tile = 8;
-    } else if (!XNN_PLATFORM_MOBILE) {
-      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__scalar_2x4;
-      f32_prelu_config.row_tile = 4;
-      f32_prelu_config.channel_tile = 4;
-    }
-  #elif XNN_ARCH_ARM64
-    f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__neon_2x8;
-    f32_prelu_config.row_tile = 2;
-    f32_prelu_config.channel_tile = 8;
-  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
-    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-    assert(hardware_config != NULL);
-    #if XNN_ENABLE_AVX512F
-      if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
-        f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__avx512f_2x16;
-        f32_prelu_config.row_tile = 2;
-        f32_prelu_config.channel_tile = 16;
-      } else
-    #endif
-    if (hardware_config->use_x86_avx) {
-      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__avx_2x16;
-      f32_prelu_config.row_tile = 2;
-      f32_prelu_config.channel_tile = 16;
-    } else if (hardware_config->use_x86_sse4_1) {
-      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__sse41_2x8;
-      f32_prelu_config.row_tile = 2;
-      f32_prelu_config.channel_tile = 8;
-    } else {
-      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__sse2_2x8;
-      f32_prelu_config.row_tile = 2;
-      f32_prelu_config.channel_tile = 8;
-    }
-  #elif XNN_ARCH_WASMRELAXEDSIMD
-    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-    assert(hardware_config != NULL);
-    if (hardware_config->is_x86) {
-      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4;
-      f32_prelu_config.row_tile = 2;
-      f32_prelu_config.channel_tile = 4;
-    } else {
-      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4;
-      f32_prelu_config.row_tile = 2;
-      f32_prelu_config.channel_tile = 4;
-    }
-  #elif XNN_ARCH_WASMSIMD
-    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-    assert(hardware_config != NULL);
-    if (hardware_config->is_x86) {
-      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8;
-      f32_prelu_config.row_tile = 2;
-      f32_prelu_config.channel_tile = 8;
-    } else {
-      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8;
-      f32_prelu_config.row_tile = 2;
-      f32_prelu_config.channel_tile = 8;
-    }
-  #elif XNN_ARCH_WASM
-    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-    assert(hardware_config != NULL);
-    if (hardware_config->is_x86) {
-      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__scalar_2x4;
-      f32_prelu_config.row_tile = 2;
-      f32_prelu_config.channel_tile = 4;
-    } else {
-      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__wasm_2x4;
-      f32_prelu_config.row_tile = 2;
-      f32_prelu_config.channel_tile = 4;
-    }
-  #else
-    f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__scalar_2x4;
-    f32_prelu_config.row_tile = 4;
-    f32_prelu_config.channel_tile = 4;
-  #endif
-}
-
-const struct xnn_prelu_config* xnn_init_f16_prelu_config() {
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-  if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) {
-    return NULL;
-  }
-  XNN_INIT_ONCE(f16_prelu);
-  return &f16_prelu_config;
-}
-
-const struct xnn_prelu_config* xnn_init_f32_prelu_config() {
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-  if (hardware_config == NULL) {
-    return NULL;
-  }
-  XNN_INIT_ONCE(f32_prelu);
-  return &f32_prelu_config;
-}
diff --git a/src/configs/unary-elementwise-config.c b/src/configs/unary-elementwise-config.c
index 15f83474cdc..ff36ce67e75 100644
--- a/src/configs/unary-elementwise-config.c
+++ b/src/configs/unary-elementwise-config.c
@@ -15,7 +15,6 @@
 #include "xnnpack/microparams-init.h"
 #include "xnnpack/packq.h"
 #include "xnnpack/vcvt.h"
-#include "xnnpack/vlrelu.h"
 #include "xnnpack/vunary.h"
 
 static struct xnn_unary_elementwise_config f16_abs_config = {0};
@@ -1980,10 +1979,23 @@ static void init_s8_clamp_config(void) {
   #elif XNN_ARCH_ARM64
     s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__neon_u64;
     s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params;
+
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+    s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__rvv_u4v;
+    s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params;
   #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
     const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
     assert(hardware_config != NULL);
-    if (hardware_config->use_x86_sse4_1) {
+    #if XNN_ENABLE_AVX512SKX
+      if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
+        s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__avx512skx_u256;
+        s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params;
+      } else
+    #endif
+    if (hardware_config->use_x86_avx2) {
+      s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__avx2_u128;
+      s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params;
+    } else if (hardware_config->use_x86_sse4_1) {
       s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__sse41_u64;
       s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params;
     } else {
@@ -1993,12 +2005,6 @@ static void init_s8_clamp_config(void) {
   #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
     s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__wasmsimd_u64;
     s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params;
-  #elif XNN_ARCH_WASM
-    s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__scalar_u4;
-    s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params;
-  #elif XNN_ARCH_RISCV
-    s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__scalar_u4;
-    s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params;
   #else
     s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__scalar_u4;
     s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params;
@@ -2019,9 +2025,25 @@ static void init_u8_clamp_config(void) {
   #elif XNN_ARCH_ARM64
     u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__neon_u64;
     u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params;
-  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
-    u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__sse2_u64;
+  #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+    u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__rvv_u4v;
     u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params;
+  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
+    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+    assert(hardware_config != NULL);
+    #if XNN_ENABLE_AVX512SKX
+      if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
+        u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__avx512skx_u256;
+        u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params;
+      } else
+    #endif
+    if (hardware_config->use_x86_avx2) {
+      u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__avx2_u128;
+      u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params;
+    } else {
+      u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__sse2_u64;
+      u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params;
+    }
   #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
     u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__wasmsimd_u64;
     u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params;
diff --git a/src/configs/x8-lut-config.c b/src/configs/x8-lut-config.c
index 59a39926987..5e1e548246f 100644
--- a/src/configs/x8-lut-config.c
+++ b/src/configs/x8-lut-config.c
@@ -25,13 +25,14 @@ static void init_x8_lut_config(void) {
     const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
     assert(hardware_config != NULL);
 
+    #if XNN_ENABLE_AVX256VBMI
+      if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vbmi) {
+        x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u128;
+      } else
+    #endif
     #if XNN_ENABLE_AVX512SKX
       if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
-        if (hardware_config->use_x86_avx512vbmi) {
-          x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u128;
-        } else {
-          x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx512skx_vpshufb_u64;
-        }
+        x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx512skx_vpshufb_u64;
       } else
     #endif
     if (hardware_config->use_x86_avx2) {
diff --git a/src/f16-f32-vcvt/f16-f32-vcvt.h b/src/f16-f32-vcvt/f16-f32-vcvt.h
index f8e5c42db91..0d685c2e399 100644
--- a/src/f16-f32-vcvt/f16-f32-vcvt.h
+++ b/src/f16-f32-vcvt/f16-f32-vcvt.h
@@ -56,9 +56,12 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f16_f32_vcvt_ukernel__avx_int3
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f16_f32_vcvt_ukernel__avx_int32_u32, 32, false, xnn_float16, float, void, NULL)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_f32_vcvt_ukernel__f16c_u8, 8, false, xnn_float16, float, void, NULL)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_f32_vcvt_ukernel__f16c_u16, 16, false, xnn_float16, float, void, NULL)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_f32_vcvt_ukernel__avx512skx_u16, 16, false, xnn_float16, float, void, NULL)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_f32_vcvt_ukernel__avx512skx_u32, 32, false, xnn_float16, float, void, NULL)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8, 8, false, xnn_float16, float, void, NULL)
diff --git a/src/f16-prelu/f16c.c.in b/src/f16-prelu/f16c.c.in
deleted file mode 100644
index b187fa60a21..00000000000
--- a/src/f16-prelu/f16c.c.in
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright 2022 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-$assert CHANNEL_TILE % 8 == 0
-$assert CHANNEL_TILE >= 8
-$assert ROW_TILE >= 1
-$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-#include <assert.h>
-
-#include <immintrin.h>
-
-#include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f16_prelu_ukernel__f16c_${ROW_TILE}x${CHANNEL_TILE}(
-    size_t rows,
-    size_t channels,
-    const xnn_float16* restrict input,
-    size_t input_stride,
-    const xnn_float16* restrict weights,
-    xnn_float16* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(uint16_t) == 0);
-
-  const uint16_t* i0 = (const uint16_t*) input;
-  uint16_t* o0 = (uint16_t*) output;
-  $for M in range(1, ROW_TILE):
-    const uint16_t* i${M} = (const uint16_t*) ((uintptr_t) i${M-1} + input_stride);
-    uint16_t* o${M} = (uint16_t*) ((uintptr_t) o${M-1} + output_stride);
-
-  const size_t input_increment = input_stride * ${ROW_TILE} - channels;
-  const size_t output_increment = output_stride * ${ROW_TILE} - channels;
-
-  do {
-    $for M in range(1, ROW_TILE):
-      $if M % 2 == 0:
-        if XNN_UNPREDICTABLE(rows <= ${M}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-      $else:
-        if XNN_UNPREDICTABLE(rows < ${M+1}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-
-    const uint16_t* w = (const uint16_t*) weights;
-    size_t c = channels;
-    $if CHANNEL_TILE > 8:
-      for (; c >= ${CHANNEL_TILE} * sizeof(uint16_t); c -= ${CHANNEL_TILE} * sizeof(uint16_t)) {
-        const __m256 vw${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
-        $for C in range(8, CHANNEL_TILE, 8):
-          const __m256 vw${ABC[C:C+8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + ${C})));
-        w += ${CHANNEL_TILE};
-
-        $for M in range(ROW_TILE):
-          const __m256 vi${M}x0${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M}));
-          $for C in range(8, CHANNEL_TILE, 8):
-            const __m256 vi${M}x0${ABC[C:C+8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i${M} + ${C})));
-          i${M} += ${CHANNEL_TILE};
-
-        $for M in range(ROW_TILE):
-          $for C in range(0, CHANNEL_TILE, 8):
-            __m256 vacc${M}x0${ABC[C:C+8]} = _mm256_mul_ps(vi${M}x0${ABC[C:C+8]}, vw${ABC[C:C+8]});
-
-        $for M in range(ROW_TILE):
-          $for C in range(0, CHANNEL_TILE, 8):
-            vacc${M}x0${ABC[C:C+8]} = _mm256_blendv_ps(vi${M}x0${ABC[C:C+8]}, vacc${M}x0${ABC[C:C+8]}, vi${M}x0${ABC[C:C+8]});
-
-        $for M in range(ROW_TILE):
-          _mm_storeu_si128((__m128i*) o${M}, _mm256_cvtps_ph(vacc${M}x0${ABC[C:C+8]}, _MM_FROUND_TO_NEAREST_INT));
-          $for C in range(0, CHANNEL_TILE, 8):
-            _mm_storeu_si128((__m128i*) (o${M} + ${C}), _mm256_cvtps_ph(vacc${M}x0${ABC[C:C+8]}, _MM_FROUND_TO_NEAREST_INT));
-          o${M} += ${CHANNEL_TILE};
-      }
-    for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) {
-      const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
-      w += 8;
-
-      $for M in range(ROW_TILE):
-        const __m256 vi${M}x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M}));
-        i${M} += 8;
-
-      $for M in range(ROW_TILE):
-        __m256 vacc${M}x01234567 = _mm256_mul_ps(vi${M}x01234567, vw01234567);
-
-      $for M in range(ROW_TILE):
-        vacc${M}x01234567 = _mm256_blendv_ps(vi${M}x01234567, vacc${M}x01234567, vi${M}x01234567);
-
-      $for M in range(ROW_TILE):
-        _mm_storeu_si128((__m128i*) o${M}, _mm256_cvtps_ph(vacc${M}x01234567, _MM_FROUND_TO_NEAREST_INT));
-        o${M} += 8;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
-
-      $for M in range(ROW_TILE):
-        const __m256 vi${M}x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M}));
-        i${M} = (const uint16_t*) ((uintptr_t) i${M} + c);
-
-      $for M in range(ROW_TILE):
-        __m256 vacc${M}x01234567 = _mm256_mul_ps(vi${M}x01234567, vw01234567);
-
-      $for M in range(ROW_TILE):
-        vacc${M}x01234567 = _mm256_blendv_ps(vi${M}x01234567, vacc${M}x01234567, vi${M}x01234567);
-
-      $for M in range(ROW_TILE):
-        __m128i vh${M}x01234567 = _mm256_cvtps_ph(vacc${M}x01234567, _MM_FROUND_TO_NEAREST_INT);
-      if (c & (4 * sizeof(uint16_t))) {
-        $for M in range(ROW_TILE):
-          _mm_storel_epi64((__m128i*) o${M}, vh${M}x01234567);
-
-        $for M in range(ROW_TILE):
-          vh${M}x01234567 = _mm_unpackhi_epi64(vh${M}x01234567, vh${M}x01234567);
-
-        $for M in range(ROW_TILE):
-          o${M} += 4;
-      }
-      if (c & (2 * sizeof(uint16_t))) {
-        $for M in range(ROW_TILE):
-          _mm_storeu_si32(o${M}, vh${M}x01234567);
-
-        $for M in range(ROW_TILE):
-          vh${M}x01234567 = _mm_srli_epi64(vh${M}x01234567, 32);
-
-        $for M in range(ROW_TILE):
-          o${M} += 2;
-      }
-      if (c & (1 * sizeof(uint16_t))) {
-        $for M in range(ROW_TILE):
-          *o${M} = (uint16_t) _mm_extract_epi16(vh${M}x01234567, 0);
-
-        $for M in range(ROW_TILE):
-          o${M} += 1;
-      }
-    }
-    $for M in range(ROW_TILE):
-      i${M} = (const uint16_t*) ((uintptr_t) i${M} + input_increment);
-      o${M} = (uint16_t*) ((uintptr_t) o${M} + output_increment);
-    rows = doz(rows, ${ROW_TILE});
-  } while (rows != 0);
-}
diff --git a/src/f16-prelu/gen/f16-prelu-f16c-2x16.c b/src/f16-prelu/gen/f16-prelu-f16c-2x16.c
deleted file mode 100644
index f145aabf22f..00000000000
--- a/src/f16-prelu/gen/f16-prelu-f16c-2x16.c
+++ /dev/null
@@ -1,149 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f16-prelu/f16c.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2022 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <immintrin.h>
-
-#include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f16_prelu_ukernel__f16c_2x16(
-    size_t rows,
-    size_t channels,
-    const xnn_float16* restrict input,
-    size_t input_stride,
-    const xnn_float16* restrict weights,
-    xnn_float16* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(uint16_t) == 0);
-
-  const uint16_t* i0 = (const uint16_t*) input;
-  uint16_t* o0 = (uint16_t*) output;
-  const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
-  uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const uint16_t* w = (const uint16_t*) weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(uint16_t); c -= 16 * sizeof(uint16_t)) {
-      const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
-      const __m256 vw89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8)));
-      w += 16;
-
-      const __m256 vi0x001234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
-      const __m256 vi0x089ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8)));
-      i0 += 16;
-      const __m256 vi1x001234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
-      const __m256 vi1x089ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8)));
-      i1 += 16;
-
-      __m256 vacc0x001234567 = _mm256_mul_ps(vi0x001234567, vw01234567);
-      __m256 vacc0x089ABCDEF = _mm256_mul_ps(vi0x089ABCDEF, vw89ABCDEF);
-      __m256 vacc1x001234567 = _mm256_mul_ps(vi1x001234567, vw01234567);
-      __m256 vacc1x089ABCDEF = _mm256_mul_ps(vi1x089ABCDEF, vw89ABCDEF);
-
-      vacc0x001234567 = _mm256_blendv_ps(vi0x001234567, vacc0x001234567, vi0x001234567);
-      vacc0x089ABCDEF = _mm256_blendv_ps(vi0x089ABCDEF, vacc0x089ABCDEF, vi0x089ABCDEF);
-      vacc1x001234567 = _mm256_blendv_ps(vi1x001234567, vacc1x001234567, vi1x001234567);
-      vacc1x089ABCDEF = _mm256_blendv_ps(vi1x089ABCDEF, vacc1x089ABCDEF, vi1x089ABCDEF);
-
-      _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x089ABCDEF, _MM_FROUND_TO_NEAREST_INT));
-      _mm_storeu_si128((__m128i*) (o0 + 0), _mm256_cvtps_ph(vacc0x001234567, _MM_FROUND_TO_NEAREST_INT));
-      _mm_storeu_si128((__m128i*) (o0 + 8), _mm256_cvtps_ph(vacc0x089ABCDEF, _MM_FROUND_TO_NEAREST_INT));
-      o0 += 16;
-      _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x089ABCDEF, _MM_FROUND_TO_NEAREST_INT));
-      _mm_storeu_si128((__m128i*) (o1 + 0), _mm256_cvtps_ph(vacc1x001234567, _MM_FROUND_TO_NEAREST_INT));
-      _mm_storeu_si128((__m128i*) (o1 + 8), _mm256_cvtps_ph(vacc1x089ABCDEF, _MM_FROUND_TO_NEAREST_INT));
-      o1 += 16;
-    }
-    for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) {
-      const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
-      w += 8;
-
-      const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
-      i0 += 8;
-      const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
-      i1 += 8;
-
-      __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
-      __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
-
-      vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567);
-      vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567);
-
-      _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_TO_NEAREST_INT));
-      o0 += 8;
-      _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_TO_NEAREST_INT));
-      o1 += 8;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
-
-      const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
-      i0 = (const uint16_t*) ((uintptr_t) i0 + c);
-      const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
-      i1 = (const uint16_t*) ((uintptr_t) i1 + c);
-
-      __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
-      __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
-
-      vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567);
-      vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567);
-
-      __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_TO_NEAREST_INT);
-      __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_TO_NEAREST_INT);
-      if (c & (4 * sizeof(uint16_t))) {
-        _mm_storel_epi64((__m128i*) o0, vh0x01234567);
-        _mm_storel_epi64((__m128i*) o1, vh1x01234567);
-
-        vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
-        vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
-
-        o0 += 4;
-        o1 += 4;
-      }
-      if (c & (2 * sizeof(uint16_t))) {
-        _mm_storeu_si32(o0, vh0x01234567);
-        _mm_storeu_si32(o1, vh1x01234567);
-
-        vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
-        vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(uint16_t))) {
-        *o0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
-        *o1 = (uint16_t) _mm_extract_epi16(vh1x01234567, 0);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
-    o0 = (uint16_t*) ((uintptr_t) o0 + output_increment);
-    i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
-    o1 = (uint16_t*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f16-prelu/gen/f16-prelu-f16c-2x8.c b/src/f16-prelu/gen/f16-prelu-f16c-2x8.c
deleted file mode 100644
index 9a062951c33..00000000000
--- a/src/f16-prelu/gen/f16-prelu-f16c-2x8.c
+++ /dev/null
@@ -1,118 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f16-prelu/f16c.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2022 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <immintrin.h>
-
-#include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f16_prelu_ukernel__f16c_2x8(
-    size_t rows,
-    size_t channels,
-    const xnn_float16* restrict input,
-    size_t input_stride,
-    const xnn_float16* restrict weights,
-    xnn_float16* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(uint16_t) == 0);
-
-  const uint16_t* i0 = (const uint16_t*) input;
-  uint16_t* o0 = (uint16_t*) output;
-  const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
-  uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const uint16_t* w = (const uint16_t*) weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) {
-      const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
-      w += 8;
-
-      const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
-      i0 += 8;
-      const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
-      i1 += 8;
-
-      __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
-      __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
-
-      vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567);
-      vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567);
-
-      _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_TO_NEAREST_INT));
-      o0 += 8;
-      _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_TO_NEAREST_INT));
-      o1 += 8;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
-
-      const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
-      i0 = (const uint16_t*) ((uintptr_t) i0 + c);
-      const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
-      i1 = (const uint16_t*) ((uintptr_t) i1 + c);
-
-      __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
-      __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
-
-      vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567);
-      vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567);
-
-      __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_TO_NEAREST_INT);
-      __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_TO_NEAREST_INT);
-      if (c & (4 * sizeof(uint16_t))) {
-        _mm_storel_epi64((__m128i*) o0, vh0x01234567);
-        _mm_storel_epi64((__m128i*) o1, vh1x01234567);
-
-        vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
-        vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
-
-        o0 += 4;
-        o1 += 4;
-      }
-      if (c & (2 * sizeof(uint16_t))) {
-        _mm_storeu_si32(o0, vh0x01234567);
-        _mm_storeu_si32(o1, vh1x01234567);
-
-        vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
-        vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(uint16_t))) {
-        *o0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
-        *o1 = (uint16_t) _mm_extract_epi16(vh1x01234567, 0);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
-    o0 = (uint16_t*) ((uintptr_t) o0 + output_increment);
-    i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
-    o1 = (uint16_t*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c b/src/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c
deleted file mode 100644
index 064c541c100..00000000000
--- a/src/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c
+++ /dev/null
@@ -1,136 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f16-prelu/neonfp16arith.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f16_prelu_ukernel__neonfp16arith_2x16(
-    size_t rows,
-    size_t channels,
-    const xnn_float16* restrict input,
-    size_t input_stride,
-    const xnn_float16* restrict weights,
-    xnn_float16* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(uint16_t) == 0);
-
-  const uint16_t* i0 = (const uint16_t*) input;
-  uint16_t* o0 = (uint16_t*) output;
-  const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
-  uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const uint16_t* w = (const uint16_t*) weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(uint16_t); c -= 16 * sizeof(uint16_t)) {
-      const float16x8_t vw01234567 = vreinterpretq_f16_u16(vld1q_u16(w)); w += 8;
-      const float16x8_t vw89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(w)); w += 8;
-
-      const float16x8_t vi0x001234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8;
-      const float16x8_t vi0x089ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8;
-      const float16x8_t vi1x001234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8;
-      const float16x8_t vi1x089ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8;
-
-      float16x8_t vacc0x001234567 = vmulq_f16(vi0x001234567, vw01234567);
-      const uint16x8_t vm0x001234567 = vcltq_s16(vreinterpretq_s16_f16(vi0x001234567), vmovq_n_s16(0));
-      float16x8_t vacc0x089ABCDEF = vmulq_f16(vi0x089ABCDEF, vw89ABCDEF);
-      const uint16x8_t vm0x089ABCDEF = vcltq_s16(vreinterpretq_s16_f16(vi0x089ABCDEF), vmovq_n_s16(0));
-      float16x8_t vacc1x001234567 = vmulq_f16(vi1x001234567, vw01234567);
-      const uint16x8_t vm1x001234567 = vcltq_s16(vreinterpretq_s16_f16(vi1x001234567), vmovq_n_s16(0));
-      float16x8_t vacc1x089ABCDEF = vmulq_f16(vi1x089ABCDEF, vw89ABCDEF);
-      const uint16x8_t vm1x089ABCDEF = vcltq_s16(vreinterpretq_s16_f16(vi1x089ABCDEF), vmovq_n_s16(0));
-
-      vacc0x001234567 = vbslq_f16(vm0x001234567, vacc0x001234567, vi0x001234567);
-      vacc0x089ABCDEF = vbslq_f16(vm0x089ABCDEF, vacc0x089ABCDEF, vi0x089ABCDEF);
-      vacc1x001234567 = vbslq_f16(vm1x001234567, vacc1x001234567, vi1x001234567);
-      vacc1x089ABCDEF = vbslq_f16(vm1x089ABCDEF, vacc1x089ABCDEF, vi1x089ABCDEF);
-
-      vst1q_u16(o0, vreinterpretq_u16_f16(vacc0x001234567)); o0 += 8;
-      vst1q_u16(o0, vreinterpretq_u16_f16(vacc0x089ABCDEF)); o0 += 8;
-      vst1q_u16(o1, vreinterpretq_u16_f16(vacc1x001234567)); o1 += 8;
-      vst1q_u16(o1, vreinterpretq_u16_f16(vacc1x089ABCDEF)); o1 += 8;
-    }
-    for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) {
-      const float16x8_t vw01234567 = vreinterpretq_f16_u16(vld1q_u16(w)); w += 8;
-
-      const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0));
-      i0 += 8;
-      const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1));
-      i1 += 8;
-
-      float16x8_t vacc0x01234567 = vmulq_f16(vi0x01234567, vw01234567);
-      const uint16x8_t vm0x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi0x01234567), vmovq_n_s16(0));
-      float16x8_t vacc1x01234567 = vmulq_f16(vi1x01234567, vw01234567);
-      const uint16x8_t vm1x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi1x01234567), vmovq_n_s16(0));
-
-      vacc0x01234567 = vbslq_f16(vm0x01234567, vacc0x01234567, vi0x01234567);
-      vacc1x01234567 = vbslq_f16(vm1x01234567, vacc1x01234567, vi1x01234567);
-
-      vst1q_u16(o0, vreinterpretq_u16_f16(vacc0x01234567)); o0 += 8;
-      vst1q_u16(o1, vreinterpretq_u16_f16(vacc1x01234567)); o1 += 8;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const float16x8_t vw01234567 = vreinterpretq_f16_u16(vld1q_u16(w));
-
-      const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0));
-      i0 = (const uint16_t*) ((uintptr_t) i0 + c);
-      const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1));
-      i1 = (const uint16_t*) ((uintptr_t) i1 + c);
-
-      float16x8_t vacc0x01234567 = vmulq_f16(vi0x01234567, vw01234567);
-      const uint16x8_t vm0x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi0x01234567), vmovq_n_s16(0));
-      float16x8_t vacc1x01234567 = vmulq_f16(vi1x01234567, vw01234567);
-      const uint16x8_t vm1x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi1x01234567), vmovq_n_s16(0));
-
-      vacc0x01234567 = vbslq_f16(vm0x01234567, vacc0x01234567, vi0x01234567);
-      vacc1x01234567 = vbslq_f16(vm1x01234567, vacc1x01234567, vi1x01234567);
-
-      float16x4_t vacc0x0123 = vget_low_f16(vacc0x01234567);
-      float16x4_t vacc1x0123 = vget_low_f16(vacc1x01234567);
-      if (c & (4 * sizeof(uint16_t))) {
-        vst1_u16(o0, vreinterpret_u16_f16(vacc0x0123)); o0 += 4;
-        vst1_u16(o1, vreinterpret_u16_f16(vacc1x0123)); o1 += 4;
-
-        vacc0x0123 = vget_high_f16(vacc0x01234567);
-        vacc1x0123 = vget_high_f16(vacc1x01234567);
-      }
-      if (c & (2 * sizeof(uint16_t))) {
-        vst1_lane_u32((void*) o0, vreinterpret_u32_f16(vacc0x0123), 0); o0 += 2;
-        vacc0x0123 = vext_f16(vacc0x0123, vacc0x0123, 2);
-        vst1_lane_u32((void*) o1, vreinterpret_u32_f16(vacc1x0123), 0); o1 += 2;
-        vacc1x0123 = vext_f16(vacc1x0123, vacc1x0123, 2);
-      }
-      if (c & (1 * sizeof(uint16_t))) {
-        vst1_lane_u16(o0, vreinterpret_u16_f16(vacc0x0123), 0); o0 += 1;
-        vst1_lane_u16(o1, vreinterpret_u16_f16(vacc1x0123), 0); o1 += 1;
-      }
-    }
-    i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
-    o0 = (uint16_t*) ((uintptr_t) o0 + output_increment);
-    i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
-    o1 = (uint16_t*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f16-prelu/gen/f16-prelu-neonfp16arith-2x8.c b/src/f16-prelu/gen/f16-prelu-neonfp16arith-2x8.c
deleted file mode 100644
index 7169917364b..00000000000
--- a/src/f16-prelu/gen/f16-prelu-neonfp16arith-2x8.c
+++ /dev/null
@@ -1,108 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f16-prelu/neonfp16arith.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f16_prelu_ukernel__neonfp16arith_2x8(
-    size_t rows,
-    size_t channels,
-    const xnn_float16* restrict input,
-    size_t input_stride,
-    const xnn_float16* restrict weights,
-    xnn_float16* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(uint16_t) == 0);
-
-  const uint16_t* i0 = (const uint16_t*) input;
-  uint16_t* o0 = (uint16_t*) output;
-  const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
-  uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const uint16_t* w = (const uint16_t*) weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) {
-      const float16x8_t vw01234567 = vreinterpretq_f16_u16(vld1q_u16(w)); w += 8;
-
-      const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0));
-      i0 += 8;
-      const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1));
-      i1 += 8;
-
-      float16x8_t vacc0x01234567 = vmulq_f16(vi0x01234567, vw01234567);
-      const uint16x8_t vm0x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi0x01234567), vmovq_n_s16(0));
-      float16x8_t vacc1x01234567 = vmulq_f16(vi1x01234567, vw01234567);
-      const uint16x8_t vm1x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi1x01234567), vmovq_n_s16(0));
-
-      vacc0x01234567 = vbslq_f16(vm0x01234567, vacc0x01234567, vi0x01234567);
-      vacc1x01234567 = vbslq_f16(vm1x01234567, vacc1x01234567, vi1x01234567);
-
-      vst1q_u16(o0, vreinterpretq_u16_f16(vacc0x01234567)); o0 += 8;
-      vst1q_u16(o1, vreinterpretq_u16_f16(vacc1x01234567)); o1 += 8;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const float16x8_t vw01234567 = vreinterpretq_f16_u16(vld1q_u16(w));
-
-      const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0));
-      i0 = (const uint16_t*) ((uintptr_t) i0 + c);
-      const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1));
-      i1 = (const uint16_t*) ((uintptr_t) i1 + c);
-
-      float16x8_t vacc0x01234567 = vmulq_f16(vi0x01234567, vw01234567);
-      const uint16x8_t vm0x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi0x01234567), vmovq_n_s16(0));
-      float16x8_t vacc1x01234567 = vmulq_f16(vi1x01234567, vw01234567);
-      const uint16x8_t vm1x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi1x01234567), vmovq_n_s16(0));
-
-      vacc0x01234567 = vbslq_f16(vm0x01234567, vacc0x01234567, vi0x01234567);
-      vacc1x01234567 = vbslq_f16(vm1x01234567, vacc1x01234567, vi1x01234567);
-
-      float16x4_t vacc0x0123 = vget_low_f16(vacc0x01234567);
-      float16x4_t vacc1x0123 = vget_low_f16(vacc1x01234567);
-      if (c & (4 * sizeof(uint16_t))) {
-        vst1_u16(o0, vreinterpret_u16_f16(vacc0x0123)); o0 += 4;
-        vst1_u16(o1, vreinterpret_u16_f16(vacc1x0123)); o1 += 4;
-
-        vacc0x0123 = vget_high_f16(vacc0x01234567);
-        vacc1x0123 = vget_high_f16(vacc1x01234567);
-      }
-      if (c & (2 * sizeof(uint16_t))) {
-        vst1_lane_u32((void*) o0, vreinterpret_u32_f16(vacc0x0123), 0); o0 += 2;
-        vacc0x0123 = vext_f16(vacc0x0123, vacc0x0123, 2);
-        vst1_lane_u32((void*) o1, vreinterpret_u32_f16(vacc1x0123), 0); o1 += 2;
-        vacc1x0123 = vext_f16(vacc1x0123, vacc1x0123, 2);
-      }
-      if (c & (1 * sizeof(uint16_t))) {
-        vst1_lane_u16(o0, vreinterpret_u16_f16(vacc0x0123), 0); o0 += 1;
-        vst1_lane_u16(o1, vreinterpret_u16_f16(vacc1x0123), 0); o1 += 1;
-      }
-    }
-    i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
-    o0 = (uint16_t*) ((uintptr_t) o0 + output_increment);
-    i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
-    o1 = (uint16_t*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f16-prelu/neonfp16arith.c.in b/src/f16-prelu/neonfp16arith.c.in
deleted file mode 100644
index f65f953a349..00000000000
--- a/src/f16-prelu/neonfp16arith.c.in
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-$assert CHANNEL_TILE % 8 == 0
-$assert CHANNEL_TILE >= 8
-$assert ROW_TILE >= 1
-$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f16_prelu_ukernel__neonfp16arith_${ROW_TILE}x${CHANNEL_TILE}(
-    size_t rows,
-    size_t channels,
-    const xnn_float16* restrict input,
-    size_t input_stride,
-    const xnn_float16* restrict weights,
-    xnn_float16* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(uint16_t) == 0);
-
-  const uint16_t* i0 = (const uint16_t*) input;
-  uint16_t* o0 = (uint16_t*) output;
-  $for M in range(1, ROW_TILE):
-    const uint16_t* i${M} = (const uint16_t*) ((uintptr_t) i${M-1} + input_stride);
-    uint16_t* o${M} = (uint16_t*) ((uintptr_t) o${M-1} + output_stride);
-
-  const size_t input_increment = input_stride * ${ROW_TILE} - channels;
-  const size_t output_increment = output_stride * ${ROW_TILE} - channels;
-
-  do {
-    $for M in range(1, ROW_TILE):
-      $if M % 2 == 0:
-        if XNN_UNPREDICTABLE(rows <= ${M}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-      $else:
-        if XNN_UNPREDICTABLE(rows < ${M+1}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-
-    const uint16_t* w = (const uint16_t*) weights;
-    size_t c = channels;
-    $if CHANNEL_TILE > 8:
-      for (; c >= ${CHANNEL_TILE} * sizeof(uint16_t); c -= ${CHANNEL_TILE} * sizeof(uint16_t)) {
-        $for C in range(0, CHANNEL_TILE, 8):
-          const float16x8_t vw${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16(w)); w += 8;
-
-        $for M in range(ROW_TILE):
-          $for C in range(0, CHANNEL_TILE, 8):
-            const float16x8_t vi${M}x0${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16(i${M})); i${M} += 8;
-
-        $for M in range(ROW_TILE):
-          $for C in range(0, CHANNEL_TILE, 8):
-            float16x8_t vacc${M}x0${ABC[C:C+8]} = vmulq_f16(vi${M}x0${ABC[C:C+8]}, vw${ABC[C:C+8]});
-            const uint16x8_t vm${M}x0${ABC[C:C+8]} = vcltq_s16(vreinterpretq_s16_f16(vi${M}x0${ABC[C:C+8]}), vmovq_n_s16(0));
-
-        $for M in range(ROW_TILE):
-          $for C in range(0, CHANNEL_TILE, 8):
-            vacc${M}x0${ABC[C:C+8]} = vbslq_f16(vm${M}x0${ABC[C:C+8]}, vacc${M}x0${ABC[C:C+8]}, vi${M}x0${ABC[C:C+8]});
-
-        $for M in range(ROW_TILE):
-          $for C in range(0, CHANNEL_TILE, 8):
-            vst1q_u16(o${M}, vreinterpretq_u16_f16(vacc${M}x0${ABC[C:C+8]})); o${M} += 8;
-      }
-    for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) {
-      const float16x8_t vw01234567 = vreinterpretq_f16_u16(vld1q_u16(w)); w += 8;
-
-      $for M in range(ROW_TILE):
-        const float16x8_t vi${M}x01234567 = vreinterpretq_f16_u16(vld1q_u16(i${M}));
-        i${M} += 8;
-
-      $for M in range(ROW_TILE):
-        float16x8_t vacc${M}x01234567 = vmulq_f16(vi${M}x01234567, vw01234567);
-        const uint16x8_t vm${M}x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi${M}x01234567), vmovq_n_s16(0));
-
-      $for M in range(ROW_TILE):
-        vacc${M}x01234567 = vbslq_f16(vm${M}x01234567, vacc${M}x01234567, vi${M}x01234567);
-
-      $for M in range(ROW_TILE):
-        vst1q_u16(o${M}, vreinterpretq_u16_f16(vacc${M}x01234567)); o${M} += 8;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const float16x8_t vw01234567 = vreinterpretq_f16_u16(vld1q_u16(w));
-
-      $for M in range(ROW_TILE):
-        const float16x8_t vi${M}x01234567 = vreinterpretq_f16_u16(vld1q_u16(i${M}));
-        i${M} = (const uint16_t*) ((uintptr_t) i${M} + c);
-
-      $for M in range(ROW_TILE):
-        float16x8_t vacc${M}x01234567 = vmulq_f16(vi${M}x01234567, vw01234567);
-        const uint16x8_t vm${M}x01234567 = vcltq_s16(vreinterpretq_s16_f16(vi${M}x01234567), vmovq_n_s16(0));
-
-      $for M in range(ROW_TILE):
-        vacc${M}x01234567 = vbslq_f16(vm${M}x01234567, vacc${M}x01234567, vi${M}x01234567);
-
-      $for M in range(ROW_TILE):
-        float16x4_t vacc${M}x0123 = vget_low_f16(vacc${M}x01234567);
-      if (c & (4 * sizeof(uint16_t))) {
-        $for M in range(ROW_TILE):
-          vst1_u16(o${M}, vreinterpret_u16_f16(vacc${M}x0123)); o${M} += 4;
-
-        $for M in range(ROW_TILE):
-          vacc${M}x0123 = vget_high_f16(vacc${M}x01234567);
-      }
-      if (c & (2 * sizeof(uint16_t))) {
-        $for M in range(ROW_TILE):
-          vst1_lane_u32((void*) o${M}, vreinterpret_u32_f16(vacc${M}x0123), 0); o${M} += 2;
-          vacc${M}x0123 = vext_f16(vacc${M}x0123, vacc${M}x0123, 2);
-      }
-      if (c & (1 * sizeof(uint16_t))) {
-        $for M in range(ROW_TILE):
-          vst1_lane_u16(o${M}, vreinterpret_u16_f16(vacc${M}x0123), 0); o${M} += 1;
-      }
-    }
-    $for M in range(ROW_TILE):
-      i${M} = (const uint16_t*) ((uintptr_t) i${M} + input_increment);
-      o${M} = (uint16_t*) ((uintptr_t) o${M} + output_increment);
-    rows = doz(rows, ${ROW_TILE});
-  } while (rows != 0);
-}
diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u16.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u16.c
index fc2e2015708..b039b2c5552 100644
--- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u16.c
+++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u16.c
@@ -31,8 +31,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u16(
 
   const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->scalar.scale));
   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->scalar.output_zero_point);
-  const int8x16_t voutput_min = vld1q_dup_s8(&params->scalar.output_min);
-  const int8x16_t voutput_max = vld1q_dup_s8(&params->scalar.output_max);
   for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
     float16x8_t vx0 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8;
     float16x8_t vx8 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8;
@@ -48,10 +46,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u16(
 
     int8x16_t vy0 = vcombine_s8(vqmovn_s16(vacc0), vqmovn_s16(vacc8));
 
-    vy0 = vmaxq_s8(vy0, voutput_min);
-
-    vy0 = vminq_s8(vy0, voutput_max);
-
     vst1q_s8(output, vy0); output += 16;
   }
   for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) {
@@ -64,8 +58,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u16(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
     vst1_s8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -79,8 +71,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u16(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
 
     if (batch & (4 * sizeof(uint16_t))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u24.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u24.c
index 7e02a33f646..d3eb2ca8e4a 100644
--- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u24.c
+++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u24.c
@@ -31,8 +31,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u24(
 
   const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->scalar.scale));
   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->scalar.output_zero_point);
-  const int8x16_t voutput_min = vld1q_dup_s8(&params->scalar.output_min);
-  const int8x16_t voutput_max = vld1q_dup_s8(&params->scalar.output_max);
   for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) {
     float16x8_t vx0 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8;
     float16x8_t vx8 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8;
@@ -53,12 +51,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u24(
     int8x16_t vy0 = vcombine_s8(vqmovn_s16(vacc0), vqmovn_s16(vacc8));
     int8x8_t vy16 = vqmovn_s16(vacc16);
 
-    vy0 = vmaxq_s8(vy0, voutput_min);
-    vy16 = vmax_s8(vy16, vget_low_s8(voutput_min));
-
-    vy0 = vminq_s8(vy0, voutput_max);
-    vy16 = vmin_s8(vy16, vget_low_s8(voutput_max));
-
     vst1q_s8(output, vy0); output += 16;
     vst1_s8(output, vy16); output += 8;
   }
@@ -72,8 +64,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u24(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
     vst1_s8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -87,8 +77,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u24(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
 
     if (batch & (4 * sizeof(uint16_t))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u32.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u32.c
index 9645bdb569c..8e035ffe87b 100644
--- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u32.c
+++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u32.c
@@ -31,8 +31,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u32(
 
   const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->scalar.scale));
   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->scalar.output_zero_point);
-  const int8x16_t voutput_min = vld1q_dup_s8(&params->scalar.output_min);
-  const int8x16_t voutput_max = vld1q_dup_s8(&params->scalar.output_max);
   for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) {
     float16x8_t vx0 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8;
     float16x8_t vx8 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8;
@@ -57,12 +55,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u32(
     int8x16_t vy0 = vcombine_s8(vqmovn_s16(vacc0), vqmovn_s16(vacc8));
     int8x16_t vy16 = vcombine_s8(vqmovn_s16(vacc16), vqmovn_s16(vacc24));
 
-    vy0 = vmaxq_s8(vy0, voutput_min);
-    vy16 = vmaxq_s8(vy16, voutput_min);
-
-    vy0 = vminq_s8(vy0, voutput_max);
-    vy16 = vminq_s8(vy16, voutput_max);
-
     vst1q_s8(output, vy0); output += 16;
     vst1q_s8(output, vy16); output += 16;
   }
@@ -76,8 +68,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u32(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
     vst1_s8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -91,8 +81,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u32(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
 
     if (batch & (4 * sizeof(uint16_t))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u64.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u64.c
index 6561ddf428f..ebc4fb5f661 100644
--- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u64.c
+++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u64.c
@@ -31,8 +31,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u64(
 
   const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->scalar.scale));
   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->scalar.output_zero_point);
-  const int8x16_t voutput_min = vld1q_dup_s8(&params->scalar.output_min);
-  const int8x16_t voutput_max = vld1q_dup_s8(&params->scalar.output_max);
   for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) {
     float16x8_t vx0 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8;
     float16x8_t vx8 = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8;
@@ -75,16 +73,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u64(
     int8x16_t vy32 = vcombine_s8(vqmovn_s16(vacc32), vqmovn_s16(vacc40));
     int8x16_t vy48 = vcombine_s8(vqmovn_s16(vacc48), vqmovn_s16(vacc56));
 
-    vy0 = vmaxq_s8(vy0, voutput_min);
-    vy16 = vmaxq_s8(vy16, voutput_min);
-    vy32 = vmaxq_s8(vy32, voutput_min);
-    vy48 = vmaxq_s8(vy48, voutput_min);
-
-    vy0 = vminq_s8(vy0, voutput_max);
-    vy16 = vminq_s8(vy16, voutput_max);
-    vy32 = vminq_s8(vy32, voutput_max);
-    vy48 = vminq_s8(vy48, voutput_max);
-
     vst1q_s8(output, vy0); output += 16;
     vst1q_s8(output, vy16); output += 16;
     vst1q_s8(output, vy32); output += 16;
@@ -100,8 +88,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u64(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
     vst1_s8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -115,8 +101,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u64(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
 
     if (batch & (4 * sizeof(uint16_t))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u8.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u8.c
index be0a15e9a7c..81839e9e305 100644
--- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u8.c
+++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-neonfp16arith-u8.c
@@ -31,8 +31,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u8(
 
   const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->scalar.scale));
   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->scalar.output_zero_point);
-  const int8x8_t voutput_min = vld1_dup_s8(&params->scalar.output_min);
-  const int8x8_t voutput_max = vld1_dup_s8(&params->scalar.output_max);
   for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) {
     float16x8_t vx = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8;
 
@@ -43,8 +41,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u8(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, voutput_min);
-    vy = vmin_s8(vy, voutput_max);
     vst1_s8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -58,8 +54,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u8(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, voutput_min);
-    vy = vmin_s8(vy, voutput_max);
 
     if (batch & (4 * sizeof(uint16_t))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u1.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u1.c
index fb9e4dc17b6..e635693c36a 100644
--- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u1.c
+++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u1.c
@@ -26,8 +26,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_fmagic_u1(
 
   const xnn_float16* i = input;
   const float vscale = xnn_float16_to_float(params->scalar.scale);
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u2.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u2.c
index e3561a3ccb3..b46d911e5c6 100644
--- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u2.c
+++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u2.c
@@ -26,8 +26,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_fmagic_u2(
 
   const xnn_float16* i = input;
   const float vscale = xnn_float16_to_float(params->scalar.scale);
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u3.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u3.c
index f088ddf5684..e45cd24818a 100644
--- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u3.c
+++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u3.c
@@ -26,8 +26,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_fmagic_u3(
 
   const xnn_float16* i = input;
   const float vscale = xnn_float16_to_float(params->scalar.scale);
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u4.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u4.c
index c2cb7a3594e..27682f5af68 100644
--- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u4.c
+++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-fmagic-u4.c
@@ -26,8 +26,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_fmagic_u4(
 
   const xnn_float16* i = input;
   const float vscale = xnn_float16_to_float(params->scalar.scale);
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u1.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u1.c
index c995a3df11a..08f1f168b88 100644
--- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u1.c
+++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u1.c
@@ -27,8 +27,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_imagic_u1(
   const xnn_float16* i = input;
   const float vscale = xnn_float16_to_float(params->scalar.scale);
   const float vmagic_bias = 12582912.0f;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
   const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u2.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u2.c
index 00620ce80b3..0f854a094a1 100644
--- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u2.c
+++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u2.c
@@ -27,8 +27,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_imagic_u2(
   const xnn_float16* i = input;
   const float vscale = xnn_float16_to_float(params->scalar.scale);
   const float vmagic_bias = 12582912.0f;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
   const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u3.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u3.c
index c4f849e687e..d6dae2b9be8 100644
--- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u3.c
+++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u3.c
@@ -27,8 +27,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_imagic_u3(
   const xnn_float16* i = input;
   const float vscale = xnn_float16_to_float(params->scalar.scale);
   const float vmagic_bias = 12582912.0f;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
   const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
diff --git a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u4.c b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u4.c
index 86e42c4ca57..2dcafe7e9a0 100644
--- a/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u4.c
+++ b/src/f16-qs8-vcvt/gen/f16-qs8-vcvt-scalar-imagic-u4.c
@@ -27,8 +27,8 @@ void xnn_f16_qs8_vcvt_ukernel__scalar_imagic_u4(
   const xnn_float16* i = input;
   const float vscale = xnn_float16_to_float(params->scalar.scale);
   const float vmagic_bias = 12582912.0f;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
   const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
diff --git a/src/f16-qs8-vcvt/neonfp16arith.c.in b/src/f16-qs8-vcvt/neonfp16arith.c.in
index 7738146d92b..67797f2808c 100644
--- a/src/f16-qs8-vcvt/neonfp16arith.c.in
+++ b/src/f16-qs8-vcvt/neonfp16arith.c.in
@@ -29,12 +29,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u${BATCH_TILE}(
 
   const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->scalar.scale));
   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->scalar.output_zero_point);
-  $if BATCH_TILE > 8:
-    const int8x16_t voutput_min = vld1q_dup_s8(&params->scalar.output_min);
-    const int8x16_t voutput_max = vld1q_dup_s8(&params->scalar.output_max);
-  $else:
-    const int8x8_t voutput_min = vld1_dup_s8(&params->scalar.output_min);
-    const int8x8_t voutput_max = vld1_dup_s8(&params->scalar.output_max);
   $if BATCH_TILE > 8:
     for (; batch >= ${BATCH_TILE} * sizeof(uint16_t); batch -= ${BATCH_TILE} * sizeof(uint16_t)) {
       $for N in range(0, BATCH_TILE, 8):
@@ -55,18 +49,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u${BATCH_TILE}(
         $else:
           int8x8_t vy${N} = vqmovn_s16(vacc${N});
 
-      $for N in range(0, BATCH_TILE, 16):
-        $if N + 8 < BATCH_TILE:
-          vy${N} = vmaxq_s8(vy${N}, voutput_min);
-        $else:
-          vy${N} = vmax_s8(vy${N}, vget_low_s8(voutput_min));
-
-      $for N in range(0, BATCH_TILE, 16):
-        $if N + 8 < BATCH_TILE:
-          vy${N} = vminq_s8(vy${N}, voutput_max);
-        $else:
-          vy${N} = vmin_s8(vy${N}, vget_low_s8(voutput_max));
-
       $for N in range(0, BATCH_TILE, 16):
         $if N + 8 < BATCH_TILE:
           vst1q_s8(output, vy${N}); output += 16;
@@ -83,12 +65,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u${BATCH_TILE}(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    $if BATCH_TILE > 8:
-      vy = vmax_s8(vy, vget_low_s8(voutput_min));
-      vy = vmin_s8(vy, vget_low_s8(voutput_max));
-    $else:
-      vy = vmax_s8(vy, voutput_min);
-      vy = vmin_s8(vy, voutput_max);
     vst1_s8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -102,12 +78,6 @@ void xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u${BATCH_TILE}(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    $if BATCH_TILE > 8:
-      vy = vmax_s8(vy, vget_low_s8(voutput_min));
-      vy = vmin_s8(vy, vget_low_s8(voutput_max));
-    $else:
-      vy = vmax_s8(vy, voutput_min);
-      vy = vmin_s8(vy, voutput_max);
 
     if (batch & (4 * sizeof(uint16_t))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
diff --git a/src/f16-vsqrt/f16-vsqrt.h b/src/f16-vsqrt/f16-vsqrt.h
index f5001c3fe03..7bb5773a693 100644
--- a/src/f16-vsqrt/f16-vsqrt.h
+++ b/src/f16-vsqrt/f16-vsqrt.h
@@ -41,9 +41,6 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsqrt_ukernel__avx512fp
 #endif  // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL))
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL))
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u64, 64, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_rsqrt_u8, 8, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_rsqrt_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_rsqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL))
@@ -52,6 +49,12 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_sqrt_u16,
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_sqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u64, 64, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL))
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+
 
 #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS
 #undef XNN_DEFINED_UKERNEL_WITH_PARAMS
diff --git a/src/f32-dwconv/f32-dwconv-minmax-multipass.h b/src/f32-dwconv/f32-dwconv-minmax-multipass.h
index 659f4aa6f7c..197547ca656 100644
--- a/src/f32-dwconv/f32-dwconv-minmax-multipass.h
+++ b/src/f32-dwconv/f32-dwconv-minmax-multipass.h
@@ -76,6 +76,9 @@ XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_7f6m6l16c8
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_7f6m6l16c8s4r__fma3_acc2, 7, 6, 6, 16, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_7f6m6l32c8s4r__fma3, 7, 6, 6, 32, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_7f6m6l32c8s4r__fma3_acc2, 7, 6, 6, 32, 8, 4, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f, 5, 5, 5, 16, 16, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_5f5m5l16c16s1r__avx512f_acc2, 5, 5, 5, 16, 16, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f, 5, 5, 5, 32, 16, 1, float, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
diff --git a/src/f32-dwconv/f32-dwconv-minmax-unipass.h b/src/f32-dwconv/f32-dwconv-minmax-unipass.h
index 9a5630b1b1c..56b94f8b195 100644
--- a/src/f32-dwconv/f32-dwconv-minmax-unipass.h
+++ b/src/f32-dwconv/f32-dwconv-minmax-unipass.h
@@ -111,6 +111,9 @@ XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_25p8c__fma3,
 XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_25p8c__fma3_acc2, 8, false, 8, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_25p16c__fma3, 16, false, 16, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_fma3, xnn_f32_dwconv_minmax_ukernel_25p16c__fma3_acc2, 16, false, 16, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f, 16, false, 16, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f_acc2, 16, false, 16, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512f, xnn_f32_dwconv_minmax_ukernel_3p32c__avx512f, 32, false, 32, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
diff --git a/src/f32-f16-vcvt/f32-f16-vcvt.h b/src/f32-f16-vcvt/f32-f16-vcvt.h
index 8abb72ebbd7..89acb5523cf 100644
--- a/src/f32-f16-vcvt/f32-f16-vcvt.h
+++ b/src/f32-f16-vcvt/f32-f16-vcvt.h
@@ -40,9 +40,12 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_f16_vcvt_ukernel__avx_u24,
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_f16_vcvt_ukernel__avx_u32, 32, false, float, xnn_float16, void, NULL)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f32_f16_vcvt_ukernel__f16c_u8, 8, false, float, xnn_float16, void, NULL)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f32_f16_vcvt_ukernel__f16c_u16, 16, false, float, xnn_float16, void, NULL)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_f16_vcvt_ukernel__avx512skx_u16, 16, false, float, xnn_float16, void, NULL)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_f16_vcvt_ukernel__avx512skx_u32, 32, false, float, xnn_float16, void, NULL)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_f32_f16_vcvt_ukernel__wasmsimd_u8, 8, false, float, xnn_float16, void, NULL)
diff --git a/src/f32-prelu/avx.c.in b/src/f32-prelu/avx.c.in
deleted file mode 100644
index 752f85f8817..00000000000
--- a/src/f32-prelu/avx.c.in
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-$assert CHANNEL_TILE % 8 == 0
-$assert CHANNEL_TILE >= 8
-$assert ROW_TILE >= 1
-$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-#include <assert.h>
-
-#include <immintrin.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__avx_${ROW_TILE}x${CHANNEL_TILE}(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
-
-  const float* i0 = input;
-  float* o0 = output;
-  $for M in range(1, ROW_TILE):
-    const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride);
-    float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride);
-
-  const size_t input_increment = input_stride * ${ROW_TILE} - channels;
-  const size_t output_increment = output_stride * ${ROW_TILE} - channels;
-
-  do {
-    $for M in range(1, ROW_TILE):
-      $if M % 2 == 0:
-        if XNN_UNPREDICTABLE(rows <= ${M}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-      $else:
-        if XNN_UNPREDICTABLE(rows < ${M+1}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) {
-      const __m256 vw${ABC[0:8]} = _mm256_load_ps(w);
-      $for C in range(8, CHANNEL_TILE, 8):
-        const __m256 vw${ABC[C:C+8]} = _mm256_load_ps(w + ${C});
-      w += ${CHANNEL_TILE};
-
-      $for M in range(ROW_TILE):
-        const __m256 vi${M}x${ABC[0:8]} = _mm256_loadu_ps(i${M});
-        $for C in range(8, CHANNEL_TILE, 8):
-          const __m256 vi${M}x${ABC[C:C+8]} = _mm256_loadu_ps(i${M} + ${C});
-        i${M} += ${CHANNEL_TILE};
-
-      $for M in range(ROW_TILE):
-        $for C in range(0, CHANNEL_TILE, 8):
-          const __m256 vprod${M}x${ABC[C:C+8]} = _mm256_mul_ps(vi${M}x${ABC[C:C+8]}, vw${ABC[C:C+8]});
-
-      $for M in range(ROW_TILE):
-        $for C in range(0, CHANNEL_TILE, 8):
-          const __m256 vacc${M}x${ABC[C:C+8]} = _mm256_blendv_ps(vi${M}x${ABC[C:C+8]}, vprod${M}x${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
-
-      $for M in range(ROW_TILE):
-        _mm256_storeu_ps(o${M}, vacc${M}x${ABC[0:8]});
-        $for C in range(8, CHANNEL_TILE, 8):
-          _mm256_storeu_ps(o${M} + ${C}, vacc${M}x${ABC[C:C+8]});
-        o${M} += ${CHANNEL_TILE};
-    }
-    $if CHANNEL_TILE > 8:
-      for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-        const __m256 vw = _mm256_load_ps(w);
-        w += 8;
-
-        $for M in range(ROW_TILE):
-          const __m256 vi${M} = _mm256_loadu_ps(i${M});
-          i${M} += 8;
-
-        $for M in range(ROW_TILE):
-          const __m256 vprod${M} = _mm256_mul_ps(vi${M}, vw);
-
-        $for M in range(ROW_TILE):
-          const __m256 vacc${M} = _mm256_blendv_ps(vi${M}, vprod${M}, vi${M});
-
-        $for M in range(ROW_TILE):
-          _mm256_storeu_ps(o${M}, vacc${M});
-          o${M} += 8;
-      }
-    if XNN_UNLIKELY(c != 0) {
-      assert(c >= 1 * sizeof(float));
-      assert(c <= 7 * sizeof(float));
-      __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - c));
-
-      const __m256 vw = _mm256_maskload_ps(w, vmask);
-
-      $for M in range(ROW_TILE):
-        const __m256 vi${M} = _mm256_maskload_ps(i${M}, vmask);
-        i${M} = (const float*) ((uintptr_t) i${M} + c);
-
-      $for M in range(ROW_TILE):
-        const __m256 vprod${M} = _mm256_mul_ps(vi${M}, vw);
-
-      $for M in range(ROW_TILE):
-        __m256 vacc${M} = _mm256_blendv_ps(vi${M}, vprod${M}, vi${M});
-
-      $for M in range(ROW_TILE):
-        __m128 vacc${M}_lo = _mm256_castps256_ps128(vacc${M});
-      if (c & (4 * sizeof(float))) {
-        $for M in range(ROW_TILE):
-          _mm_storeu_ps(o${M}, vacc${M}_lo);
-
-        $for M in range(ROW_TILE):
-          vacc${M}_lo = _mm256_extractf128_ps(vacc${M}, 1);
-
-        $for M in range(ROW_TILE):
-          o${M} += 4;
-      }
-      if (c & (2 * sizeof(float))) {
-        $for M in range(ROW_TILE):
-          _mm_storel_pi((__m64*) o${M}, vacc${M}_lo);
-
-        $for M in range(ROW_TILE):
-          vacc${M}_lo = _mm_movehl_ps(vacc${M}_lo, vacc${M}_lo);
-
-        $for M in range(ROW_TILE):
-          o${M} += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        $for M in range(ROW_TILE):
-          _mm_store_ss(o${M}, vacc${M}_lo);
-
-        $for M in range(ROW_TILE):
-          o${M} += 1;
-      }
-    }
-    $for M in range(ROW_TILE):
-      i${M} = (const float*) ((uintptr_t) i${M} + input_increment);
-      o${M} = (float*) ((uintptr_t) o${M} + output_increment);
-    rows = doz(rows, ${ROW_TILE});
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/avx512f.c.in b/src/f32-prelu/avx512f.c.in
deleted file mode 100644
index 859f59cd2ce..00000000000
--- a/src/f32-prelu/avx512f.c.in
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-$assert CHANNEL_TILE % 16 == 0
-$assert CHANNEL_TILE >= 16
-$assert ROW_TILE >= 1
-$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-#include <assert.h>
-
-#include <immintrin.h>
-
-#include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__avx512f_${ROW_TILE}x${CHANNEL_TILE}(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  $for M in range(1, ROW_TILE):
-    const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride);
-    float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride);
-
-  const size_t input_increment = input_stride * ${ROW_TILE} - channels;
-  const size_t output_increment = output_stride * ${ROW_TILE} - channels;
-
-  const __m512 vzero = _mm512_setzero_ps();
-  do {
-    $for M in range(1, ROW_TILE):
-      $if M % 2 == 0:
-        if XNN_UNPREDICTABLE(rows <= ${M}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-      $else:
-        if XNN_UNPREDICTABLE(rows < ${M+1}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) {
-      const __m512 vw${ABC[0:16]} = _mm512_load_ps(w);
-      $for C in range(16, CHANNEL_TILE, 16):
-        const __m512 vw${ABC[C:C+16]} = _mm512_load_ps(w + ${C});
-      w += ${CHANNEL_TILE};
-
-      $for M in range(ROW_TILE):
-        const __m512 vi${M}x${ABC[0:16]} = _mm512_loadu_ps(i${M});
-        $for C in range(16, CHANNEL_TILE, 16):
-          const __m512 vi${M}x${ABC[C:C+16]} = _mm512_loadu_ps(i${M} + ${C});
-        i${M} += ${CHANNEL_TILE};
-
-      $for M in range(ROW_TILE):
-        $for C in range(0, CHANNEL_TILE, 16):
-          const __mmask16 vsign${M}x${ABC[C:C+16]} = _mm512_cmp_ps_mask(vi${M}x${ABC[C:C+16]}, vzero, _CMP_LT_OQ);
-          const __m512 vacc${M}x${ABC[C:C+16]} = _mm512_mask_mul_ps(vi${M}x${ABC[C:C+16]}, vsign${M}x${ABC[C:C+16]}, vi${M}x${ABC[C:C+16]}, vw${ABC[C:C+16]});
-
-      $for M in range(ROW_TILE):
-        _mm512_storeu_ps(o${M}, vacc${M}x${ABC[0:16]});
-        $for C in range(16, CHANNEL_TILE, 16):
-          _mm512_storeu_ps(o${M} + ${C}, vacc${M}x${ABC[C:C+16]});
-        o${M} += ${CHANNEL_TILE};
-    }
-    $if CHANNEL_TILE > 16:
-      for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-        const __m512 vw = _mm512_load_ps(w);
-        w += 16;
-
-        $for M in range(ROW_TILE):
-          const __m512 vi${M} = _mm512_loadu_ps(i${M});
-          i${M} += 16;
-
-        $for M in range(ROW_TILE):
-          const __mmask16 vsign${M} = _mm512_cmp_ps_mask(vi${M}, vzero, _CMP_LT_OQ);
-          const __m512 vacc${M} = _mm512_mask_mul_ps(vi${M}, vsign${M}, vi${M}, vw);
-
-        $for M in range(ROW_TILE):
-          _mm512_storeu_ps(o${M}, vacc${M});
-          o${M} += 16;
-      }
-    if XNN_UNLIKELY(c != 0) {
-      assert(c >= 1 * sizeof(float));
-      assert(c <= 15 * sizeof(float));
-      // Prepare mask for valid 32-bit elements (depends on c).
-      const __mmask16 vmask = _cvtu32_mask16((uint32_t) (UINT32_C(1) << (c >> XNN_LOG2_SIZEOF_FLOAT)) - UINT32_C(1));
-
-      const __m512 vw = _mm512_maskz_loadu_ps(vmask, w);
-
-      $for M in range(ROW_TILE):
-        const __m512 vi${M} = _mm512_maskz_loadu_ps(vmask, i${M});
-        i${M} = (const float*) ((uintptr_t) i${M} + c);
-
-      $for M in range(ROW_TILE):
-        const __mmask16 vsign${M} = _mm512_cmp_ps_mask(vi${M}, vzero, _CMP_LT_OQ);
-        const __m512 vacc${M} = _mm512_mask_mul_ps(vi${M}, vsign${M}, vi${M}, vw);
-
-      $for M in range(ROW_TILE):
-        _mm512_mask_storeu_ps(o${M}, vmask, vacc${M});
-        o${M} = (float*) ((uintptr_t) o${M} + c);
-    }
-    $for M in range(ROW_TILE):
-      i${M} = (const float*) ((uintptr_t) i${M} + input_increment);
-      o${M} = (float*) ((uintptr_t) o${M} + output_increment);
-    rows = doz(rows, ${ROW_TILE});
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-avx-2x16.c b/src/f32-prelu/gen/f32-prelu-avx-2x16.c
index 70cde0fd797..6ef21bd98a8 100644
--- a/src/f32-prelu/gen/f32-prelu-avx-2x16.c
+++ b/src/f32-prelu/gen/f32-prelu-avx-2x16.c
@@ -12,7 +12,7 @@
 #include <immintrin.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
+#include "xnnpack/vbinary.h"
 
 
 void xnn_f32_prelu_ukernel__avx_2x16(
diff --git a/src/f32-prelu/gen/f32-prelu-avx-2x8.c b/src/f32-prelu/gen/f32-prelu-avx-2x8.c
deleted file mode 100644
index bdd6a780fad..00000000000
--- a/src/f32-prelu/gen/f32-prelu-avx-2x8.c
+++ /dev/null
@@ -1,123 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/avx.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <immintrin.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__avx_2x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const __m256 vw01234567 = _mm256_load_ps(w);
-      w += 8;
-
-      const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
-      i0 += 8;
-      const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
-      i1 += 8;
-
-      const __m256 vprod0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
-      const __m256 vprod1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
-
-      const __m256 vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vprod0x01234567, vi0x01234567);
-      const __m256 vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vprod1x01234567, vi1x01234567);
-
-      _mm256_storeu_ps(o0, vacc0x01234567);
-      o0 += 8;
-      _mm256_storeu_ps(o1, vacc1x01234567);
-      o1 += 8;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      assert(c >= 1 * sizeof(float));
-      assert(c <= 7 * sizeof(float));
-      __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - c));
-
-      const __m256 vw = _mm256_maskload_ps(w, vmask);
-
-      const __m256 vi0 = _mm256_maskload_ps(i0, vmask);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const __m256 vi1 = _mm256_maskload_ps(i1, vmask);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      const __m256 vprod0 = _mm256_mul_ps(vi0, vw);
-      const __m256 vprod1 = _mm256_mul_ps(vi1, vw);
-
-      __m256 vacc0 = _mm256_blendv_ps(vi0, vprod0, vi0);
-      __m256 vacc1 = _mm256_blendv_ps(vi1, vprod1, vi1);
-
-      __m128 vacc0_lo = _mm256_castps256_ps128(vacc0);
-      __m128 vacc1_lo = _mm256_castps256_ps128(vacc1);
-      if (c & (4 * sizeof(float))) {
-        _mm_storeu_ps(o0, vacc0_lo);
-        _mm_storeu_ps(o1, vacc1_lo);
-
-        vacc0_lo = _mm256_extractf128_ps(vacc0, 1);
-        vacc1_lo = _mm256_extractf128_ps(vacc1, 1);
-
-        o0 += 4;
-        o1 += 4;
-      }
-      if (c & (2 * sizeof(float))) {
-        _mm_storel_pi((__m64*) o0, vacc0_lo);
-        _mm_storel_pi((__m64*) o1, vacc1_lo);
-
-        vacc0_lo = _mm_movehl_ps(vacc0_lo, vacc0_lo);
-        vacc1_lo = _mm_movehl_ps(vacc1_lo, vacc1_lo);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        _mm_store_ss(o0, vacc0_lo);
-        _mm_store_ss(o1, vacc1_lo);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-avx512f-2x16.c b/src/f32-prelu/gen/f32-prelu-avx512f-2x16.c
deleted file mode 100644
index 4897ecbf064..00000000000
--- a/src/f32-prelu/gen/f32-prelu-avx512f-2x16.c
+++ /dev/null
@@ -1,97 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/avx512f.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <immintrin.h>
-
-#include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__avx512f_2x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  const __m512 vzero = _mm512_setzero_ps();
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const __m512 vw0123456789ABCDEF = _mm512_load_ps(w);
-      w += 16;
-
-      const __m512 vi0x0123456789ABCDEF = _mm512_loadu_ps(i0);
-      i0 += 16;
-      const __m512 vi1x0123456789ABCDEF = _mm512_loadu_ps(i1);
-      i1 += 16;
-
-      const __mmask16 vsign0x0123456789ABCDEF = _mm512_cmp_ps_mask(vi0x0123456789ABCDEF, vzero, _CMP_LT_OQ);
-      const __m512 vacc0x0123456789ABCDEF = _mm512_mask_mul_ps(vi0x0123456789ABCDEF, vsign0x0123456789ABCDEF, vi0x0123456789ABCDEF, vw0123456789ABCDEF);
-      const __mmask16 vsign1x0123456789ABCDEF = _mm512_cmp_ps_mask(vi1x0123456789ABCDEF, vzero, _CMP_LT_OQ);
-      const __m512 vacc1x0123456789ABCDEF = _mm512_mask_mul_ps(vi1x0123456789ABCDEF, vsign1x0123456789ABCDEF, vi1x0123456789ABCDEF, vw0123456789ABCDEF);
-
-      _mm512_storeu_ps(o0, vacc0x0123456789ABCDEF);
-      o0 += 16;
-      _mm512_storeu_ps(o1, vacc1x0123456789ABCDEF);
-      o1 += 16;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      assert(c >= 1 * sizeof(float));
-      assert(c <= 15 * sizeof(float));
-      // Prepare mask for valid 32-bit elements (depends on c).
-      const __mmask16 vmask = _cvtu32_mask16((uint32_t) (UINT32_C(1) << (c >> XNN_LOG2_SIZEOF_FLOAT)) - UINT32_C(1));
-
-      const __m512 vw = _mm512_maskz_loadu_ps(vmask, w);
-
-      const __m512 vi0 = _mm512_maskz_loadu_ps(vmask, i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const __m512 vi1 = _mm512_maskz_loadu_ps(vmask, i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      const __mmask16 vsign0 = _mm512_cmp_ps_mask(vi0, vzero, _CMP_LT_OQ);
-      const __m512 vacc0 = _mm512_mask_mul_ps(vi0, vsign0, vi0, vw);
-      const __mmask16 vsign1 = _mm512_cmp_ps_mask(vi1, vzero, _CMP_LT_OQ);
-      const __m512 vacc1 = _mm512_mask_mul_ps(vi1, vsign1, vi1, vw);
-
-      _mm512_mask_storeu_ps(o0, vmask, vacc0);
-      o0 = (float*) ((uintptr_t) o0 + c);
-      _mm512_mask_storeu_ps(o1, vmask, vacc1);
-      o1 = (float*) ((uintptr_t) o1 + c);
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-avx512f-2x32.c b/src/f32-prelu/gen/f32-prelu-avx512f-2x32.c
deleted file mode 100644
index a2f64e7c6c9..00000000000
--- a/src/f32-prelu/gen/f32-prelu-avx512f-2x32.c
+++ /dev/null
@@ -1,125 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/avx512f.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <immintrin.h>
-
-#include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__avx512f_2x32(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  const __m512 vzero = _mm512_setzero_ps();
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 32 * sizeof(float); c -= 32 * sizeof(float)) {
-      const __m512 vw0123456789ABCDEF = _mm512_load_ps(w);
-      const __m512 vwGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 16);
-      w += 32;
-
-      const __m512 vi0x0123456789ABCDEF = _mm512_loadu_ps(i0);
-      const __m512 vi0xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i0 + 16);
-      i0 += 32;
-      const __m512 vi1x0123456789ABCDEF = _mm512_loadu_ps(i1);
-      const __m512 vi1xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i1 + 16);
-      i1 += 32;
-
-      const __mmask16 vsign0x0123456789ABCDEF = _mm512_cmp_ps_mask(vi0x0123456789ABCDEF, vzero, _CMP_LT_OQ);
-      const __m512 vacc0x0123456789ABCDEF = _mm512_mask_mul_ps(vi0x0123456789ABCDEF, vsign0x0123456789ABCDEF, vi0x0123456789ABCDEF, vw0123456789ABCDEF);
-      const __mmask16 vsign0xGHIJKLMNOPQRSTUV = _mm512_cmp_ps_mask(vi0xGHIJKLMNOPQRSTUV, vzero, _CMP_LT_OQ);
-      const __m512 vacc0xGHIJKLMNOPQRSTUV = _mm512_mask_mul_ps(vi0xGHIJKLMNOPQRSTUV, vsign0xGHIJKLMNOPQRSTUV, vi0xGHIJKLMNOPQRSTUV, vwGHIJKLMNOPQRSTUV);
-      const __mmask16 vsign1x0123456789ABCDEF = _mm512_cmp_ps_mask(vi1x0123456789ABCDEF, vzero, _CMP_LT_OQ);
-      const __m512 vacc1x0123456789ABCDEF = _mm512_mask_mul_ps(vi1x0123456789ABCDEF, vsign1x0123456789ABCDEF, vi1x0123456789ABCDEF, vw0123456789ABCDEF);
-      const __mmask16 vsign1xGHIJKLMNOPQRSTUV = _mm512_cmp_ps_mask(vi1xGHIJKLMNOPQRSTUV, vzero, _CMP_LT_OQ);
-      const __m512 vacc1xGHIJKLMNOPQRSTUV = _mm512_mask_mul_ps(vi1xGHIJKLMNOPQRSTUV, vsign1xGHIJKLMNOPQRSTUV, vi1xGHIJKLMNOPQRSTUV, vwGHIJKLMNOPQRSTUV);
-
-      _mm512_storeu_ps(o0, vacc0x0123456789ABCDEF);
-      _mm512_storeu_ps(o0 + 16, vacc0xGHIJKLMNOPQRSTUV);
-      o0 += 32;
-      _mm512_storeu_ps(o1, vacc1x0123456789ABCDEF);
-      _mm512_storeu_ps(o1 + 16, vacc1xGHIJKLMNOPQRSTUV);
-      o1 += 32;
-    }
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const __m512 vw = _mm512_load_ps(w);
-      w += 16;
-
-      const __m512 vi0 = _mm512_loadu_ps(i0);
-      i0 += 16;
-      const __m512 vi1 = _mm512_loadu_ps(i1);
-      i1 += 16;
-
-      const __mmask16 vsign0 = _mm512_cmp_ps_mask(vi0, vzero, _CMP_LT_OQ);
-      const __m512 vacc0 = _mm512_mask_mul_ps(vi0, vsign0, vi0, vw);
-      const __mmask16 vsign1 = _mm512_cmp_ps_mask(vi1, vzero, _CMP_LT_OQ);
-      const __m512 vacc1 = _mm512_mask_mul_ps(vi1, vsign1, vi1, vw);
-
-      _mm512_storeu_ps(o0, vacc0);
-      o0 += 16;
-      _mm512_storeu_ps(o1, vacc1);
-      o1 += 16;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      assert(c >= 1 * sizeof(float));
-      assert(c <= 15 * sizeof(float));
-      // Prepare mask for valid 32-bit elements (depends on c).
-      const __mmask16 vmask = _cvtu32_mask16((uint32_t) (UINT32_C(1) << (c >> XNN_LOG2_SIZEOF_FLOAT)) - UINT32_C(1));
-
-      const __m512 vw = _mm512_maskz_loadu_ps(vmask, w);
-
-      const __m512 vi0 = _mm512_maskz_loadu_ps(vmask, i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const __m512 vi1 = _mm512_maskz_loadu_ps(vmask, i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      const __mmask16 vsign0 = _mm512_cmp_ps_mask(vi0, vzero, _CMP_LT_OQ);
-      const __m512 vacc0 = _mm512_mask_mul_ps(vi0, vsign0, vi0, vw);
-      const __mmask16 vsign1 = _mm512_cmp_ps_mask(vi1, vzero, _CMP_LT_OQ);
-      const __m512 vacc1 = _mm512_mask_mul_ps(vi1, vsign1, vi1, vw);
-
-      _mm512_mask_storeu_ps(o0, vmask, vacc0);
-      o0 = (float*) ((uintptr_t) o0 + c);
-      _mm512_mask_storeu_ps(o1, vmask, vacc1);
-      o1 = (float*) ((uintptr_t) o1 + c);
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-neon-1x16.c b/src/f32-prelu/gen/f32-prelu-neon-1x16.c
deleted file mode 100644
index 8a43be75c30..00000000000
--- a/src/f32-prelu/gen/f32-prelu-neon-1x16.c
+++ /dev/null
@@ -1,109 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/neon.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__neon_1x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-
-  const size_t input_increment = input_stride * 1 - channels;
-  const size_t output_increment = output_stride * 1 - channels;
-
-  do {
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vw4567 = vld1q_f32(w); w += 4;
-      const float32x4_t vw89AB = vld1q_f32(w); w += 4;
-      const float32x4_t vwCDEF = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi0xCDEF = vld1q_f32(i0); i0 += 4;
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc0x4567 = vmulq_f32(vi0x4567, vw4567);
-      const uint32x4_t vm0x4567 = vcltq_s32(vreinterpretq_s32_f32(vi0x4567), vmovq_n_s32(0));
-      float32x4_t vacc0x89AB = vmulq_f32(vi0x89AB, vw89AB);
-      const uint32x4_t vm0x89AB = vcltq_s32(vreinterpretq_s32_f32(vi0x89AB), vmovq_n_s32(0));
-      float32x4_t vacc0xCDEF = vmulq_f32(vi0xCDEF, vwCDEF);
-      const uint32x4_t vm0xCDEF = vcltq_s32(vreinterpretq_s32_f32(vi0xCDEF), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc0x4567 = vbslq_f32(vm0x4567, vacc0x4567, vi0x4567);
-      vacc0x89AB = vbslq_f32(vm0x89AB, vacc0x89AB, vi0x89AB);
-      vacc0xCDEF = vbslq_f32(vm0xCDEF, vacc0xCDEF, vi0xCDEF);
-
-      vst1q_f32(o0, vacc0x0123); o0 += 4;
-      vst1q_f32(o0, vacc0x4567); o0 += 4;
-      vst1q_f32(o0, vacc0x89AB); o0 += 4;
-      vst1q_f32(o0, vacc0xCDEF); o0 += 4;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0);
-      i0 += 4;
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-
-      vst1q_f32(o0, vacc0x0123); o0 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (c & (2 * sizeof(float))) {
-        vst1_f32(o0, vacc0x01); o0 += 2;
-
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (c & (1 * sizeof(float))) {
-        vst1_lane_f32(o0, vacc0x01, 0); o0 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    rows = doz(rows, 1);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-neon-1x4.c b/src/f32-prelu/gen/f32-prelu-neon-1x4.c
deleted file mode 100644
index cac72266b83..00000000000
--- a/src/f32-prelu/gen/f32-prelu-neon-1x4.c
+++ /dev/null
@@ -1,78 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/neon.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__neon_1x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-
-  const size_t input_increment = input_stride * 1 - channels;
-  const size_t output_increment = output_stride * 1 - channels;
-
-  do {
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-
-      vst1q_f32(o0, vacc0x0123); o0 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (c & (2 * sizeof(float))) {
-        vst1_f32(o0, vacc0x01); o0 += 2;
-
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (c & (1 * sizeof(float))) {
-        vst1_lane_f32(o0, vacc0x01, 0); o0 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    rows = doz(rows, 1);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-neon-1x8.c b/src/f32-prelu/gen/f32-prelu-neon-1x8.c
deleted file mode 100644
index 0674b56d772..00000000000
--- a/src/f32-prelu/gen/f32-prelu-neon-1x8.c
+++ /dev/null
@@ -1,97 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/neon.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__neon_1x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-
-  const size_t input_increment = input_stride * 1 - channels;
-  const size_t output_increment = output_stride * 1 - channels;
-
-  do {
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vw4567 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc0x4567 = vmulq_f32(vi0x4567, vw4567);
-      const uint32x4_t vm0x4567 = vcltq_s32(vreinterpretq_s32_f32(vi0x4567), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc0x4567 = vbslq_f32(vm0x4567, vacc0x4567, vi0x4567);
-
-      vst1q_f32(o0, vacc0x0123); o0 += 4;
-      vst1q_f32(o0, vacc0x4567); o0 += 4;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0);
-      i0 += 4;
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-
-      vst1q_f32(o0, vacc0x0123); o0 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (c & (2 * sizeof(float))) {
-        vst1_f32(o0, vacc0x01); o0 += 2;
-
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (c & (1 * sizeof(float))) {
-        vst1_lane_f32(o0, vacc0x01, 0); o0 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    rows = doz(rows, 1);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-neon-2x16.c b/src/f32-prelu/gen/f32-prelu-neon-2x16.c
deleted file mode 100644
index 68d7f95ef22..00000000000
--- a/src/f32-prelu/gen/f32-prelu-neon-2x16.c
+++ /dev/null
@@ -1,152 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/neon.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__neon_2x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vw4567 = vld1q_f32(w); w += 4;
-      const float32x4_t vw89AB = vld1q_f32(w); w += 4;
-      const float32x4_t vwCDEF = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi0xCDEF = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
-      const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
-      const float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4;
-      const float32x4_t vi1xCDEF = vld1q_f32(i1); i1 += 4;
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc0x4567 = vmulq_f32(vi0x4567, vw4567);
-      const uint32x4_t vm0x4567 = vcltq_s32(vreinterpretq_s32_f32(vi0x4567), vmovq_n_s32(0));
-      float32x4_t vacc0x89AB = vmulq_f32(vi0x89AB, vw89AB);
-      const uint32x4_t vm0x89AB = vcltq_s32(vreinterpretq_s32_f32(vi0x89AB), vmovq_n_s32(0));
-      float32x4_t vacc0xCDEF = vmulq_f32(vi0xCDEF, vwCDEF);
-      const uint32x4_t vm0xCDEF = vcltq_s32(vreinterpretq_s32_f32(vi0xCDEF), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x4567 = vmulq_f32(vi1x4567, vw4567);
-      const uint32x4_t vm1x4567 = vcltq_s32(vreinterpretq_s32_f32(vi1x4567), vmovq_n_s32(0));
-      float32x4_t vacc1x89AB = vmulq_f32(vi1x89AB, vw89AB);
-      const uint32x4_t vm1x89AB = vcltq_s32(vreinterpretq_s32_f32(vi1x89AB), vmovq_n_s32(0));
-      float32x4_t vacc1xCDEF = vmulq_f32(vi1xCDEF, vwCDEF);
-      const uint32x4_t vm1xCDEF = vcltq_s32(vreinterpretq_s32_f32(vi1xCDEF), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc0x4567 = vbslq_f32(vm0x4567, vacc0x4567, vi0x4567);
-      vacc0x89AB = vbslq_f32(vm0x89AB, vacc0x89AB, vi0x89AB);
-      vacc0xCDEF = vbslq_f32(vm0xCDEF, vacc0xCDEF, vi0xCDEF);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-      vacc1x4567 = vbslq_f32(vm1x4567, vacc1x4567, vi1x4567);
-      vacc1x89AB = vbslq_f32(vm1x89AB, vacc1x89AB, vi1x89AB);
-      vacc1xCDEF = vbslq_f32(vm1xCDEF, vacc1xCDEF, vi1xCDEF);
-
-      vst1q_f32(o0, vacc0x0123); o0 += 4;
-      vst1q_f32(o0, vacc0x4567); o0 += 4;
-      vst1q_f32(o0, vacc0x89AB); o0 += 4;
-      vst1q_f32(o0, vacc0xCDEF); o0 += 4;
-      vst1q_f32(o1, vacc1x0123); o1 += 4;
-      vst1q_f32(o1, vacc1x4567); o1 += 4;
-      vst1q_f32(o1, vacc1x89AB); o1 += 4;
-      vst1q_f32(o1, vacc1xCDEF); o1 += 4;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0);
-      i0 += 4;
-      const float32x4_t vi1x0123 = vld1q_f32(i1);
-      i1 += 4;
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-
-      vst1q_f32(o0, vacc0x0123); o0 += 4;
-      vst1q_f32(o1, vacc1x0123); o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const float32x4_t vi1x0123 = vld1q_f32(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      if (c & (2 * sizeof(float))) {
-        vst1_f32(o0, vacc0x01); o0 += 2;
-        vst1_f32(o1, vacc1x01); o1 += 2;
-
-        vacc0x01 = vget_high_f32(vacc0x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-      }
-      if (c & (1 * sizeof(float))) {
-        vst1_lane_f32(o0, vacc0x01, 0); o0 += 1;
-        vst1_lane_f32(o1, vacc1x01, 0); o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-neon-2x4.c b/src/f32-prelu/gen/f32-prelu-neon-2x4.c
deleted file mode 100644
index 6a0d0d42a18..00000000000
--- a/src/f32-prelu/gen/f32-prelu-neon-2x4.c
+++ /dev/null
@@ -1,100 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/neon.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__neon_2x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-
-      vst1q_f32(o0, vacc0x0123); o0 += 4;
-      vst1q_f32(o1, vacc1x0123); o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const float32x4_t vi1x0123 = vld1q_f32(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      if (c & (2 * sizeof(float))) {
-        vst1_f32(o0, vacc0x01); o0 += 2;
-        vst1_f32(o1, vacc1x01); o1 += 2;
-
-        vacc0x01 = vget_high_f32(vacc0x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-      }
-      if (c & (1 * sizeof(float))) {
-        vst1_lane_f32(o0, vacc0x01, 0); o0 += 1;
-        vst1_lane_f32(o1, vacc1x01, 0); o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-neon-2x8.c b/src/f32-prelu/gen/f32-prelu-neon-2x8.c
deleted file mode 100644
index 2077ade9aaa..00000000000
--- a/src/f32-prelu/gen/f32-prelu-neon-2x8.c
+++ /dev/null
@@ -1,130 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/neon.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__neon_2x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vw4567 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
-      const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc0x4567 = vmulq_f32(vi0x4567, vw4567);
-      const uint32x4_t vm0x4567 = vcltq_s32(vreinterpretq_s32_f32(vi0x4567), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x4567 = vmulq_f32(vi1x4567, vw4567);
-      const uint32x4_t vm1x4567 = vcltq_s32(vreinterpretq_s32_f32(vi1x4567), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc0x4567 = vbslq_f32(vm0x4567, vacc0x4567, vi0x4567);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-      vacc1x4567 = vbslq_f32(vm1x4567, vacc1x4567, vi1x4567);
-
-      vst1q_f32(o0, vacc0x0123); o0 += 4;
-      vst1q_f32(o0, vacc0x4567); o0 += 4;
-      vst1q_f32(o1, vacc1x0123); o1 += 4;
-      vst1q_f32(o1, vacc1x4567); o1 += 4;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0);
-      i0 += 4;
-      const float32x4_t vi1x0123 = vld1q_f32(i1);
-      i1 += 4;
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-
-      vst1q_f32(o0, vacc0x0123); o0 += 4;
-      vst1q_f32(o1, vacc1x0123); o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const float32x4_t vi1x0123 = vld1q_f32(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      if (c & (2 * sizeof(float))) {
-        vst1_f32(o0, vacc0x01); o0 += 2;
-        vst1_f32(o1, vacc1x01); o1 += 2;
-
-        vacc0x01 = vget_high_f32(vacc0x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-      }
-      if (c & (1 * sizeof(float))) {
-        vst1_lane_f32(o0, vacc0x01, 0); o0 += 1;
-        vst1_lane_f32(o1, vacc1x01, 0); o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-neon-4x16.c b/src/f32-prelu/gen/f32-prelu-neon-4x16.c
deleted file mode 100644
index 531c27c58d5..00000000000
--- a/src/f32-prelu/gen/f32-prelu-neon-4x16.c
+++ /dev/null
@@ -1,238 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/neon.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__neon_4x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
-  float* o2 = (float*) ((uintptr_t) o1 + output_stride);
-  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
-  float* o3 = (float*) ((uintptr_t) o2 + output_stride);
-
-  const size_t input_increment = input_stride * 4 - channels;
-  const size_t output_increment = output_stride * 4 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-    if XNN_UNPREDICTABLE(rows <= 2) {
-      i2 = i1;
-      o2 = o1;
-    }
-    if XNN_UNPREDICTABLE(rows < 4) {
-      i3 = i2;
-      o3 = o2;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vw4567 = vld1q_f32(w); w += 4;
-      const float32x4_t vw89AB = vld1q_f32(w); w += 4;
-      const float32x4_t vwCDEF = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi0xCDEF = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
-      const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
-      const float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4;
-      const float32x4_t vi1xCDEF = vld1q_f32(i1); i1 += 4;
-      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
-      const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
-      const float32x4_t vi2x89AB = vld1q_f32(i2); i2 += 4;
-      const float32x4_t vi2xCDEF = vld1q_f32(i2); i2 += 4;
-      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
-      const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
-      const float32x4_t vi3x89AB = vld1q_f32(i3); i3 += 4;
-      const float32x4_t vi3xCDEF = vld1q_f32(i3); i3 += 4;
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc0x4567 = vmulq_f32(vi0x4567, vw4567);
-      const uint32x4_t vm0x4567 = vcltq_s32(vreinterpretq_s32_f32(vi0x4567), vmovq_n_s32(0));
-      float32x4_t vacc0x89AB = vmulq_f32(vi0x89AB, vw89AB);
-      const uint32x4_t vm0x89AB = vcltq_s32(vreinterpretq_s32_f32(vi0x89AB), vmovq_n_s32(0));
-      float32x4_t vacc0xCDEF = vmulq_f32(vi0xCDEF, vwCDEF);
-      const uint32x4_t vm0xCDEF = vcltq_s32(vreinterpretq_s32_f32(vi0xCDEF), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x4567 = vmulq_f32(vi1x4567, vw4567);
-      const uint32x4_t vm1x4567 = vcltq_s32(vreinterpretq_s32_f32(vi1x4567), vmovq_n_s32(0));
-      float32x4_t vacc1x89AB = vmulq_f32(vi1x89AB, vw89AB);
-      const uint32x4_t vm1x89AB = vcltq_s32(vreinterpretq_s32_f32(vi1x89AB), vmovq_n_s32(0));
-      float32x4_t vacc1xCDEF = vmulq_f32(vi1xCDEF, vwCDEF);
-      const uint32x4_t vm1xCDEF = vcltq_s32(vreinterpretq_s32_f32(vi1xCDEF), vmovq_n_s32(0));
-      float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123);
-      const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0));
-      float32x4_t vacc2x4567 = vmulq_f32(vi2x4567, vw4567);
-      const uint32x4_t vm2x4567 = vcltq_s32(vreinterpretq_s32_f32(vi2x4567), vmovq_n_s32(0));
-      float32x4_t vacc2x89AB = vmulq_f32(vi2x89AB, vw89AB);
-      const uint32x4_t vm2x89AB = vcltq_s32(vreinterpretq_s32_f32(vi2x89AB), vmovq_n_s32(0));
-      float32x4_t vacc2xCDEF = vmulq_f32(vi2xCDEF, vwCDEF);
-      const uint32x4_t vm2xCDEF = vcltq_s32(vreinterpretq_s32_f32(vi2xCDEF), vmovq_n_s32(0));
-      float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123);
-      const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0));
-      float32x4_t vacc3x4567 = vmulq_f32(vi3x4567, vw4567);
-      const uint32x4_t vm3x4567 = vcltq_s32(vreinterpretq_s32_f32(vi3x4567), vmovq_n_s32(0));
-      float32x4_t vacc3x89AB = vmulq_f32(vi3x89AB, vw89AB);
-      const uint32x4_t vm3x89AB = vcltq_s32(vreinterpretq_s32_f32(vi3x89AB), vmovq_n_s32(0));
-      float32x4_t vacc3xCDEF = vmulq_f32(vi3xCDEF, vwCDEF);
-      const uint32x4_t vm3xCDEF = vcltq_s32(vreinterpretq_s32_f32(vi3xCDEF), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc0x4567 = vbslq_f32(vm0x4567, vacc0x4567, vi0x4567);
-      vacc0x89AB = vbslq_f32(vm0x89AB, vacc0x89AB, vi0x89AB);
-      vacc0xCDEF = vbslq_f32(vm0xCDEF, vacc0xCDEF, vi0xCDEF);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-      vacc1x4567 = vbslq_f32(vm1x4567, vacc1x4567, vi1x4567);
-      vacc1x89AB = vbslq_f32(vm1x89AB, vacc1x89AB, vi1x89AB);
-      vacc1xCDEF = vbslq_f32(vm1xCDEF, vacc1xCDEF, vi1xCDEF);
-      vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123);
-      vacc2x4567 = vbslq_f32(vm2x4567, vacc2x4567, vi2x4567);
-      vacc2x89AB = vbslq_f32(vm2x89AB, vacc2x89AB, vi2x89AB);
-      vacc2xCDEF = vbslq_f32(vm2xCDEF, vacc2xCDEF, vi2xCDEF);
-      vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123);
-      vacc3x4567 = vbslq_f32(vm3x4567, vacc3x4567, vi3x4567);
-      vacc3x89AB = vbslq_f32(vm3x89AB, vacc3x89AB, vi3x89AB);
-      vacc3xCDEF = vbslq_f32(vm3xCDEF, vacc3xCDEF, vi3xCDEF);
-
-      vst1q_f32(o0, vacc0x0123); o0 += 4;
-      vst1q_f32(o0, vacc0x4567); o0 += 4;
-      vst1q_f32(o0, vacc0x89AB); o0 += 4;
-      vst1q_f32(o0, vacc0xCDEF); o0 += 4;
-      vst1q_f32(o1, vacc1x0123); o1 += 4;
-      vst1q_f32(o1, vacc1x4567); o1 += 4;
-      vst1q_f32(o1, vacc1x89AB); o1 += 4;
-      vst1q_f32(o1, vacc1xCDEF); o1 += 4;
-      vst1q_f32(o2, vacc2x0123); o2 += 4;
-      vst1q_f32(o2, vacc2x4567); o2 += 4;
-      vst1q_f32(o2, vacc2x89AB); o2 += 4;
-      vst1q_f32(o2, vacc2xCDEF); o2 += 4;
-      vst1q_f32(o3, vacc3x0123); o3 += 4;
-      vst1q_f32(o3, vacc3x4567); o3 += 4;
-      vst1q_f32(o3, vacc3x89AB); o3 += 4;
-      vst1q_f32(o3, vacc3xCDEF); o3 += 4;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0);
-      i0 += 4;
-      const float32x4_t vi1x0123 = vld1q_f32(i1);
-      i1 += 4;
-      const float32x4_t vi2x0123 = vld1q_f32(i2);
-      i2 += 4;
-      const float32x4_t vi3x0123 = vld1q_f32(i3);
-      i3 += 4;
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-      float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123);
-      const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0));
-      float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123);
-      const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-      vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123);
-      vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123);
-
-      vst1q_f32(o0, vacc0x0123); o0 += 4;
-      vst1q_f32(o1, vacc1x0123); o1 += 4;
-      vst1q_f32(o2, vacc2x0123); o2 += 4;
-      vst1q_f32(o3, vacc3x0123); o3 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const float32x4_t vi1x0123 = vld1q_f32(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-      const float32x4_t vi2x0123 = vld1q_f32(i2);
-      i2 = (const float*) ((uintptr_t) i2 + c);
-      const float32x4_t vi3x0123 = vld1q_f32(i3);
-      i3 = (const float*) ((uintptr_t) i3 + c);
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-      float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123);
-      const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0));
-      float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123);
-      const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-      vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123);
-      vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123);
-
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      if (c & (2 * sizeof(float))) {
-        vst1_f32(o0, vacc0x01); o0 += 2;
-        vst1_f32(o1, vacc1x01); o1 += 2;
-        vst1_f32(o2, vacc2x01); o2 += 2;
-        vst1_f32(o3, vacc3x01); o3 += 2;
-
-        vacc0x01 = vget_high_f32(vacc0x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc3x01 = vget_high_f32(vacc3x0123);
-      }
-      if (c & (1 * sizeof(float))) {
-        vst1_lane_f32(o0, vacc0x01, 0); o0 += 1;
-        vst1_lane_f32(o1, vacc1x01, 0); o1 += 1;
-        vst1_lane_f32(o2, vacc2x01, 0); o2 += 1;
-        vst1_lane_f32(o3, vacc3x01, 0); o3 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    i2 = (const float*) ((uintptr_t) i2 + input_increment);
-    o2 = (float*) ((uintptr_t) o2 + output_increment);
-    i3 = (const float*) ((uintptr_t) i3 + input_increment);
-    o3 = (float*) ((uintptr_t) o3 + output_increment);
-    rows = doz(rows, 4);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-neon-4x4.c b/src/f32-prelu/gen/f32-prelu-neon-4x4.c
deleted file mode 100644
index 495bde3e955..00000000000
--- a/src/f32-prelu/gen/f32-prelu-neon-4x4.c
+++ /dev/null
@@ -1,144 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/neon.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__neon_4x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
-  float* o2 = (float*) ((uintptr_t) o1 + output_stride);
-  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
-  float* o3 = (float*) ((uintptr_t) o2 + output_stride);
-
-  const size_t input_increment = input_stride * 4 - channels;
-  const size_t output_increment = output_stride * 4 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-    if XNN_UNPREDICTABLE(rows <= 2) {
-      i2 = i1;
-      o2 = o1;
-    }
-    if XNN_UNPREDICTABLE(rows < 4) {
-      i3 = i2;
-      o3 = o2;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
-      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
-      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-      float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123);
-      const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0));
-      float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123);
-      const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-      vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123);
-      vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123);
-
-      vst1q_f32(o0, vacc0x0123); o0 += 4;
-      vst1q_f32(o1, vacc1x0123); o1 += 4;
-      vst1q_f32(o2, vacc2x0123); o2 += 4;
-      vst1q_f32(o3, vacc3x0123); o3 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const float32x4_t vi1x0123 = vld1q_f32(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-      const float32x4_t vi2x0123 = vld1q_f32(i2);
-      i2 = (const float*) ((uintptr_t) i2 + c);
-      const float32x4_t vi3x0123 = vld1q_f32(i3);
-      i3 = (const float*) ((uintptr_t) i3 + c);
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-      float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123);
-      const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0));
-      float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123);
-      const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-      vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123);
-      vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123);
-
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      if (c & (2 * sizeof(float))) {
-        vst1_f32(o0, vacc0x01); o0 += 2;
-        vst1_f32(o1, vacc1x01); o1 += 2;
-        vst1_f32(o2, vacc2x01); o2 += 2;
-        vst1_f32(o3, vacc3x01); o3 += 2;
-
-        vacc0x01 = vget_high_f32(vacc0x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc3x01 = vget_high_f32(vacc3x0123);
-      }
-      if (c & (1 * sizeof(float))) {
-        vst1_lane_f32(o0, vacc0x01, 0); o0 += 1;
-        vst1_lane_f32(o1, vacc1x01, 0); o1 += 1;
-        vst1_lane_f32(o2, vacc2x01, 0); o2 += 1;
-        vst1_lane_f32(o3, vacc3x01, 0); o3 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    i2 = (const float*) ((uintptr_t) i2 + input_increment);
-    o2 = (float*) ((uintptr_t) o2 + output_increment);
-    i3 = (const float*) ((uintptr_t) i3 + input_increment);
-    o3 = (float*) ((uintptr_t) o3 + output_increment);
-    rows = doz(rows, 4);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-neon-4x8.c b/src/f32-prelu/gen/f32-prelu-neon-4x8.c
deleted file mode 100644
index 8f14872b6bb..00000000000
--- a/src/f32-prelu/gen/f32-prelu-neon-4x8.c
+++ /dev/null
@@ -1,196 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/neon.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__neon_4x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
-  float* o2 = (float*) ((uintptr_t) o1 + output_stride);
-  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
-  float* o3 = (float*) ((uintptr_t) o2 + output_stride);
-
-  const size_t input_increment = input_stride * 4 - channels;
-  const size_t output_increment = output_stride * 4 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-    if XNN_UNPREDICTABLE(rows <= 2) {
-      i2 = i1;
-      o2 = o1;
-    }
-    if XNN_UNPREDICTABLE(rows < 4) {
-      i3 = i2;
-      o3 = o2;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vw4567 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
-      const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
-      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
-      const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
-      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
-      const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc0x4567 = vmulq_f32(vi0x4567, vw4567);
-      const uint32x4_t vm0x4567 = vcltq_s32(vreinterpretq_s32_f32(vi0x4567), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x4567 = vmulq_f32(vi1x4567, vw4567);
-      const uint32x4_t vm1x4567 = vcltq_s32(vreinterpretq_s32_f32(vi1x4567), vmovq_n_s32(0));
-      float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123);
-      const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0));
-      float32x4_t vacc2x4567 = vmulq_f32(vi2x4567, vw4567);
-      const uint32x4_t vm2x4567 = vcltq_s32(vreinterpretq_s32_f32(vi2x4567), vmovq_n_s32(0));
-      float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123);
-      const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0));
-      float32x4_t vacc3x4567 = vmulq_f32(vi3x4567, vw4567);
-      const uint32x4_t vm3x4567 = vcltq_s32(vreinterpretq_s32_f32(vi3x4567), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc0x4567 = vbslq_f32(vm0x4567, vacc0x4567, vi0x4567);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-      vacc1x4567 = vbslq_f32(vm1x4567, vacc1x4567, vi1x4567);
-      vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123);
-      vacc2x4567 = vbslq_f32(vm2x4567, vacc2x4567, vi2x4567);
-      vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123);
-      vacc3x4567 = vbslq_f32(vm3x4567, vacc3x4567, vi3x4567);
-
-      vst1q_f32(o0, vacc0x0123); o0 += 4;
-      vst1q_f32(o0, vacc0x4567); o0 += 4;
-      vst1q_f32(o1, vacc1x0123); o1 += 4;
-      vst1q_f32(o1, vacc1x4567); o1 += 4;
-      vst1q_f32(o2, vacc2x0123); o2 += 4;
-      vst1q_f32(o2, vacc2x4567); o2 += 4;
-      vst1q_f32(o3, vacc3x0123); o3 += 4;
-      vst1q_f32(o3, vacc3x4567); o3 += 4;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0);
-      i0 += 4;
-      const float32x4_t vi1x0123 = vld1q_f32(i1);
-      i1 += 4;
-      const float32x4_t vi2x0123 = vld1q_f32(i2);
-      i2 += 4;
-      const float32x4_t vi3x0123 = vld1q_f32(i3);
-      i3 += 4;
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-      float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123);
-      const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0));
-      float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123);
-      const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-      vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123);
-      vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123);
-
-      vst1q_f32(o0, vacc0x0123); o0 += 4;
-      vst1q_f32(o1, vacc1x0123); o1 += 4;
-      vst1q_f32(o2, vacc2x0123); o2 += 4;
-      vst1q_f32(o3, vacc3x0123); o3 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      const float32x4_t vi0x0123 = vld1q_f32(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const float32x4_t vi1x0123 = vld1q_f32(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-      const float32x4_t vi2x0123 = vld1q_f32(i2);
-      i2 = (const float*) ((uintptr_t) i2 + c);
-      const float32x4_t vi3x0123 = vld1q_f32(i3);
-      i3 = (const float*) ((uintptr_t) i3 + c);
-
-      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
-      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
-      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
-      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
-      float32x4_t vacc2x0123 = vmulq_f32(vi2x0123, vw0123);
-      const uint32x4_t vm2x0123 = vcltq_s32(vreinterpretq_s32_f32(vi2x0123), vmovq_n_s32(0));
-      float32x4_t vacc3x0123 = vmulq_f32(vi3x0123, vw0123);
-      const uint32x4_t vm3x0123 = vcltq_s32(vreinterpretq_s32_f32(vi3x0123), vmovq_n_s32(0));
-
-      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
-      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
-      vacc2x0123 = vbslq_f32(vm2x0123, vacc2x0123, vi2x0123);
-      vacc3x0123 = vbslq_f32(vm3x0123, vacc3x0123, vi3x0123);
-
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      if (c & (2 * sizeof(float))) {
-        vst1_f32(o0, vacc0x01); o0 += 2;
-        vst1_f32(o1, vacc1x01); o1 += 2;
-        vst1_f32(o2, vacc2x01); o2 += 2;
-        vst1_f32(o3, vacc3x01); o3 += 2;
-
-        vacc0x01 = vget_high_f32(vacc0x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc3x01 = vget_high_f32(vacc3x0123);
-      }
-      if (c & (1 * sizeof(float))) {
-        vst1_lane_f32(o0, vacc0x01, 0); o0 += 1;
-        vst1_lane_f32(o1, vacc1x01, 0); o1 += 1;
-        vst1_lane_f32(o2, vacc2x01, 0); o2 += 1;
-        vst1_lane_f32(o3, vacc3x01, 0); o3 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    i2 = (const float*) ((uintptr_t) i2 + input_increment);
-    o2 = (float*) ((uintptr_t) o2 + output_increment);
-    i3 = (const float*) ((uintptr_t) i3 + input_increment);
-    o3 = (float*) ((uintptr_t) o3 + output_increment);
-    rows = doz(rows, 4);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-scalar-2x1.c b/src/f32-prelu/gen/f32-prelu-scalar-2x1.c
deleted file mode 100644
index 7d1b7c3fdf2..00000000000
--- a/src/f32-prelu/gen/f32-prelu-scalar-2x1.c
+++ /dev/null
@@ -1,65 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/scalar.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__scalar_2x1(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    do {
-      const float vw = *w++;
-
-      const float vi0 = *i0++;
-      const float vi1 = *i1++;
-
-      const float vacc0 = XNN_UNPREDICTABLE(vi0 < 0.0f) ? vi0 * vw : vi0;
-      const float vacc1 = XNN_UNPREDICTABLE(vi1 < 0.0f) ? vi1 * vw : vi1;
-
-      *o0++ = vacc0;
-      *o1++ = vacc1;
-
-      c -= sizeof(float);
-    } while (c != 0);
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-scalar-2x4.c b/src/f32-prelu/gen/f32-prelu-scalar-2x4.c
deleted file mode 100644
index b1bd371446b..00000000000
--- a/src/f32-prelu/gen/f32-prelu-scalar-2x4.c
+++ /dev/null
@@ -1,102 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/scalar.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__scalar_2x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const float vw0 = w[0];
-      const float vw1 = w[1];
-      const float vw2 = w[2];
-      const float vw3 = w[3];
-
-      const float vi0x0 = i0[0];
-      const float vi0x1 = i0[1];
-      const float vi0x2 = i0[2];
-      const float vi0x3 = i0[3];
-      i0 += 4;
-      const float vi1x0 = i1[0];
-      const float vi1x1 = i1[1];
-      const float vi1x2 = i1[2];
-      const float vi1x3 = i1[3];
-      i1 += 4;
-
-      const float vacc0x0 = XNN_UNPREDICTABLE(vi0x0 < 0.0f) ? vi0x0 * vw0 : vi0x0;
-      const float vacc0x1 = XNN_UNPREDICTABLE(vi0x1 < 0.0f) ? vi0x1 * vw1 : vi0x1;
-      const float vacc0x2 = XNN_UNPREDICTABLE(vi0x2 < 0.0f) ? vi0x2 * vw2 : vi0x2;
-      const float vacc0x3 = XNN_UNPREDICTABLE(vi0x3 < 0.0f) ? vi0x3 * vw3 : vi0x3;
-      const float vacc1x0 = XNN_UNPREDICTABLE(vi1x0 < 0.0f) ? vi1x0 * vw0 : vi1x0;
-      const float vacc1x1 = XNN_UNPREDICTABLE(vi1x1 < 0.0f) ? vi1x1 * vw1 : vi1x1;
-      const float vacc1x2 = XNN_UNPREDICTABLE(vi1x2 < 0.0f) ? vi1x2 * vw2 : vi1x2;
-      const float vacc1x3 = XNN_UNPREDICTABLE(vi1x3 < 0.0f) ? vi1x3 * vw3 : vi1x3;
-
-      o0[0] = vacc0x0;
-      o0[1] = vacc0x1;
-      o0[2] = vacc0x2;
-      o0[3] = vacc0x3;
-      o0 += 4;
-      o1[0] = vacc1x0;
-      o1[1] = vacc1x1;
-      o1[2] = vacc1x2;
-      o1[3] = vacc1x3;
-      o1 += 4;
-
-      w += 4;
-    }
-    for (; c != 0; c -= sizeof(float)) {
-      const float vw = *w++;
-
-      const float vi0 = *i0++;
-      const float vi1 = *i1++;
-
-      const float vacc0 = XNN_UNPREDICTABLE(vi0 < 0.0f) ? vi0 * vw : vi0;
-      const float vacc1 = XNN_UNPREDICTABLE(vi1 < 0.0f) ? vi1 * vw : vi1;
-
-      *o0++ = vacc0;
-      *o1++ = vacc1;
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-sse-2x4.c b/src/f32-prelu/gen/f32-prelu-sse-2x4.c
deleted file mode 100644
index f3a5063f78d..00000000000
--- a/src/f32-prelu/gen/f32-prelu-sse-2x4.c
+++ /dev/null
@@ -1,111 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xmmintrin.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__sse_2x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  const __m128 vzero = _mm_setzero_ps();
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      w += 4;
-
-      __m128 vi0x0123 = _mm_loadu_ps(i0);
-      i0 += 4;
-      __m128 vi1x0123 = _mm_loadu_ps(i1);
-      i1 += 4;
-
-      __m128 vacc0x0123 = _mm_max_ps(_mm_setzero_ps(), vi0x0123);
-      vi0x0123 = _mm_min_ps(vi0x0123, vzero);
-      __m128 vacc1x0123 = _mm_max_ps(_mm_setzero_ps(), vi1x0123);
-      vi1x0123 = _mm_min_ps(vi1x0123, vzero);
-
-      vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(vi0x0123, vw0123));
-      vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(vi1x0123, vw0123));
-
-      _mm_storeu_ps(o0, vacc0x0123);
-      o0 += 4;
-      _mm_storeu_ps(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      __m128 vi0x0123 = _mm_loadu_ps(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      __m128 vi1x0123 = _mm_loadu_ps(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      __m128 vacc0x0123 = _mm_max_ps(_mm_setzero_ps(), vi0x0123);
-      vi0x0123 = _mm_min_ps(vi0x0123, vzero);
-      __m128 vacc1x0123 = _mm_max_ps(_mm_setzero_ps(), vi1x0123);
-      vi1x0123 = _mm_min_ps(vi1x0123, vzero);
-
-      vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(vi0x0123, vw0123));
-      vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(vi1x0123, vw0123));
-
-      if (c & (2 * sizeof(float))) {
-        _mm_storel_pi((__m64*) o0, vacc0x0123);
-        _mm_storel_pi((__m64*) o1, vacc1x0123);
-
-        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
-        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        _mm_store_ss(o0, vacc0x0123);
-        _mm_store_ss(o1, vacc1x0123);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-sse-2x8.c b/src/f32-prelu/gen/f32-prelu-sse-2x8.c
deleted file mode 100644
index b361d5a293c..00000000000
--- a/src/f32-prelu/gen/f32-prelu-sse-2x8.c
+++ /dev/null
@@ -1,144 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xmmintrin.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__sse_2x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  const __m128 vzero = _mm_setzero_ps();
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      const __m128 vw4567 = _mm_load_ps(w + 4);
-      w += 8;
-
-      __m128 vi0x0123 = _mm_loadu_ps(i0);
-      __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
-      i0 += 8;
-      __m128 vi1x0123 = _mm_loadu_ps(i1);
-      __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
-      i1 += 8;
-
-      __m128 vacc0x0123 = _mm_max_ps(_mm_setzero_ps(), vi0x0123);
-      vi0x0123 = _mm_min_ps(vi0x0123, vzero);
-      __m128 vacc0x4567 = _mm_max_ps(_mm_setzero_ps(), vi0x4567);
-      vi0x4567 = _mm_min_ps(vi0x4567, vzero);
-      __m128 vacc1x0123 = _mm_max_ps(_mm_setzero_ps(), vi1x0123);
-      vi1x0123 = _mm_min_ps(vi1x0123, vzero);
-      __m128 vacc1x4567 = _mm_max_ps(_mm_setzero_ps(), vi1x4567);
-      vi1x4567 = _mm_min_ps(vi1x4567, vzero);
-
-      vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(vi0x0123, vw0123));
-      vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(vi0x4567, vw4567));
-      vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(vi1x0123, vw0123));
-      vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(vi1x4567, vw4567));
-
-      _mm_storeu_ps(o0, vacc0x0123);
-      _mm_storeu_ps(o0 + 4, vacc0x4567);
-      o0 += 8;
-      _mm_storeu_ps(o1, vacc1x0123);
-      _mm_storeu_ps(o1 + 4, vacc1x4567);
-      o1 += 8;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      w += 4;
-
-      __m128 vi0x0123 = _mm_loadu_ps(i0);
-      i0 += 4;
-      __m128 vi1x0123 = _mm_loadu_ps(i1);
-      i1 += 4;
-
-      __m128 vacc0x0123 = _mm_max_ps(_mm_setzero_ps(), vi0x0123);
-      vi0x0123 = _mm_min_ps(vi0x0123, vzero);
-      __m128 vacc1x0123 = _mm_max_ps(_mm_setzero_ps(), vi1x0123);
-      vi1x0123 = _mm_min_ps(vi1x0123, vzero);
-
-      vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(vi0x0123, vw0123));
-      vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(vi1x0123, vw0123));
-
-      _mm_storeu_ps(o0, vacc0x0123);
-      o0 += 4;
-      _mm_storeu_ps(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      __m128 vi0x0123 = _mm_loadu_ps(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      __m128 vi1x0123 = _mm_loadu_ps(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      __m128 vacc0x0123 = _mm_max_ps(_mm_setzero_ps(), vi0x0123);
-      vi0x0123 = _mm_min_ps(vi0x0123, vzero);
-      __m128 vacc1x0123 = _mm_max_ps(_mm_setzero_ps(), vi1x0123);
-      vi1x0123 = _mm_min_ps(vi1x0123, vzero);
-
-      vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(vi0x0123, vw0123));
-      vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(vi1x0123, vw0123));
-
-      if (c & (2 * sizeof(float))) {
-        _mm_storel_pi((__m64*) o0, vacc0x0123);
-        _mm_storel_pi((__m64*) o1, vacc1x0123);
-
-        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
-        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        _mm_store_ss(o0, vacc0x0123);
-        _mm_store_ss(o1, vacc1x0123);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-sse2-2x4.c b/src/f32-prelu/gen/f32-prelu-sse2-2x4.c
deleted file mode 100644
index a4d3ca533da..00000000000
--- a/src/f32-prelu/gen/f32-prelu-sse2-2x4.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__sse2_2x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      w += 4;
-
-      const __m128 vi0x0123 = _mm_loadu_ps(i0);
-      i0 += 4;
-      const __m128 vi1x0123 = _mm_loadu_ps(i1);
-      i1 += 4;
-
-      const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
-      const __m128 vmask0x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x0123)));
-      const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
-      const __m128 vmask1x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x0123)));
-
-      const __m128 vacc0x0123 = _mm_or_ps(_mm_and_ps(vprod0x0123, vmask0x0123), _mm_andnot_ps(vmask0x0123, vi0x0123));
-      const __m128 vacc1x0123 = _mm_or_ps(_mm_and_ps(vprod1x0123, vmask1x0123), _mm_andnot_ps(vmask1x0123, vi1x0123));
-
-      _mm_storeu_ps(o0, vacc0x0123);
-      o0 += 4;
-      _mm_storeu_ps(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const __m128 vi0x0123 = _mm_loadu_ps(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const __m128 vi1x0123 = _mm_loadu_ps(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
-      const __m128 vmask0x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x0123)));
-      const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
-      const __m128 vmask1x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x0123)));
-
-      __m128 vacc0x0123 = _mm_or_ps(_mm_and_ps(vprod0x0123, vmask0x0123), _mm_andnot_ps(vmask0x0123, vi0x0123));
-      __m128 vacc1x0123 = _mm_or_ps(_mm_and_ps(vprod1x0123, vmask1x0123), _mm_andnot_ps(vmask1x0123, vi1x0123));
-
-      if (c & (2 * sizeof(float))) {
-        _mm_storel_pi((__m64*) o0, vacc0x0123);
-        _mm_storel_pi((__m64*) o1, vacc1x0123);
-
-        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
-        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        _mm_store_ss(o0, vacc0x0123);
-        _mm_store_ss(o1, vacc1x0123);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-sse2-2x8.c b/src/f32-prelu/gen/f32-prelu-sse2-2x8.c
deleted file mode 100644
index 87f3ebbd99f..00000000000
--- a/src/f32-prelu/gen/f32-prelu-sse2-2x8.c
+++ /dev/null
@@ -1,143 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__sse2_2x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      const __m128 vw4567 = _mm_load_ps(w + 4);
-      w += 8;
-
-      const __m128 vi0x0123 = _mm_loadu_ps(i0);
-      const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
-      i0 += 8;
-      const __m128 vi1x0123 = _mm_loadu_ps(i1);
-      const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
-      i1 += 8;
-
-      const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
-      const __m128 vmask0x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x0123)));
-      const __m128 vprod0x4567 = _mm_mul_ps(vi0x4567, vw4567);
-      const __m128 vmask0x4567 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x4567)));
-      const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
-      const __m128 vmask1x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x0123)));
-      const __m128 vprod1x4567 = _mm_mul_ps(vi1x4567, vw4567);
-      const __m128 vmask1x4567 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x4567)));
-
-      const __m128 vacc0x0123 = _mm_or_ps(_mm_and_ps(vprod0x0123, vmask0x0123), _mm_andnot_ps(vmask0x0123, vi0x0123));
-      const __m128 vacc0x4567 = _mm_or_ps(_mm_and_ps(vprod0x4567, vmask0x4567), _mm_andnot_ps(vmask0x4567, vi0x4567));
-      const __m128 vacc1x0123 = _mm_or_ps(_mm_and_ps(vprod1x0123, vmask1x0123), _mm_andnot_ps(vmask1x0123, vi1x0123));
-      const __m128 vacc1x4567 = _mm_or_ps(_mm_and_ps(vprod1x4567, vmask1x4567), _mm_andnot_ps(vmask1x4567, vi1x4567));
-
-      _mm_storeu_ps(o0, vacc0x0123);
-      _mm_storeu_ps(o0 + 4, vacc0x4567);
-      o0 += 8;
-      _mm_storeu_ps(o1, vacc1x0123);
-      _mm_storeu_ps(o1 + 4, vacc1x4567);
-      o1 += 8;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      w += 4;
-
-      const __m128 vi0x0123 = _mm_loadu_ps(i0);
-      i0 += 4;
-      const __m128 vi1x0123 = _mm_loadu_ps(i1);
-      i1 += 4;
-
-      const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
-      const __m128 vmask0x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x0123)));
-      const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
-      const __m128 vmask1x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x0123)));
-
-      __m128 vacc0x0123 = _mm_or_ps(_mm_and_ps(vprod0x0123, vmask0x0123), _mm_andnot_ps(vmask0x0123, vi0x0123));
-      __m128 vacc1x0123 = _mm_or_ps(_mm_and_ps(vprod1x0123, vmask1x0123), _mm_andnot_ps(vmask1x0123, vi1x0123));
-
-      _mm_storeu_ps(o0, vacc0x0123);
-      o0 += 4;
-      _mm_storeu_ps(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const __m128 vi0x0123 = _mm_loadu_ps(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const __m128 vi1x0123 = _mm_loadu_ps(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
-      const __m128 vmask0x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x0123)));
-      const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
-      const __m128 vmask1x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x0123)));
-
-      __m128 vacc0x0123 = _mm_or_ps(_mm_and_ps(vprod0x0123, vmask0x0123), _mm_andnot_ps(vmask0x0123, vi0x0123));
-      __m128 vacc1x0123 = _mm_or_ps(_mm_and_ps(vprod1x0123, vmask1x0123), _mm_andnot_ps(vmask1x0123, vi1x0123));
-
-      if (c & (2 * sizeof(float))) {
-        _mm_storel_pi((__m64*) o0, vacc0x0123);
-        _mm_storel_pi((__m64*) o1, vacc1x0123);
-
-        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
-        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        _mm_store_ss(o0, vacc0x0123);
-        _mm_store_ss(o1, vacc1x0123);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-sse41-2x4.c b/src/f32-prelu/gen/f32-prelu-sse41-2x4.c
deleted file mode 100644
index 1bcec873fe7..00000000000
--- a/src/f32-prelu/gen/f32-prelu-sse41-2x4.c
+++ /dev/null
@@ -1,106 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__sse41_2x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      w += 4;
-
-      const __m128 vi0x0123 = _mm_loadu_ps(i0);
-      i0 += 4;
-      const __m128 vi1x0123 = _mm_loadu_ps(i1);
-      i1 += 4;
-
-      const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
-      const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
-
-      const __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123);
-      const __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123);
-
-      _mm_storeu_ps(o0, vacc0x0123);
-      o0 += 4;
-      _mm_storeu_ps(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const __m128 vi0x0123 = _mm_loadu_ps(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const __m128 vi1x0123 = _mm_loadu_ps(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
-      const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
-
-      __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123);
-      __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123);
-
-      if (c & (2 * sizeof(float))) {
-        _mm_storel_pi((__m64*) o0, vacc0x0123);
-        _mm_storel_pi((__m64*) o1, vacc1x0123);
-
-        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
-        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        _mm_store_ss(o0, vacc0x0123);
-        _mm_store_ss(o1, vacc1x0123);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-sse41-2x8.c b/src/f32-prelu/gen/f32-prelu-sse41-2x8.c
deleted file mode 100644
index 21548d5c193..00000000000
--- a/src/f32-prelu/gen/f32-prelu-sse41-2x8.c
+++ /dev/null
@@ -1,135 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__sse41_2x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      const __m128 vw4567 = _mm_load_ps(w + 4);
-      w += 8;
-
-      const __m128 vi0x0123 = _mm_loadu_ps(i0);
-      const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
-      i0 += 8;
-      const __m128 vi1x0123 = _mm_loadu_ps(i1);
-      const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
-      i1 += 8;
-
-      const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
-      const __m128 vprod0x4567 = _mm_mul_ps(vi0x4567, vw4567);
-      const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
-      const __m128 vprod1x4567 = _mm_mul_ps(vi1x4567, vw4567);
-
-      const __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123);
-      const __m128 vacc0x4567 = _mm_blendv_ps(vi0x4567, vprod0x4567, vi0x4567);
-      const __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123);
-      const __m128 vacc1x4567 = _mm_blendv_ps(vi1x4567, vprod1x4567, vi1x4567);
-
-      _mm_storeu_ps(o0, vacc0x0123);
-      _mm_storeu_ps(o0 + 4, vacc0x4567);
-      o0 += 8;
-      _mm_storeu_ps(o1, vacc1x0123);
-      _mm_storeu_ps(o1 + 4, vacc1x4567);
-      o1 += 8;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      w += 4;
-
-      const __m128 vi0x0123 = _mm_loadu_ps(i0);
-      i0 += 4;
-      const __m128 vi1x0123 = _mm_loadu_ps(i1);
-      i1 += 4;
-
-      const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
-      const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
-
-      __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123);
-      __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123);
-
-      _mm_storeu_ps(o0, vacc0x0123);
-      o0 += 4;
-      _mm_storeu_ps(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const __m128 vi0x0123 = _mm_loadu_ps(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const __m128 vi1x0123 = _mm_loadu_ps(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
-      const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
-
-      __m128 vacc0x0123 = _mm_blendv_ps(vi0x0123, vprod0x0123, vi0x0123);
-      __m128 vacc1x0123 = _mm_blendv_ps(vi1x0123, vprod1x0123, vi1x0123);
-
-      if (c & (2 * sizeof(float))) {
-        _mm_storel_pi((__m64*) o0, vacc0x0123);
-        _mm_storel_pi((__m64*) o1, vacc1x0123);
-
-        vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
-        vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        _mm_store_ss(o0, vacc0x0123);
-        _mm_store_ss(o1, vacc1x0123);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasm-2x1.c b/src/f32-prelu/gen/f32-prelu-wasm-2x1.c
deleted file mode 100644
index f4d21934848..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasm-2x1.c
+++ /dev/null
@@ -1,71 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasm.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasm_2x1(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  const float vzero = 0.0f;
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    do {
-      const float vw = *w++;
-
-      float vi0 = *i0++;
-      float vi1 = *i1++;
-
-      float vacc0 = __builtin_wasm_max_f32(vi0, vzero);
-      vi0 = __builtin_wasm_min_f32(vi0, vzero);
-      float vacc1 = __builtin_wasm_max_f32(vi1, vzero);
-      vi1 = __builtin_wasm_min_f32(vi1, vzero);
-
-      vacc0 += vi0 * vw;
-      vacc1 += vi1 * vw;
-
-      *o0++ = vacc0;
-      *o1++ = vacc1;
-
-      c -= sizeof(float);
-    } while (c != 0);
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasm-2x4.c b/src/f32-prelu/gen/f32-prelu-wasm-2x4.c
deleted file mode 100644
index beee640db7f..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasm-2x4.c
+++ /dev/null
@@ -1,125 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasm.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasm_2x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  const float vzero = 0.0f;
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const float vw0 = w[0];
-      const float vw1 = w[1];
-      const float vw2 = w[2];
-      const float vw3 = w[3];
-
-      float vi0x0 = i0[0];
-      float vi0x1 = i0[1];
-      float vi0x2 = i0[2];
-      float vi0x3 = i0[3];
-      i0 += 4;
-      float vi1x0 = i1[0];
-      float vi1x1 = i1[1];
-      float vi1x2 = i1[2];
-      float vi1x3 = i1[3];
-      i1 += 4;
-
-      float vacc0x0 = __builtin_wasm_max_f32(vi0x0, vzero);
-      vi0x0 = __builtin_wasm_min_f32(vi0x0, vzero);
-      float vacc0x1 = __builtin_wasm_max_f32(vi0x1, vzero);
-      vi0x1 = __builtin_wasm_min_f32(vi0x1, vzero);
-      float vacc0x2 = __builtin_wasm_max_f32(vi0x2, vzero);
-      vi0x2 = __builtin_wasm_min_f32(vi0x2, vzero);
-      float vacc0x3 = __builtin_wasm_max_f32(vi0x3, vzero);
-      vi0x3 = __builtin_wasm_min_f32(vi0x3, vzero);
-      float vacc1x0 = __builtin_wasm_max_f32(vi1x0, vzero);
-      vi1x0 = __builtin_wasm_min_f32(vi1x0, vzero);
-      float vacc1x1 = __builtin_wasm_max_f32(vi1x1, vzero);
-      vi1x1 = __builtin_wasm_min_f32(vi1x1, vzero);
-      float vacc1x2 = __builtin_wasm_max_f32(vi1x2, vzero);
-      vi1x2 = __builtin_wasm_min_f32(vi1x2, vzero);
-      float vacc1x3 = __builtin_wasm_max_f32(vi1x3, vzero);
-      vi1x3 = __builtin_wasm_min_f32(vi1x3, vzero);
-
-      vacc0x0 += vi0x0 * vw0;
-      vacc0x1 += vi0x1 * vw1;
-      vacc0x2 += vi0x2 * vw2;
-      vacc0x3 += vi0x3 * vw3;
-      vacc1x0 += vi1x0 * vw0;
-      vacc1x1 += vi1x1 * vw1;
-      vacc1x2 += vi1x2 * vw2;
-      vacc1x3 += vi1x3 * vw3;
-
-      o0[0] = vacc0x0;
-      o0[1] = vacc0x1;
-      o0[2] = vacc0x2;
-      o0[3] = vacc0x3;
-      o0 += 4;
-      o1[0] = vacc1x0;
-      o1[1] = vacc1x1;
-      o1[2] = vacc1x2;
-      o1[3] = vacc1x3;
-      o1 += 4;
-
-      w += 4;
-    }
-    for (; c != 0; c -= sizeof(float)) {
-      const float vw = *w++;
-
-      float vi0 = *i0++;
-      float vi1 = *i1++;
-
-      float vacc0 = __builtin_wasm_max_f32(vi0, vzero);
-      vi0 = __builtin_wasm_min_f32(vi0, vzero);
-      float vacc1 = __builtin_wasm_max_f32(vi1, vzero);
-      vi1 = __builtin_wasm_min_f32(vi1, vzero);
-
-      vacc0 += vi0 * vw;
-      vacc1 += vi1 * vw;
-
-      *o0++ = vacc0;
-      *o1++ = vacc1;
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x16.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x16.c
deleted file mode 100644
index c3f3c18d6a6..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x16.c
+++ /dev/null
@@ -1,119 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-
-  const size_t input_increment = input_stride * 1 - channels;
-  const size_t output_increment = output_stride * 1 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      const v128_t vw89AB = wasm_v128_load(w + 8);
-      const v128_t vwCDEF = wasm_v128_load(w + 12);
-      w += 16;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      v128_t vi0x89AB = wasm_v128_load(i0 + 8);
-      v128_t vi0xCDEF = wasm_v128_load(i0 + 12);
-      i0 += 16;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero);
-      vi0x4567 = wasm_i32x4_min(vi0x4567, vzero);
-      v128_t vacc0x89AB = wasm_i32x4_max(vi0x89AB, vzero);
-      vi0x89AB = wasm_i32x4_min(vi0x89AB, vzero);
-      v128_t vacc0xCDEF = wasm_i32x4_max(vi0xCDEF, vzero);
-      vi0xCDEF = wasm_i32x4_min(vi0xCDEF, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc0x4567 = wasm_f32x4_relaxed_madd(vi0x4567, vw4567, vacc0x4567);
-      vacc0x89AB = wasm_f32x4_relaxed_madd(vi0x89AB, vw89AB, vacc0x89AB);
-      vacc0xCDEF = wasm_f32x4_relaxed_madd(vi0xCDEF, vwCDEF, vacc0xCDEF);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      wasm_v128_store(o0 + 8, vacc0x89AB);
-      wasm_v128_store(o0 + 12, vacc0xCDEF);
-      o0 += 16;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-
-        o0 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-
-        o0 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    rows = doz(rows, 1);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x4.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x4.c
deleted file mode 100644
index 36bb65e0b3c..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x4.c
+++ /dev/null
@@ -1,86 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-
-  const size_t input_increment = input_stride * 1 - channels;
-  const size_t output_increment = output_stride * 1 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-
-        o0 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-
-        o0 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    rows = doz(rows, 1);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x8.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x8.c
deleted file mode 100644
index 5fd22f53efa..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x8.c
+++ /dev/null
@@ -1,107 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-
-  const size_t input_increment = input_stride * 1 - channels;
-  const size_t output_increment = output_stride * 1 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      w += 8;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      i0 += 8;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero);
-      vi0x4567 = wasm_i32x4_min(vi0x4567, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc0x4567 = wasm_f32x4_relaxed_madd(vi0x4567, vw4567, vacc0x4567);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      o0 += 8;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-
-        o0 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-
-        o0 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    rows = doz(rows, 1);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x16.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x16.c
deleted file mode 100644
index 473d4b5273e..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x16.c
+++ /dev/null
@@ -1,166 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      const v128_t vw89AB = wasm_v128_load(w + 8);
-      const v128_t vwCDEF = wasm_v128_load(w + 12);
-      w += 16;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      v128_t vi0x89AB = wasm_v128_load(i0 + 8);
-      v128_t vi0xCDEF = wasm_v128_load(i0 + 12);
-      i0 += 16;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      v128_t vi1x89AB = wasm_v128_load(i1 + 8);
-      v128_t vi1xCDEF = wasm_v128_load(i1 + 12);
-      i1 += 16;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero);
-      vi0x4567 = wasm_i32x4_min(vi0x4567, vzero);
-      v128_t vacc0x89AB = wasm_i32x4_max(vi0x89AB, vzero);
-      vi0x89AB = wasm_i32x4_min(vi0x89AB, vzero);
-      v128_t vacc0xCDEF = wasm_i32x4_max(vi0xCDEF, vzero);
-      vi0xCDEF = wasm_i32x4_min(vi0xCDEF, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero);
-      vi1x4567 = wasm_i32x4_min(vi1x4567, vzero);
-      v128_t vacc1x89AB = wasm_i32x4_max(vi1x89AB, vzero);
-      vi1x89AB = wasm_i32x4_min(vi1x89AB, vzero);
-      v128_t vacc1xCDEF = wasm_i32x4_max(vi1xCDEF, vzero);
-      vi1xCDEF = wasm_i32x4_min(vi1xCDEF, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc0x4567 = wasm_f32x4_relaxed_madd(vi0x4567, vw4567, vacc0x4567);
-      vacc0x89AB = wasm_f32x4_relaxed_madd(vi0x89AB, vw89AB, vacc0x89AB);
-      vacc0xCDEF = wasm_f32x4_relaxed_madd(vi0xCDEF, vwCDEF, vacc0xCDEF);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-      vacc1x4567 = wasm_f32x4_relaxed_madd(vi1x4567, vw4567, vacc1x4567);
-      vacc1x89AB = wasm_f32x4_relaxed_madd(vi1x89AB, vw89AB, vacc1x89AB);
-      vacc1xCDEF = wasm_f32x4_relaxed_madd(vi1xCDEF, vwCDEF, vacc1xCDEF);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      wasm_v128_store(o0 + 8, vacc0x89AB);
-      wasm_v128_store(o0 + 12, vacc0xCDEF);
-      o0 += 16;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      wasm_v128_store(o1 + 8, vacc1x89AB);
-      wasm_v128_store(o1 + 12, vacc1xCDEF);
-      o1 += 16;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x4.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x4.c
deleted file mode 100644
index ddf7efb3685..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x4.c
+++ /dev/null
@@ -1,111 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x8.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x8.c
deleted file mode 100644
index cd4ce1f33e7..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x8.c
+++ /dev/null
@@ -1,144 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      w += 8;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      i0 += 8;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      i1 += 8;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero);
-      vi0x4567 = wasm_i32x4_min(vi0x4567, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero);
-      vi1x4567 = wasm_i32x4_min(vi1x4567, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc0x4567 = wasm_f32x4_relaxed_madd(vi0x4567, vw4567, vacc0x4567);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-      vacc1x4567 = wasm_f32x4_relaxed_madd(vi1x4567, vw4567, vacc1x4567);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      o0 += 8;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      o1 += 8;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x16.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x16.c
deleted file mode 100644
index 084aa6997e3..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x16.c
+++ /dev/null
@@ -1,260 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
-  float* o2 = (float*) ((uintptr_t) o1 + output_stride);
-  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
-  float* o3 = (float*) ((uintptr_t) o2 + output_stride);
-
-  const size_t input_increment = input_stride * 4 - channels;
-  const size_t output_increment = output_stride * 4 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-    if XNN_UNPREDICTABLE(rows <= 2) {
-      i2 = i1;
-      o2 = o1;
-    }
-    if XNN_UNPREDICTABLE(rows < 4) {
-      i3 = i2;
-      o3 = o2;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      const v128_t vw89AB = wasm_v128_load(w + 8);
-      const v128_t vwCDEF = wasm_v128_load(w + 12);
-      w += 16;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      v128_t vi0x89AB = wasm_v128_load(i0 + 8);
-      v128_t vi0xCDEF = wasm_v128_load(i0 + 12);
-      i0 += 16;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      v128_t vi1x89AB = wasm_v128_load(i1 + 8);
-      v128_t vi1xCDEF = wasm_v128_load(i1 + 12);
-      i1 += 16;
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      v128_t vi2x4567 = wasm_v128_load(i2 + 4);
-      v128_t vi2x89AB = wasm_v128_load(i2 + 8);
-      v128_t vi2xCDEF = wasm_v128_load(i2 + 12);
-      i2 += 16;
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      v128_t vi3x4567 = wasm_v128_load(i3 + 4);
-      v128_t vi3x89AB = wasm_v128_load(i3 + 8);
-      v128_t vi3xCDEF = wasm_v128_load(i3 + 12);
-      i3 += 16;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero);
-      vi0x4567 = wasm_i32x4_min(vi0x4567, vzero);
-      v128_t vacc0x89AB = wasm_i32x4_max(vi0x89AB, vzero);
-      vi0x89AB = wasm_i32x4_min(vi0x89AB, vzero);
-      v128_t vacc0xCDEF = wasm_i32x4_max(vi0xCDEF, vzero);
-      vi0xCDEF = wasm_i32x4_min(vi0xCDEF, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero);
-      vi1x4567 = wasm_i32x4_min(vi1x4567, vzero);
-      v128_t vacc1x89AB = wasm_i32x4_max(vi1x89AB, vzero);
-      vi1x89AB = wasm_i32x4_min(vi1x89AB, vzero);
-      v128_t vacc1xCDEF = wasm_i32x4_max(vi1xCDEF, vzero);
-      vi1xCDEF = wasm_i32x4_min(vi1xCDEF, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc2x4567 = wasm_i32x4_max(vi2x4567, vzero);
-      vi2x4567 = wasm_i32x4_min(vi2x4567, vzero);
-      v128_t vacc2x89AB = wasm_i32x4_max(vi2x89AB, vzero);
-      vi2x89AB = wasm_i32x4_min(vi2x89AB, vzero);
-      v128_t vacc2xCDEF = wasm_i32x4_max(vi2xCDEF, vzero);
-      vi2xCDEF = wasm_i32x4_min(vi2xCDEF, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-      v128_t vacc3x4567 = wasm_i32x4_max(vi3x4567, vzero);
-      vi3x4567 = wasm_i32x4_min(vi3x4567, vzero);
-      v128_t vacc3x89AB = wasm_i32x4_max(vi3x89AB, vzero);
-      vi3x89AB = wasm_i32x4_min(vi3x89AB, vzero);
-      v128_t vacc3xCDEF = wasm_i32x4_max(vi3xCDEF, vzero);
-      vi3xCDEF = wasm_i32x4_min(vi3xCDEF, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc0x4567 = wasm_f32x4_relaxed_madd(vi0x4567, vw4567, vacc0x4567);
-      vacc0x89AB = wasm_f32x4_relaxed_madd(vi0x89AB, vw89AB, vacc0x89AB);
-      vacc0xCDEF = wasm_f32x4_relaxed_madd(vi0xCDEF, vwCDEF, vacc0xCDEF);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-      vacc1x4567 = wasm_f32x4_relaxed_madd(vi1x4567, vw4567, vacc1x4567);
-      vacc1x89AB = wasm_f32x4_relaxed_madd(vi1x89AB, vw89AB, vacc1x89AB);
-      vacc1xCDEF = wasm_f32x4_relaxed_madd(vi1xCDEF, vwCDEF, vacc1xCDEF);
-      vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123);
-      vacc2x4567 = wasm_f32x4_relaxed_madd(vi2x4567, vw4567, vacc2x4567);
-      vacc2x89AB = wasm_f32x4_relaxed_madd(vi2x89AB, vw89AB, vacc2x89AB);
-      vacc2xCDEF = wasm_f32x4_relaxed_madd(vi2xCDEF, vwCDEF, vacc2xCDEF);
-      vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123);
-      vacc3x4567 = wasm_f32x4_relaxed_madd(vi3x4567, vw4567, vacc3x4567);
-      vacc3x89AB = wasm_f32x4_relaxed_madd(vi3x89AB, vw89AB, vacc3x89AB);
-      vacc3xCDEF = wasm_f32x4_relaxed_madd(vi3xCDEF, vwCDEF, vacc3xCDEF);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      wasm_v128_store(o0 + 8, vacc0x89AB);
-      wasm_v128_store(o0 + 12, vacc0xCDEF);
-      o0 += 16;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      wasm_v128_store(o1 + 8, vacc1x89AB);
-      wasm_v128_store(o1 + 12, vacc1xCDEF);
-      o1 += 16;
-      wasm_v128_store(o2, vacc2x0123);
-      wasm_v128_store(o2 + 4, vacc2x4567);
-      wasm_v128_store(o2 + 8, vacc2x89AB);
-      wasm_v128_store(o2 + 12, vacc2xCDEF);
-      o2 += 16;
-      wasm_v128_store(o3, vacc3x0123);
-      wasm_v128_store(o3 + 4, vacc3x4567);
-      wasm_v128_store(o3 + 8, vacc3x89AB);
-      wasm_v128_store(o3 + 12, vacc3xCDEF);
-      o3 += 16;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 += 4;
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-      vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123);
-      vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-      wasm_v128_store(o2, vacc2x0123);
-      o2 += 4;
-      wasm_v128_store(o3, vacc3x0123);
-      o3 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 = (const float*) ((uintptr_t) i2 + c);
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 = (const float*) ((uintptr_t) i3 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-      vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123);
-      vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-        wasm_v128_store64_lane(o2, vacc2x0123, 0);
-        wasm_v128_store64_lane(o3, vacc3x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-        vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1);
-        vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-        o2 += 2;
-        o3 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-        wasm_v128_store32_lane(o2, vacc2x0123, 0);
-        wasm_v128_store32_lane(o3, vacc3x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-        o2 += 1;
-        o3 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    i2 = (const float*) ((uintptr_t) i2 + input_increment);
-    o2 = (float*) ((uintptr_t) o2 + output_increment);
-    i3 = (const float*) ((uintptr_t) i3 + input_increment);
-    o3 = (float*) ((uintptr_t) o3 + output_increment);
-    rows = doz(rows, 4);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x4.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x4.c
deleted file mode 100644
index 2606faa27e4..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x4.c
+++ /dev/null
@@ -1,161 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
-  float* o2 = (float*) ((uintptr_t) o1 + output_stride);
-  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
-  float* o3 = (float*) ((uintptr_t) o2 + output_stride);
-
-  const size_t input_increment = input_stride * 4 - channels;
-  const size_t output_increment = output_stride * 4 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-    if XNN_UNPREDICTABLE(rows <= 2) {
-      i2 = i1;
-      o2 = o1;
-    }
-    if XNN_UNPREDICTABLE(rows < 4) {
-      i3 = i2;
-      o3 = o2;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 += 4;
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-      vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123);
-      vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-      wasm_v128_store(o2, vacc2x0123);
-      o2 += 4;
-      wasm_v128_store(o3, vacc3x0123);
-      o3 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 = (const float*) ((uintptr_t) i2 + c);
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 = (const float*) ((uintptr_t) i3 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-      vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123);
-      vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-        wasm_v128_store64_lane(o2, vacc2x0123, 0);
-        wasm_v128_store64_lane(o3, vacc3x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-        vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1);
-        vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-        o2 += 2;
-        o3 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-        wasm_v128_store32_lane(o2, vacc2x0123, 0);
-        wasm_v128_store32_lane(o3, vacc3x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-        o2 += 1;
-        o3 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    i2 = (const float*) ((uintptr_t) i2 + input_increment);
-    o2 = (float*) ((uintptr_t) o2 + output_increment);
-    i3 = (const float*) ((uintptr_t) i3 + input_increment);
-    o3 = (float*) ((uintptr_t) o3 + output_increment);
-    rows = doz(rows, 4);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x8.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x8.c
deleted file mode 100644
index 1a4595e0c98..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x8.c
+++ /dev/null
@@ -1,218 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
-  float* o2 = (float*) ((uintptr_t) o1 + output_stride);
-  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
-  float* o3 = (float*) ((uintptr_t) o2 + output_stride);
-
-  const size_t input_increment = input_stride * 4 - channels;
-  const size_t output_increment = output_stride * 4 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-    if XNN_UNPREDICTABLE(rows <= 2) {
-      i2 = i1;
-      o2 = o1;
-    }
-    if XNN_UNPREDICTABLE(rows < 4) {
-      i3 = i2;
-      o3 = o2;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      w += 8;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      i0 += 8;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      i1 += 8;
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      v128_t vi2x4567 = wasm_v128_load(i2 + 4);
-      i2 += 8;
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      v128_t vi3x4567 = wasm_v128_load(i3 + 4);
-      i3 += 8;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero);
-      vi0x4567 = wasm_i32x4_min(vi0x4567, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero);
-      vi1x4567 = wasm_i32x4_min(vi1x4567, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc2x4567 = wasm_i32x4_max(vi2x4567, vzero);
-      vi2x4567 = wasm_i32x4_min(vi2x4567, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-      v128_t vacc3x4567 = wasm_i32x4_max(vi3x4567, vzero);
-      vi3x4567 = wasm_i32x4_min(vi3x4567, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc0x4567 = wasm_f32x4_relaxed_madd(vi0x4567, vw4567, vacc0x4567);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-      vacc1x4567 = wasm_f32x4_relaxed_madd(vi1x4567, vw4567, vacc1x4567);
-      vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123);
-      vacc2x4567 = wasm_f32x4_relaxed_madd(vi2x4567, vw4567, vacc2x4567);
-      vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123);
-      vacc3x4567 = wasm_f32x4_relaxed_madd(vi3x4567, vw4567, vacc3x4567);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      o0 += 8;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      o1 += 8;
-      wasm_v128_store(o2, vacc2x0123);
-      wasm_v128_store(o2 + 4, vacc2x4567);
-      o2 += 8;
-      wasm_v128_store(o3, vacc3x0123);
-      wasm_v128_store(o3 + 4, vacc3x4567);
-      o3 += 8;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 += 4;
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-      vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123);
-      vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-      wasm_v128_store(o2, vacc2x0123);
-      o2 += 4;
-      wasm_v128_store(o3, vacc3x0123);
-      o3 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 = (const float*) ((uintptr_t) i2 + c);
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 = (const float*) ((uintptr_t) i3 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_relaxed_madd(vi0x0123, vw0123, vacc0x0123);
-      vacc1x0123 = wasm_f32x4_relaxed_madd(vi1x0123, vw0123, vacc1x0123);
-      vacc2x0123 = wasm_f32x4_relaxed_madd(vi2x0123, vw0123, vacc2x0123);
-      vacc3x0123 = wasm_f32x4_relaxed_madd(vi3x0123, vw0123, vacc3x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-        wasm_v128_store64_lane(o2, vacc2x0123, 0);
-        wasm_v128_store64_lane(o3, vacc3x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-        vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1);
-        vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-        o2 += 2;
-        o3 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-        wasm_v128_store32_lane(o2, vacc2x0123, 0);
-        wasm_v128_store32_lane(o3, vacc3x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-        o2 += 1;
-        o3 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    i2 = (const float*) ((uintptr_t) i2 + input_increment);
-    o2 = (float*) ((uintptr_t) o2 + output_increment);
-    i3 = (const float*) ((uintptr_t) i3 + input_increment);
-    o3 = (float*) ((uintptr_t) o3 + output_increment);
-    rows = doz(rows, 4);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x16.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x16.c
deleted file mode 100644
index 974dab42c07..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x16.c
+++ /dev/null
@@ -1,118 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-
-  const size_t input_increment = input_stride * 1 - channels;
-  const size_t output_increment = output_stride * 1 - channels;
-
-  do {
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      const v128_t vw89AB = wasm_v128_load(w + 8);
-      const v128_t vwCDEF = wasm_v128_load(w + 12);
-      w += 16;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      const v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      const v128_t vi0x89AB = wasm_v128_load(i0 + 8);
-      const v128_t vi0xCDEF = wasm_v128_load(i0 + 12);
-      i0 += 16;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567);
-      const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31);
-      v128_t vacc0x89AB = wasm_f32x4_mul(vi0x89AB, vw89AB);
-      const v128_t vmask0x89AB = wasm_i32x4_shr(vi0x89AB, 31);
-      v128_t vacc0xCDEF = wasm_f32x4_mul(vi0xCDEF, vwCDEF);
-      const v128_t vmask0xCDEF = wasm_i32x4_shr(vi0xCDEF, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc0x4567 = wasm_i32x4_relaxed_laneselect(vacc0x4567, vi0x4567, vmask0x4567);
-      vacc0x89AB = wasm_i32x4_relaxed_laneselect(vacc0x89AB, vi0x89AB, vmask0x89AB);
-      vacc0xCDEF = wasm_i32x4_relaxed_laneselect(vacc0xCDEF, vi0xCDEF, vmask0xCDEF);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      wasm_v128_store(o0 + 8, vacc0x89AB);
-      wasm_v128_store(o0 + 12, vacc0xCDEF);
-      o0 += 16;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-
-        o0 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-
-        o0 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    rows = doz(rows, 1);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x4.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x4.c
deleted file mode 100644
index 88c927a6504..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x4.c
+++ /dev/null
@@ -1,85 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-
-  const size_t input_increment = input_stride * 1 - channels;
-  const size_t output_increment = output_stride * 1 - channels;
-
-  do {
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-
-        o0 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-
-        o0 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    rows = doz(rows, 1);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x8.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x8.c
deleted file mode 100644
index 5e0deffdbcb..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x8.c
+++ /dev/null
@@ -1,106 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-
-  const size_t input_increment = input_stride * 1 - channels;
-  const size_t output_increment = output_stride * 1 - channels;
-
-  do {
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      w += 8;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      const v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      i0 += 8;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567);
-      const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc0x4567 = wasm_i32x4_relaxed_laneselect(vacc0x4567, vi0x4567, vmask0x4567);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      o0 += 8;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-
-        o0 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-
-        o0 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    rows = doz(rows, 1);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x16.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x16.c
deleted file mode 100644
index dad82d1178e..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x16.c
+++ /dev/null
@@ -1,165 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      const v128_t vw89AB = wasm_v128_load(w + 8);
-      const v128_t vwCDEF = wasm_v128_load(w + 12);
-      w += 16;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      const v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      const v128_t vi0x89AB = wasm_v128_load(i0 + 8);
-      const v128_t vi0xCDEF = wasm_v128_load(i0 + 12);
-      i0 += 16;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      const v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      const v128_t vi1x89AB = wasm_v128_load(i1 + 8);
-      const v128_t vi1xCDEF = wasm_v128_load(i1 + 12);
-      i1 += 16;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567);
-      const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31);
-      v128_t vacc0x89AB = wasm_f32x4_mul(vi0x89AB, vw89AB);
-      const v128_t vmask0x89AB = wasm_i32x4_shr(vi0x89AB, 31);
-      v128_t vacc0xCDEF = wasm_f32x4_mul(vi0xCDEF, vwCDEF);
-      const v128_t vmask0xCDEF = wasm_i32x4_shr(vi0xCDEF, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567);
-      const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31);
-      v128_t vacc1x89AB = wasm_f32x4_mul(vi1x89AB, vw89AB);
-      const v128_t vmask1x89AB = wasm_i32x4_shr(vi1x89AB, 31);
-      v128_t vacc1xCDEF = wasm_f32x4_mul(vi1xCDEF, vwCDEF);
-      const v128_t vmask1xCDEF = wasm_i32x4_shr(vi1xCDEF, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc0x4567 = wasm_i32x4_relaxed_laneselect(vacc0x4567, vi0x4567, vmask0x4567);
-      vacc0x89AB = wasm_i32x4_relaxed_laneselect(vacc0x89AB, vi0x89AB, vmask0x89AB);
-      vacc0xCDEF = wasm_i32x4_relaxed_laneselect(vacc0xCDEF, vi0xCDEF, vmask0xCDEF);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc1x4567 = wasm_i32x4_relaxed_laneselect(vacc1x4567, vi1x4567, vmask1x4567);
-      vacc1x89AB = wasm_i32x4_relaxed_laneselect(vacc1x89AB, vi1x89AB, vmask1x89AB);
-      vacc1xCDEF = wasm_i32x4_relaxed_laneselect(vacc1xCDEF, vi1xCDEF, vmask1xCDEF);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      wasm_v128_store(o0 + 8, vacc0x89AB);
-      wasm_v128_store(o0 + 12, vacc0xCDEF);
-      o0 += 16;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      wasm_v128_store(o1 + 8, vacc1x89AB);
-      wasm_v128_store(o1 + 12, vacc1xCDEF);
-      o1 += 16;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x4.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x4.c
deleted file mode 100644
index fc94f57f6ef..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x4.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x8.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x8.c
deleted file mode 100644
index 22ba498bae0..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x8.c
+++ /dev/null
@@ -1,143 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      w += 8;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      const v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      i0 += 8;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      const v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      i1 += 8;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567);
-      const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567);
-      const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc0x4567 = wasm_i32x4_relaxed_laneselect(vacc0x4567, vi0x4567, vmask0x4567);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc1x4567 = wasm_i32x4_relaxed_laneselect(vacc1x4567, vi1x4567, vmask1x4567);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      o0 += 8;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      o1 += 8;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x16.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x16.c
deleted file mode 100644
index 029d5b5aad1..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x16.c
+++ /dev/null
@@ -1,259 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
-  float* o2 = (float*) ((uintptr_t) o1 + output_stride);
-  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
-  float* o3 = (float*) ((uintptr_t) o2 + output_stride);
-
-  const size_t input_increment = input_stride * 4 - channels;
-  const size_t output_increment = output_stride * 4 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-    if XNN_UNPREDICTABLE(rows <= 2) {
-      i2 = i1;
-      o2 = o1;
-    }
-    if XNN_UNPREDICTABLE(rows < 4) {
-      i3 = i2;
-      o3 = o2;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      const v128_t vw89AB = wasm_v128_load(w + 8);
-      const v128_t vwCDEF = wasm_v128_load(w + 12);
-      w += 16;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      const v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      const v128_t vi0x89AB = wasm_v128_load(i0 + 8);
-      const v128_t vi0xCDEF = wasm_v128_load(i0 + 12);
-      i0 += 16;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      const v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      const v128_t vi1x89AB = wasm_v128_load(i1 + 8);
-      const v128_t vi1xCDEF = wasm_v128_load(i1 + 12);
-      i1 += 16;
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      const v128_t vi2x4567 = wasm_v128_load(i2 + 4);
-      const v128_t vi2x89AB = wasm_v128_load(i2 + 8);
-      const v128_t vi2xCDEF = wasm_v128_load(i2 + 12);
-      i2 += 16;
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      const v128_t vi3x4567 = wasm_v128_load(i3 + 4);
-      const v128_t vi3x89AB = wasm_v128_load(i3 + 8);
-      const v128_t vi3xCDEF = wasm_v128_load(i3 + 12);
-      i3 += 16;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567);
-      const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31);
-      v128_t vacc0x89AB = wasm_f32x4_mul(vi0x89AB, vw89AB);
-      const v128_t vmask0x89AB = wasm_i32x4_shr(vi0x89AB, 31);
-      v128_t vacc0xCDEF = wasm_f32x4_mul(vi0xCDEF, vwCDEF);
-      const v128_t vmask0xCDEF = wasm_i32x4_shr(vi0xCDEF, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567);
-      const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31);
-      v128_t vacc1x89AB = wasm_f32x4_mul(vi1x89AB, vw89AB);
-      const v128_t vmask1x89AB = wasm_i32x4_shr(vi1x89AB, 31);
-      v128_t vacc1xCDEF = wasm_f32x4_mul(vi1xCDEF, vwCDEF);
-      const v128_t vmask1xCDEF = wasm_i32x4_shr(vi1xCDEF, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc2x4567 = wasm_f32x4_mul(vi2x4567, vw4567);
-      const v128_t vmask2x4567 = wasm_i32x4_shr(vi2x4567, 31);
-      v128_t vacc2x89AB = wasm_f32x4_mul(vi2x89AB, vw89AB);
-      const v128_t vmask2x89AB = wasm_i32x4_shr(vi2x89AB, 31);
-      v128_t vacc2xCDEF = wasm_f32x4_mul(vi2xCDEF, vwCDEF);
-      const v128_t vmask2xCDEF = wasm_i32x4_shr(vi2xCDEF, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-      v128_t vacc3x4567 = wasm_f32x4_mul(vi3x4567, vw4567);
-      const v128_t vmask3x4567 = wasm_i32x4_shr(vi3x4567, 31);
-      v128_t vacc3x89AB = wasm_f32x4_mul(vi3x89AB, vw89AB);
-      const v128_t vmask3x89AB = wasm_i32x4_shr(vi3x89AB, 31);
-      v128_t vacc3xCDEF = wasm_f32x4_mul(vi3xCDEF, vwCDEF);
-      const v128_t vmask3xCDEF = wasm_i32x4_shr(vi3xCDEF, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc0x4567 = wasm_i32x4_relaxed_laneselect(vacc0x4567, vi0x4567, vmask0x4567);
-      vacc0x89AB = wasm_i32x4_relaxed_laneselect(vacc0x89AB, vi0x89AB, vmask0x89AB);
-      vacc0xCDEF = wasm_i32x4_relaxed_laneselect(vacc0xCDEF, vi0xCDEF, vmask0xCDEF);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc1x4567 = wasm_i32x4_relaxed_laneselect(vacc1x4567, vi1x4567, vmask1x4567);
-      vacc1x89AB = wasm_i32x4_relaxed_laneselect(vacc1x89AB, vi1x89AB, vmask1x89AB);
-      vacc1xCDEF = wasm_i32x4_relaxed_laneselect(vacc1xCDEF, vi1xCDEF, vmask1xCDEF);
-      vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc2x4567 = wasm_i32x4_relaxed_laneselect(vacc2x4567, vi2x4567, vmask2x4567);
-      vacc2x89AB = wasm_i32x4_relaxed_laneselect(vacc2x89AB, vi2x89AB, vmask2x89AB);
-      vacc2xCDEF = wasm_i32x4_relaxed_laneselect(vacc2xCDEF, vi2xCDEF, vmask2xCDEF);
-      vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123);
-      vacc3x4567 = wasm_i32x4_relaxed_laneselect(vacc3x4567, vi3x4567, vmask3x4567);
-      vacc3x89AB = wasm_i32x4_relaxed_laneselect(vacc3x89AB, vi3x89AB, vmask3x89AB);
-      vacc3xCDEF = wasm_i32x4_relaxed_laneselect(vacc3xCDEF, vi3xCDEF, vmask3xCDEF);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      wasm_v128_store(o0 + 8, vacc0x89AB);
-      wasm_v128_store(o0 + 12, vacc0xCDEF);
-      o0 += 16;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      wasm_v128_store(o1 + 8, vacc1x89AB);
-      wasm_v128_store(o1 + 12, vacc1xCDEF);
-      o1 += 16;
-      wasm_v128_store(o2, vacc2x0123);
-      wasm_v128_store(o2 + 4, vacc2x4567);
-      wasm_v128_store(o2 + 8, vacc2x89AB);
-      wasm_v128_store(o2 + 12, vacc2xCDEF);
-      o2 += 16;
-      wasm_v128_store(o3, vacc3x0123);
-      wasm_v128_store(o3 + 4, vacc3x4567);
-      wasm_v128_store(o3 + 8, vacc3x89AB);
-      wasm_v128_store(o3 + 12, vacc3xCDEF);
-      o3 += 16;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 += 4;
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-      wasm_v128_store(o2, vacc2x0123);
-      o2 += 4;
-      wasm_v128_store(o3, vacc3x0123);
-      o3 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 = (const float*) ((uintptr_t) i2 + c);
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 = (const float*) ((uintptr_t) i3 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-        wasm_v128_store64_lane(o2, vacc2x0123, 0);
-        wasm_v128_store64_lane(o3, vacc3x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-        vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1);
-        vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-        o2 += 2;
-        o3 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-        wasm_v128_store32_lane(o2, vacc2x0123, 0);
-        wasm_v128_store32_lane(o3, vacc3x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-        o2 += 1;
-        o3 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    i2 = (const float*) ((uintptr_t) i2 + input_increment);
-    o2 = (float*) ((uintptr_t) o2 + output_increment);
-    i3 = (const float*) ((uintptr_t) i3 + input_increment);
-    o3 = (float*) ((uintptr_t) o3 + output_increment);
-    rows = doz(rows, 4);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x4.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x4.c
deleted file mode 100644
index c5a1998e223..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x4.c
+++ /dev/null
@@ -1,160 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
-  float* o2 = (float*) ((uintptr_t) o1 + output_stride);
-  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
-  float* o3 = (float*) ((uintptr_t) o2 + output_stride);
-
-  const size_t input_increment = input_stride * 4 - channels;
-  const size_t output_increment = output_stride * 4 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-    if XNN_UNPREDICTABLE(rows <= 2) {
-      i2 = i1;
-      o2 = o1;
-    }
-    if XNN_UNPREDICTABLE(rows < 4) {
-      i3 = i2;
-      o3 = o2;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 += 4;
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-      wasm_v128_store(o2, vacc2x0123);
-      o2 += 4;
-      wasm_v128_store(o3, vacc3x0123);
-      o3 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 = (const float*) ((uintptr_t) i2 + c);
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 = (const float*) ((uintptr_t) i3 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-        wasm_v128_store64_lane(o2, vacc2x0123, 0);
-        wasm_v128_store64_lane(o3, vacc3x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-        vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1);
-        vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-        o2 += 2;
-        o3 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-        wasm_v128_store32_lane(o2, vacc2x0123, 0);
-        wasm_v128_store32_lane(o3, vacc3x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-        o2 += 1;
-        o3 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    i2 = (const float*) ((uintptr_t) i2 + input_increment);
-    o2 = (float*) ((uintptr_t) o2 + output_increment);
-    i3 = (const float*) ((uintptr_t) i3 + input_increment);
-    o3 = (float*) ((uintptr_t) o3 + output_increment);
-    rows = doz(rows, 4);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x8.c b/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x8.c
deleted file mode 100644
index 31a49e32df4..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x8.c
+++ /dev/null
@@ -1,217 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
-  float* o2 = (float*) ((uintptr_t) o1 + output_stride);
-  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
-  float* o3 = (float*) ((uintptr_t) o2 + output_stride);
-
-  const size_t input_increment = input_stride * 4 - channels;
-  const size_t output_increment = output_stride * 4 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-    if XNN_UNPREDICTABLE(rows <= 2) {
-      i2 = i1;
-      o2 = o1;
-    }
-    if XNN_UNPREDICTABLE(rows < 4) {
-      i3 = i2;
-      o3 = o2;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      w += 8;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      const v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      i0 += 8;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      const v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      i1 += 8;
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      const v128_t vi2x4567 = wasm_v128_load(i2 + 4);
-      i2 += 8;
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      const v128_t vi3x4567 = wasm_v128_load(i3 + 4);
-      i3 += 8;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567);
-      const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567);
-      const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc2x4567 = wasm_f32x4_mul(vi2x4567, vw4567);
-      const v128_t vmask2x4567 = wasm_i32x4_shr(vi2x4567, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-      v128_t vacc3x4567 = wasm_f32x4_mul(vi3x4567, vw4567);
-      const v128_t vmask3x4567 = wasm_i32x4_shr(vi3x4567, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc0x4567 = wasm_i32x4_relaxed_laneselect(vacc0x4567, vi0x4567, vmask0x4567);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc1x4567 = wasm_i32x4_relaxed_laneselect(vacc1x4567, vi1x4567, vmask1x4567);
-      vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc2x4567 = wasm_i32x4_relaxed_laneselect(vacc2x4567, vi2x4567, vmask2x4567);
-      vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123);
-      vacc3x4567 = wasm_i32x4_relaxed_laneselect(vacc3x4567, vi3x4567, vmask3x4567);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      o0 += 8;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      o1 += 8;
-      wasm_v128_store(o2, vacc2x0123);
-      wasm_v128_store(o2 + 4, vacc2x4567);
-      o2 += 8;
-      wasm_v128_store(o3, vacc3x0123);
-      wasm_v128_store(o3 + 4, vacc3x4567);
-      o3 += 8;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 += 4;
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-      wasm_v128_store(o2, vacc2x0123);
-      o2 += 4;
-      wasm_v128_store(o3, vacc3x0123);
-      o3 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 = (const float*) ((uintptr_t) i2 + c);
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 = (const float*) ((uintptr_t) i3 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-
-      vacc0x0123 = wasm_i32x4_relaxed_laneselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_i32x4_relaxed_laneselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc2x0123 = wasm_i32x4_relaxed_laneselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc3x0123 = wasm_i32x4_relaxed_laneselect(vacc3x0123, vi3x0123, vmask3x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-        wasm_v128_store64_lane(o2, vacc2x0123, 0);
-        wasm_v128_store64_lane(o3, vacc3x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-        vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1);
-        vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-        o2 += 2;
-        o3 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-        wasm_v128_store32_lane(o2, vacc2x0123, 0);
-        wasm_v128_store32_lane(o3, vacc3x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-        o2 += 1;
-        o3 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    i2 = (const float*) ((uintptr_t) i2 + input_increment);
-    o2 = (float*) ((uintptr_t) o2 + output_increment);
-    i3 = (const float*) ((uintptr_t) i3 + input_increment);
-    o3 = (float*) ((uintptr_t) o3 + output_increment);
-    rows = doz(rows, 4);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x16.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x16.c
deleted file mode 100644
index 7aba563c465..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x16.c
+++ /dev/null
@@ -1,119 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-
-  const size_t input_increment = input_stride * 1 - channels;
-  const size_t output_increment = output_stride * 1 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      const v128_t vw89AB = wasm_v128_load(w + 8);
-      const v128_t vwCDEF = wasm_v128_load(w + 12);
-      w += 16;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      v128_t vi0x89AB = wasm_v128_load(i0 + 8);
-      v128_t vi0xCDEF = wasm_v128_load(i0 + 12);
-      i0 += 16;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero);
-      vi0x4567 = wasm_i32x4_min(vi0x4567, vzero);
-      v128_t vacc0x89AB = wasm_i32x4_max(vi0x89AB, vzero);
-      vi0x89AB = wasm_i32x4_min(vi0x89AB, vzero);
-      v128_t vacc0xCDEF = wasm_i32x4_max(vi0xCDEF, vzero);
-      vi0xCDEF = wasm_i32x4_min(vi0xCDEF, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vw4567), vacc0x4567);
-      vacc0x89AB = wasm_f32x4_add(wasm_f32x4_mul(vi0x89AB, vw89AB), vacc0x89AB);
-      vacc0xCDEF = wasm_f32x4_add(wasm_f32x4_mul(vi0xCDEF, vwCDEF), vacc0xCDEF);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      wasm_v128_store(o0 + 8, vacc0x89AB);
-      wasm_v128_store(o0 + 12, vacc0xCDEF);
-      o0 += 16;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-
-        o0 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-
-        o0 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    rows = doz(rows, 1);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x4.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x4.c
deleted file mode 100644
index 56c78fcc29a..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x4.c
+++ /dev/null
@@ -1,86 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-
-  const size_t input_increment = input_stride * 1 - channels;
-  const size_t output_increment = output_stride * 1 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-
-        o0 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-
-        o0 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    rows = doz(rows, 1);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x8.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x8.c
deleted file mode 100644
index 221d1f4a79c..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x8.c
+++ /dev/null
@@ -1,107 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-
-  const size_t input_increment = input_stride * 1 - channels;
-  const size_t output_increment = output_stride * 1 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      w += 8;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      i0 += 8;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero);
-      vi0x4567 = wasm_i32x4_min(vi0x4567, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vw4567), vacc0x4567);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      o0 += 8;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-
-        o0 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-
-        o0 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    rows = doz(rows, 1);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x16.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x16.c
deleted file mode 100644
index 4980c8327cc..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x16.c
+++ /dev/null
@@ -1,166 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      const v128_t vw89AB = wasm_v128_load(w + 8);
-      const v128_t vwCDEF = wasm_v128_load(w + 12);
-      w += 16;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      v128_t vi0x89AB = wasm_v128_load(i0 + 8);
-      v128_t vi0xCDEF = wasm_v128_load(i0 + 12);
-      i0 += 16;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      v128_t vi1x89AB = wasm_v128_load(i1 + 8);
-      v128_t vi1xCDEF = wasm_v128_load(i1 + 12);
-      i1 += 16;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero);
-      vi0x4567 = wasm_i32x4_min(vi0x4567, vzero);
-      v128_t vacc0x89AB = wasm_i32x4_max(vi0x89AB, vzero);
-      vi0x89AB = wasm_i32x4_min(vi0x89AB, vzero);
-      v128_t vacc0xCDEF = wasm_i32x4_max(vi0xCDEF, vzero);
-      vi0xCDEF = wasm_i32x4_min(vi0xCDEF, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero);
-      vi1x4567 = wasm_i32x4_min(vi1x4567, vzero);
-      v128_t vacc1x89AB = wasm_i32x4_max(vi1x89AB, vzero);
-      vi1x89AB = wasm_i32x4_min(vi1x89AB, vzero);
-      v128_t vacc1xCDEF = wasm_i32x4_max(vi1xCDEF, vzero);
-      vi1xCDEF = wasm_i32x4_min(vi1xCDEF, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vw4567), vacc0x4567);
-      vacc0x89AB = wasm_f32x4_add(wasm_f32x4_mul(vi0x89AB, vw89AB), vacc0x89AB);
-      vacc0xCDEF = wasm_f32x4_add(wasm_f32x4_mul(vi0xCDEF, vwCDEF), vacc0xCDEF);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-      vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vw4567), vacc1x4567);
-      vacc1x89AB = wasm_f32x4_add(wasm_f32x4_mul(vi1x89AB, vw89AB), vacc1x89AB);
-      vacc1xCDEF = wasm_f32x4_add(wasm_f32x4_mul(vi1xCDEF, vwCDEF), vacc1xCDEF);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      wasm_v128_store(o0 + 8, vacc0x89AB);
-      wasm_v128_store(o0 + 12, vacc0xCDEF);
-      o0 += 16;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      wasm_v128_store(o1 + 8, vacc1x89AB);
-      wasm_v128_store(o1 + 12, vacc1xCDEF);
-      o1 += 16;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x4.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x4.c
deleted file mode 100644
index d1117ad4b3b..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x4.c
+++ /dev/null
@@ -1,111 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x8.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x8.c
deleted file mode 100644
index a293e9533e6..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x8.c
+++ /dev/null
@@ -1,144 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      w += 8;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      i0 += 8;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      i1 += 8;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero);
-      vi0x4567 = wasm_i32x4_min(vi0x4567, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero);
-      vi1x4567 = wasm_i32x4_min(vi1x4567, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vw4567), vacc0x4567);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-      vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vw4567), vacc1x4567);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      o0 += 8;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      o1 += 8;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x16.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x16.c
deleted file mode 100644
index 0d01bf6fef6..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x16.c
+++ /dev/null
@@ -1,260 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
-  float* o2 = (float*) ((uintptr_t) o1 + output_stride);
-  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
-  float* o3 = (float*) ((uintptr_t) o2 + output_stride);
-
-  const size_t input_increment = input_stride * 4 - channels;
-  const size_t output_increment = output_stride * 4 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-    if XNN_UNPREDICTABLE(rows <= 2) {
-      i2 = i1;
-      o2 = o1;
-    }
-    if XNN_UNPREDICTABLE(rows < 4) {
-      i3 = i2;
-      o3 = o2;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      const v128_t vw89AB = wasm_v128_load(w + 8);
-      const v128_t vwCDEF = wasm_v128_load(w + 12);
-      w += 16;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      v128_t vi0x89AB = wasm_v128_load(i0 + 8);
-      v128_t vi0xCDEF = wasm_v128_load(i0 + 12);
-      i0 += 16;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      v128_t vi1x89AB = wasm_v128_load(i1 + 8);
-      v128_t vi1xCDEF = wasm_v128_load(i1 + 12);
-      i1 += 16;
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      v128_t vi2x4567 = wasm_v128_load(i2 + 4);
-      v128_t vi2x89AB = wasm_v128_load(i2 + 8);
-      v128_t vi2xCDEF = wasm_v128_load(i2 + 12);
-      i2 += 16;
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      v128_t vi3x4567 = wasm_v128_load(i3 + 4);
-      v128_t vi3x89AB = wasm_v128_load(i3 + 8);
-      v128_t vi3xCDEF = wasm_v128_load(i3 + 12);
-      i3 += 16;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero);
-      vi0x4567 = wasm_i32x4_min(vi0x4567, vzero);
-      v128_t vacc0x89AB = wasm_i32x4_max(vi0x89AB, vzero);
-      vi0x89AB = wasm_i32x4_min(vi0x89AB, vzero);
-      v128_t vacc0xCDEF = wasm_i32x4_max(vi0xCDEF, vzero);
-      vi0xCDEF = wasm_i32x4_min(vi0xCDEF, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero);
-      vi1x4567 = wasm_i32x4_min(vi1x4567, vzero);
-      v128_t vacc1x89AB = wasm_i32x4_max(vi1x89AB, vzero);
-      vi1x89AB = wasm_i32x4_min(vi1x89AB, vzero);
-      v128_t vacc1xCDEF = wasm_i32x4_max(vi1xCDEF, vzero);
-      vi1xCDEF = wasm_i32x4_min(vi1xCDEF, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc2x4567 = wasm_i32x4_max(vi2x4567, vzero);
-      vi2x4567 = wasm_i32x4_min(vi2x4567, vzero);
-      v128_t vacc2x89AB = wasm_i32x4_max(vi2x89AB, vzero);
-      vi2x89AB = wasm_i32x4_min(vi2x89AB, vzero);
-      v128_t vacc2xCDEF = wasm_i32x4_max(vi2xCDEF, vzero);
-      vi2xCDEF = wasm_i32x4_min(vi2xCDEF, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-      v128_t vacc3x4567 = wasm_i32x4_max(vi3x4567, vzero);
-      vi3x4567 = wasm_i32x4_min(vi3x4567, vzero);
-      v128_t vacc3x89AB = wasm_i32x4_max(vi3x89AB, vzero);
-      vi3x89AB = wasm_i32x4_min(vi3x89AB, vzero);
-      v128_t vacc3xCDEF = wasm_i32x4_max(vi3xCDEF, vzero);
-      vi3xCDEF = wasm_i32x4_min(vi3xCDEF, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vw4567), vacc0x4567);
-      vacc0x89AB = wasm_f32x4_add(wasm_f32x4_mul(vi0x89AB, vw89AB), vacc0x89AB);
-      vacc0xCDEF = wasm_f32x4_add(wasm_f32x4_mul(vi0xCDEF, vwCDEF), vacc0xCDEF);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-      vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vw4567), vacc1x4567);
-      vacc1x89AB = wasm_f32x4_add(wasm_f32x4_mul(vi1x89AB, vw89AB), vacc1x89AB);
-      vacc1xCDEF = wasm_f32x4_add(wasm_f32x4_mul(vi1xCDEF, vwCDEF), vacc1xCDEF);
-      vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123);
-      vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vw4567), vacc2x4567);
-      vacc2x89AB = wasm_f32x4_add(wasm_f32x4_mul(vi2x89AB, vw89AB), vacc2x89AB);
-      vacc2xCDEF = wasm_f32x4_add(wasm_f32x4_mul(vi2xCDEF, vwCDEF), vacc2xCDEF);
-      vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123);
-      vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi3x4567, vw4567), vacc3x4567);
-      vacc3x89AB = wasm_f32x4_add(wasm_f32x4_mul(vi3x89AB, vw89AB), vacc3x89AB);
-      vacc3xCDEF = wasm_f32x4_add(wasm_f32x4_mul(vi3xCDEF, vwCDEF), vacc3xCDEF);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      wasm_v128_store(o0 + 8, vacc0x89AB);
-      wasm_v128_store(o0 + 12, vacc0xCDEF);
-      o0 += 16;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      wasm_v128_store(o1 + 8, vacc1x89AB);
-      wasm_v128_store(o1 + 12, vacc1xCDEF);
-      o1 += 16;
-      wasm_v128_store(o2, vacc2x0123);
-      wasm_v128_store(o2 + 4, vacc2x4567);
-      wasm_v128_store(o2 + 8, vacc2x89AB);
-      wasm_v128_store(o2 + 12, vacc2xCDEF);
-      o2 += 16;
-      wasm_v128_store(o3, vacc3x0123);
-      wasm_v128_store(o3 + 4, vacc3x4567);
-      wasm_v128_store(o3 + 8, vacc3x89AB);
-      wasm_v128_store(o3 + 12, vacc3xCDEF);
-      o3 += 16;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 += 4;
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-      vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123);
-      vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-      wasm_v128_store(o2, vacc2x0123);
-      o2 += 4;
-      wasm_v128_store(o3, vacc3x0123);
-      o3 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 = (const float*) ((uintptr_t) i2 + c);
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 = (const float*) ((uintptr_t) i3 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-      vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123);
-      vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-        wasm_v128_store64_lane(o2, vacc2x0123, 0);
-        wasm_v128_store64_lane(o3, vacc3x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-        vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1);
-        vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-        o2 += 2;
-        o3 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-        wasm_v128_store32_lane(o2, vacc2x0123, 0);
-        wasm_v128_store32_lane(o3, vacc3x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-        o2 += 1;
-        o3 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    i2 = (const float*) ((uintptr_t) i2 + input_increment);
-    o2 = (float*) ((uintptr_t) o2 + output_increment);
-    i3 = (const float*) ((uintptr_t) i3 + input_increment);
-    o3 = (float*) ((uintptr_t) o3 + output_increment);
-    rows = doz(rows, 4);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x4.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x4.c
deleted file mode 100644
index 97e712636f5..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x4.c
+++ /dev/null
@@ -1,161 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
-  float* o2 = (float*) ((uintptr_t) o1 + output_stride);
-  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
-  float* o3 = (float*) ((uintptr_t) o2 + output_stride);
-
-  const size_t input_increment = input_stride * 4 - channels;
-  const size_t output_increment = output_stride * 4 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-    if XNN_UNPREDICTABLE(rows <= 2) {
-      i2 = i1;
-      o2 = o1;
-    }
-    if XNN_UNPREDICTABLE(rows < 4) {
-      i3 = i2;
-      o3 = o2;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 += 4;
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-      vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123);
-      vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-      wasm_v128_store(o2, vacc2x0123);
-      o2 += 4;
-      wasm_v128_store(o3, vacc3x0123);
-      o3 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 = (const float*) ((uintptr_t) i2 + c);
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 = (const float*) ((uintptr_t) i3 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-      vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123);
-      vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-        wasm_v128_store64_lane(o2, vacc2x0123, 0);
-        wasm_v128_store64_lane(o3, vacc3x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-        vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1);
-        vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-        o2 += 2;
-        o3 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-        wasm_v128_store32_lane(o2, vacc2x0123, 0);
-        wasm_v128_store32_lane(o3, vacc3x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-        o2 += 1;
-        o3 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    i2 = (const float*) ((uintptr_t) i2 + input_increment);
-    o2 = (float*) ((uintptr_t) o2 + output_increment);
-    i3 = (const float*) ((uintptr_t) i3 + input_increment);
-    o3 = (float*) ((uintptr_t) o3 + output_increment);
-    rows = doz(rows, 4);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x8.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x8.c
deleted file mode 100644
index a62ad1e8412..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x8.c
+++ /dev/null
@@ -1,218 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-iminmax.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
-  float* o2 = (float*) ((uintptr_t) o1 + output_stride);
-  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
-  float* o3 = (float*) ((uintptr_t) o2 + output_stride);
-
-  const size_t input_increment = input_stride * 4 - channels;
-  const size_t output_increment = output_stride * 4 - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-    if XNN_UNPREDICTABLE(rows <= 2) {
-      i2 = i1;
-      o2 = o1;
-    }
-    if XNN_UNPREDICTABLE(rows < 4) {
-      i3 = i2;
-      o3 = o2;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      w += 8;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      i0 += 8;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      i1 += 8;
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      v128_t vi2x4567 = wasm_v128_load(i2 + 4);
-      i2 += 8;
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      v128_t vi3x4567 = wasm_v128_load(i3 + 4);
-      i3 += 8;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero);
-      vi0x4567 = wasm_i32x4_min(vi0x4567, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero);
-      vi1x4567 = wasm_i32x4_min(vi1x4567, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc2x4567 = wasm_i32x4_max(vi2x4567, vzero);
-      vi2x4567 = wasm_i32x4_min(vi2x4567, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-      v128_t vacc3x4567 = wasm_i32x4_max(vi3x4567, vzero);
-      vi3x4567 = wasm_i32x4_min(vi3x4567, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vw4567), vacc0x4567);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-      vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vw4567), vacc1x4567);
-      vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123);
-      vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vw4567), vacc2x4567);
-      vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123);
-      vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi3x4567, vw4567), vacc3x4567);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      o0 += 8;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      o1 += 8;
-      wasm_v128_store(o2, vacc2x0123);
-      wasm_v128_store(o2 + 4, vacc2x4567);
-      o2 += 8;
-      wasm_v128_store(o3, vacc3x0123);
-      wasm_v128_store(o3 + 4, vacc3x4567);
-      o3 += 8;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 += 4;
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 += 4;
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-      vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123);
-      vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-      wasm_v128_store(o2, vacc2x0123);
-      o2 += 4;
-      wasm_v128_store(o3, vacc3x0123);
-      o3 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-      v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 = (const float*) ((uintptr_t) i2 + c);
-      v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 = (const float*) ((uintptr_t) i3 + c);
-
-      v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero);
-      vi0x0123 = wasm_i32x4_min(vi0x0123, vzero);
-      v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero);
-      vi1x0123 = wasm_i32x4_min(vi1x0123, vzero);
-      v128_t vacc2x0123 = wasm_i32x4_max(vi2x0123, vzero);
-      vi2x0123 = wasm_i32x4_min(vi2x0123, vzero);
-      v128_t vacc3x0123 = wasm_i32x4_max(vi3x0123, vzero);
-      vi3x0123 = wasm_i32x4_min(vi3x0123, vzero);
-
-      vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123);
-      vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123);
-      vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vw0123), vacc2x0123);
-      vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vw0123), vacc3x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-        wasm_v128_store64_lane(o2, vacc2x0123, 0);
-        wasm_v128_store64_lane(o3, vacc3x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-        vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1);
-        vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-        o2 += 2;
-        o3 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-        wasm_v128_store32_lane(o2, vacc2x0123, 0);
-        wasm_v128_store32_lane(o3, vacc3x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-        o2 += 1;
-        o3 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    i2 = (const float*) ((uintptr_t) i2 + input_increment);
-    o2 = (float*) ((uintptr_t) o2 + output_increment);
-    i3 = (const float*) ((uintptr_t) i3 + input_increment);
-    o3 = (float*) ((uintptr_t) o3 + output_increment);
-    rows = doz(rows, 4);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x16.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x16.c
deleted file mode 100644
index e0df620d3a0..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x16.c
+++ /dev/null
@@ -1,118 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-
-  const size_t input_increment = input_stride * 1 - channels;
-  const size_t output_increment = output_stride * 1 - channels;
-
-  do {
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      const v128_t vw89AB = wasm_v128_load(w + 8);
-      const v128_t vwCDEF = wasm_v128_load(w + 12);
-      w += 16;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      const v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      const v128_t vi0x89AB = wasm_v128_load(i0 + 8);
-      const v128_t vi0xCDEF = wasm_v128_load(i0 + 12);
-      i0 += 16;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567);
-      const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31);
-      v128_t vacc0x89AB = wasm_f32x4_mul(vi0x89AB, vw89AB);
-      const v128_t vmask0x89AB = wasm_i32x4_shr(vi0x89AB, 31);
-      v128_t vacc0xCDEF = wasm_f32x4_mul(vi0xCDEF, vwCDEF);
-      const v128_t vmask0xCDEF = wasm_i32x4_shr(vi0xCDEF, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vi0x4567, vmask0x4567);
-      vacc0x89AB = wasm_v128_bitselect(vacc0x89AB, vi0x89AB, vmask0x89AB);
-      vacc0xCDEF = wasm_v128_bitselect(vacc0xCDEF, vi0xCDEF, vmask0xCDEF);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      wasm_v128_store(o0 + 8, vacc0x89AB);
-      wasm_v128_store(o0 + 12, vacc0xCDEF);
-      o0 += 16;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-
-        o0 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-
-        o0 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    rows = doz(rows, 1);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x4.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x4.c
deleted file mode 100644
index 0f8d2658807..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x4.c
+++ /dev/null
@@ -1,85 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-
-  const size_t input_increment = input_stride * 1 - channels;
-  const size_t output_increment = output_stride * 1 - channels;
-
-  do {
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-
-        o0 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-
-        o0 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    rows = doz(rows, 1);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x8.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x8.c
deleted file mode 100644
index 47cf36c4134..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x8.c
+++ /dev/null
@@ -1,106 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-
-  const size_t input_increment = input_stride * 1 - channels;
-  const size_t output_increment = output_stride * 1 - channels;
-
-  do {
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      w += 8;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      const v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      i0 += 8;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567);
-      const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vi0x4567, vmask0x4567);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      o0 += 8;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-
-        o0 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-
-        o0 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    rows = doz(rows, 1);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x16.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x16.c
deleted file mode 100644
index 0d7bd5537f4..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x16.c
+++ /dev/null
@@ -1,165 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      const v128_t vw89AB = wasm_v128_load(w + 8);
-      const v128_t vwCDEF = wasm_v128_load(w + 12);
-      w += 16;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      const v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      const v128_t vi0x89AB = wasm_v128_load(i0 + 8);
-      const v128_t vi0xCDEF = wasm_v128_load(i0 + 12);
-      i0 += 16;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      const v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      const v128_t vi1x89AB = wasm_v128_load(i1 + 8);
-      const v128_t vi1xCDEF = wasm_v128_load(i1 + 12);
-      i1 += 16;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567);
-      const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31);
-      v128_t vacc0x89AB = wasm_f32x4_mul(vi0x89AB, vw89AB);
-      const v128_t vmask0x89AB = wasm_i32x4_shr(vi0x89AB, 31);
-      v128_t vacc0xCDEF = wasm_f32x4_mul(vi0xCDEF, vwCDEF);
-      const v128_t vmask0xCDEF = wasm_i32x4_shr(vi0xCDEF, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567);
-      const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31);
-      v128_t vacc1x89AB = wasm_f32x4_mul(vi1x89AB, vw89AB);
-      const v128_t vmask1x89AB = wasm_i32x4_shr(vi1x89AB, 31);
-      v128_t vacc1xCDEF = wasm_f32x4_mul(vi1xCDEF, vwCDEF);
-      const v128_t vmask1xCDEF = wasm_i32x4_shr(vi1xCDEF, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vi0x4567, vmask0x4567);
-      vacc0x89AB = wasm_v128_bitselect(vacc0x89AB, vi0x89AB, vmask0x89AB);
-      vacc0xCDEF = wasm_v128_bitselect(vacc0xCDEF, vi0xCDEF, vmask0xCDEF);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc1x4567 = wasm_v128_bitselect(vacc1x4567, vi1x4567, vmask1x4567);
-      vacc1x89AB = wasm_v128_bitselect(vacc1x89AB, vi1x89AB, vmask1x89AB);
-      vacc1xCDEF = wasm_v128_bitselect(vacc1xCDEF, vi1xCDEF, vmask1xCDEF);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      wasm_v128_store(o0 + 8, vacc0x89AB);
-      wasm_v128_store(o0 + 12, vacc0xCDEF);
-      o0 += 16;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      wasm_v128_store(o1 + 8, vacc1x89AB);
-      wasm_v128_store(o1 + 12, vacc1xCDEF);
-      o1 += 16;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x4.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x4.c
deleted file mode 100644
index a5f295680eb..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x4.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x8.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x8.c
deleted file mode 100644
index b1aa60d17c0..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x8.c
+++ /dev/null
@@ -1,143 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-
-  const size_t input_increment = input_stride * 2 - channels;
-  const size_t output_increment = output_stride * 2 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      w += 8;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      const v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      i0 += 8;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      const v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      i1 += 8;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567);
-      const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567);
-      const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vi0x4567, vmask0x4567);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc1x4567 = wasm_v128_bitselect(vacc1x4567, vi1x4567, vmask1x4567);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      o0 += 8;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      o1 += 8;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    rows = doz(rows, 2);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x16.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x16.c
deleted file mode 100644
index 3cdd0b9793d..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x16.c
+++ /dev/null
@@ -1,259 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
-  float* o2 = (float*) ((uintptr_t) o1 + output_stride);
-  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
-  float* o3 = (float*) ((uintptr_t) o2 + output_stride);
-
-  const size_t input_increment = input_stride * 4 - channels;
-  const size_t output_increment = output_stride * 4 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-    if XNN_UNPREDICTABLE(rows <= 2) {
-      i2 = i1;
-      o2 = o1;
-    }
-    if XNN_UNPREDICTABLE(rows < 4) {
-      i3 = i2;
-      o3 = o2;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      const v128_t vw89AB = wasm_v128_load(w + 8);
-      const v128_t vwCDEF = wasm_v128_load(w + 12);
-      w += 16;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      const v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      const v128_t vi0x89AB = wasm_v128_load(i0 + 8);
-      const v128_t vi0xCDEF = wasm_v128_load(i0 + 12);
-      i0 += 16;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      const v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      const v128_t vi1x89AB = wasm_v128_load(i1 + 8);
-      const v128_t vi1xCDEF = wasm_v128_load(i1 + 12);
-      i1 += 16;
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      const v128_t vi2x4567 = wasm_v128_load(i2 + 4);
-      const v128_t vi2x89AB = wasm_v128_load(i2 + 8);
-      const v128_t vi2xCDEF = wasm_v128_load(i2 + 12);
-      i2 += 16;
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      const v128_t vi3x4567 = wasm_v128_load(i3 + 4);
-      const v128_t vi3x89AB = wasm_v128_load(i3 + 8);
-      const v128_t vi3xCDEF = wasm_v128_load(i3 + 12);
-      i3 += 16;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567);
-      const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31);
-      v128_t vacc0x89AB = wasm_f32x4_mul(vi0x89AB, vw89AB);
-      const v128_t vmask0x89AB = wasm_i32x4_shr(vi0x89AB, 31);
-      v128_t vacc0xCDEF = wasm_f32x4_mul(vi0xCDEF, vwCDEF);
-      const v128_t vmask0xCDEF = wasm_i32x4_shr(vi0xCDEF, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567);
-      const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31);
-      v128_t vacc1x89AB = wasm_f32x4_mul(vi1x89AB, vw89AB);
-      const v128_t vmask1x89AB = wasm_i32x4_shr(vi1x89AB, 31);
-      v128_t vacc1xCDEF = wasm_f32x4_mul(vi1xCDEF, vwCDEF);
-      const v128_t vmask1xCDEF = wasm_i32x4_shr(vi1xCDEF, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc2x4567 = wasm_f32x4_mul(vi2x4567, vw4567);
-      const v128_t vmask2x4567 = wasm_i32x4_shr(vi2x4567, 31);
-      v128_t vacc2x89AB = wasm_f32x4_mul(vi2x89AB, vw89AB);
-      const v128_t vmask2x89AB = wasm_i32x4_shr(vi2x89AB, 31);
-      v128_t vacc2xCDEF = wasm_f32x4_mul(vi2xCDEF, vwCDEF);
-      const v128_t vmask2xCDEF = wasm_i32x4_shr(vi2xCDEF, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-      v128_t vacc3x4567 = wasm_f32x4_mul(vi3x4567, vw4567);
-      const v128_t vmask3x4567 = wasm_i32x4_shr(vi3x4567, 31);
-      v128_t vacc3x89AB = wasm_f32x4_mul(vi3x89AB, vw89AB);
-      const v128_t vmask3x89AB = wasm_i32x4_shr(vi3x89AB, 31);
-      v128_t vacc3xCDEF = wasm_f32x4_mul(vi3xCDEF, vwCDEF);
-      const v128_t vmask3xCDEF = wasm_i32x4_shr(vi3xCDEF, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vi0x4567, vmask0x4567);
-      vacc0x89AB = wasm_v128_bitselect(vacc0x89AB, vi0x89AB, vmask0x89AB);
-      vacc0xCDEF = wasm_v128_bitselect(vacc0xCDEF, vi0xCDEF, vmask0xCDEF);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc1x4567 = wasm_v128_bitselect(vacc1x4567, vi1x4567, vmask1x4567);
-      vacc1x89AB = wasm_v128_bitselect(vacc1x89AB, vi1x89AB, vmask1x89AB);
-      vacc1xCDEF = wasm_v128_bitselect(vacc1xCDEF, vi1xCDEF, vmask1xCDEF);
-      vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc2x4567 = wasm_v128_bitselect(vacc2x4567, vi2x4567, vmask2x4567);
-      vacc2x89AB = wasm_v128_bitselect(vacc2x89AB, vi2x89AB, vmask2x89AB);
-      vacc2xCDEF = wasm_v128_bitselect(vacc2xCDEF, vi2xCDEF, vmask2xCDEF);
-      vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123);
-      vacc3x4567 = wasm_v128_bitselect(vacc3x4567, vi3x4567, vmask3x4567);
-      vacc3x89AB = wasm_v128_bitselect(vacc3x89AB, vi3x89AB, vmask3x89AB);
-      vacc3xCDEF = wasm_v128_bitselect(vacc3xCDEF, vi3xCDEF, vmask3xCDEF);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      wasm_v128_store(o0 + 8, vacc0x89AB);
-      wasm_v128_store(o0 + 12, vacc0xCDEF);
-      o0 += 16;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      wasm_v128_store(o1 + 8, vacc1x89AB);
-      wasm_v128_store(o1 + 12, vacc1xCDEF);
-      o1 += 16;
-      wasm_v128_store(o2, vacc2x0123);
-      wasm_v128_store(o2 + 4, vacc2x4567);
-      wasm_v128_store(o2 + 8, vacc2x89AB);
-      wasm_v128_store(o2 + 12, vacc2xCDEF);
-      o2 += 16;
-      wasm_v128_store(o3, vacc3x0123);
-      wasm_v128_store(o3 + 4, vacc3x4567);
-      wasm_v128_store(o3 + 8, vacc3x89AB);
-      wasm_v128_store(o3 + 12, vacc3xCDEF);
-      o3 += 16;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 += 4;
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-      wasm_v128_store(o2, vacc2x0123);
-      o2 += 4;
-      wasm_v128_store(o3, vacc3x0123);
-      o3 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 = (const float*) ((uintptr_t) i2 + c);
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 = (const float*) ((uintptr_t) i3 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-        wasm_v128_store64_lane(o2, vacc2x0123, 0);
-        wasm_v128_store64_lane(o3, vacc3x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-        vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1);
-        vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-        o2 += 2;
-        o3 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-        wasm_v128_store32_lane(o2, vacc2x0123, 0);
-        wasm_v128_store32_lane(o3, vacc3x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-        o2 += 1;
-        o3 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    i2 = (const float*) ((uintptr_t) i2 + input_increment);
-    o2 = (float*) ((uintptr_t) o2 + output_increment);
-    i3 = (const float*) ((uintptr_t) i3 + input_increment);
-    o3 = (float*) ((uintptr_t) o3 + output_increment);
-    rows = doz(rows, 4);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x4.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x4.c
deleted file mode 100644
index 894ab77465b..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x4.c
+++ /dev/null
@@ -1,160 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
-  float* o2 = (float*) ((uintptr_t) o1 + output_stride);
-  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
-  float* o3 = (float*) ((uintptr_t) o2 + output_stride);
-
-  const size_t input_increment = input_stride * 4 - channels;
-  const size_t output_increment = output_stride * 4 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-    if XNN_UNPREDICTABLE(rows <= 2) {
-      i2 = i1;
-      o2 = o1;
-    }
-    if XNN_UNPREDICTABLE(rows < 4) {
-      i3 = i2;
-      o3 = o2;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 += 4;
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-      wasm_v128_store(o2, vacc2x0123);
-      o2 += 4;
-      wasm_v128_store(o3, vacc3x0123);
-      o3 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 = (const float*) ((uintptr_t) i2 + c);
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 = (const float*) ((uintptr_t) i3 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-        wasm_v128_store64_lane(o2, vacc2x0123, 0);
-        wasm_v128_store64_lane(o3, vacc3x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-        vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1);
-        vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-        o2 += 2;
-        o3 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-        wasm_v128_store32_lane(o2, vacc2x0123, 0);
-        wasm_v128_store32_lane(o3, vacc3x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-        o2 += 1;
-        o3 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    i2 = (const float*) ((uintptr_t) i2 + input_increment);
-    o2 = (float*) ((uintptr_t) o2 + output_increment);
-    i3 = (const float*) ((uintptr_t) i3 + input_increment);
-    o3 = (float*) ((uintptr_t) o3 + output_increment);
-    rows = doz(rows, 4);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x8.c b/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x8.c
deleted file mode 100644
index f160a03e220..00000000000
--- a/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x8.c
+++ /dev/null
@@ -1,217 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-prelu/wasmsimd-laneselect.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
-  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
-  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
-  float* o2 = (float*) ((uintptr_t) o1 + output_stride);
-  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
-  float* o3 = (float*) ((uintptr_t) o2 + output_stride);
-
-  const size_t input_increment = input_stride * 4 - channels;
-  const size_t output_increment = output_stride * 4 - channels;
-
-  do {
-    if XNN_UNPREDICTABLE(rows < 2) {
-      i1 = i0;
-      o1 = o0;
-    }
-    if XNN_UNPREDICTABLE(rows <= 2) {
-      i2 = i1;
-      o2 = o1;
-    }
-    if XNN_UNPREDICTABLE(rows < 4) {
-      i3 = i2;
-      o3 = o2;
-    }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      const v128_t vw4567 = wasm_v128_load(w + 4);
-      w += 8;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      const v128_t vi0x4567 = wasm_v128_load(i0 + 4);
-      i0 += 8;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      const v128_t vi1x4567 = wasm_v128_load(i1 + 4);
-      i1 += 8;
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      const v128_t vi2x4567 = wasm_v128_load(i2 + 4);
-      i2 += 8;
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      const v128_t vi3x4567 = wasm_v128_load(i3 + 4);
-      i3 += 8;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567);
-      const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567);
-      const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc2x4567 = wasm_f32x4_mul(vi2x4567, vw4567);
-      const v128_t vmask2x4567 = wasm_i32x4_shr(vi2x4567, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-      v128_t vacc3x4567 = wasm_f32x4_mul(vi3x4567, vw4567);
-      const v128_t vmask3x4567 = wasm_i32x4_shr(vi3x4567, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vi0x4567, vmask0x4567);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc1x4567 = wasm_v128_bitselect(vacc1x4567, vi1x4567, vmask1x4567);
-      vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc2x4567 = wasm_v128_bitselect(vacc2x4567, vi2x4567, vmask2x4567);
-      vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123);
-      vacc3x4567 = wasm_v128_bitselect(vacc3x4567, vi3x4567, vmask3x4567);
-
-      wasm_v128_store(o0, vacc0x0123);
-      wasm_v128_store(o0 + 4, vacc0x4567);
-      o0 += 8;
-      wasm_v128_store(o1, vacc1x0123);
-      wasm_v128_store(o1 + 4, vacc1x4567);
-      o1 += 8;
-      wasm_v128_store(o2, vacc2x0123);
-      wasm_v128_store(o2 + 4, vacc2x4567);
-      o2 += 8;
-      wasm_v128_store(o3, vacc3x0123);
-      wasm_v128_store(o3 + 4, vacc3x4567);
-      o3 += 8;
-    }
-    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w += 4;
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 += 4;
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 += 4;
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 += 4;
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 += 4;
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123);
-
-      wasm_v128_store(o0, vacc0x0123);
-      o0 += 4;
-      wasm_v128_store(o1, vacc1x0123);
-      o1 += 4;
-      wasm_v128_store(o2, vacc2x0123);
-      o2 += 4;
-      wasm_v128_store(o3, vacc3x0123);
-      o3 += 4;
-    }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      const v128_t vi0x0123 = wasm_v128_load(i0);
-      i0 = (const float*) ((uintptr_t) i0 + c);
-      const v128_t vi1x0123 = wasm_v128_load(i1);
-      i1 = (const float*) ((uintptr_t) i1 + c);
-      const v128_t vi2x0123 = wasm_v128_load(i2);
-      i2 = (const float*) ((uintptr_t) i2 + c);
-      const v128_t vi3x0123 = wasm_v128_load(i3);
-      i3 = (const float*) ((uintptr_t) i3 + c);
-
-      v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123);
-      const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31);
-      v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123);
-      const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31);
-      v128_t vacc2x0123 = wasm_f32x4_mul(vi2x0123, vw0123);
-      const v128_t vmask2x0123 = wasm_i32x4_shr(vi2x0123, 31);
-      v128_t vacc3x0123 = wasm_f32x4_mul(vi3x0123, vw0123);
-      const v128_t vmask3x0123 = wasm_i32x4_shr(vi3x0123, 31);
-
-      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123);
-      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123);
-      vacc2x0123 = wasm_v128_bitselect(vacc2x0123, vi2x0123, vmask2x0123);
-      vacc3x0123 = wasm_v128_bitselect(vacc3x0123, vi3x0123, vmask3x0123);
-
-      if (c & (2 * sizeof(float))) {
-        wasm_v128_store64_lane(o0, vacc0x0123, 0);
-        wasm_v128_store64_lane(o1, vacc1x0123, 0);
-        wasm_v128_store64_lane(o2, vacc2x0123, 0);
-        wasm_v128_store64_lane(o3, vacc3x0123, 0);
-
-        vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
-        vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1);
-        vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1);
-        vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1);
-
-        o0 += 2;
-        o1 += 2;
-        o2 += 2;
-        o3 += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        wasm_v128_store32_lane(o0, vacc0x0123, 0);
-        wasm_v128_store32_lane(o1, vacc1x0123, 0);
-        wasm_v128_store32_lane(o2, vacc2x0123, 0);
-        wasm_v128_store32_lane(o3, vacc3x0123, 0);
-
-        o0 += 1;
-        o1 += 1;
-        o2 += 1;
-        o3 += 1;
-      }
-    }
-    i0 = (const float*) ((uintptr_t) i0 + input_increment);
-    o0 = (float*) ((uintptr_t) o0 + output_increment);
-    i1 = (const float*) ((uintptr_t) i1 + input_increment);
-    o1 = (float*) ((uintptr_t) o1 + output_increment);
-    i2 = (const float*) ((uintptr_t) i2 + input_increment);
-    o2 = (float*) ((uintptr_t) o2 + output_increment);
-    i3 = (const float*) ((uintptr_t) i3 + input_increment);
-    o3 = (float*) ((uintptr_t) o3 + output_increment);
-    rows = doz(rows, 4);
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/neon.c.in b/src/f32-prelu/neon.c.in
deleted file mode 100644
index 3774bbebc97..00000000000
--- a/src/f32-prelu/neon.c.in
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-$assert CHANNEL_TILE % 4 == 0
-$assert CHANNEL_TILE >= 4
-$assert ROW_TILE >= 1
-$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__neon_${ROW_TILE}x${CHANNEL_TILE}(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  $for M in range(1, ROW_TILE):
-    const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride);
-    float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride);
-
-  const size_t input_increment = input_stride * ${ROW_TILE} - channels;
-  const size_t output_increment = output_stride * ${ROW_TILE} - channels;
-
-  do {
-    $for M in range(1, ROW_TILE):
-      $if M % 2 == 0:
-        if XNN_UNPREDICTABLE(rows <= ${M}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-      $else:
-        if XNN_UNPREDICTABLE(rows < ${M+1}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) {
-      $for C in range(0, CHANNEL_TILE, 4):
-        const float32x4_t vw${ABC[C:C+4]} = vld1q_f32(w); w += 4;
-
-      $for M in range(ROW_TILE):
-        $for C in range(0, CHANNEL_TILE, 4):
-          const float32x4_t vi${M}x${ABC[C:C+4]} = vld1q_f32(i${M}); i${M} += 4;
-
-      $for M in range(ROW_TILE):
-        $for C in range(0, CHANNEL_TILE, 4):
-          float32x4_t vacc${M}x${ABC[C:C+4]} = vmulq_f32(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]});
-          const uint32x4_t vm${M}x${ABC[C:C+4]} = vcltq_s32(vreinterpretq_s32_f32(vi${M}x${ABC[C:C+4]}), vmovq_n_s32(0));
-
-      $for M in range(ROW_TILE):
-        $for C in range(0, CHANNEL_TILE, 4):
-          vacc${M}x${ABC[C:C+4]} = vbslq_f32(vm${M}x${ABC[C:C+4]}, vacc${M}x${ABC[C:C+4]}, vi${M}x${ABC[C:C+4]});
-
-      $for M in range(ROW_TILE):
-        $for C in range(0, CHANNEL_TILE, 4):
-          vst1q_f32(o${M}, vacc${M}x${ABC[C:C+4]}); o${M} += 4;
-    }
-    $if CHANNEL_TILE != 4:
-      for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-        const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-        $for M in range(ROW_TILE):
-          const float32x4_t vi${M}x0123 = vld1q_f32(i${M});
-          i${M} += 4;
-
-        $for M in range(ROW_TILE):
-          float32x4_t vacc${M}x0123 = vmulq_f32(vi${M}x0123, vw0123);
-          const uint32x4_t vm${M}x0123 = vcltq_s32(vreinterpretq_s32_f32(vi${M}x0123), vmovq_n_s32(0));
-
-        $for M in range(ROW_TILE):
-          vacc${M}x0123 = vbslq_f32(vm${M}x0123, vacc${M}x0123, vi${M}x0123);
-
-        $for M in range(ROW_TILE):
-          vst1q_f32(o${M}, vacc${M}x0123); o${M} += 4;
-      }
-    if XNN_UNLIKELY(c != 0) {
-      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
-
-      $for M in range(ROW_TILE):
-        const float32x4_t vi${M}x0123 = vld1q_f32(i${M});
-        i${M} = (const float*) ((uintptr_t) i${M} + c);
-
-      $for M in range(ROW_TILE):
-        float32x4_t vacc${M}x0123 = vmulq_f32(vi${M}x0123, vw0123);
-        const uint32x4_t vm${M}x0123 = vcltq_s32(vreinterpretq_s32_f32(vi${M}x0123), vmovq_n_s32(0));
-
-      $for M in range(ROW_TILE):
-        vacc${M}x0123 = vbslq_f32(vm${M}x0123, vacc${M}x0123, vi${M}x0123);
-
-      $for M in range(ROW_TILE):
-        float32x2_t vacc${M}x01 = vget_low_f32(vacc${M}x0123);
-      if (c & (2 * sizeof(float))) {
-        $for M in range(ROW_TILE):
-          vst1_f32(o${M}, vacc${M}x01); o${M} += 2;
-
-        $for M in range(ROW_TILE):
-          vacc${M}x01 = vget_high_f32(vacc${M}x0123);
-      }
-      if (c & (1 * sizeof(float))) {
-        $for M in range(ROW_TILE):
-          vst1_lane_f32(o${M}, vacc${M}x01, 0); o${M} += 1;
-      }
-    }
-    $for M in range(ROW_TILE):
-      i${M} = (const float*) ((uintptr_t) i${M} + input_increment);
-      o${M} = (float*) ((uintptr_t) o${M} + output_increment);
-    rows = doz(rows, ${ROW_TILE});
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/scalar.c.in b/src/f32-prelu/scalar.c.in
deleted file mode 100644
index ba3f11fbaf7..00000000000
--- a/src/f32-prelu/scalar.c.in
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-$assert CHANNEL_TILE > 0
-$assert ROW_TILE >= 1
-$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-#include <assert.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__scalar_${ROW_TILE}x${CHANNEL_TILE}(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  $for M in range(1, ROW_TILE):
-    const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride);
-    float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride);
-
-  const size_t input_increment = input_stride * ${ROW_TILE} - channels;
-  const size_t output_increment = output_stride * ${ROW_TILE} - channels;
-
-  do {
-    $for M in range(1, ROW_TILE):
-      $if M % 2 == 0:
-        if XNN_UNPREDICTABLE(rows <= ${M}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-      $else:
-        if XNN_UNPREDICTABLE(rows < ${M+1}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-
-    const float* w = weights;
-    size_t c = channels;
-    $if CHANNEL_TILE > 1:
-      for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) {
-        $for C in range(CHANNEL_TILE):
-          const float vw${ABC[C]} = w[${C}];
-
-        $for M in range(ROW_TILE):
-          $for C in range(CHANNEL_TILE):
-            const float vi${M}x${ABC[C]} = i${M}[${C}];
-          i${M} += ${CHANNEL_TILE};
-
-        $for M in range(ROW_TILE):
-          $for C in range(CHANNEL_TILE):
-            const float vacc${M}x${ABC[C]} = XNN_UNPREDICTABLE(vi${M}x${ABC[C]} < 0.0f) ? vi${M}x${ABC[C]} * vw${ABC[C]} : vi${M}x${ABC[C]};
-
-        $for M in range(ROW_TILE):
-          $for C in range(CHANNEL_TILE):
-            o${M}[${C}] = vacc${M}x${ABC[C]};
-          o${M} += ${CHANNEL_TILE};
-
-        w += ${CHANNEL_TILE};
-      }
-      for (; c != 0; c -= sizeof(float)) {
-        const float vw = *w++;
-
-        $for M in range(ROW_TILE):
-          const float vi${M} = *i${M}++;
-
-        $for M in range(ROW_TILE):
-          const float vacc${M} = XNN_UNPREDICTABLE(vi${M} < 0.0f) ? vi${M} * vw : vi${M};
-
-        $for M in range(ROW_TILE):
-          *o${M}++ = vacc${M};
-      }
-    $else:
-      do {
-        const float vw = *w++;
-
-        $for M in range(ROW_TILE):
-          const float vi${M} = *i${M}++;
-
-        $for M in range(ROW_TILE):
-          const float vacc${M} = XNN_UNPREDICTABLE(vi${M} < 0.0f) ? vi${M} * vw : vi${M};
-
-        $for M in range(ROW_TILE):
-          *o${M}++ = vacc${M};
-
-        c -= sizeof(float);
-      } while (c != 0);
-    $for M in range(ROW_TILE):
-      i${M} = (const float*) ((uintptr_t) i${M} + input_increment);
-      o${M} = (float*) ((uintptr_t) o${M} + output_increment);
-    rows = doz(rows, ${ROW_TILE});
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/sse.c.in b/src/f32-prelu/sse.c.in
deleted file mode 100644
index 7e51e3a496d..00000000000
--- a/src/f32-prelu/sse.c.in
+++ /dev/null
@@ -1,186 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-$assert CHANNEL_TILE % 4 == 0
-$assert CHANNEL_TILE >= 4
-$assert ROW_TILE >= 1
-$assert SSE in [1, 2, 4]
-$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-$SSE_HEADER = {1: "xmmintrin.h", 2: "emmintrin.h", 4: "smmintrin.h"}[SSE]
-#include <assert.h>
-
-#include <${SSE_HEADER}>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-$ISA = {1: "sse", 2: "sse2", 4: "sse41"}[SSE]
-void xnn_f32_prelu_ukernel__${ISA}_${ROW_TILE}x${CHANNEL_TILE}(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  $for M in range(1, ROW_TILE):
-    const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride);
-    float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride);
-
-  const size_t input_increment = input_stride * ${ROW_TILE} - channels;
-  const size_t output_increment = output_stride * ${ROW_TILE} - channels;
-
-  $if SSE == 1:
-    const __m128 vzero = _mm_setzero_ps();
-  do {
-    $for M in range(1, ROW_TILE):
-      $if M % 2 == 0:
-        if XNN_UNPREDICTABLE(rows <= ${M}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-      $else:
-        if XNN_UNPREDICTABLE(rows < ${M+1}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) {
-      const __m128 vw${ABC[0:4]} = _mm_load_ps(w);
-      $for C in range(4, CHANNEL_TILE, 4):
-        const __m128 vw${ABC[C:C+4]} = _mm_load_ps(w + ${C});
-      w += ${CHANNEL_TILE};
-
-      $for M in range(ROW_TILE):
-        $if SSE == 1:
-          __m128 vi${M}x${ABC[0:4]} = _mm_loadu_ps(i${M});
-          $for C in range(4, CHANNEL_TILE, 4):
-            __m128 vi${M}x${ABC[C:C+4]} = _mm_loadu_ps(i${M} + ${C});
-        $else:
-          const __m128 vi${M}x${ABC[0:4]} = _mm_loadu_ps(i${M});
-          $for C in range(4, CHANNEL_TILE, 4):
-            const __m128 vi${M}x${ABC[C:C+4]} = _mm_loadu_ps(i${M} + ${C});
-        i${M} += ${CHANNEL_TILE};
-
-      $for M in range(ROW_TILE):
-        $for C in range(0, CHANNEL_TILE, 4):
-          $if SSE == 1:
-            __m128 vacc${M}x${ABC[C:C+4]} = _mm_max_ps(_mm_setzero_ps(), vi${M}x${ABC[C:C+4]});
-            vi${M}x${ABC[C:C+4]} = _mm_min_ps(vi${M}x${ABC[C:C+4]}, vzero);
-          $else:
-            const __m128 vprod${M}x${ABC[C:C+4]} = _mm_mul_ps(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]});
-            $if SSE == 2:
-              const __m128 vmask${M}x${ABC[C:C+4]} = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi${M}x${ABC[C:C+4]})));
-
-      $for M in range(ROW_TILE):
-        $for C in range(0, CHANNEL_TILE, 4):
-          $if SSE == 1:
-            vacc${M}x${ABC[C:C+4]} = _mm_add_ps(vacc${M}x${ABC[C:C+4]}, _mm_mul_ps(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]}));
-          $elif SSE == 2:
-            const __m128 vacc${M}x${ABC[C:C+4]} = _mm_or_ps(_mm_and_ps(vprod${M}x${ABC[C:C+4]}, vmask${M}x${ABC[C:C+4]}), _mm_andnot_ps(vmask${M}x${ABC[C:C+4]}, vi${M}x${ABC[C:C+4]}));
-          $elif SSE == 4:
-            const __m128 vacc${M}x${ABC[C:C+4]} = _mm_blendv_ps(vi${M}x${ABC[C:C+4]}, vprod${M}x${ABC[C:C+4]}, vi${M}x${ABC[C:C+4]});
-
-      $for M in range(ROW_TILE):
-        _mm_storeu_ps(o${M}, vacc${M}x${ABC[0:4]});
-        $for C in range(4, CHANNEL_TILE, 4):
-          _mm_storeu_ps(o${M} + ${C}, vacc${M}x${ABC[C:C+4]});
-        o${M} += ${CHANNEL_TILE};
-    }
-    $if CHANNEL_TILE > 4:
-      for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-        const __m128 vw0123 = _mm_load_ps(w);
-        w += 4;
-
-        $for M in range(ROW_TILE):
-          $if SSE == 1:
-            __m128 vi${M}x0123 = _mm_loadu_ps(i${M});
-          $else:
-            const __m128 vi${M}x0123 = _mm_loadu_ps(i${M});
-          i${M} += 4;
-
-        $for M in range(ROW_TILE):
-          $if SSE == 1:
-            __m128 vacc${M}x0123 = _mm_max_ps(_mm_setzero_ps(), vi${M}x0123);
-            vi${M}x0123 = _mm_min_ps(vi${M}x0123, vzero);
-          $else:
-            const __m128 vprod${M}x0123 = _mm_mul_ps(vi${M}x0123, vw0123);
-            $if SSE == 2:
-              const __m128 vmask${M}x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi${M}x0123)));
-
-        $for M in range(ROW_TILE):
-          $if SSE == 1:
-            vacc${M}x0123 = _mm_add_ps(vacc${M}x0123, _mm_mul_ps(vi${M}x0123, vw0123));
-          $elif SSE == 2:
-            __m128 vacc${M}x0123 = _mm_or_ps(_mm_and_ps(vprod${M}x0123, vmask${M}x0123), _mm_andnot_ps(vmask${M}x0123, vi${M}x0123));
-          $elif SSE == 4:
-            __m128 vacc${M}x0123 = _mm_blendv_ps(vi${M}x0123, vprod${M}x0123, vi${M}x0123);
-
-        $for M in range(ROW_TILE):
-          _mm_storeu_ps(o${M}, vacc${M}x0123);
-          o${M} += 4;
-      }
-    if XNN_UNLIKELY(c != 0) {
-      const __m128 vw0123 = _mm_load_ps(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      $for M in range(ROW_TILE):
-        $if SSE == 1:
-          __m128 vi${M}x0123 = _mm_loadu_ps(i${M});
-        $else:
-          const __m128 vi${M}x0123 = _mm_loadu_ps(i${M});
-        i${M} = (const float*) ((uintptr_t) i${M} + c);
-
-      $for M in range(ROW_TILE):
-        $if SSE == 1:
-          __m128 vacc${M}x0123 = _mm_max_ps(_mm_setzero_ps(), vi${M}x0123);
-          vi${M}x0123 = _mm_min_ps(vi${M}x0123, vzero);
-        $else:
-          const __m128 vprod${M}x0123 = _mm_mul_ps(vi${M}x0123, vw0123);
-          $if SSE == 2:
-            const __m128 vmask${M}x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi${M}x0123)));
-
-      $for M in range(ROW_TILE):
-        $if SSE == 1:
-          vacc${M}x0123 = _mm_add_ps(vacc${M}x0123, _mm_mul_ps(vi${M}x0123, vw0123));
-        $elif SSE == 2:
-          __m128 vacc${M}x0123 = _mm_or_ps(_mm_and_ps(vprod${M}x0123, vmask${M}x0123), _mm_andnot_ps(vmask${M}x0123, vi${M}x0123));
-        $elif SSE == 4:
-          __m128 vacc${M}x0123 = _mm_blendv_ps(vi${M}x0123, vprod${M}x0123, vi${M}x0123);
-
-      if (c & (2 * sizeof(float))) {
-        $for M in range(ROW_TILE):
-          _mm_storel_pi((__m64*) o${M}, vacc${M}x0123);
-
-        $for M in range(ROW_TILE):
-          vacc${M}x0123 = _mm_movehl_ps(vacc${M}x0123, vacc${M}x0123);
-
-        $for M in range(ROW_TILE):
-          o${M} += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        $for M in range(ROW_TILE):
-          _mm_store_ss(o${M}, vacc${M}x0123);
-
-        $for M in range(ROW_TILE):
-          o${M} += 1;
-      }
-    }
-    $for M in range(ROW_TILE):
-      i${M} = (const float*) ((uintptr_t) i${M} + input_increment);
-      o${M} = (float*) ((uintptr_t) o${M} + output_increment);
-    rows = doz(rows, ${ROW_TILE});
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/wasm.c.in b/src/f32-prelu/wasm.c.in
deleted file mode 100644
index fd6801ffa3b..00000000000
--- a/src/f32-prelu/wasm.c.in
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-$assert CHANNEL_TILE > 0
-$assert ROW_TILE >= 1
-$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-#include <assert.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-void xnn_f32_prelu_ukernel__wasm_${ROW_TILE}x${CHANNEL_TILE}(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride)
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  $for M in range(1, ROW_TILE):
-    const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride);
-    float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride);
-
-  const size_t input_increment = input_stride * ${ROW_TILE} - channels;
-  const size_t output_increment = output_stride * ${ROW_TILE} - channels;
-
-  const float vzero = 0.0f;
-  do {
-    $for M in range(1, ROW_TILE):
-      $if M % 2 == 0:
-        if XNN_UNPREDICTABLE(rows <= ${M}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-      $else:
-        if XNN_UNPREDICTABLE(rows < ${M+1}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-
-    const float* w = weights;
-    size_t c = channels;
-    $if CHANNEL_TILE > 1:
-      for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) {
-        $for C in range(CHANNEL_TILE):
-          const float vw${ABC[C]} = w[${C}];
-
-        $for M in range(ROW_TILE):
-          $for C in range(CHANNEL_TILE):
-            float vi${M}x${ABC[C]} = i${M}[${C}];
-          i${M} += ${CHANNEL_TILE};
-
-        $for M in range(ROW_TILE):
-          $for C in range(CHANNEL_TILE):
-            float vacc${M}x${ABC[C]} = __builtin_wasm_max_f32(vi${M}x${ABC[C]}, vzero);
-            vi${M}x${ABC[C]} = __builtin_wasm_min_f32(vi${M}x${ABC[C]}, vzero);
-
-        $for M in range(ROW_TILE):
-          $for C in range(CHANNEL_TILE):
-            vacc${M}x${ABC[C]} += vi${M}x${ABC[C]} * vw${ABC[C]};
-
-        $for M in range(ROW_TILE):
-          $for C in range(CHANNEL_TILE):
-            o${M}[${C}] = vacc${M}x${ABC[C]};
-          o${M} += ${CHANNEL_TILE};
-
-        w += ${CHANNEL_TILE};
-      }
-      for (; c != 0; c -= sizeof(float)) {
-        const float vw = *w++;
-
-        $for M in range(ROW_TILE):
-          float vi${M} = *i${M}++;
-
-        $for M in range(ROW_TILE):
-          float vacc${M} = __builtin_wasm_max_f32(vi${M}, vzero);
-          vi${M} = __builtin_wasm_min_f32(vi${M}, vzero);
-
-        $for M in range(ROW_TILE):
-          vacc${M} += vi${M} * vw;
-
-        $for M in range(ROW_TILE):
-          *o${M}++ = vacc${M};
-      }
-    $else:
-      do {
-        const float vw = *w++;
-
-        $for M in range(ROW_TILE):
-          float vi${M} = *i${M}++;
-
-        $for M in range(ROW_TILE):
-          float vacc${M} = __builtin_wasm_max_f32(vi${M}, vzero);
-          vi${M} = __builtin_wasm_min_f32(vi${M}, vzero);
-
-        $for M in range(ROW_TILE):
-          vacc${M} += vi${M} * vw;
-
-        $for M in range(ROW_TILE):
-          *o${M}++ = vacc${M};
-
-        c -= sizeof(float);
-      } while (c != 0);
-    $for M in range(ROW_TILE):
-      i${M} = (const float*) ((uintptr_t) i${M} + input_increment);
-      o${M} = (float*) ((uintptr_t) o${M} + output_increment);
-    rows = doz(rows, ${ROW_TILE});
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/wasmsimd-iminmax.c.in b/src/f32-prelu/wasmsimd-iminmax.c.in
deleted file mode 100644
index 6a9920b5317..00000000000
--- a/src/f32-prelu/wasmsimd-iminmax.c.in
+++ /dev/null
@@ -1,151 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-$assert CHANNEL_TILE % 4 == 0
-$assert CHANNEL_TILE >= 4
-$assert ROW_TILE >= 1
-$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-$ISA = "wasmrelaxedsimd" if RELAXED else "wasmsimd"
-void xnn_f32_prelu_ukernel__${ISA}_iminmax_${ROW_TILE}x${CHANNEL_TILE}(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  $for M in range(1, ROW_TILE):
-    const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride);
-    float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride);
-
-  const size_t input_increment = input_stride * ${ROW_TILE} - channels;
-  const size_t output_increment = output_stride * ${ROW_TILE} - channels;
-
-  const v128_t vzero = wasm_i32x4_const_splat(0);
-  do {
-    $for M in range(1, ROW_TILE):
-      $if M % 2 == 0:
-        if XNN_UNPREDICTABLE(rows <= ${M}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-      $else:
-        if XNN_UNPREDICTABLE(rows < ${M+1}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) {
-      const v128_t vw${ABC[0:4]} = wasm_v128_load(w);
-      $for C in range(4, CHANNEL_TILE, 4):
-        const v128_t vw${ABC[C:C+4]} = wasm_v128_load(w + ${C});
-      w += ${CHANNEL_TILE};
-
-      $for M in range(ROW_TILE):
-        v128_t vi${M}x${ABC[0:4]} = wasm_v128_load(i${M});
-        $for C in range(4, CHANNEL_TILE, 4):
-          v128_t vi${M}x${ABC[C:C+4]} = wasm_v128_load(i${M} + ${C});
-        i${M} += ${CHANNEL_TILE};
-
-      $for M in range(ROW_TILE):
-        $for C in range(0, CHANNEL_TILE, 4):
-          v128_t vacc${M}x${ABC[C:C+4]} = wasm_i32x4_max(vi${M}x${ABC[C:C+4]}, vzero);
-          vi${M}x${ABC[C:C+4]} = wasm_i32x4_min(vi${M}x${ABC[C:C+4]}, vzero);
-
-      $for M in range(ROW_TILE):
-        $for C in range(0, CHANNEL_TILE, 4):
-          $if RELAXED:
-            vacc${M}x${ABC[C:C+4]} = wasm_f32x4_relaxed_madd(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]}, vacc${M}x${ABC[C:C+4]});
-          $else:
-            vacc${M}x${ABC[C:C+4]} = wasm_f32x4_add(wasm_f32x4_mul(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]}), vacc${M}x${ABC[C:C+4]});
-
-      $for M in range(ROW_TILE):
-        wasm_v128_store(o${M}, vacc${M}x${ABC[0:4]});
-        $for C in range(4, CHANNEL_TILE, 4):
-          wasm_v128_store(o${M} + ${C}, vacc${M}x${ABC[C:C+4]});
-        o${M} += ${CHANNEL_TILE};
-    }
-    $if CHANNEL_TILE > 4:
-      for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-        const v128_t vw0123 = wasm_v128_load(w);
-        w += 4;
-
-        $for M in range(ROW_TILE):
-          v128_t vi${M}x0123 = wasm_v128_load(i${M});
-          i${M} += 4;
-
-        $for M in range(ROW_TILE):
-          v128_t vacc${M}x0123 = wasm_i32x4_max(vi${M}x0123, vzero);
-          vi${M}x0123 = wasm_i32x4_min(vi${M}x0123, vzero);
-
-        $for M in range(ROW_TILE):
-          $if RELAXED:
-            vacc${M}x0123 = wasm_f32x4_relaxed_madd(vi${M}x0123, vw0123, vacc${M}x0123);
-          $else:
-            vacc${M}x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi${M}x0123, vw0123), vacc${M}x0123);
-
-        $for M in range(ROW_TILE):
-          wasm_v128_store(o${M}, vacc${M}x0123);
-          o${M} += 4;
-      }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      $for M in range(ROW_TILE):
-        v128_t vi${M}x0123 = wasm_v128_load(i${M});
-        i${M} = (const float*) ((uintptr_t) i${M} + c);
-
-      $for M in range(ROW_TILE):
-        v128_t vacc${M}x0123 = wasm_i32x4_max(vi${M}x0123, vzero);
-        vi${M}x0123 = wasm_i32x4_min(vi${M}x0123, vzero);
-
-      $for M in range(ROW_TILE):
-        $if RELAXED:
-          vacc${M}x0123 = wasm_f32x4_relaxed_madd(vi${M}x0123, vw0123, vacc${M}x0123);
-        $else:
-          vacc${M}x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi${M}x0123, vw0123), vacc${M}x0123);
-
-      if (c & (2 * sizeof(float))) {
-        $for M in range(ROW_TILE):
-          wasm_v128_store64_lane(o${M}, vacc${M}x0123, 0);
-
-        $for M in range(ROW_TILE):
-          vacc${M}x0123 = wasm_v64x2_shuffle(vacc${M}x0123, vacc${M}x0123, 1, 1);
-
-        $for M in range(ROW_TILE):
-          o${M} += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        $for M in range(ROW_TILE):
-          wasm_v128_store32_lane(o${M}, vacc${M}x0123, 0);
-
-        $for M in range(ROW_TILE):
-          o${M} += 1;
-      }
-    }
-    $for M in range(ROW_TILE):
-      i${M} = (const float*) ((uintptr_t) i${M} + input_increment);
-      o${M} = (float*) ((uintptr_t) o${M} + output_increment);
-    rows = doz(rows, ${ROW_TILE});
-  } while (rows != 0);
-}
diff --git a/src/f32-prelu/wasmsimd-laneselect.c.in b/src/f32-prelu/wasmsimd-laneselect.c.in
deleted file mode 100644
index c5b81fbaafd..00000000000
--- a/src/f32-prelu/wasmsimd-laneselect.c.in
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-$assert CHANNEL_TILE % 4 == 0
-$assert CHANNEL_TILE >= 4
-$assert ROW_TILE >= 1
-$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/prelu.h"
-
-
-$WASM_V32X4_LANESELECT = "wasm_i32x4_relaxed_laneselect" if RELAXED else "wasm_v128_bitselect"
-$ISA = "wasmrelaxedsimd" if RELAXED else "wasmsimd"
-void xnn_f32_prelu_ukernel__${ISA}_laneselect_${ROW_TILE}x${CHANNEL_TILE}(
-    size_t rows,
-    size_t channels,
-    const float* restrict input,
-    size_t input_stride,
-    const float* restrict weights,
-    float* restrict output,
-    size_t output_stride) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(channels != 0);
-  assert(channels % sizeof(float) == 0);
-
-  const float* i0 = input;
-  float* o0 = output;
-  $for M in range(1, ROW_TILE):
-    const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride);
-    float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride);
-
-  const size_t input_increment = input_stride * ${ROW_TILE} - channels;
-  const size_t output_increment = output_stride * ${ROW_TILE} - channels;
-
-  do {
-    $for M in range(1, ROW_TILE):
-      $if M % 2 == 0:
-        if XNN_UNPREDICTABLE(rows <= ${M}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-      $else:
-        if XNN_UNPREDICTABLE(rows < ${M+1}) {
-          i${M} = i${M-1};
-          o${M} = o${M-1};
-        }
-
-    const float* w = weights;
-    size_t c = channels;
-    for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) {
-      const v128_t vw${ABC[0:4]} = wasm_v128_load(w);
-      $for C in range(4, CHANNEL_TILE, 4):
-        const v128_t vw${ABC[C:C+4]} = wasm_v128_load(w + ${C});
-      w += ${CHANNEL_TILE};
-
-      $for M in range(ROW_TILE):
-        const v128_t vi${M}x${ABC[0:4]} = wasm_v128_load(i${M});
-        $for C in range(4, CHANNEL_TILE, 4):
-          const v128_t vi${M}x${ABC[C:C+4]} = wasm_v128_load(i${M} + ${C});
-        i${M} += ${CHANNEL_TILE};
-
-      $for M in range(ROW_TILE):
-        $for C in range(0, CHANNEL_TILE, 4):
-          v128_t vacc${M}x${ABC[C:C+4]} = wasm_f32x4_mul(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]});
-          const v128_t vmask${M}x${ABC[C:C+4]} = wasm_i32x4_shr(vi${M}x${ABC[C:C+4]}, 31);
-
-      $for M in range(ROW_TILE):
-        $for C in range(0, CHANNEL_TILE, 4):
-          vacc${M}x${ABC[C:C+4]} = ${WASM_V32X4_LANESELECT}(vacc${M}x${ABC[C:C+4]}, vi${M}x${ABC[C:C+4]}, vmask${M}x${ABC[C:C+4]});
-
-      $for M in range(ROW_TILE):
-        wasm_v128_store(o${M}, vacc${M}x${ABC[0:4]});
-        $for C in range(4, CHANNEL_TILE, 4):
-          wasm_v128_store(o${M} + ${C}, vacc${M}x${ABC[C:C+4]});
-        o${M} += ${CHANNEL_TILE};
-    }
-    $if CHANNEL_TILE > 4:
-      for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
-        const v128_t vw0123 = wasm_v128_load(w);
-        w += 4;
-
-        $for M in range(ROW_TILE):
-          const v128_t vi${M}x0123 = wasm_v128_load(i${M});
-          i${M} += 4;
-
-        $for M in range(ROW_TILE):
-          v128_t vacc${M}x0123 = wasm_f32x4_mul(vi${M}x0123, vw0123);
-          const v128_t vmask${M}x0123 = wasm_i32x4_shr(vi${M}x0123, 31);
-
-        $for M in range(ROW_TILE):
-          vacc${M}x0123 = ${WASM_V32X4_LANESELECT}(vacc${M}x0123, vi${M}x0123, vmask${M}x0123);
-
-        $for M in range(ROW_TILE):
-          wasm_v128_store(o${M}, vacc${M}x0123);
-          o${M} += 4;
-      }
-    if XNN_UNLIKELY(c != 0) {
-      const v128_t vw0123 = wasm_v128_load(w);
-      w = (const float*) ((uintptr_t) w + c);
-
-      $for M in range(ROW_TILE):
-        const v128_t vi${M}x0123 = wasm_v128_load(i${M});
-        i${M} = (const float*) ((uintptr_t) i${M} + c);
-
-      $for M in range(ROW_TILE):
-        v128_t vacc${M}x0123 = wasm_f32x4_mul(vi${M}x0123, vw0123);
-        const v128_t vmask${M}x0123 = wasm_i32x4_shr(vi${M}x0123, 31);
-
-      $for M in range(ROW_TILE):
-        vacc${M}x0123 = ${WASM_V32X4_LANESELECT}(vacc${M}x0123, vi${M}x0123, vmask${M}x0123);
-
-      if (c & (2 * sizeof(float))) {
-        $for M in range(ROW_TILE):
-          wasm_v128_store64_lane(o${M}, vacc${M}x0123, 0);
-
-        $for M in range(ROW_TILE):
-          vacc${M}x0123 = wasm_v64x2_shuffle(vacc${M}x0123, vacc${M}x0123, 1, 1);
-
-        $for M in range(ROW_TILE):
-          o${M} += 2;
-      }
-      if (c & (1 * sizeof(float))) {
-        $for M in range(ROW_TILE):
-          wasm_v128_store32_lane(o${M}, vacc${M}x0123, 0);
-
-        $for M in range(ROW_TILE):
-          o${M} += 1;
-      }
-    }
-    $for M in range(ROW_TILE):
-      i${M} = (const float*) ((uintptr_t) i${M} + input_increment);
-      o${M} = (float*) ((uintptr_t) o${M} + output_increment);
-    rows = doz(rows, ${ROW_TILE});
-  } while (rows != 0);
-}
diff --git a/src/f32-qs8-vcvt/avx.c.in b/src/f32-qs8-vcvt/avx.c.in
index adf7dc83940..2cdd41eff6a 100644
--- a/src/f32-qs8-vcvt/avx.c.in
+++ b/src/f32-qs8-vcvt/avx.c.in
@@ -18,7 +18,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
 $_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE]
-$_MM_MAX_EPX8 = {"QS8": "_mm_max_epi8", "QU8": "_mm_max_epu8"}[DATATYPE]
+$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[DATATYPE]
 void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx_u${BATCH_TILE}(
     size_t batch,
     const float* input,
@@ -33,13 +33,11 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx_u${BATCH_TILE}(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   $if BATCH_TILE > 8:
     for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) {
@@ -69,12 +67,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx_u${BATCH_TILE}(
         $else:
           vy${ABC[N:N+8]} = ${_MM_PACKXS_EPI16}(vy${ABC[N:N+8]}, vy${ABC[N:N+8]});
 
-      $for N in range(0, BATCH_TILE, 16):
-        $if N + 8 < BATCH_TILE:
-          vy${ABC[N:N+16]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+16]}, voutput_min);
-        $else:
-          vy${ABC[N:N+8]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+8]}, voutput_min);
-
       _mm_storeu_si128((__m128i*) output, vy${ABC[0:16]});
       $for N in range(16, BATCH_TILE, 16):
         $if N + 8 < BATCH_TILE:
@@ -94,7 +86,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx_u${BATCH_TILE}(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = ${_MM_PACKXS_EPI16}(vy, vy);
-    vy = ${_MM_MAX_EPX8}(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -113,7 +104,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx_u${BATCH_TILE}(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = ${_MM_PACKXS_EPI16}(vy, vy);
-    vy = ${_MM_MAX_EPX8}(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qs8-vcvt/avx2.c.in b/src/f32-qs8-vcvt/avx2.c.in
index f6c965d62c9..dc9c9e5b5ca 100644
--- a/src/f32-qs8-vcvt/avx2.c.in
+++ b/src/f32-qs8-vcvt/avx2.c.in
@@ -20,8 +20,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
 $_MM256_PACKXS_EPI16 = {"QS8": "_mm256_packs_epi16", "QU8": "_mm256_packus_epi16"}[DATATYPE]
 $_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE]
-$_MM256_MAX_EPX8 = {"QS8": "_mm256_max_epi8", "QU8": "_mm256_max_epu8"}[DATATYPE]
-$_MM_MAX_EPX8 = {"QS8": "_mm_max_epi8", "QU8": "_mm_max_epu8"}[DATATYPE]
+$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[DATATYPE]
 void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx2_u${BATCH_TILE}(
     size_t batch,
     const float* input,
@@ -36,18 +35,14 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx2_u${BATCH_TILE}(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point));
   const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point);
   $if BATCH_TILE > 16:
     XNN_ALIGN(32) static const uint32_t shuffle_mask[8] = {0, 4, 1, 5, 2, 6, 3, 7};
     const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) shuffle_mask);
-    const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min);
-  $else:
-    const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) {
     __m256 vx${ABC[0:2]} = _mm256_loadu_ps(input);
@@ -82,14 +77,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx2_u${BATCH_TILE}(
       $else:
         __m128i vy${ABC[N:N+4]} = _mm_shuffle_epi32(vy${ABC[N]}${ABC[N+2]}${ABC[N+1]}${ABC[N+3]}, _MM_SHUFFLE(3, 1, 2, 0));
 
-    $for N in range(0, SIMD_TILE, 8):
-      $if N + 4 < SIMD_TILE:
-        vy${ABC[N:N+8]} = ${_MM256_MAX_EPX8}(vy${ABC[N:N+8]}, voutput_min);
-      $elif BATCH_TILE > 16:
-        vy${ABC[N:N+4]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+4]}, _mm256_castsi256_si128(voutput_min));
-      $else:
-        vy${ABC[N:N+4]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+4]}, voutput_min);
-
     $if SIMD_TILE > 4:
       _mm256_storeu_si256((__m256i*) output, vy${ABC[0:8]});
     $else:
@@ -112,10 +99,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx2_u${BATCH_TILE}(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = ${_MM_PACKXS_EPI16}(vy, vy);
-    $if BATCH_TILE > 16:
-      vy = ${_MM_MAX_EPX8}(vy, _mm256_castsi256_si128(voutput_min));
-    $else:
-      vy = ${_MM_MAX_EPX8}(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -134,10 +117,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx2_u${BATCH_TILE}(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = ${_MM_PACKXS_EPI16}(vy, vy);
-    $if BATCH_TILE > 16:
-      vy = ${_MM_MAX_EPX8}(vy, _mm256_castsi256_si128(voutput_min));
-    $else:
-      vy = ${_MM_MAX_EPX8}(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qs8-vcvt/avx512skx.c.in b/src/f32-qs8-vcvt/avx512skx.c.in
index 50728697c93..23b78a6f922 100644
--- a/src/f32-qs8-vcvt/avx512skx.c.in
+++ b/src/f32-qs8-vcvt/avx512skx.c.in
@@ -21,9 +21,7 @@ $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
 $_MM512_PACKXS_EPI16 = {"QS8": "_mm512_packs_epi16", "QU8": "_mm512_packus_epi16"}[DATATYPE]
 $_MM256_PACKXS_EPI16 = {"QS8": "_mm256_packs_epi16", "QU8": "_mm256_packus_epi16"}[DATATYPE]
 $_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE]
-$_MM512_MAX_EPX8 = {"QS8": "_mm512_max_epi8", "QU8": "_mm512_max_epu8"}[DATATYPE]
-$_MM256_MAX_EPX8 = {"QS8": "_mm256_max_epi8", "QU8": "_mm256_max_epu8"}[DATATYPE]
-$_MM_MAX_EPX8 = {"QS8": "_mm_max_epi8", "QU8": "_mm_max_epu8"}[DATATYPE]
+$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[DATATYPE]
 void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx512skx_u${BATCH_TILE}(
     size_t batch,
     const float* input,
@@ -42,20 +40,15 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx512skx_u${BATCH_TILE}(
     XNN_ALIGN(32) static const uint32_t shuffle256_mask[8] = {0, 4, 2, 6, 1, 5, 3, 7};
 
   const __m512 vscale = _mm512_set1_ps(params->scalar.scale);
-  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point));
   const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point);
   $if SIMD_TILE > 8:
     const __m512i vshuffle512_mask = _mm512_load_si512(shuffle512_mask);
   $if SIMD_TILE % 16 != 0:
     const __m256i vshuffle256_mask = _mm256_load_si256((const __m256i*) shuffle256_mask);
-  $if SIMD_TILE > 8:
-    const __m512i voutput_min = _mm512_set1_epi8(params->scalar.output_min);
-  $else:
-    const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
   for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) {
     __m512 vx0123 = _mm512_loadu_ps(input);
     $for N in range(4, SIMD_TILE, 4):
@@ -83,14 +76,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx512skx_u${BATCH_TILE}(
       $else:
         __m256i vy${ABC[N]}${ABC[N+4]}${ABC[N+2]}${ABC[N+6]}${ABC[N+1]}${ABC[N+5]}${ABC[N+3]}${ABC[N+7]} = ${_MM256_PACKXS_EPI16}(_mm512_castsi512_si256(vacc${ABC[N]}${ABC[N+4]}${ABC[N+1]}${ABC[N+5]}${ABC[N+2]}${ABC[N+6]}${ABC[N+3]}${ABC[N+7]}), _mm512_extracti32x8_epi32(vacc${ABC[N]}${ABC[N+4]}${ABC[N+1]}${ABC[N+5]}${ABC[N+2]}${ABC[N+6]}${ABC[N+3]}${ABC[N+7]}, 1));
 
-    $for N in range(0, SIMD_TILE, 16):
-      $if N + 8 < SIMD_TILE:
-        vy${ABC[N]}${ABC[N+4]}${ABC[N+8]}${ABC[N+12]}${ABC[N+1]}${ABC[N+5]}${ABC[N+9]}${ABC[N+13]}${ABC[N+2]}${ABC[N+6]}${ABC[N+10]}${ABC[N+14]}${ABC[N+3]}${ABC[N+7]}${ABC[N+11]}${ABC[N+15]} = ${_MM512_MAX_EPX8}(vy${ABC[N]}${ABC[N+4]}${ABC[N+8]}${ABC[N+12]}${ABC[N+1]}${ABC[N+5]}${ABC[N+9]}${ABC[N+13]}${ABC[N+2]}${ABC[N+6]}${ABC[N+10]}${ABC[N+14]}${ABC[N+3]}${ABC[N+7]}${ABC[N+11]}${ABC[N+15]}, voutput_min);
-      $elif SIMD_TILE > 8:
-        vy${ABC[N]}${ABC[N+4]}${ABC[N+2]}${ABC[N+6]}${ABC[N+1]}${ABC[N+5]}${ABC[N+3]}${ABC[N+7]} = ${_MM256_MAX_EPX8}(vy${ABC[N]}${ABC[N+4]}${ABC[N+2]}${ABC[N+6]}${ABC[N+1]}${ABC[N+5]}${ABC[N+3]}${ABC[N+7]}, _mm512_castsi512_si256(voutput_min));
-      $else:
-        vy${ABC[N]}${ABC[N+4]}${ABC[N+2]}${ABC[N+6]}${ABC[N+1]}${ABC[N+5]}${ABC[N+3]}${ABC[N+7]} = ${_MM256_MAX_EPX8}(vy${ABC[N]}${ABC[N+4]}${ABC[N+2]}${ABC[N+6]}${ABC[N+1]}${ABC[N+5]}${ABC[N+3]}${ABC[N+7]}, voutput_min);
-
     $for N in range(0, SIMD_TILE, 16):
       $if N + 8 < SIMD_TILE:
         const __m512i vy${ABC[N:N+16]} = _mm512_permutexvar_epi32(vshuffle512_mask, vy${ABC[N]}${ABC[N+4]}${ABC[N+8]}${ABC[N+12]}${ABC[N+1]}${ABC[N+5]}${ABC[N+9]}${ABC[N+13]}${ABC[N+2]}${ABC[N+6]}${ABC[N+10]}${ABC[N+14]}${ABC[N+3]}${ABC[N+7]}${ABC[N+11]}${ABC[N+15]});
@@ -120,10 +105,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx512skx_u${BATCH_TILE}(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = ${_MM_PACKXS_EPI16}(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    $if SIMD_TILE > 8:
-      vy0123 = ${_MM_MAX_EPX8}(vy0123, _mm512_castsi512_si128(voutput_min));
-    $else:
-      vy0123 = ${_MM_MAX_EPX8}(vy0123, _mm256_castsi256_si128(voutput_min));
 
     _mm_storeu_si128((__m128i*) output, vy0123);
     output += 16;
@@ -146,10 +127,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__avx512skx_u${BATCH_TILE}(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = ${_MM_PACKXS_EPI16}(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    $if SIMD_TILE > 8:
-      vy0123 = ${_MM_MAX_EPX8}(vy0123, _mm512_castsi512_si128(voutput_min));
-    $else:
-      vy0123 = ${_MM_MAX_EPX8}(vy0123, _mm256_castsi256_si128(voutput_min));
 
     _mm_mask_storeu_epi8(output, vmask, vy0123);
   }
diff --git a/src/f32-qs8-vcvt/f32-qs8-vcvt.h b/src/f32-qs8-vcvt/f32-qs8-vcvt.h
index cca362a9af2..a21fc308d84 100644
--- a/src/f32-qs8-vcvt/f32-qs8-vcvt.h
+++ b/src/f32-qs8-vcvt/f32-qs8-vcvt.h
@@ -51,11 +51,14 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qs8_vcvt_ukernel__avx2_u1
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qs8_vcvt_ukernel__avx2_u32, 32, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qs8_vcvt_ukernel__avx2_u48, 48, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qs8_vcvt_ukernel__avx2_u64, 64, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qs8_vcvt_ukernel__avx512skx_u32, 32, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qs8_vcvt_ukernel__avx512skx_u64, 64, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qs8_vcvt_ukernel__avx512skx_u96, 96, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qs8_vcvt_ukernel__avx512skx_u128, 128, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u8, 8, false, float, int8_t, struct xnn_f32_qs8_cvt_params, xnn_init_f32_qs8_cvt_scalar_params)
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u16.c
index 077b88cd582..7b199c3a48e 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u16.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u16.c
@@ -30,13 +30,11 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u16(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
     __m256 vx01234567 = _mm256_loadu_ps(input);
@@ -60,8 +58,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u16(
 
     __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
 
-    vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
-
     _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF);
     output += 16;
   }
@@ -76,7 +72,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u16(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -95,7 +90,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u16(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u24.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u24.c
index 6ad7909cc74..d81aae2ef9e 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u24.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u24.c
@@ -30,13 +30,11 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u24(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) {
     __m256 vx01234567 = _mm256_loadu_ps(input);
@@ -67,9 +65,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u24(
     __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
     vyGHIJKLMN = _mm_packs_epi16(vyGHIJKLMN, vyGHIJKLMN);
 
-    vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMN = _mm_max_epi8(vyGHIJKLMN, voutput_min);
-
     _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF);
     _mm_storel_epi64((__m128i*) (output + 16), vyGHIJKLMN);
     output += 24;
@@ -85,7 +80,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u24(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -104,7 +98,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u24(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u32.c
index c249138d905..b82a02e540f 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u32.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u32.c
@@ -30,13 +30,11 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u32(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     __m256 vx01234567 = _mm256_loadu_ps(input);
@@ -73,9 +71,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u32(
     __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
     __m128i vyGHIJKLMNOPQRSTUV = _mm_packs_epi16(vyGHIJKLMN, vyOPQRSTUV);
 
-    vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMNOPQRSTUV = _mm_max_epi8(vyGHIJKLMNOPQRSTUV, voutput_min);
-
     _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF);
     _mm_storeu_si128((__m128i*) (output + 16), vyGHIJKLMNOPQRSTUV);
     output += 32;
@@ -91,7 +86,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u32(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -110,7 +104,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u32(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u8.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u8.c
index 193acb70e13..2272d45a3be 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u8.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-u8.c
@@ -30,13 +30,11 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u8(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
     __m256 vx = _mm256_loadu_ps(input);
@@ -49,7 +47,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u8(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -68,7 +65,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx_u8(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u16.c
index 122cbfac25b..a4267455b14 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u16.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u16.c
@@ -30,13 +30,11 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u16(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
     __m256 vx01 = _mm256_loadu_ps(input);
@@ -60,8 +58,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u16(
 
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
 
-    vy0123 = _mm_max_epi8(vy0123, voutput_min);
-
     _mm_storeu_si128((__m128i*) output, vy0123);
     output += 16;
   }
@@ -76,7 +72,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u16(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -95,7 +90,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u16(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u32.c
index de671df9956..b8faf2cbc8e 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u32.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u32.c
@@ -30,15 +30,13 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u32(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point);
   XNN_ALIGN(32) static const uint32_t shuffle_mask[8] = {0, 4, 1, 5, 2, 6, 3, 7};
   const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) shuffle_mask);
-  const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     __m256 vx01 = _mm256_loadu_ps(input);
@@ -72,8 +70,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u32(
 
     __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask);
 
-    vy01234567 = _mm256_max_epi8(vy01234567, voutput_min);
-
     _mm256_storeu_si256((__m256i*) output, vy01234567);
     output += 32;
   }
@@ -88,7 +84,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u32(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min));
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -107,7 +102,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u32(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min));
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u48.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u48.c
index cb6aab374fa..351cac10264 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u48.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u48.c
@@ -30,15 +30,13 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u48(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point);
   XNN_ALIGN(32) static const uint32_t shuffle_mask[8] = {0, 4, 1, 5, 2, 6, 3, 7};
   const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) shuffle_mask);
-  const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 48 * sizeof(float); batch -= 48 * sizeof(float)) {
     __m256 vx01 = _mm256_loadu_ps(input);
@@ -84,9 +82,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u48(
     __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask);
     __m128i vy89AB = _mm_shuffle_epi32(vy8A9B, _MM_SHUFFLE(3, 1, 2, 0));
 
-    vy01234567 = _mm256_max_epi8(vy01234567, voutput_min);
-    vy89AB = _mm_max_epi8(vy89AB, _mm256_castsi256_si128(voutput_min));
-
     _mm256_storeu_si256((__m256i*) output, vy01234567);
     _mm_storeu_si128((__m128i*) (output + 32), vy89AB);
     output += 48;
@@ -102,7 +97,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u48(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min));
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -121,7 +115,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u48(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min));
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u64.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u64.c
index a20495ab83b..c1cfc2a3d18 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u64.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-u64.c
@@ -30,15 +30,13 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u64(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point);
   XNN_ALIGN(32) static const uint32_t shuffle_mask[8] = {0, 4, 1, 5, 2, 6, 3, 7};
   const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) shuffle_mask);
-  const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) {
     __m256 vx01 = _mm256_loadu_ps(input);
@@ -94,9 +92,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u64(
     __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask);
     __m256i vy89ABCDEF = _mm256_permutevar8x32_epi32(vy8ACE9BDF, vshuffle_mask);
 
-    vy01234567 = _mm256_max_epi8(vy01234567, voutput_min);
-    vy89ABCDEF = _mm256_max_epi8(vy89ABCDEF, voutput_min);
-
     _mm256_storeu_si256((__m256i*) output, vy01234567);
     _mm256_storeu_si256((__m256i*) (output + 32), vy89ABCDEF);
     output += 64;
@@ -112,7 +107,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u64(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min));
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -131,7 +125,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx2_u64(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, _mm256_castsi256_si128(voutput_min));
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u128.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u128.c
index 850a3cd2489..6ee078c00dd 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u128.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u128.c
@@ -31,14 +31,12 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u128(
 
 
   const __m512 vscale = _mm512_set1_ps(params->scalar.scale);
-  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point);
   const __m512i vshuffle512_mask = _mm512_load_si512(shuffle512_mask);
-  const __m512i voutput_min = _mm512_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
   for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) {
     __m512 vx0123 = _mm512_loadu_ps(input);
     __m512 vx4567 = _mm512_loadu_ps(input + 16);
@@ -90,9 +88,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u128(
     __m512i vy048C159D26AE37BF = _mm512_packs_epi16(vacc04152637, vacc8C9DAEBF);
     __m512i vyGKOSHLPTIMQUJNRV = _mm512_packs_epi16(vaccGKHLIMJN, vaccOSPTQURV);
 
-    vy048C159D26AE37BF = _mm512_max_epi8(vy048C159D26AE37BF, voutput_min);
-    vyGKOSHLPTIMQUJNRV = _mm512_max_epi8(vyGKOSHLPTIMQUJNRV, voutput_min);
-
     const __m512i vy0123456789ABCDEF = _mm512_permutexvar_epi32(vshuffle512_mask, vy048C159D26AE37BF);
     const __m512i vyGHIJKLMNOPQRSTUV = _mm512_permutexvar_epi32(vshuffle512_mask, vyGKOSHLPTIMQUJNRV);
 
@@ -112,7 +107,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u128(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epi8(vy0123, _mm512_castsi512_si128(voutput_min));
 
     _mm_storeu_si128((__m128i*) output, vy0123);
     output += 16;
@@ -135,7 +129,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u128(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epi8(vy0123, _mm512_castsi512_si128(voutput_min));
 
     _mm_mask_storeu_epi8(output, vmask, vy0123);
   }
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u32.c
index d732a7f539f..43a38fceec4 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u32.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u32.c
@@ -31,14 +31,12 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u32(
   XNN_ALIGN(32) static const uint32_t shuffle256_mask[8] = {0, 4, 2, 6, 1, 5, 3, 7};
 
   const __m512 vscale = _mm512_set1_ps(params->scalar.scale);
-  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point);
   const __m256i vshuffle256_mask = _mm256_load_si256((const __m256i*) shuffle256_mask);
-  const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     __m512 vx0123 = _mm512_loadu_ps(input);
     __m512 vx4567 = _mm512_loadu_ps(input + 16);
@@ -59,8 +57,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u32(
 
     __m256i vy04261537 = _mm256_packs_epi16(_mm512_castsi512_si256(vacc04152637), _mm512_extracti32x8_epi32(vacc04152637, 1));
 
-    vy04261537 = _mm256_max_epi8(vy04261537, voutput_min);
-
     const __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy04261537, vshuffle256_mask);
 
     _mm256_storeu_si256((__m256i*) output, vy01234567);
@@ -78,7 +74,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u32(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epi8(vy0123, _mm256_castsi256_si128(voutput_min));
 
     _mm_storeu_si128((__m128i*) output, vy0123);
     output += 16;
@@ -101,7 +96,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u32(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epi8(vy0123, _mm256_castsi256_si128(voutput_min));
 
     _mm_mask_storeu_epi8(output, vmask, vy0123);
   }
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u64.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u64.c
index 2534b3c70f3..40a5bd1d02e 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u64.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u64.c
@@ -31,14 +31,12 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u64(
 
 
   const __m512 vscale = _mm512_set1_ps(params->scalar.scale);
-  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point);
   const __m512i vshuffle512_mask = _mm512_load_si512(shuffle512_mask);
-  const __m512i voutput_min = _mm512_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
   for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) {
     __m512 vx0123 = _mm512_loadu_ps(input);
     __m512 vx4567 = _mm512_loadu_ps(input + 16);
@@ -69,8 +67,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u64(
 
     __m512i vy048C159D26AE37BF = _mm512_packs_epi16(vacc04152637, vacc8C9DAEBF);
 
-    vy048C159D26AE37BF = _mm512_max_epi8(vy048C159D26AE37BF, voutput_min);
-
     const __m512i vy0123456789ABCDEF = _mm512_permutexvar_epi32(vshuffle512_mask, vy048C159D26AE37BF);
 
     _mm512_storeu_si512(output, vy0123456789ABCDEF);
@@ -88,7 +84,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u64(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epi8(vy0123, _mm512_castsi512_si128(voutput_min));
 
     _mm_storeu_si128((__m128i*) output, vy0123);
     output += 16;
@@ -111,7 +106,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u64(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epi8(vy0123, _mm512_castsi512_si128(voutput_min));
 
     _mm_mask_storeu_epi8(output, vmask, vy0123);
   }
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u96.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u96.c
index 2c34fc6864b..943e467a2c9 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u96.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-u96.c
@@ -32,15 +32,13 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u96(
   XNN_ALIGN(32) static const uint32_t shuffle256_mask[8] = {0, 4, 2, 6, 1, 5, 3, 7};
 
   const __m512 vscale = _mm512_set1_ps(params->scalar.scale);
-  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point);
   const __m512i vshuffle512_mask = _mm512_load_si512(shuffle512_mask);
   const __m256i vshuffle256_mask = _mm256_load_si256((const __m256i*) shuffle256_mask);
-  const __m512i voutput_min = _mm512_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
   for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) {
     __m512 vx0123 = _mm512_loadu_ps(input);
     __m512 vx4567 = _mm512_loadu_ps(input + 16);
@@ -82,9 +80,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u96(
     __m512i vy048C159D26AE37BF = _mm512_packs_epi16(vacc04152637, vacc8C9DAEBF);
     __m256i vyGKIMHLJN = _mm256_packs_epi16(_mm512_castsi512_si256(vaccGKHLIMJN), _mm512_extracti32x8_epi32(vaccGKHLIMJN, 1));
 
-    vy048C159D26AE37BF = _mm512_max_epi8(vy048C159D26AE37BF, voutput_min);
-    vyGKIMHLJN = _mm256_max_epi8(vyGKIMHLJN, _mm512_castsi512_si256(voutput_min));
-
     const __m512i vy0123456789ABCDEF = _mm512_permutexvar_epi32(vshuffle512_mask, vy048C159D26AE37BF);
     const __m256i vyGHIJKLMN = _mm256_permutevar8x32_epi32(vyGKIMHLJN, vshuffle256_mask);
 
@@ -104,7 +99,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u96(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epi8(vy0123, _mm512_castsi512_si128(voutput_min));
 
     _mm_storeu_si128((__m128i*) output, vy0123);
     output += 16;
@@ -127,7 +121,6 @@ void xnn_f32_qs8_vcvt_ukernel__avx512skx_u96(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packs_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epi8(vy0123, _mm512_castsi512_si128(voutput_min));
 
     _mm_mask_storeu_epi8(output, vmask, vy0123);
   }
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u128.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u128.c
index 277f5a0a102..686e1eaf431 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u128.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u128.c
@@ -26,8 +26,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u128(
   const HVX_Vector vscale = xnn_set1_f32(params->scalar.scale);
   const HVX_Vector vmagic_bias = xnn_set1_f32(12582912.0f);
   const HVX_Vector vmagic_bias_less_zero_point = Q6_V_vsplat_R(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->scalar.output_min);
-  const HVX_Vector voutput_max = Q6_Vb_vsplat_R(params->scalar.output_max);
   XNN_FORCE_REALIZATION(vmagic_bias);
   for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) {
     HVX_Vector vx0 = xnn_loadu_f32(input);
@@ -53,9 +51,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u128(
     // narrowing 16-bit to 8-bit
     HVX_Vector vy0 = Q6_Vb_vpack_VhVh_sat(vacc_h1, vacc_h0);
 
-    vy0 = Q6_Vb_vmax_VbVb(voutput_min, vy0);
-    vy0 = Q6_Vb_vmin_VbVb(voutput_max, vy0);
-
     *((HVX_UVector *) output) = vy0;
     output += 128;
   }
@@ -71,9 +66,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u128(
 
     HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h);
 
-    vy = Q6_Vb_vmax_VbVb(voutput_min, vy);
-    vy = Q6_Vb_vmin_VbVb(voutput_max, vy);
-
     Q6_V_vstu_variable(output, 32, vy);
     output += 32;
   }
@@ -90,9 +82,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u128(
 
     HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h);
 
-    vy = Q6_Vb_vmax_VbVb(voutput_min, vy);
-    vy = Q6_Vb_vmin_VbVb(voutput_max, vy);
-
     // Since the output data type is int8_t,
     // we simply determine the number of elements using batch >> 2
     // without multiplying by sizeof(int8_t).
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u256.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u256.c
index 9d96a73e6d6..7aa784bb36f 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u256.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u256.c
@@ -26,8 +26,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u256(
   const HVX_Vector vscale = xnn_set1_f32(params->scalar.scale);
   const HVX_Vector vmagic_bias = xnn_set1_f32(12582912.0f);
   const HVX_Vector vmagic_bias_less_zero_point = Q6_V_vsplat_R(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->scalar.output_min);
-  const HVX_Vector voutput_max = Q6_Vb_vsplat_R(params->scalar.output_max);
   XNN_FORCE_REALIZATION(vmagic_bias);
   for (; batch >= 256 * sizeof(float); batch -= 256 * sizeof(float)) {
     HVX_Vector vx0 = xnn_loadu_f32(input);
@@ -68,11 +66,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u256(
     HVX_Vector vy0 = Q6_Vb_vpack_VhVh_sat(vacc_h1, vacc_h0);
     HVX_Vector vy1 = Q6_Vb_vpack_VhVh_sat(vacc_h3, vacc_h2);
 
-    vy0 = Q6_Vb_vmax_VbVb(voutput_min, vy0);
-    vy0 = Q6_Vb_vmin_VbVb(voutput_max, vy0);
-    vy1 = Q6_Vb_vmax_VbVb(voutput_min, vy1);
-    vy1 = Q6_Vb_vmin_VbVb(voutput_max, vy1);
-
     *((HVX_UVector *) output) = vy0;
     output += 128;
     *((HVX_UVector *) output) = vy1;
@@ -90,9 +83,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u256(
 
     HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h);
 
-    vy = Q6_Vb_vmax_VbVb(voutput_min, vy);
-    vy = Q6_Vb_vmin_VbVb(voutput_max, vy);
-
     Q6_V_vstu_variable(output, 32, vy);
     output += 32;
   }
@@ -109,9 +99,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u256(
 
     HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h);
 
-    vy = Q6_Vb_vmax_VbVb(voutput_min, vy);
-    vy = Q6_Vb_vmin_VbVb(voutput_max, vy);
-
     // Since the output data type is int8_t,
     // we simply determine the number of elements using batch >> 2
     // without multiplying by sizeof(int8_t).
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u32.c
index 80c7c050e96..02a2540be9f 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u32.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u32.c
@@ -26,8 +26,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u32(
   const HVX_Vector vscale = xnn_set1_f32(params->scalar.scale);
   const HVX_Vector vmagic_bias = xnn_set1_f32(12582912.0f);
   const HVX_Vector vmagic_bias_less_zero_point = Q6_V_vsplat_R(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->scalar.output_min);
-  const HVX_Vector voutput_max = Q6_Vb_vsplat_R(params->scalar.output_max);
   XNN_FORCE_REALIZATION(vmagic_bias);
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     HVX_Vector vx = xnn_loadu_f32(input);
@@ -41,9 +39,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u32(
 
     HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h);
 
-    vy = Q6_Vb_vmax_VbVb(voutput_min, vy);
-    vy = Q6_Vb_vmin_VbVb(voutput_max, vy);
-
     Q6_V_vstu_variable(output, 32, vy);
     output += 32;
   }
@@ -60,9 +55,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u32(
 
     HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h);
 
-    vy = Q6_Vb_vmax_VbVb(voutput_min, vy);
-    vy = Q6_Vb_vmin_VbVb(voutput_max, vy);
-
     // Since the output data type is int8_t,
     // we simply determine the number of elements using batch >> 2
     // without multiplying by sizeof(int8_t).
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u64.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u64.c
index 234b656aa4c..95e41c9055f 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u64.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u64.c
@@ -26,8 +26,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u64(
   const HVX_Vector vscale = xnn_set1_f32(params->scalar.scale);
   const HVX_Vector vmagic_bias = xnn_set1_f32(12582912.0f);
   const HVX_Vector vmagic_bias_less_zero_point = Q6_V_vsplat_R(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->scalar.output_min);
-  const HVX_Vector voutput_max = Q6_Vb_vsplat_R(params->scalar.output_max);
   XNN_FORCE_REALIZATION(vmagic_bias);
   for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) {
     HVX_Vector vx0 = xnn_loadu_f32(input);
@@ -46,9 +44,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u64(
     // narrowing 16-bit to 8-bit
     HVX_Vector vy0 = Q6_Vb_vpack_VhVh_sat(vacc_h0, vacc_h0);
 
-    vy0 = Q6_Vb_vmax_VbVb(voutput_min, vy0);
-    vy0 = Q6_Vb_vmin_VbVb(voutput_max, vy0);
-
     Q6_V_vstu_variable(output, 64, vy0);
     output += 64;
   }
@@ -64,9 +59,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u64(
 
     HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h);
 
-    vy = Q6_Vb_vmax_VbVb(voutput_min, vy);
-    vy = Q6_Vb_vmin_VbVb(voutput_max, vy);
-
     Q6_V_vstu_variable(output, 32, vy);
     output += 32;
   }
@@ -83,9 +75,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u64(
 
     HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h);
 
-    vy = Q6_Vb_vmax_VbVb(voutput_min, vy);
-    vy = Q6_Vb_vmin_VbVb(voutput_max, vy);
-
     // Since the output data type is int8_t,
     // we simply determine the number of elements using batch >> 2
     // without multiplying by sizeof(int8_t).
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u96.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u96.c
index 131e51562ba..3b336d79e1c 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u96.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-hvx-u96.c
@@ -26,8 +26,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u96(
   const HVX_Vector vscale = xnn_set1_f32(params->scalar.scale);
   const HVX_Vector vmagic_bias = xnn_set1_f32(12582912.0f);
   const HVX_Vector vmagic_bias_less_zero_point = Q6_V_vsplat_R(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->scalar.output_min);
-  const HVX_Vector voutput_max = Q6_Vb_vsplat_R(params->scalar.output_max);
   XNN_FORCE_REALIZATION(vmagic_bias);
   for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) {
     HVX_Vector vx0 = xnn_loadu_f32(input);
@@ -50,9 +48,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u96(
     // narrowing 16-bit to 8-bit
     HVX_Vector vy0 = Q6_Vb_vpack_VhVh_sat(vacc_h1, vacc_h0);
 
-    vy0 = Q6_Vb_vmax_VbVb(voutput_min, vy0);
-    vy0 = Q6_Vb_vmin_VbVb(voutput_max, vy0);
-
     Q6_V_vstu_variable(output, 96, vy0);
     output += 96;
   }
@@ -68,9 +63,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u96(
 
     HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h);
 
-    vy = Q6_Vb_vmax_VbVb(voutput_min, vy);
-    vy = Q6_Vb_vmin_VbVb(voutput_max, vy);
-
     Q6_V_vstu_variable(output, 32, vy);
     output += 32;
   }
@@ -87,9 +79,6 @@ void xnn_f32_qs8_vcvt_ukernel__hvx_u96(
 
     HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h);
 
-    vy = Q6_Vb_vmax_VbVb(voutput_min, vy);
-    vy = Q6_Vb_vmin_VbVb(voutput_max, vy);
-
     // Since the output data type is int8_t,
     // we simply determine the number of elements using batch >> 2
     // without multiplying by sizeof(int8_t).
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u16.c
index aa77d56645c..ddd09f979de 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u16.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u16.c
@@ -30,8 +30,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u16(
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f);
   const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const int8x16_t voutput_min = vld1q_dup_s8(&params->scalar.output_min);
-  const int8x16_t voutput_max = vld1q_dup_s8(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vmagic_bias);
   for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
     float32x4_t vx0123 = vld1q_f32(input); input += 4;
@@ -59,10 +57,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u16(
 
     int8x16_t vy0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
 
-    vy0123456789ABCDEF = vmaxq_s8(vy0123456789ABCDEF, voutput_min);
-
-    vy0123456789ABCDEF = vminq_s8(vy0123456789ABCDEF, voutput_max);
-
     vst1q_s8(output, vy0123456789ABCDEF); output += 16;
   }
   for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
@@ -81,8 +75,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u16(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
     vst1_s8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -104,8 +96,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u16(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u24.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u24.c
index 0014b184788..332a6a22358 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u24.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u24.c
@@ -30,8 +30,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u24(
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f);
   const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const int8x16_t voutput_min = vld1q_dup_s8(&params->scalar.output_min);
-  const int8x16_t voutput_max = vld1q_dup_s8(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vmagic_bias);
   for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) {
     float32x4_t vx0123 = vld1q_f32(input); input += 4;
@@ -69,12 +67,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u24(
     int8x16_t vy0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
     int8x8_t vyGHIJKLMN = vqmovn_s16(vaccGHIJKLMN);
 
-    vy0123456789ABCDEF = vmaxq_s8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMN = vmax_s8(vyGHIJKLMN, vget_low_s8(voutput_min));
-
-    vy0123456789ABCDEF = vminq_s8(vy0123456789ABCDEF, voutput_max);
-    vyGHIJKLMN = vmin_s8(vyGHIJKLMN, vget_low_s8(voutput_max));
-
     vst1q_s8(output, vy0123456789ABCDEF); output += 16;
     vst1_s8(output, vyGHIJKLMN); output += 8;
   }
@@ -94,8 +86,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u24(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
     vst1_s8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -117,8 +107,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u24(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u32.c
index c269620c21c..4e8e50c6f2c 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u32.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u32.c
@@ -30,8 +30,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u32(
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f);
   const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const int8x16_t voutput_min = vld1q_dup_s8(&params->scalar.output_min);
-  const int8x16_t voutput_max = vld1q_dup_s8(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vmagic_bias);
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     float32x4_t vx0123 = vld1q_f32(input); input += 4;
@@ -78,12 +76,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u32(
     int8x16_t vy0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
     int8x16_t vyGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV));
 
-    vy0123456789ABCDEF = vmaxq_s8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMNOPQRSTUV = vmaxq_s8(vyGHIJKLMNOPQRSTUV, voutput_min);
-
-    vy0123456789ABCDEF = vminq_s8(vy0123456789ABCDEF, voutput_max);
-    vyGHIJKLMNOPQRSTUV = vminq_s8(vyGHIJKLMNOPQRSTUV, voutput_max);
-
     vst1q_s8(output, vy0123456789ABCDEF); output += 16;
     vst1q_s8(output, vyGHIJKLMNOPQRSTUV); output += 16;
   }
@@ -103,8 +95,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u32(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
     vst1_s8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -126,8 +116,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u32(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u8.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u8.c
index ebca2f0cd62..bac5161d0d2 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u8.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-u8.c
@@ -30,8 +30,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u8(
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f);
   const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const int8x8_t voutput_min = vld1_dup_s8(&params->scalar.output_min);
-  const int8x8_t voutput_max = vld1_dup_s8(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vmagic_bias);
   for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
     float32x4_t vx_lo = vld1q_f32(input); input += 4;
@@ -49,8 +47,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u8(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, voutput_min);
-    vy = vmin_s8(vy, voutput_max);
     vst1_s8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -72,8 +68,6 @@ void xnn_f32_qs8_vcvt_ukernel__neon_u8(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, voutput_min);
-    vy = vmin_s8(vy, voutput_max);
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u16.c
index 7288a72069d..acb1398d7aa 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u16.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u16.c
@@ -29,8 +29,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u16(
 
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->scalar.output_zero_point);
-  const int8x16_t voutput_min = vld1q_dup_s8(&params->scalar.output_min);
-  const int8x16_t voutput_max = vld1q_dup_s8(&params->scalar.output_max);
   for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
     float32x4_t vx0123 = vld1q_f32(input); input += 4;
     float32x4_t vx4567 = vld1q_f32(input); input += 4;
@@ -55,10 +53,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u16(
 
     int8x16_t vy0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
 
-    vy0123456789ABCDEF = vmaxq_s8(vy0123456789ABCDEF, voutput_min);
-
-    vy0123456789ABCDEF = vminq_s8(vy0123456789ABCDEF, voutput_max);
-
     vst1q_s8(output, vy0123456789ABCDEF); output += 16;
   }
   for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
@@ -75,8 +69,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u16(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
     vst1_s8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -96,8 +88,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u16(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u24.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u24.c
index d10a83ea2a4..2cd5a932404 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u24.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u24.c
@@ -29,8 +29,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u24(
 
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->scalar.output_zero_point);
-  const int8x16_t voutput_min = vld1q_dup_s8(&params->scalar.output_min);
-  const int8x16_t voutput_max = vld1q_dup_s8(&params->scalar.output_max);
   for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) {
     float32x4_t vx0123 = vld1q_f32(input); input += 4;
     float32x4_t vx4567 = vld1q_f32(input); input += 4;
@@ -64,12 +62,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u24(
     int8x16_t vy0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
     int8x8_t vyGHIJKLMN = vqmovn_s16(vaccGHIJKLMN);
 
-    vy0123456789ABCDEF = vmaxq_s8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMN = vmax_s8(vyGHIJKLMN, vget_low_s8(voutput_min));
-
-    vy0123456789ABCDEF = vminq_s8(vy0123456789ABCDEF, voutput_max);
-    vyGHIJKLMN = vmin_s8(vyGHIJKLMN, vget_low_s8(voutput_max));
-
     vst1q_s8(output, vy0123456789ABCDEF); output += 16;
     vst1_s8(output, vyGHIJKLMN); output += 8;
   }
@@ -87,8 +79,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u24(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
     vst1_s8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -108,8 +98,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u24(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u32.c
index c0ae721e9de..05229f96051 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u32.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u32.c
@@ -29,8 +29,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u32(
 
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->scalar.output_zero_point);
-  const int8x16_t voutput_min = vld1q_dup_s8(&params->scalar.output_min);
-  const int8x16_t voutput_max = vld1q_dup_s8(&params->scalar.output_max);
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     float32x4_t vx0123 = vld1q_f32(input); input += 4;
     float32x4_t vx4567 = vld1q_f32(input); input += 4;
@@ -72,12 +70,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u32(
     int8x16_t vy0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
     int8x16_t vyGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV));
 
-    vy0123456789ABCDEF = vmaxq_s8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMNOPQRSTUV = vmaxq_s8(vyGHIJKLMNOPQRSTUV, voutput_min);
-
-    vy0123456789ABCDEF = vminq_s8(vy0123456789ABCDEF, voutput_max);
-    vyGHIJKLMNOPQRSTUV = vminq_s8(vyGHIJKLMNOPQRSTUV, voutput_max);
-
     vst1q_s8(output, vy0123456789ABCDEF); output += 16;
     vst1q_s8(output, vyGHIJKLMNOPQRSTUV); output += 16;
   }
@@ -95,8 +87,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u32(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
     vst1_s8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -116,8 +106,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u32(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, vget_low_s8(voutput_min));
-    vy = vmin_s8(vy, vget_low_s8(voutput_max));
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u8.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u8.c
index b695ce35e83..04aec02db27 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u8.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-u8.c
@@ -29,8 +29,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u8(
 
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->scalar.output_zero_point);
-  const int8x8_t voutput_min = vld1_dup_s8(&params->scalar.output_min);
-  const int8x8_t voutput_max = vld1_dup_s8(&params->scalar.output_max);
   for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
     float32x4_t vx_lo = vld1q_f32(input); input += 4;
     float32x4_t vx_hi = vld1q_f32(input); input += 4;
@@ -45,8 +43,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u8(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, voutput_min);
-    vy = vmin_s8(vy, voutput_max);
     vst1_s8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -66,8 +62,6 @@ void xnn_f32_qs8_vcvt_ukernel__neonv8_u8(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     int8x8_t vy = vqmovn_s16(vacc);
-    vy = vmax_s8(vy, voutput_min);
-    vy = vmin_s8(vy, voutput_max);
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c
index 3b34cf53db2..d321f60096c 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c
@@ -29,8 +29,9 @@ void xnn_f32_qs8_vcvt_ukernel__rvv_u1v(
   batch >>= XNN_LOG2_SIZEOF_FLOAT;
 
   const float scale = params->scalar.scale;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates...
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t output_zero_point = params->scalar.output_zero_point;
 
   for (; batch > 0; ) {
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u2v.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u2v.c
index 007cfc2674c..7c705028874 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u2v.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u2v.c
@@ -29,8 +29,9 @@ void xnn_f32_qs8_vcvt_ukernel__rvv_u2v(
   batch >>= XNN_LOG2_SIZEOF_FLOAT;
 
   const float scale = params->scalar.scale;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates...
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t output_zero_point = params->scalar.output_zero_point;
 
   for (; batch > 0; ) {
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c
index 1dc5f95390d..f7b097b8dab 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c
@@ -29,8 +29,9 @@ void xnn_f32_qs8_vcvt_ukernel__rvv_u4v(
   batch >>= XNN_LOG2_SIZEOF_FLOAT;
 
   const float scale = params->scalar.scale;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates...
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t output_zero_point = params->scalar.output_zero_point;
 
   for (; batch > 0; ) {
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c
index 825ab646212..a697c9a51f9 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c
@@ -29,8 +29,9 @@ void xnn_f32_qs8_vcvt_ukernel__rvv_u8v(
   batch >>= XNN_LOG2_SIZEOF_FLOAT;
 
   const float scale = params->scalar.scale;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates...
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t output_zero_point = params->scalar.output_zero_point;
 
   for (; batch > 0; ) {
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u1.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u1.c
index c8b2d40a1bc..7977f9d3c48 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u1.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u1.c
@@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_fmagic_u1(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u2.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u2.c
index 43d349249a5..8ffe971eef2 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u2.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u2.c
@@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_fmagic_u2(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u3.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u3.c
index 2464f6f929a..174cd884515 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u3.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u3.c
@@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_fmagic_u3(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u4.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u4.c
index 5f79a10ac3f..e3963ebe017 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u4.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-u4.c
@@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_fmagic_u4(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u1.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u1.c
index e9d3f8c00ef..6a6f8ccfcd1 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u1.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u1.c
@@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_imagic_u1(
   const float* i = input;
   const float vscale = params->scalar.scale;
   const float vmagic_bias = 12582912.0f;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
   const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u2.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u2.c
index ec02afcb18f..3842ef98972 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u2.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u2.c
@@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_imagic_u2(
   const float* i = input;
   const float vscale = params->scalar.scale;
   const float vmagic_bias = 12582912.0f;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
   const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u3.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u3.c
index d71b65c92f4..cc2043221c5 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u3.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u3.c
@@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_imagic_u3(
   const float* i = input;
   const float vscale = params->scalar.scale;
   const float vmagic_bias = 12582912.0f;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
   const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u4.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u4.c
index 55a3b724597..ba3508948b6 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u4.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-u4.c
@@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_imagic_u4(
   const float* i = input;
   const float vscale = params->scalar.scale;
   const float vmagic_bias = 12582912.0f;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
   const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u1.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u1.c
index ff91a4a5a76..772a1b9ac21 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u1.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u1.c
@@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_u1(
   assert(output != NULL);
 
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t voutput_zero_point = params->scalar.output_zero_point;
 
   do {
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u2.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u2.c
index 117536d2fe0..656e12af70c 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u2.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u2.c
@@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_u2(
   assert(output != NULL);
 
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t voutput_zero_point = params->scalar.output_zero_point;
 
   for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) {
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u3.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u3.c
index f479dbcbd8e..c6867a7366e 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u3.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u3.c
@@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_u3(
   assert(output != NULL);
 
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t voutput_zero_point = params->scalar.output_zero_point;
 
   for (; batch >= 3 * sizeof(float); batch -= 3 * sizeof(float)) {
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u4.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u4.c
index 60d73a29ed1..67e2adef52d 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u4.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-u4.c
@@ -27,8 +27,8 @@ void xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_u4(
   assert(output != NULL);
 
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const int32_t voutput_zero_point = params->scalar.output_zero_point;
 
   for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u16.c
index 18715c32046..9498d5e23c9 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u16.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u16.c
@@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u16(
   assert(output != NULL);
 
   const __m128 vscale = _mm_set1_ps(params->scalar.scale);
-  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi16(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
     __m128 vx0123 = _mm_loadu_ps(input);
@@ -64,12 +62,8 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u16(
     vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
     vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
 
-    vy01234567 = _mm_max_epi16(vy01234567, voutput_min);
-    vy89ABCDEF = _mm_max_epi16(vy89ABCDEF, voutput_min);
-
     __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
 
-
     _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF);
     output += 16;
   }
@@ -89,7 +83,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u16(
 
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
-    vy = _mm_max_epi16(vy, voutput_min);
     vy = _mm_packs_epi16(vy, vy);
 
     _mm_storel_epi64((__m128i*) output, vy);
@@ -111,7 +104,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u16(
 
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
-    vy = _mm_max_epi16(vy, voutput_min);
     vy = _mm_packs_epi16(vy, vy);
 
     if (batch & (4 * sizeof(float))) {
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u24.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u24.c
index c2dd1fab9bf..64cfa2b64b5 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u24.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u24.c
@@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u24(
   assert(output != NULL);
 
   const __m128 vscale = _mm_set1_ps(params->scalar.scale);
-  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi16(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) {
     __m128 vx0123 = _mm_loadu_ps(input);
@@ -74,14 +72,9 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u24(
     vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
     vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
 
-    vy01234567 = _mm_max_epi16(vy01234567, voutput_min);
-    vy89ABCDEF = _mm_max_epi16(vy89ABCDEF, voutput_min);
-    vyGHIJKLMN = _mm_max_epi16(vyGHIJKLMN, voutput_min);
-
     __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
     vyGHIJKLMN = _mm_packs_epi16(vyGHIJKLMN, vyGHIJKLMN);
 
-
     _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF);
     _mm_storel_epi64((__m128i*) (output + 16), vyGHIJKLMN);
     output += 24;
@@ -102,7 +95,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u24(
 
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
-    vy = _mm_max_epi16(vy, voutput_min);
     vy = _mm_packs_epi16(vy, vy);
 
     _mm_storel_epi64((__m128i*) output, vy);
@@ -124,7 +116,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u24(
 
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
-    vy = _mm_max_epi16(vy, voutput_min);
     vy = _mm_packs_epi16(vy, vy);
 
     if (batch & (4 * sizeof(float))) {
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u32.c
index 9404c0fde05..eed7b436f63 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u32.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u32.c
@@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u32(
   assert(output != NULL);
 
   const __m128 vscale = _mm_set1_ps(params->scalar.scale);
-  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi16(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     __m128 vx0123 = _mm_loadu_ps(input);
@@ -84,15 +82,9 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u32(
     vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
     vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point);
 
-    vy01234567 = _mm_max_epi16(vy01234567, voutput_min);
-    vy89ABCDEF = _mm_max_epi16(vy89ABCDEF, voutput_min);
-    vyGHIJKLMN = _mm_max_epi16(vyGHIJKLMN, voutput_min);
-    vyOPQRSTUV = _mm_max_epi16(vyOPQRSTUV, voutput_min);
-
     __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
     __m128i vyGHIJKLMNOPQRSTUV = _mm_packs_epi16(vyGHIJKLMN, vyOPQRSTUV);
 
-
     _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF);
     _mm_storeu_si128((__m128i*) (output + 16), vyGHIJKLMNOPQRSTUV);
     output += 32;
@@ -113,7 +105,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u32(
 
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
-    vy = _mm_max_epi16(vy, voutput_min);
     vy = _mm_packs_epi16(vy, vy);
 
     _mm_storel_epi64((__m128i*) output, vy);
@@ -135,7 +126,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u32(
 
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
-    vy = _mm_max_epi16(vy, voutput_min);
     vy = _mm_packs_epi16(vy, vy);
 
     if (batch & (4 * sizeof(float))) {
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u8.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u8.c
index a13f7e24a6e..b11c2fed70f 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u8.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-u8.c
@@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u8(
   assert(output != NULL);
 
   const __m128 vscale = _mm_set1_ps(params->scalar.scale);
-  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi16(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
     __m128 vx_lo = _mm_loadu_ps(input);
@@ -52,7 +50,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u8(
 
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
-    vy = _mm_max_epi16(vy, voutput_min);
     vy = _mm_packs_epi16(vy, vy);
 
     _mm_storel_epi64((__m128i*) output, vy);
@@ -74,7 +71,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse2_u8(
 
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
-    vy = _mm_max_epi16(vy, voutput_min);
     vy = _mm_packs_epi16(vy, vy);
 
     if (batch & (4 * sizeof(float))) {
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u16.c
index 48c217c81fa..c155b181f09 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u16.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u16.c
@@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u16(
   assert(output != NULL);
 
   const __m128 vscale = _mm_set1_ps(params->scalar.scale);
-  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
     __m128 vx0123 = _mm_loadu_ps(input);
@@ -64,11 +62,8 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u16(
     vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
     vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
 
-
     __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
 
-    vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
-
     _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF);
     output += 16;
   }
@@ -89,7 +84,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u16(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -111,7 +105,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u16(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u24.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u24.c
index 54944780d4c..6f07a0f425b 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u24.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u24.c
@@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u24(
   assert(output != NULL);
 
   const __m128 vscale = _mm_set1_ps(params->scalar.scale);
-  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) {
     __m128 vx0123 = _mm_loadu_ps(input);
@@ -74,13 +72,9 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u24(
     vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
     vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
 
-
     __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
     vyGHIJKLMN = _mm_packs_epi16(vyGHIJKLMN, vyGHIJKLMN);
 
-    vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMN = _mm_max_epi8(vyGHIJKLMN, voutput_min);
-
     _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF);
     _mm_storel_epi64((__m128i*) (output + 16), vyGHIJKLMN);
     output += 24;
@@ -102,7 +96,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u24(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -124,7 +117,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u24(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u32.c
index 5cede1dbd14..3d050f75c58 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u32.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u32.c
@@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u32(
   assert(output != NULL);
 
   const __m128 vscale = _mm_set1_ps(params->scalar.scale);
-  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     __m128 vx0123 = _mm_loadu_ps(input);
@@ -84,13 +82,9 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u32(
     vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
     vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point);
 
-
     __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
     __m128i vyGHIJKLMNOPQRSTUV = _mm_packs_epi16(vyGHIJKLMN, vyOPQRSTUV);
 
-    vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMNOPQRSTUV = _mm_max_epi8(vyGHIJKLMNOPQRSTUV, voutput_min);
-
     _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF);
     _mm_storeu_si128((__m128i*) (output + 16), vyGHIJKLMNOPQRSTUV);
     output += 32;
@@ -112,7 +106,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u32(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -134,7 +127,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u32(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u8.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u8.c
index c029b2edebf..fca10deddab 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u8.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-u8.c
@@ -28,13 +28,11 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u8(
   assert(output != NULL);
 
   const __m128 vscale = _mm_set1_ps(params->scalar.scale);
-  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
     __m128 vx_lo = _mm_loadu_ps(input);
@@ -53,7 +51,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u8(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -75,7 +72,6 @@ void xnn_f32_qs8_vcvt_ukernel__sse41_u8(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packs_epi16(vy, vy);
-    vy = _mm_max_epi8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u1.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u1.c
index 45da09e9419..79a2d37748b 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u1.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u1.c
@@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasm_fmagic_u1(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u2.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u2.c
index 85c306cbf9f..e33221c736f 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u2.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u2.c
@@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasm_fmagic_u2(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u3.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u3.c
index 6611776a42d..9f3b8b5c77c 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u3.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u3.c
@@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasm_fmagic_u3(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u4.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u4.c
index 41428e83021..cc9f407cd5a 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u4.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-u4.c
@@ -26,8 +26,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasm_fmagic_u4(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 127 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u16.c
index 5f52592f181..bb183e7b855 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u16.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u16.c
@@ -29,12 +29,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u16(
 
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t voutput_zero_point = wasm_v128_load16_splat(&params->scalar.output_zero_point);
-  const v128_t voutput_min = wasm_v128_load8_splat(&params->scalar.output_min);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
-  XNN_FORCE_REALIZATION(voutput_max);
   for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
     v128_t vx0123 = wasm_v128_load(input);
     v128_t vx4567 = wasm_v128_load(input + 4);
@@ -65,10 +61,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u16(
 
     v128_t vy0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vacc01234567, vacc89ABCDEF);
 
-    vy0123456789ABCDEF = wasm_i8x16_max(vy0123456789ABCDEF, voutput_min);
-
-    vy0123456789ABCDEF = wasm_i8x16_min(vy0123456789ABCDEF, voutput_max);
-
     wasm_v128_store(output, vy0123456789ABCDEF);
     output += 16;
   }
@@ -90,8 +82,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u16(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_i8x16_max(vy, voutput_min);
-    vy = wasm_i8x16_min(vy, voutput_max);
 
     wasm_v128_store64_lane(output, vy, 0);
     output += 8;
@@ -116,8 +106,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u16(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_i8x16_max(vy, voutput_min);
-    vy = wasm_i8x16_min(vy, voutput_max);
 
     if (batch & (4 * sizeof(float))) {
       wasm_v128_store32_lane(output, vy, 0);
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u24.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u24.c
index 8529640968a..f7dfe1ae059 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u24.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u24.c
@@ -29,12 +29,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u24(
 
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t voutput_zero_point = wasm_v128_load16_splat(&params->scalar.output_zero_point);
-  const v128_t voutput_min = wasm_v128_load8_splat(&params->scalar.output_min);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
-  XNN_FORCE_REALIZATION(voutput_max);
   for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) {
     v128_t vx0123 = wasm_v128_load(input);
     v128_t vx4567 = wasm_v128_load(input + 4);
@@ -76,12 +72,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u24(
     v128_t vy0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vacc01234567, vacc89ABCDEF);
     v128_t vyGHIJKLMN = wasm_i8x16_narrow_i16x8(vaccGHIJKLMN, vaccGHIJKLMN);
 
-    vy0123456789ABCDEF = wasm_i8x16_max(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMN = wasm_i8x16_max(vyGHIJKLMN, voutput_min);
-
-    vy0123456789ABCDEF = wasm_i8x16_min(vy0123456789ABCDEF, voutput_max);
-    vyGHIJKLMN = wasm_i8x16_min(vyGHIJKLMN, voutput_max);
-
     wasm_v128_store(output, vy0123456789ABCDEF);
     wasm_v128_store64_lane(output + 16, vyGHIJKLMN, 0);
     output += 24;
@@ -104,8 +94,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u24(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_i8x16_max(vy, voutput_min);
-    vy = wasm_i8x16_min(vy, voutput_max);
 
     wasm_v128_store64_lane(output, vy, 0);
     output += 8;
@@ -130,8 +118,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u24(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_i8x16_max(vy, voutput_min);
-    vy = wasm_i8x16_min(vy, voutput_max);
 
     if (batch & (4 * sizeof(float))) {
       wasm_v128_store32_lane(output, vy, 0);
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u32.c
index ec9fae4c169..fc1d0a8ec23 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u32.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u32.c
@@ -29,12 +29,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u32(
 
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t voutput_zero_point = wasm_v128_load16_splat(&params->scalar.output_zero_point);
-  const v128_t voutput_min = wasm_v128_load8_splat(&params->scalar.output_min);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
-  XNN_FORCE_REALIZATION(voutput_max);
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     v128_t vx0123 = wasm_v128_load(input);
     v128_t vx4567 = wasm_v128_load(input + 4);
@@ -86,12 +82,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u32(
     v128_t vy0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vacc01234567, vacc89ABCDEF);
     v128_t vyGHIJKLMNOPQRSTUV = wasm_i8x16_narrow_i16x8(vaccGHIJKLMN, vaccOPQRSTUV);
 
-    vy0123456789ABCDEF = wasm_i8x16_max(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMNOPQRSTUV = wasm_i8x16_max(vyGHIJKLMNOPQRSTUV, voutput_min);
-
-    vy0123456789ABCDEF = wasm_i8x16_min(vy0123456789ABCDEF, voutput_max);
-    vyGHIJKLMNOPQRSTUV = wasm_i8x16_min(vyGHIJKLMNOPQRSTUV, voutput_max);
-
     wasm_v128_store(output, vy0123456789ABCDEF);
     wasm_v128_store(output + 16, vyGHIJKLMNOPQRSTUV);
     output += 32;
@@ -114,8 +104,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u32(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_i8x16_max(vy, voutput_min);
-    vy = wasm_i8x16_min(vy, voutput_max);
 
     wasm_v128_store64_lane(output, vy, 0);
     output += 8;
@@ -140,8 +128,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u32(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_i8x16_max(vy, voutput_min);
-    vy = wasm_i8x16_min(vy, voutput_max);
 
     if (batch & (4 * sizeof(float))) {
       wasm_v128_store32_lane(output, vy, 0);
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u8.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u8.c
index 2bff6a3f06b..9323c1b82e7 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u8.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-u8.c
@@ -29,12 +29,8 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u8(
 
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t voutput_zero_point = wasm_v128_load16_splat(&params->scalar.output_zero_point);
-  const v128_t voutput_min = wasm_v128_load8_splat(&params->scalar.output_min);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
-  XNN_FORCE_REALIZATION(voutput_max);
   for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
     v128_t vx_lo = wasm_v128_load(input);
     v128_t vx_hi = wasm_v128_load(input + 4);
@@ -53,8 +49,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u8(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_i8x16_max(vy, voutput_min);
-    vy = wasm_i8x16_min(vy, voutput_max);
 
     wasm_v128_store64_lane(output, vy, 0);
     output += 8;
@@ -79,8 +73,6 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_u8(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_i8x16_max(vy, voutput_min);
-    vy = wasm_i8x16_min(vy, voutput_max);
 
     if (batch & (4 * sizeof(float))) {
       wasm_v128_store32_lane(output, vy, 0);
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u16.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u16.c
index 3f62f774212..e73be322f5d 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u16.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u16.c
@@ -28,12 +28,12 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_u16(
   assert(input != NULL);
   assert(output != NULL);
 
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f);
   const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point));
   const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
+  const v128_t voutput_max = wasm_u8x16_const_splat(127);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(vmagic_bias);
   XNN_FORCE_REALIZATION(vmagic_min);
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u24.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u24.c
index 87078726928..4dff7b9324a 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u24.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u24.c
@@ -28,12 +28,12 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_u24(
   assert(input != NULL);
   assert(output != NULL);
 
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f);
   const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point));
   const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
+  const v128_t voutput_max = wasm_u8x16_const_splat(127);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(vmagic_bias);
   XNN_FORCE_REALIZATION(vmagic_min);
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u32.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u32.c
index 5fbbeb7a1a7..d392c0b8997 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u32.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u32.c
@@ -28,12 +28,12 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_u32(
   assert(input != NULL);
   assert(output != NULL);
 
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f);
   const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point));
   const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
+  const v128_t voutput_max = wasm_u8x16_const_splat(127);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(vmagic_bias);
   XNN_FORCE_REALIZATION(vmagic_min);
diff --git a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u8.c b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u8.c
index d822d804fa8..4b67d131af7 100644
--- a/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u8.c
+++ b/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-u8.c
@@ -28,12 +28,12 @@ void xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_u8(
   assert(input != NULL);
   assert(output != NULL);
 
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) -128 - (int32_t) params->scalar.output_zero_point);
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f);
   const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point));
   const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
+  const v128_t voutput_max = wasm_u8x16_const_splat(127);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(vmagic_bias);
   XNN_FORCE_REALIZATION(vmagic_min);
diff --git a/src/f32-qs8-vcvt/hvx.c.in b/src/f32-qs8-vcvt/hvx.c.in
index 3971581110a..b66363ad7d8 100644
--- a/src/f32-qs8-vcvt/hvx.c.in
+++ b/src/f32-qs8-vcvt/hvx.c.in
@@ -25,8 +25,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__hvx_u${BATCH_TILE}(
   const HVX_Vector vscale = xnn_set1_f32(params->scalar.scale);
   const HVX_Vector vmagic_bias = xnn_set1_f32(12582912.0f);
   const HVX_Vector vmagic_bias_less_zero_point = Q6_V_vsplat_R(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const HVX_Vector voutput_min = Q6_Vb_vsplat_R(params->scalar.output_min);
-  const HVX_Vector voutput_max = Q6_Vb_vsplat_R(params->scalar.output_max);
   XNN_FORCE_REALIZATION(vmagic_bias);
   $if BATCH_TILE > 32:
     for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) {
@@ -55,10 +53,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__hvx_u${BATCH_TILE}(
         $else:
           HVX_Vector vy${int(N/128)} = Q6_Vb_vpack_VhVh_sat(vacc_h${int(N/64)}, vacc_h${int(N/64)});
 
-      $for N in range(0, BATCH_TILE, 128):
-        vy${int(N/128)} = Q6_Vb_vmax_VbVb(voutput_min, vy${int(N/128)});
-        vy${int(N/128)} = Q6_Vb_vmin_VbVb(voutput_max, vy${int(N/128)});
-
       $for N in range(0, BATCH_TILE, 128):
         $if N + 128 <= BATCH_TILE:
           *((HVX_UVector *) output) = vy${int(N/128)};
@@ -79,9 +73,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__hvx_u${BATCH_TILE}(
 
     HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h);
 
-    vy = Q6_Vb_vmax_VbVb(voutput_min, vy);
-    vy = Q6_Vb_vmin_VbVb(voutput_max, vy);
-
     Q6_V_vstu_variable(output, 32, vy);
     output += 32;
   }
@@ -98,9 +89,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__hvx_u${BATCH_TILE}(
 
     HVX_Vector vy = Q6_Vb_vpack_VhVh_sat(vacc_h, vacc_h);
 
-    vy = Q6_Vb_vmax_VbVb(voutput_min, vy);
-    vy = Q6_Vb_vmin_VbVb(voutput_max, vy);
-
     // Since the output data type is int8_t,
     // we simply determine the number of elements using batch >> 2
     // without multiplying by sizeof(int8_t).
diff --git a/src/f32-qs8-vcvt/neon.c.in b/src/f32-qs8-vcvt/neon.c.in
index 4ceffd80b50..fb391ad4300 100644
--- a/src/f32-qs8-vcvt/neon.c.in
+++ b/src/f32-qs8-vcvt/neon.c.in
@@ -29,10 +29,6 @@ $VCOMBINE_X8 = {"QS8": "vcombine_s8", "QU8": "vcombine_u8"}[DATATYPE]
 $VGET_LOW_X8 = {"QS8": "vget_low_s8", "QU8": "vget_low_u8"}[DATATYPE]
 $VREINTERPRET_U16_X8 = {"QS8": "vreinterpret_u16_s8", "QU8": "vreinterpret_u16_u8"}[DATATYPE]
 $VREINTERPRET_U32_X8 = {"QS8": "vreinterpret_u32_s8", "QU8": "vreinterpret_u32_u8"}[DATATYPE]
-$VMINQ_X8 = {"QS8": "vminq_s8", "QU8": "vminq_u8"}[DATATYPE]
-$VMIN_X8 = {"QS8": "vmin_s8", "QU8": "vmin_u8"}[DATATYPE]
-$VMAXQ_X8 = {"QS8": "vmaxq_s8", "QU8": "vmaxq_u8"}[DATATYPE]
-$VMAX_X8 = {"QS8": "vmax_s8", "QU8": "vmax_u8"}[DATATYPE]
 void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neon_u${BATCH_TILE}(
     size_t batch,
     const float* input,
@@ -47,12 +43,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neon_u${BATCH_TILE}(
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f);
   const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  $if BATCH_TILE > 8:
-    const ${XINT8X16_T} voutput_min = ${VLD1Q_DUP_X8}(&params->scalar.output_min);
-    const ${XINT8X16_T} voutput_max = ${VLD1Q_DUP_X8}(&params->scalar.output_max);
-  $else:
-    const ${XINT8X8_T} voutput_min = ${VLD1_DUP_X8}(&params->scalar.output_min);
-    const ${XINT8X8_T} voutput_max = ${VLD1_DUP_X8}(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vmagic_bias);
   $if BATCH_TILE > 8:
     for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) {
@@ -77,18 +67,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neon_u${BATCH_TILE}(
         $else:
           ${XINT8X8_T} vy${ABC[N:N+8]} = ${VQMOVXN_S16}(vacc${ABC[N:N+8]});
 
-      $for N in range(0, BATCH_TILE, 16):
-        $if N + 8 < BATCH_TILE:
-          vy${ABC[N:N+16]} = ${VMAXQ_X8}(vy${ABC[N:N+16]}, voutput_min);
-        $else:
-          vy${ABC[N:N+8]} = ${VMAX_X8}(vy${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_min));
-
-      $for N in range(0, BATCH_TILE, 16):
-        $if N + 8 < BATCH_TILE:
-          vy${ABC[N:N+16]} = ${VMINQ_X8}(vy${ABC[N:N+16]}, voutput_max);
-        $else:
-          vy${ABC[N:N+8]} = ${VMIN_X8}(vy${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_max));
-
       $for N in range(0, BATCH_TILE, 16):
         $if N + 8 < BATCH_TILE:
           ${VST1Q_X8}(output, vy${ABC[N:N+16]}); output += 16;
@@ -111,12 +89,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neon_u${BATCH_TILE}(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     ${XINT8X8_T} vy = ${VQMOVXN_S16}(vacc);
-    $if BATCH_TILE > 8:
-      vy = ${VMAX_X8}(vy, ${VGET_LOW_X8}(voutput_min));
-      vy = ${VMIN_X8}(vy, ${VGET_LOW_X8}(voutput_max));
-    $else:
-      vy = ${VMAX_X8}(vy, voutput_min);
-      vy = ${VMIN_X8}(vy, voutput_max);
     ${VST1_X8}(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -138,12 +110,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neon_u${BATCH_TILE}(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     ${XINT8X8_T} vy = ${VQMOVXN_S16}(vacc);
-    $if BATCH_TILE > 8:
-      vy = ${VMAX_X8}(vy, ${VGET_LOW_X8}(voutput_min));
-      vy = ${VMIN_X8}(vy, ${VGET_LOW_X8}(voutput_max));
-    $else:
-      vy = ${VMAX_X8}(vy, voutput_min);
-      vy = ${VMIN_X8}(vy, voutput_max);
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vy), 0); output += 4;
diff --git a/src/f32-qs8-vcvt/neonv8.c.in b/src/f32-qs8-vcvt/neonv8.c.in
index dc21df69461..998aa16bdaf 100644
--- a/src/f32-qs8-vcvt/neonv8.c.in
+++ b/src/f32-qs8-vcvt/neonv8.c.in
@@ -29,10 +29,6 @@ $VCOMBINE_X8 = {"QS8": "vcombine_s8", "QU8": "vcombine_u8"}[DATATYPE]
 $VGET_LOW_X8 = {"QS8": "vget_low_s8", "QU8": "vget_low_u8"}[DATATYPE]
 $VREINTERPRET_U16_X8 = {"QS8": "vreinterpret_u16_s8", "QU8": "vreinterpret_u16_u8"}[DATATYPE]
 $VREINTERPRET_U32_X8 = {"QS8": "vreinterpret_u32_s8", "QU8": "vreinterpret_u32_u8"}[DATATYPE]
-$VMAXQ_X8 = {"QS8": "vmaxq_s8", "QU8": "vmaxq_u8"}[DATATYPE]
-$VMAX_X8 = {"QS8": "vmax_s8", "QU8": "vmax_u8"}[DATATYPE]
-$VMINQ_X8 = {"QS8": "vminq_s8", "QU8": "vminq_u8"}[DATATYPE]
-$VMIN_X8 = {"QS8": "vmin_s8", "QU8": "vmin_u8"}[DATATYPE]
 void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neonv8_u${BATCH_TILE}(
     size_t batch,
     const float* input,
@@ -46,12 +42,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neonv8_u${BATCH_TILE}(
 
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->scalar.output_zero_point);
-  $if BATCH_TILE > 8:
-    const ${XINT8X16_T} voutput_min = ${VLD1Q_DUP_X8}(&params->scalar.output_min);
-    const ${XINT8X16_T} voutput_max = ${VLD1Q_DUP_X8}(&params->scalar.output_max);
-  $else:
-    const ${XINT8X8_T} voutput_min = ${VLD1_DUP_X8}(&params->scalar.output_min);
-    const ${XINT8X8_T} voutput_max = ${VLD1_DUP_X8}(&params->scalar.output_max);
   $if BATCH_TILE > 8:
     for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) {
       $for N in range(0, BATCH_TILE, 4):
@@ -75,18 +65,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neonv8_u${BATCH_TILE}(
         $else:
           ${XINT8X8_T} vy${ABC[N:N+8]} = ${VQMOVXN_S16}(vacc${ABC[N:N+8]});
 
-      $for N in range(0, BATCH_TILE, 16):
-        $if N + 8 < BATCH_TILE:
-          vy${ABC[N:N+16]} = ${VMAXQ_X8}(vy${ABC[N:N+16]}, voutput_min);
-        $else:
-          vy${ABC[N:N+8]} = ${VMAX_X8}(vy${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_min));
-
-      $for N in range(0, BATCH_TILE, 16):
-        $if N + 8 < BATCH_TILE:
-          vy${ABC[N:N+16]} = ${VMINQ_X8}(vy${ABC[N:N+16]}, voutput_max);
-        $else:
-          vy${ABC[N:N+8]} = ${VMIN_X8}(vy${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_max));
-
       $for N in range(0, BATCH_TILE, 16):
         $if N + 8 < BATCH_TILE:
           ${VST1Q_X8}(output, vy${ABC[N:N+16]}); output += 16;
@@ -107,12 +85,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neonv8_u${BATCH_TILE}(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     ${XINT8X8_T} vy = ${VQMOVXN_S16}(vacc);
-    $if BATCH_TILE > 8:
-      vy = ${VMAX_X8}(vy, ${VGET_LOW_X8}(voutput_min));
-      vy = ${VMIN_X8}(vy, ${VGET_LOW_X8}(voutput_max));
-    $else:
-      vy = ${VMAX_X8}(vy, voutput_min);
-      vy = ${VMIN_X8}(vy, voutput_max);
     ${VST1_X8}(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -132,12 +104,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__neonv8_u${BATCH_TILE}(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     ${XINT8X8_T} vy = ${VQMOVXN_S16}(vacc);
-    $if BATCH_TILE > 8:
-      vy = ${VMAX_X8}(vy, ${VGET_LOW_X8}(voutput_min));
-      vy = ${VMIN_X8}(vy, ${VGET_LOW_X8}(voutput_max));
-    $else:
-      vy = ${VMAX_X8}(vy, voutput_min);
-      vy = ${VMIN_X8}(vy, voutput_max);
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vy), 0); output += 4;
diff --git a/src/f32-qs8-vcvt/rvv.c.in b/src/f32-qs8-vcvt/rvv.c.in
index a725a9d4f0e..c389a826ee2 100755
--- a/src/f32-qs8-vcvt/rvv.c.in
+++ b/src/f32-qs8-vcvt/rvv.c.in
@@ -15,6 +15,8 @@ $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
 #include "xnnpack/vcvt.h"
 
 
+$OUTPUT_MIN = {"QS8": -128, "QU8": 0}[DATATYPE]
+$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[DATATYPE]
 void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__rvv_u${LMUL}v(
     size_t batch,
     const float* input,
@@ -29,8 +31,9 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__rvv_u${LMUL}v(
   batch >>= XNN_LOG2_SIZEOF_FLOAT;
 
   const float scale = params->scalar.scale;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates...
+  const float output_min_less_zero_point = (float) ((int32_t) ${OUTPUT_MIN} - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point);
   const int32_t output_zero_point = params->scalar.output_zero_point;
 
   for (; batch > 0; ) {
diff --git a/src/f32-qs8-vcvt/scalar-fmagic.c.in b/src/f32-qs8-vcvt/scalar-fmagic.c.in
index 94218c1745a..72a150faea9 100644
--- a/src/f32-qs8-vcvt/scalar-fmagic.c.in
+++ b/src/f32-qs8-vcvt/scalar-fmagic.c.in
@@ -17,6 +17,8 @@ $INPUT_T = {"F16": "xnn_float16", "F32": "float"}[IDATATYPE]
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[ODATATYPE]
 $MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32"
 $MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32"
+$OUTPUT_MIN = {"QS8": -128, "QU8": 0}[ODATATYPE]
+$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[ODATATYPE]
 void xnn_${IDATATYPE.lower()}_${ODATATYPE.lower()}_vcvt_ukernel__${"wasm" if WASM else "scalar"}_fmagic_u${BATCH_TILE}(
     size_t batch,
     const ${INPUT_T}* input,
@@ -34,8 +36,8 @@ void xnn_${IDATATYPE.lower()}_${ODATATYPE.lower()}_vcvt_ukernel__${"wasm" if WAS
   $else:
     const float* i = input;
     const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) ${OUTPUT_MIN} - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qs8-vcvt/scalar-imagic.c.in b/src/f32-qs8-vcvt/scalar-imagic.c.in
index 2d83fb24ed0..a14d728bf58 100644
--- a/src/f32-qs8-vcvt/scalar-imagic.c.in
+++ b/src/f32-qs8-vcvt/scalar-imagic.c.in
@@ -15,6 +15,8 @@ $assert IDATATYPE == "F16" and ODATATYPE == "QS8" or IDATATYPE == "F32"
 
 $INPUT_T = {"F16": "xnn_float16", "F32": "float"}[IDATATYPE]
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[ODATATYPE]
+$OUTPUT_MIN = {"QS8": -128, "QU8": 0}[ODATATYPE]
+$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[ODATATYPE]
 void xnn_${IDATATYPE.lower()}_${ODATATYPE.lower()}_vcvt_ukernel__${"wasm" if WASM else "scalar"}_imagic_u${BATCH_TILE}(
     size_t batch,
     const ${INPUT_T}* input,
@@ -33,8 +35,8 @@ void xnn_${IDATATYPE.lower()}_${ODATATYPE.lower()}_vcvt_ukernel__${"wasm" if WAS
     const float* i = input;
     const float vscale = params->scalar.scale;
   const float vmagic_bias = 12582912.0f;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) ${OUTPUT_MIN} - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point);
   const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
   const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
diff --git a/src/f32-qs8-vcvt/scalar-lrintf.c.in b/src/f32-qs8-vcvt/scalar-lrintf.c.in
index bbe2352e5ad..808ae9f2117 100644
--- a/src/f32-qs8-vcvt/scalar-lrintf.c.in
+++ b/src/f32-qs8-vcvt/scalar-lrintf.c.in
@@ -15,6 +15,8 @@ $assert BATCH_TILE >= 1
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
 $MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32"
 $MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32"
+$OUTPUT_MIN = {"QS8": -128, "QU8": 0}[DATATYPE]
+$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[DATATYPE]
 void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__${"wasm" if WASM else "scalar"}_lrintf_u${BATCH_TILE}(
     size_t batch,
     const float* input,
@@ -27,8 +29,8 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__${"wasm" if WASM else "scalar"}_l
   assert(output != NULL);
 
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) ${OUTPUT_MIN} - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point);
   const int32_t voutput_zero_point = params->scalar.output_zero_point;
 
   $if BATCH_TILE == 1:
diff --git a/src/f32-qs8-vcvt/sse.c.in b/src/f32-qs8-vcvt/sse.c.in
index 0cf69499c38..4442ccc3a76 100644
--- a/src/f32-qs8-vcvt/sse.c.in
+++ b/src/f32-qs8-vcvt/sse.c.in
@@ -21,7 +21,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 $ISA = {2: "sse2", 4: "sse41"}[SSE]
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
 $_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE]
-$_MM_MAX_EPX8 = {"QS8": "_mm_max_epi8", "QU8": "_mm_max_epu8"}[DATATYPE]
+$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[DATATYPE]
 void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__${ISA}_u${BATCH_TILE}(
     size_t batch,
     const float* input,
@@ -34,16 +34,11 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__${ISA}_u${BATCH_TILE}(
   assert(output != NULL);
 
   const __m128 vscale = _mm_set1_ps(params->scalar.scale);
-  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) ${OUTPUT_MAX} - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  $if DATATYPE == "QS8" and SSE < 4:
-    const __m128i voutput_min = _mm_set1_epi16(params->scalar.output_min);
-  $if DATATYPE == "QU8" or SSE == 4:
-    const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   $if BATCH_TILE > 8:
     for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) {
@@ -67,23 +62,12 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__${ISA}_u${BATCH_TILE}(
       $for N in range(0, BATCH_TILE, 8):
         vy${ABC[N:N+8]} = _mm_adds_epi16(vy${ABC[N:N+8]}, voutput_zero_point);
 
-      $if DATATYPE == "QS8" and SSE < 4:
-        $for N in range(0, BATCH_TILE, 8):
-          vy${ABC[N:N+8]} = _mm_max_epi16(vy${ABC[N:N+8]}, voutput_min);
-
       $for N in range(0, BATCH_TILE, 16):
         $if N + 8 < BATCH_TILE:
           __m128i vy${ABC[N:N+16]} = ${_MM_PACKXS_EPI16}(vy${ABC[N:N+8]}, vy${ABC[N+8:N+16]});
         $else:
           vy${ABC[N:N+8]} = ${_MM_PACKXS_EPI16}(vy${ABC[N:N+8]}, vy${ABC[N:N+8]});
 
-      $if DATATYPE == "QU8" or SSE == 4:
-        $for N in range(0, BATCH_TILE, 16):
-          $if N + 8 < BATCH_TILE:
-            vy${ABC[N:N+16]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+16]}, voutput_min);
-          $else:
-            vy${ABC[N:N+8]} = ${_MM_MAX_EPX8}(vy${ABC[N:N+8]}, voutput_min);
-
       _mm_storeu_si128((__m128i*) output, vy${ABC[0:16]});
       $for N in range(16, BATCH_TILE, 16):
         $if N + 8 < BATCH_TILE:
@@ -108,11 +92,7 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__${ISA}_u${BATCH_TILE}(
 
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
-    $if DATATYPE == "QS8" and SSE < 4:
-      vy = _mm_max_epi16(vy, voutput_min);
     vy = ${_MM_PACKXS_EPI16}(vy, vy);
-    $if DATATYPE == "QU8" or SSE == 4:
-      vy = ${_MM_MAX_EPX8}(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -133,11 +113,7 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__${ISA}_u${BATCH_TILE}(
 
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
-    $if DATATYPE == "QS8" and SSE < 4:
-      vy = _mm_max_epi16(vy, voutput_min);
     vy = ${_MM_PACKXS_EPI16}(vy, vy);
-    $if DATATYPE == "QU8" or SSE == 4:
-      vy = ${_MM_MAX_EPX8}(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
diff --git a/src/f32-qs8-vcvt/wasmsimd-cvt.c.in b/src/f32-qs8-vcvt/wasmsimd-cvt.c.in
index bb18a7ee250..318983d5f3f 100644
--- a/src/f32-qs8-vcvt/wasmsimd-cvt.c.in
+++ b/src/f32-qs8-vcvt/wasmsimd-cvt.c.in
@@ -17,8 +17,6 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
 $WASM_X8X16_NARROW_I16X8 = {"QS8": "wasm_i8x16_narrow_i16x8", "QU8": "wasm_u8x16_narrow_i16x8"}[DATATYPE]
-$WASM_X8X16_MIN = {"QS8": "wasm_i8x16_min", "QU8": "wasm_u8x16_min"}[DATATYPE]
-$WASM_X8X16_MAX = {"QS8": "wasm_i8x16_max", "QU8": "wasm_u8x16_max"}[DATATYPE]
 void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__wasmsimd_cvt_u${BATCH_TILE}(
     size_t batch,
     const float* input,
@@ -32,12 +30,8 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__wasmsimd_cvt_u${BATCH_TILE}(
 
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t voutput_zero_point = wasm_v128_load16_splat(&params->scalar.output_zero_point);
-  const v128_t voutput_min = wasm_v128_load8_splat(&params->scalar.output_min);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
-  XNN_FORCE_REALIZATION(voutput_max);
   $if BATCH_TILE > 8:
     for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) {
       v128_t vx${ABC[0:4]} = wasm_v128_load(input);
@@ -66,18 +60,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__wasmsimd_cvt_u${BATCH_TILE}(
         $else:
           v128_t vy${ABC[N:N+8]} = ${WASM_X8X16_NARROW_I16X8}(vacc${ABC[N:N+8]}, vacc${ABC[N:N+8]});
 
-      $for N in range(0, BATCH_TILE, 16):
-        $if N + 8 < BATCH_TILE:
-          vy${ABC[N:N+16]} = ${WASM_X8X16_MAX}(vy${ABC[N:N+16]}, voutput_min);
-        $else:
-          vy${ABC[N:N+8]} = ${WASM_X8X16_MAX}(vy${ABC[N:N+8]}, voutput_min);
-
-      $for N in range(0, BATCH_TILE, 16):
-        $if N + 8 < BATCH_TILE:
-          vy${ABC[N:N+16]} = ${WASM_X8X16_MIN}(vy${ABC[N:N+16]}, voutput_max);
-        $else:
-          vy${ABC[N:N+8]} = ${WASM_X8X16_MIN}(vy${ABC[N:N+8]}, voutput_max);
-
       wasm_v128_store(output, vy${ABC[0:16]});
       $for N in range(16, BATCH_TILE, 16):
         $if N + 8 < BATCH_TILE:
@@ -104,8 +86,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__wasmsimd_cvt_u${BATCH_TILE}(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = ${WASM_X8X16_NARROW_I16X8}(vacc, vacc);
-    vy = ${WASM_X8X16_MAX}(vy, voutput_min);
-    vy = ${WASM_X8X16_MIN}(vy, voutput_max);
 
     wasm_v128_store64_lane(output, vy, 0);
     output += 8;
@@ -130,8 +110,6 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__wasmsimd_cvt_u${BATCH_TILE}(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = ${WASM_X8X16_NARROW_I16X8}(vacc, vacc);
-    vy = ${WASM_X8X16_MAX}(vy, voutput_min);
-    vy = ${WASM_X8X16_MIN}(vy, voutput_max);
 
     if (batch & (4 * sizeof(float))) {
       wasm_v128_store32_lane(output, vy, 0);
diff --git a/src/f32-qs8-vcvt/wasmsimd-magic.c.in b/src/f32-qs8-vcvt/wasmsimd-magic.c.in
index 5f0bee52414..160157276c2 100644
--- a/src/f32-qs8-vcvt/wasmsimd-magic.c.in
+++ b/src/f32-qs8-vcvt/wasmsimd-magic.c.in
@@ -19,6 +19,8 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
 $WASM_X8X16_NARROW_I16X8 = {"QS8": "wasm_i8x16_narrow_i16x8", "QU8": "wasm_u8x16_narrow_i16x8"}[DATATYPE]
 $WASM_X8X16_MIN = {"QS8": "wasm_i8x16_min", "QU8": "wasm_u8x16_min"}[DATATYPE]
+$OUTPUT_MIN = {"QS8": -128, "QU8": 0}[DATATYPE]
+$OUTPUT_MAX = {"QS8": 127, "QU8": 255}[DATATYPE]
 void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__wasmsimd_magic_u${BATCH_TILE}(
     size_t batch,
     const float* input,
@@ -30,12 +32,12 @@ void xnn_f32_${DATATYPE.lower()}_vcvt_ukernel__wasmsimd_magic_u${BATCH_TILE}(
   assert(input != NULL);
   assert(output != NULL);
 
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) ${OUTPUT_MIN} - (int32_t) params->scalar.output_zero_point);
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f);
   const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point));
   const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
+  const v128_t voutput_max = wasm_u8x16_const_splat(${OUTPUT_MAX});
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(vmagic_bias);
   XNN_FORCE_REALIZATION(vmagic_min);
diff --git a/src/f32-qu8-vcvt/f32-qu8-vcvt.h b/src/f32-qu8-vcvt/f32-qu8-vcvt.h
index 9f664a284d4..7ce3eaec17b 100644
--- a/src/f32-qu8-vcvt/f32-qu8-vcvt.h
+++ b/src/f32-qu8-vcvt/f32-qu8-vcvt.h
@@ -47,11 +47,14 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qu8_vcvt_ukernel__avx2_u1
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qu8_vcvt_ukernel__avx2_u32, 32, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qu8_vcvt_ukernel__avx2_u48, 48, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_qu8_vcvt_ukernel__avx2_u64, 64, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qu8_vcvt_ukernel__avx512skx_u32, 32, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qu8_vcvt_ukernel__avx512skx_u64, 64, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qu8_vcvt_ukernel__avx512skx_u96, 96, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_qu8_vcvt_ukernel__avx512skx_u128, 128, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u8, 8, false, float, uint8_t, struct xnn_f32_qu8_cvt_params, xnn_init_f32_qu8_cvt_scalar_params)
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u16.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u16.c
index 0231f8f7614..1b79bfbe5c4 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u16.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u16.c
@@ -30,13 +30,11 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u16(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
     __m256 vx01234567 = _mm256_loadu_ps(input);
@@ -60,8 +58,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u16(
 
     __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF);
 
-    vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min);
-
     _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF);
     output += 16;
   }
@@ -76,7 +72,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u16(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -95,7 +90,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u16(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u24.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u24.c
index 8f90d0f07ac..700c23d66de 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u24.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u24.c
@@ -30,13 +30,11 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u24(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) {
     __m256 vx01234567 = _mm256_loadu_ps(input);
@@ -67,9 +65,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u24(
     __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF);
     vyGHIJKLMN = _mm_packus_epi16(vyGHIJKLMN, vyGHIJKLMN);
 
-    vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMN = _mm_max_epu8(vyGHIJKLMN, voutput_min);
-
     _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF);
     _mm_storel_epi64((__m128i*) (output + 16), vyGHIJKLMN);
     output += 24;
@@ -85,7 +80,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u24(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -104,7 +98,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u24(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u32.c
index 560d212fa8f..c255c8f1d9b 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u32.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u32.c
@@ -30,13 +30,11 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u32(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     __m256 vx01234567 = _mm256_loadu_ps(input);
@@ -73,9 +71,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u32(
     __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF);
     __m128i vyGHIJKLMNOPQRSTUV = _mm_packus_epi16(vyGHIJKLMN, vyOPQRSTUV);
 
-    vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMNOPQRSTUV = _mm_max_epu8(vyGHIJKLMNOPQRSTUV, voutput_min);
-
     _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF);
     _mm_storeu_si128((__m128i*) (output + 16), vyGHIJKLMNOPQRSTUV);
     output += 32;
@@ -91,7 +86,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u32(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -110,7 +104,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u32(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u8.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u8.c
index c61a5b69643..7e2c779b842 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u8.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u8.c
@@ -30,13 +30,11 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u8(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
     __m256 vx = _mm256_loadu_ps(input);
@@ -49,7 +47,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u8(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -68,7 +65,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx_u8(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u16.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u16.c
index e1be2cd4ad6..308d0f224aa 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u16.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u16.c
@@ -30,13 +30,11 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u16(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
     __m256 vx01 = _mm256_loadu_ps(input);
@@ -60,8 +58,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u16(
 
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
 
-    vy0123 = _mm_max_epu8(vy0123, voutput_min);
-
     _mm_storeu_si128((__m128i*) output, vy0123);
     output += 16;
   }
@@ -76,7 +72,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u16(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -95,7 +90,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u16(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u32.c
index ac968a255c2..5e37dc74d4b 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u32.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u32.c
@@ -30,15 +30,13 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u32(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point);
   XNN_ALIGN(32) static const uint32_t shuffle_mask[8] = {0, 4, 1, 5, 2, 6, 3, 7};
   const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) shuffle_mask);
-  const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     __m256 vx01 = _mm256_loadu_ps(input);
@@ -72,8 +70,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u32(
 
     __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask);
 
-    vy01234567 = _mm256_max_epu8(vy01234567, voutput_min);
-
     _mm256_storeu_si256((__m256i*) output, vy01234567);
     output += 32;
   }
@@ -88,7 +84,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u32(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min));
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -107,7 +102,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u32(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min));
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u48.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u48.c
index d7fda136262..fd5add47ae7 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u48.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u48.c
@@ -30,15 +30,13 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u48(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point);
   XNN_ALIGN(32) static const uint32_t shuffle_mask[8] = {0, 4, 1, 5, 2, 6, 3, 7};
   const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) shuffle_mask);
-  const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 48 * sizeof(float); batch -= 48 * sizeof(float)) {
     __m256 vx01 = _mm256_loadu_ps(input);
@@ -84,9 +82,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u48(
     __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask);
     __m128i vy89AB = _mm_shuffle_epi32(vy8A9B, _MM_SHUFFLE(3, 1, 2, 0));
 
-    vy01234567 = _mm256_max_epu8(vy01234567, voutput_min);
-    vy89AB = _mm_max_epu8(vy89AB, _mm256_castsi256_si128(voutput_min));
-
     _mm256_storeu_si256((__m256i*) output, vy01234567);
     _mm_storeu_si128((__m128i*) (output + 32), vy89AB);
     output += 48;
@@ -102,7 +97,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u48(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min));
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -121,7 +115,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u48(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min));
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u64.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u64.c
index ff915dc9f72..07f2dcd7642 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u64.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-u64.c
@@ -30,15 +30,13 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u64(
   static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
 
   const __m256 vscale = _mm256_set1_ps(params->scalar.scale);
-  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m256 voutput_max_less_zero_point = _mm256_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m256i voutput_zero_point = _mm256_set1_epi16(params->scalar.output_zero_point);
   XNN_ALIGN(32) static const uint32_t shuffle_mask[8] = {0, 4, 1, 5, 2, 6, 3, 7};
   const __m256i vshuffle_mask = _mm256_load_si256((const __m256i*) shuffle_mask);
-  const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) {
     __m256 vx01 = _mm256_loadu_ps(input);
@@ -94,9 +92,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u64(
     __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy02461357, vshuffle_mask);
     __m256i vy89ABCDEF = _mm256_permutevar8x32_epi32(vy8ACE9BDF, vshuffle_mask);
 
-    vy01234567 = _mm256_max_epu8(vy01234567, voutput_min);
-    vy89ABCDEF = _mm256_max_epu8(vy89ABCDEF, voutput_min);
-
     _mm256_storeu_si256((__m256i*) output, vy01234567);
     _mm256_storeu_si256((__m256i*) (output + 32), vy89ABCDEF);
     output += 64;
@@ -112,7 +107,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u64(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min));
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -131,7 +125,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx2_u64(
     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extracti128_si256(vacc, 1));
     vy = _mm_adds_epi16(vy, _mm256_castsi256_si128(voutput_zero_point));
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, _mm256_castsi256_si128(voutput_min));
 
     if (batch & (4 * sizeof(float))) {
       _mm_storeu_si32(output, vy);
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u128.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u128.c
index 52503bc5c83..57e686a05e7 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u128.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u128.c
@@ -31,14 +31,12 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u128(
 
 
   const __m512 vscale = _mm512_set1_ps(params->scalar.scale);
-  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point);
   const __m512i vshuffle512_mask = _mm512_load_si512(shuffle512_mask);
-  const __m512i voutput_min = _mm512_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
   for (; batch >= 128 * sizeof(float); batch -= 128 * sizeof(float)) {
     __m512 vx0123 = _mm512_loadu_ps(input);
     __m512 vx4567 = _mm512_loadu_ps(input + 16);
@@ -90,9 +88,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u128(
     __m512i vy048C159D26AE37BF = _mm512_packus_epi16(vacc04152637, vacc8C9DAEBF);
     __m512i vyGKOSHLPTIMQUJNRV = _mm512_packus_epi16(vaccGKHLIMJN, vaccOSPTQURV);
 
-    vy048C159D26AE37BF = _mm512_max_epu8(vy048C159D26AE37BF, voutput_min);
-    vyGKOSHLPTIMQUJNRV = _mm512_max_epu8(vyGKOSHLPTIMQUJNRV, voutput_min);
-
     const __m512i vy0123456789ABCDEF = _mm512_permutexvar_epi32(vshuffle512_mask, vy048C159D26AE37BF);
     const __m512i vyGHIJKLMNOPQRSTUV = _mm512_permutexvar_epi32(vshuffle512_mask, vyGKOSHLPTIMQUJNRV);
 
@@ -112,7 +107,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u128(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epu8(vy0123, _mm512_castsi512_si128(voutput_min));
 
     _mm_storeu_si128((__m128i*) output, vy0123);
     output += 16;
@@ -135,7 +129,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u128(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epu8(vy0123, _mm512_castsi512_si128(voutput_min));
 
     _mm_mask_storeu_epi8(output, vmask, vy0123);
   }
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u32.c
index 774916be772..ad642cedd2c 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u32.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u32.c
@@ -31,14 +31,12 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u32(
   XNN_ALIGN(32) static const uint32_t shuffle256_mask[8] = {0, 4, 2, 6, 1, 5, 3, 7};
 
   const __m512 vscale = _mm512_set1_ps(params->scalar.scale);
-  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point);
   const __m256i vshuffle256_mask = _mm256_load_si256((const __m256i*) shuffle256_mask);
-  const __m256i voutput_min = _mm256_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     __m512 vx0123 = _mm512_loadu_ps(input);
     __m512 vx4567 = _mm512_loadu_ps(input + 16);
@@ -59,8 +57,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u32(
 
     __m256i vy04261537 = _mm256_packus_epi16(_mm512_castsi512_si256(vacc04152637), _mm512_extracti32x8_epi32(vacc04152637, 1));
 
-    vy04261537 = _mm256_max_epu8(vy04261537, voutput_min);
-
     const __m256i vy01234567 = _mm256_permutevar8x32_epi32(vy04261537, vshuffle256_mask);
 
     _mm256_storeu_si256((__m256i*) output, vy01234567);
@@ -78,7 +74,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u32(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epu8(vy0123, _mm256_castsi256_si128(voutput_min));
 
     _mm_storeu_si128((__m128i*) output, vy0123);
     output += 16;
@@ -101,7 +96,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u32(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epu8(vy0123, _mm256_castsi256_si128(voutput_min));
 
     _mm_mask_storeu_epi8(output, vmask, vy0123);
   }
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u64.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u64.c
index 90ebb74ed21..ca20f0ca9d2 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u64.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u64.c
@@ -31,14 +31,12 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u64(
 
 
   const __m512 vscale = _mm512_set1_ps(params->scalar.scale);
-  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point);
   const __m512i vshuffle512_mask = _mm512_load_si512(shuffle512_mask);
-  const __m512i voutput_min = _mm512_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
   for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) {
     __m512 vx0123 = _mm512_loadu_ps(input);
     __m512 vx4567 = _mm512_loadu_ps(input + 16);
@@ -69,8 +67,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u64(
 
     __m512i vy048C159D26AE37BF = _mm512_packus_epi16(vacc04152637, vacc8C9DAEBF);
 
-    vy048C159D26AE37BF = _mm512_max_epu8(vy048C159D26AE37BF, voutput_min);
-
     const __m512i vy0123456789ABCDEF = _mm512_permutexvar_epi32(vshuffle512_mask, vy048C159D26AE37BF);
 
     _mm512_storeu_si512(output, vy0123456789ABCDEF);
@@ -88,7 +84,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u64(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epu8(vy0123, _mm512_castsi512_si128(voutput_min));
 
     _mm_storeu_si128((__m128i*) output, vy0123);
     output += 16;
@@ -111,7 +106,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u64(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epu8(vy0123, _mm512_castsi512_si128(voutput_min));
 
     _mm_mask_storeu_epi8(output, vmask, vy0123);
   }
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u96.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u96.c
index 85e158bdc6b..abcf436c122 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u96.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-u96.c
@@ -32,15 +32,13 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u96(
   XNN_ALIGN(32) static const uint32_t shuffle256_mask[8] = {0, 4, 2, 6, 1, 5, 3, 7};
 
   const __m512 vscale = _mm512_set1_ps(params->scalar.scale);
-  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m512 voutput_max_less_zero_point = _mm512_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m512i voutput_zero_point = _mm512_set1_epi16(params->scalar.output_zero_point);
   const __m512i vshuffle512_mask = _mm512_load_si512(shuffle512_mask);
   const __m256i vshuffle256_mask = _mm256_load_si256((const __m256i*) shuffle256_mask);
-  const __m512i voutput_min = _mm512_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
   for (; batch >= 96 * sizeof(float); batch -= 96 * sizeof(float)) {
     __m512 vx0123 = _mm512_loadu_ps(input);
     __m512 vx4567 = _mm512_loadu_ps(input + 16);
@@ -82,9 +80,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u96(
     __m512i vy048C159D26AE37BF = _mm512_packus_epi16(vacc04152637, vacc8C9DAEBF);
     __m256i vyGKIMHLJN = _mm256_packus_epi16(_mm512_castsi512_si256(vaccGKHLIMJN), _mm512_extracti32x8_epi32(vaccGKHLIMJN, 1));
 
-    vy048C159D26AE37BF = _mm512_max_epu8(vy048C159D26AE37BF, voutput_min);
-    vyGKIMHLJN = _mm256_max_epu8(vyGKIMHLJN, _mm512_castsi512_si256(voutput_min));
-
     const __m512i vy0123456789ABCDEF = _mm512_permutexvar_epi32(vshuffle512_mask, vy048C159D26AE37BF);
     const __m256i vyGHIJKLMN = _mm256_permutevar8x32_epi32(vyGKIMHLJN, vshuffle256_mask);
 
@@ -104,7 +99,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u96(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epu8(vy0123, _mm512_castsi512_si128(voutput_min));
 
     _mm_storeu_si128((__m128i*) output, vy0123);
     output += 16;
@@ -127,7 +121,6 @@ void xnn_f32_qu8_vcvt_ukernel__avx512skx_u96(
     vacc0213 = _mm256_adds_epi16(vacc0213, _mm512_castsi512_si256(voutput_zero_point));
     const __m128i vy0213 = _mm_packus_epi16(_mm256_castsi256_si128(vacc0213), _mm256_extracti128_si256(vacc0213, 1));
     __m128i vy0123 = _mm_shuffle_epi32(vy0213, _MM_SHUFFLE(3, 1, 2, 0));
-    vy0123 = _mm_max_epu8(vy0123, _mm512_castsi512_si128(voutput_min));
 
     _mm_mask_storeu_epi8(output, vmask, vy0123);
   }
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u16.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u16.c
index 800f3c4011c..d6379ac534b 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u16.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u16.c
@@ -30,8 +30,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u16(
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f);
   const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const uint8x16_t voutput_min = vld1q_dup_u8(&params->scalar.output_min);
-  const uint8x16_t voutput_max = vld1q_dup_u8(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vmagic_bias);
   for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
     float32x4_t vx0123 = vld1q_f32(input); input += 4;
@@ -59,10 +57,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u16(
 
     uint8x16_t vy0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
 
-    vy0123456789ABCDEF = vmaxq_u8(vy0123456789ABCDEF, voutput_min);
-
-    vy0123456789ABCDEF = vminq_u8(vy0123456789ABCDEF, voutput_max);
-
     vst1q_u8(output, vy0123456789ABCDEF); output += 16;
   }
   for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
@@ -81,8 +75,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u16(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, vget_low_u8(voutput_min));
-    vy = vmin_u8(vy, vget_low_u8(voutput_max));
     vst1_u8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -104,8 +96,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u16(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, vget_low_u8(voutput_min));
-    vy = vmin_u8(vy, vget_low_u8(voutput_max));
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4;
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u24.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u24.c
index b24291995a3..9c3c7a37b5b 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u24.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u24.c
@@ -30,8 +30,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u24(
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f);
   const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const uint8x16_t voutput_min = vld1q_dup_u8(&params->scalar.output_min);
-  const uint8x16_t voutput_max = vld1q_dup_u8(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vmagic_bias);
   for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) {
     float32x4_t vx0123 = vld1q_f32(input); input += 4;
@@ -69,12 +67,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u24(
     uint8x16_t vy0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
     uint8x8_t vyGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
 
-    vy0123456789ABCDEF = vmaxq_u8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMN = vmax_u8(vyGHIJKLMN, vget_low_u8(voutput_min));
-
-    vy0123456789ABCDEF = vminq_u8(vy0123456789ABCDEF, voutput_max);
-    vyGHIJKLMN = vmin_u8(vyGHIJKLMN, vget_low_u8(voutput_max));
-
     vst1q_u8(output, vy0123456789ABCDEF); output += 16;
     vst1_u8(output, vyGHIJKLMN); output += 8;
   }
@@ -94,8 +86,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u24(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, vget_low_u8(voutput_min));
-    vy = vmin_u8(vy, vget_low_u8(voutput_max));
     vst1_u8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -117,8 +107,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u24(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, vget_low_u8(voutput_min));
-    vy = vmin_u8(vy, vget_low_u8(voutput_max));
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4;
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u32.c
index 7b99af2082f..b7e8ebcae3a 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u32.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u32.c
@@ -30,8 +30,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u32(
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f);
   const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const uint8x16_t voutput_min = vld1q_dup_u8(&params->scalar.output_min);
-  const uint8x16_t voutput_max = vld1q_dup_u8(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vmagic_bias);
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     float32x4_t vx0123 = vld1q_f32(input); input += 4;
@@ -78,12 +76,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u32(
     uint8x16_t vy0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
     uint8x16_t vyGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV));
 
-    vy0123456789ABCDEF = vmaxq_u8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMNOPQRSTUV = vmaxq_u8(vyGHIJKLMNOPQRSTUV, voutput_min);
-
-    vy0123456789ABCDEF = vminq_u8(vy0123456789ABCDEF, voutput_max);
-    vyGHIJKLMNOPQRSTUV = vminq_u8(vyGHIJKLMNOPQRSTUV, voutput_max);
-
     vst1q_u8(output, vy0123456789ABCDEF); output += 16;
     vst1q_u8(output, vyGHIJKLMNOPQRSTUV); output += 16;
   }
@@ -103,8 +95,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u32(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, vget_low_u8(voutput_min));
-    vy = vmin_u8(vy, vget_low_u8(voutput_max));
     vst1_u8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -126,8 +116,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u32(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, vget_low_u8(voutput_min));
-    vy = vmin_u8(vy, vget_low_u8(voutput_max));
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4;
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u8.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u8.c
index 98a17a6f2ad..031c7ec5792 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u8.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-u8.c
@@ -30,8 +30,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u8(
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const float32x4_t vmagic_bias = vdupq_n_f32(12582912.0f);
   const int32x4_t vmagic_bias_less_zero_point = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const uint8x8_t voutput_min = vld1_dup_u8(&params->scalar.output_min);
-  const uint8x8_t voutput_max = vld1_dup_u8(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vmagic_bias);
   for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
     float32x4_t vx_lo = vld1q_f32(input); input += 4;
@@ -49,8 +47,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u8(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, voutput_min);
-    vy = vmin_u8(vy, voutput_max);
     vst1_u8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -72,8 +68,6 @@ void xnn_f32_qu8_vcvt_ukernel__neon_u8(
     const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, voutput_min);
-    vy = vmin_u8(vy, voutput_max);
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4;
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u16.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u16.c
index f4a83df4736..39c606fdd6d 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u16.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u16.c
@@ -29,8 +29,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u16(
 
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->scalar.output_zero_point);
-  const uint8x16_t voutput_min = vld1q_dup_u8(&params->scalar.output_min);
-  const uint8x16_t voutput_max = vld1q_dup_u8(&params->scalar.output_max);
   for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
     float32x4_t vx0123 = vld1q_f32(input); input += 4;
     float32x4_t vx4567 = vld1q_f32(input); input += 4;
@@ -55,10 +53,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u16(
 
     uint8x16_t vy0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
 
-    vy0123456789ABCDEF = vmaxq_u8(vy0123456789ABCDEF, voutput_min);
-
-    vy0123456789ABCDEF = vminq_u8(vy0123456789ABCDEF, voutput_max);
-
     vst1q_u8(output, vy0123456789ABCDEF); output += 16;
   }
   for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
@@ -75,8 +69,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u16(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, vget_low_u8(voutput_min));
-    vy = vmin_u8(vy, vget_low_u8(voutput_max));
     vst1_u8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -96,8 +88,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u16(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, vget_low_u8(voutput_min));
-    vy = vmin_u8(vy, vget_low_u8(voutput_max));
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4;
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u24.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u24.c
index 967b8792a69..4928a0e41ac 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u24.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u24.c
@@ -29,8 +29,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u24(
 
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->scalar.output_zero_point);
-  const uint8x16_t voutput_min = vld1q_dup_u8(&params->scalar.output_min);
-  const uint8x16_t voutput_max = vld1q_dup_u8(&params->scalar.output_max);
   for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) {
     float32x4_t vx0123 = vld1q_f32(input); input += 4;
     float32x4_t vx4567 = vld1q_f32(input); input += 4;
@@ -64,12 +62,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u24(
     uint8x16_t vy0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
     uint8x8_t vyGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
 
-    vy0123456789ABCDEF = vmaxq_u8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMN = vmax_u8(vyGHIJKLMN, vget_low_u8(voutput_min));
-
-    vy0123456789ABCDEF = vminq_u8(vy0123456789ABCDEF, voutput_max);
-    vyGHIJKLMN = vmin_u8(vyGHIJKLMN, vget_low_u8(voutput_max));
-
     vst1q_u8(output, vy0123456789ABCDEF); output += 16;
     vst1_u8(output, vyGHIJKLMN); output += 8;
   }
@@ -87,8 +79,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u24(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, vget_low_u8(voutput_min));
-    vy = vmin_u8(vy, vget_low_u8(voutput_max));
     vst1_u8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -108,8 +98,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u24(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, vget_low_u8(voutput_min));
-    vy = vmin_u8(vy, vget_low_u8(voutput_max));
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4;
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u32.c
index 2908f275a66..175c99cc179 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u32.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u32.c
@@ -29,8 +29,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u32(
 
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->scalar.output_zero_point);
-  const uint8x16_t voutput_min = vld1q_dup_u8(&params->scalar.output_min);
-  const uint8x16_t voutput_max = vld1q_dup_u8(&params->scalar.output_max);
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     float32x4_t vx0123 = vld1q_f32(input); input += 4;
     float32x4_t vx4567 = vld1q_f32(input); input += 4;
@@ -72,12 +70,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u32(
     uint8x16_t vy0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
     uint8x16_t vyGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV));
 
-    vy0123456789ABCDEF = vmaxq_u8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMNOPQRSTUV = vmaxq_u8(vyGHIJKLMNOPQRSTUV, voutput_min);
-
-    vy0123456789ABCDEF = vminq_u8(vy0123456789ABCDEF, voutput_max);
-    vyGHIJKLMNOPQRSTUV = vminq_u8(vyGHIJKLMNOPQRSTUV, voutput_max);
-
     vst1q_u8(output, vy0123456789ABCDEF); output += 16;
     vst1q_u8(output, vyGHIJKLMNOPQRSTUV); output += 16;
   }
@@ -95,8 +87,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u32(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, vget_low_u8(voutput_min));
-    vy = vmin_u8(vy, vget_low_u8(voutput_max));
     vst1_u8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -116,8 +106,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u32(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, vget_low_u8(voutput_min));
-    vy = vmin_u8(vy, vget_low_u8(voutput_max));
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4;
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u8.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u8.c
index 5f871cfe28a..aa75c316192 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u8.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-u8.c
@@ -29,8 +29,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u8(
 
   const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->scalar.output_zero_point);
-  const uint8x8_t voutput_min = vld1_dup_u8(&params->scalar.output_min);
-  const uint8x8_t voutput_max = vld1_dup_u8(&params->scalar.output_max);
   for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
     float32x4_t vx_lo = vld1q_f32(input); input += 4;
     float32x4_t vx_hi = vld1q_f32(input); input += 4;
@@ -45,8 +43,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u8(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, voutput_min);
-    vy = vmin_u8(vy, voutput_max);
     vst1_u8(output, vy); output += 8;
   }
   if XNN_UNLIKELY(batch != 0) {
@@ -66,8 +62,6 @@ void xnn_f32_qu8_vcvt_ukernel__neonv8_u8(
     vacc = vqaddq_s16(vacc, voutput_zero_point);
 
     uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, voutput_min);
-    vy = vmin_u8(vy, voutput_max);
 
     if (batch & (4 * sizeof(float))) {
       vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4;
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u1v.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u1v.c
index 12c86a5427b..b3ffa68167c 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u1v.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u1v.c
@@ -29,8 +29,9 @@ void xnn_f32_qu8_vcvt_ukernel__rvv_u1v(
   batch >>= XNN_LOG2_SIZEOF_FLOAT;
 
   const float scale = params->scalar.scale;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates...
+  const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const int32_t output_zero_point = params->scalar.output_zero_point;
 
   for (; batch > 0; ) {
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u2v.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u2v.c
index 57d41b0e906..3223238d890 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u2v.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u2v.c
@@ -29,8 +29,9 @@ void xnn_f32_qu8_vcvt_ukernel__rvv_u2v(
   batch >>= XNN_LOG2_SIZEOF_FLOAT;
 
   const float scale = params->scalar.scale;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates...
+  const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const int32_t output_zero_point = params->scalar.output_zero_point;
 
   for (; batch > 0; ) {
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u4v.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u4v.c
index 4d654460c75..801d4ea26ef 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u4v.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u4v.c
@@ -29,8 +29,9 @@ void xnn_f32_qu8_vcvt_ukernel__rvv_u4v(
   batch >>= XNN_LOG2_SIZEOF_FLOAT;
 
   const float scale = params->scalar.scale;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates...
+  const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const int32_t output_zero_point = params->scalar.output_zero_point;
 
   for (; batch > 0; ) {
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u8v.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u8v.c
index a344b38cffe..1c897023906 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u8v.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u8v.c
@@ -29,8 +29,9 @@ void xnn_f32_qu8_vcvt_ukernel__rvv_u8v(
   batch >>= XNN_LOG2_SIZEOF_FLOAT;
 
   const float scale = params->scalar.scale;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  // TODO: Clamp may not be necessary. RISCV spec doesn't say if vncvt saturates...
+  const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const int32_t output_zero_point = params->scalar.output_zero_point;
 
   for (; batch > 0; ) {
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u1.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u1.c
index 6bf96a303b4..2bf4b8c50f1 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u1.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u1.c
@@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_fmagic_u1(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u2.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u2.c
index d0b4631e28e..6ed343f8d9d 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u2.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u2.c
@@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_fmagic_u2(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u3.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u3.c
index 61d888901c8..15c59c505f5 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u3.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u3.c
@@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_fmagic_u3(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u4.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u4.c
index d140844158b..f6bd1e419b1 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u4.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-u4.c
@@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_fmagic_u4(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u1.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u1.c
index bd2387ce372..46ad989c1ce 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u1.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u1.c
@@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_imagic_u1(
   const float* i = input;
   const float vscale = params->scalar.scale;
   const float vmagic_bias = 12582912.0f;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
   const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u2.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u2.c
index df926a44093..61a7edfbac5 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u2.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u2.c
@@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_imagic_u2(
   const float* i = input;
   const float vscale = params->scalar.scale;
   const float vmagic_bias = 12582912.0f;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
   const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u3.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u3.c
index a406101ba2f..14c2e17cdab 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u3.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u3.c
@@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_imagic_u3(
   const float* i = input;
   const float vscale = params->scalar.scale;
   const float vmagic_bias = 12582912.0f;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
   const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u4.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u4.c
index c6e30581518..0b2e85261a2 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u4.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-u4.c
@@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_imagic_u4(
   const float* i = input;
   const float vscale = params->scalar.scale;
   const float vmagic_bias = 12582912.0f;
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float output_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
   const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u1.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u1.c
index 74343847a58..5e2e027a2b2 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u1.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u1.c
@@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_u1(
   assert(output != NULL);
 
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const int32_t voutput_zero_point = params->scalar.output_zero_point;
 
   do {
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u2.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u2.c
index 8f06f8c3c99..f95a3eaf2d0 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u2.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u2.c
@@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_u2(
   assert(output != NULL);
 
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const int32_t voutput_zero_point = params->scalar.output_zero_point;
 
   for (; batch >= 2 * sizeof(float); batch -= 2 * sizeof(float)) {
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u3.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u3.c
index fd2fe136f5f..ce552f0fab4 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u3.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u3.c
@@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_u3(
   assert(output != NULL);
 
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const int32_t voutput_zero_point = params->scalar.output_zero_point;
 
   for (; batch >= 3 * sizeof(float); batch -= 3 * sizeof(float)) {
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u4.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u4.c
index e16efd930d5..0bd3ba88488 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u4.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-u4.c
@@ -27,8 +27,8 @@ void xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_u4(
   assert(output != NULL);
 
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const int32_t voutput_zero_point = params->scalar.output_zero_point;
 
   for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u16.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u16.c
index d8af609659e..4e8538adb6c 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u16.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u16.c
@@ -28,13 +28,11 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u16(
   assert(output != NULL);
 
   const __m128 vscale = _mm_set1_ps(params->scalar.scale);
-  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
     __m128 vx0123 = _mm_loadu_ps(input);
@@ -64,11 +62,8 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u16(
     vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
     vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
 
-
     __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF);
 
-    vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min);
-
     _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF);
     output += 16;
   }
@@ -89,7 +84,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u16(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -111,7 +105,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u16(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u24.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u24.c
index 93981b75964..dcce8e75919 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u24.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u24.c
@@ -28,13 +28,11 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u24(
   assert(output != NULL);
 
   const __m128 vscale = _mm_set1_ps(params->scalar.scale);
-  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) {
     __m128 vx0123 = _mm_loadu_ps(input);
@@ -74,13 +72,9 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u24(
     vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
     vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
 
-
     __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF);
     vyGHIJKLMN = _mm_packus_epi16(vyGHIJKLMN, vyGHIJKLMN);
 
-    vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMN = _mm_max_epu8(vyGHIJKLMN, voutput_min);
-
     _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF);
     _mm_storel_epi64((__m128i*) (output + 16), vyGHIJKLMN);
     output += 24;
@@ -102,7 +96,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u24(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -124,7 +117,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u24(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u32.c
index fc028418ac7..d40037131e7 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u32.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u32.c
@@ -28,13 +28,11 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u32(
   assert(output != NULL);
 
   const __m128 vscale = _mm_set1_ps(params->scalar.scale);
-  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     __m128 vx0123 = _mm_loadu_ps(input);
@@ -84,13 +82,9 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u32(
     vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
     vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point);
 
-
     __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF);
     __m128i vyGHIJKLMNOPQRSTUV = _mm_packus_epi16(vyGHIJKLMN, vyOPQRSTUV);
 
-    vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMNOPQRSTUV = _mm_max_epu8(vyGHIJKLMNOPQRSTUV, voutput_min);
-
     _mm_storeu_si128((__m128i*) output, vy0123456789ABCDEF);
     _mm_storeu_si128((__m128i*) (output + 16), vyGHIJKLMNOPQRSTUV);
     output += 32;
@@ -112,7 +106,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u32(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -134,7 +127,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u32(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u8.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u8.c
index fcb6e1bed70..6e8c2014b91 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u8.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-u8.c
@@ -28,13 +28,11 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u8(
   assert(output != NULL);
 
   const __m128 vscale = _mm_set1_ps(params->scalar.scale);
-  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point));
+  const __m128 voutput_max_less_zero_point = _mm_set1_ps((float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point));
   const __m128i voutput_zero_point = _mm_set1_epi16(params->scalar.output_zero_point);
-  const __m128i voutput_min = _mm_set1_epi8(params->scalar.output_min);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_max_less_zero_point);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
 
   for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
     __m128 vx_lo = _mm_loadu_ps(input);
@@ -53,7 +51,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u8(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     _mm_storel_epi64((__m128i*) output, vy);
     output += 8;
@@ -75,7 +72,6 @@ void xnn_f32_qu8_vcvt_ukernel__sse2_u8(
     __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
     vy = _mm_adds_epi16(vy, voutput_zero_point);
     vy = _mm_packus_epi16(vy, vy);
-    vy = _mm_max_epu8(vy, voutput_min);
 
     if (batch & (4 * sizeof(float))) {
       unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vy));
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u1.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u1.c
index c3ae46820a9..42f9be10b53 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u1.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u1.c
@@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_u1(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u2.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u2.c
index c4c1570ff6e..65c0619c1db 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u2.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u2.c
@@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_u2(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u3.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u3.c
index 3c3951aa44c..11aa8edf2cb 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u3.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u3.c
@@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_u3(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u4.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u4.c
index b07449669db..08a1bd0c048 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u4.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-u4.c
@@ -26,8 +26,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_u4(
 
   const float* i = input;
   const float vscale = params->scalar.scale;
-  const float voutput_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
-  const float voutput_max_less_zero_point = (float) ((int32_t) params->scalar.output_max - (int32_t) params->scalar.output_zero_point);
+  const float voutput_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
+  const float voutput_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
   const float vmagic_bias = 12582912.0f;
   const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;
 
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u16.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u16.c
index 6c9b9d6b5dd..6028ac155f2 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u16.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u16.c
@@ -29,12 +29,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u16(
 
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t voutput_zero_point = wasm_v128_load16_splat(&params->scalar.output_zero_point);
-  const v128_t voutput_min = wasm_v128_load8_splat(&params->scalar.output_min);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
-  XNN_FORCE_REALIZATION(voutput_max);
   for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
     v128_t vx0123 = wasm_v128_load(input);
     v128_t vx4567 = wasm_v128_load(input + 4);
@@ -65,10 +61,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u16(
 
     v128_t vy0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vacc01234567, vacc89ABCDEF);
 
-    vy0123456789ABCDEF = wasm_u8x16_max(vy0123456789ABCDEF, voutput_min);
-
-    vy0123456789ABCDEF = wasm_u8x16_min(vy0123456789ABCDEF, voutput_max);
-
     wasm_v128_store(output, vy0123456789ABCDEF);
     output += 16;
   }
@@ -90,8 +82,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u16(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_u8x16_max(vy, voutput_min);
-    vy = wasm_u8x16_min(vy, voutput_max);
 
     wasm_v128_store64_lane(output, vy, 0);
     output += 8;
@@ -116,8 +106,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u16(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_u8x16_max(vy, voutput_min);
-    vy = wasm_u8x16_min(vy, voutput_max);
 
     if (batch & (4 * sizeof(float))) {
       wasm_v128_store32_lane(output, vy, 0);
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u24.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u24.c
index 5574ea37dba..eddb82fa3e9 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u24.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u24.c
@@ -29,12 +29,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u24(
 
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t voutput_zero_point = wasm_v128_load16_splat(&params->scalar.output_zero_point);
-  const v128_t voutput_min = wasm_v128_load8_splat(&params->scalar.output_min);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
-  XNN_FORCE_REALIZATION(voutput_max);
   for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) {
     v128_t vx0123 = wasm_v128_load(input);
     v128_t vx4567 = wasm_v128_load(input + 4);
@@ -76,12 +72,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u24(
     v128_t vy0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vacc01234567, vacc89ABCDEF);
     v128_t vyGHIJKLMN = wasm_u8x16_narrow_i16x8(vaccGHIJKLMN, vaccGHIJKLMN);
 
-    vy0123456789ABCDEF = wasm_u8x16_max(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMN = wasm_u8x16_max(vyGHIJKLMN, voutput_min);
-
-    vy0123456789ABCDEF = wasm_u8x16_min(vy0123456789ABCDEF, voutput_max);
-    vyGHIJKLMN = wasm_u8x16_min(vyGHIJKLMN, voutput_max);
-
     wasm_v128_store(output, vy0123456789ABCDEF);
     wasm_v128_store64_lane(output + 16, vyGHIJKLMN, 0);
     output += 24;
@@ -104,8 +94,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u24(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_u8x16_max(vy, voutput_min);
-    vy = wasm_u8x16_min(vy, voutput_max);
 
     wasm_v128_store64_lane(output, vy, 0);
     output += 8;
@@ -130,8 +118,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u24(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_u8x16_max(vy, voutput_min);
-    vy = wasm_u8x16_min(vy, voutput_max);
 
     if (batch & (4 * sizeof(float))) {
       wasm_v128_store32_lane(output, vy, 0);
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u32.c
index 3aa4990c9ee..839b96b9999 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u32.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u32.c
@@ -29,12 +29,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u32(
 
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t voutput_zero_point = wasm_v128_load16_splat(&params->scalar.output_zero_point);
-  const v128_t voutput_min = wasm_v128_load8_splat(&params->scalar.output_min);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
-  XNN_FORCE_REALIZATION(voutput_max);
   for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
     v128_t vx0123 = wasm_v128_load(input);
     v128_t vx4567 = wasm_v128_load(input + 4);
@@ -86,12 +82,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u32(
     v128_t vy0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vacc01234567, vacc89ABCDEF);
     v128_t vyGHIJKLMNOPQRSTUV = wasm_u8x16_narrow_i16x8(vaccGHIJKLMN, vaccOPQRSTUV);
 
-    vy0123456789ABCDEF = wasm_u8x16_max(vy0123456789ABCDEF, voutput_min);
-    vyGHIJKLMNOPQRSTUV = wasm_u8x16_max(vyGHIJKLMNOPQRSTUV, voutput_min);
-
-    vy0123456789ABCDEF = wasm_u8x16_min(vy0123456789ABCDEF, voutput_max);
-    vyGHIJKLMNOPQRSTUV = wasm_u8x16_min(vyGHIJKLMNOPQRSTUV, voutput_max);
-
     wasm_v128_store(output, vy0123456789ABCDEF);
     wasm_v128_store(output + 16, vyGHIJKLMNOPQRSTUV);
     output += 32;
@@ -114,8 +104,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u32(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_u8x16_max(vy, voutput_min);
-    vy = wasm_u8x16_min(vy, voutput_max);
 
     wasm_v128_store64_lane(output, vy, 0);
     output += 8;
@@ -140,8 +128,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u32(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_u8x16_max(vy, voutput_min);
-    vy = wasm_u8x16_min(vy, voutput_max);
 
     if (batch & (4 * sizeof(float))) {
       wasm_v128_store32_lane(output, vy, 0);
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u8.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u8.c
index 987cc380756..266c803473b 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u8.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-u8.c
@@ -29,12 +29,8 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u8(
 
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t voutput_zero_point = wasm_v128_load16_splat(&params->scalar.output_zero_point);
-  const v128_t voutput_min = wasm_v128_load8_splat(&params->scalar.output_min);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(voutput_zero_point);
-  XNN_FORCE_REALIZATION(voutput_min);
-  XNN_FORCE_REALIZATION(voutput_max);
   for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
     v128_t vx_lo = wasm_v128_load(input);
     v128_t vx_hi = wasm_v128_load(input + 4);
@@ -53,8 +49,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u8(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_u8x16_max(vy, voutput_min);
-    vy = wasm_u8x16_min(vy, voutput_max);
 
     wasm_v128_store64_lane(output, vy, 0);
     output += 8;
@@ -79,8 +73,6 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_u8(
     vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
 
     v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
-    vy = wasm_u8x16_max(vy, voutput_min);
-    vy = wasm_u8x16_min(vy, voutput_max);
 
     if (batch & (4 * sizeof(float))) {
       wasm_v128_store32_lane(output, vy, 0);
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u16.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u16.c
index 25090ca0739..7447d350ba3 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u16.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u16.c
@@ -28,12 +28,12 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_u16(
   assert(input != NULL);
   assert(output != NULL);
 
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f);
   const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point));
   const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
+  const v128_t voutput_max = wasm_u8x16_const_splat(255);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(vmagic_bias);
   XNN_FORCE_REALIZATION(vmagic_min);
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u24.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u24.c
index fcb1df590e2..f186cfb1448 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u24.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u24.c
@@ -28,12 +28,12 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_u24(
   assert(input != NULL);
   assert(output != NULL);
 
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f);
   const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point));
   const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
+  const v128_t voutput_max = wasm_u8x16_const_splat(255);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(vmagic_bias);
   XNN_FORCE_REALIZATION(vmagic_min);
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u32.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u32.c
index 1247cc6fece..c2e51f0ae64 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u32.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u32.c
@@ -28,12 +28,12 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_u32(
   assert(input != NULL);
   assert(output != NULL);
 
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f);
   const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point));
   const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
+  const v128_t voutput_max = wasm_u8x16_const_splat(255);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(vmagic_bias);
   XNN_FORCE_REALIZATION(vmagic_min);
diff --git a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u8.c b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u8.c
index 1dfb39b6406..3f8d1a469e7 100644
--- a/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u8.c
+++ b/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-u8.c
@@ -28,12 +28,12 @@ void xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_u8(
   assert(input != NULL);
   assert(output != NULL);
 
-  const float output_min_less_zero_point = (float) ((int32_t) params->scalar.output_min - (int32_t) params->scalar.output_zero_point);
+  const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
   const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
   const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f);
   const v128_t vmagic_min = wasm_u32x4_splat(float_as_uint32(12582912.0f + output_min_less_zero_point));
   const v128_t vmagic_bias_less_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point);
-  const v128_t voutput_max = wasm_v128_load8_splat(&params->scalar.output_max);
+  const v128_t voutput_max = wasm_u8x16_const_splat(255);
   XNN_FORCE_REALIZATION(vscale);
   XNN_FORCE_REALIZATION(vmagic_bias);
   XNN_FORCE_REALIZATION(vmagic_min);
diff --git a/src/f32-vabs/f32-vabs.h b/src/f32-vabs/f32-vabs.h
index a73a35540c2..4bc0b787b98 100644
--- a/src/f32-vabs/f32-vabs.h
+++ b/src/f32-vabs/f32-vabs.h
@@ -36,10 +36,13 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__sse2_u12, 12, false, float, str
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vabs_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vabs_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vabs_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vabs_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vabs_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vabs_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vabs_ukernel__hvx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
diff --git a/src/f32-vbinary/f32-vadd.h b/src/f32-vbinary/f32-vadd.h
index c0ab4527528..edf8ba1080e 100644
--- a/src/f32-vbinary/f32-vadd.h
+++ b/src/f32-vbinary/f32-vadd.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__sse_u4, 4, false, float, struct
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vadd_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vadd_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vadd_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vadd_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vadd_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vbinary/f32-vaddc.h b/src/f32-vbinary/f32-vaddc.h
index ca2080ac180..bc6a89ef61b 100644
--- a/src/f32-vbinary/f32-vaddc.h
+++ b/src/f32-vbinary/f32-vaddc.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__sse_u4, 4, false, float, struc
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vaddc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vaddc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vaddc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vaddc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vaddc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vbinary/f32-vcmul.h b/src/f32-vbinary/f32-vcmul.h
index 953bc8752db..49e98be55bf 100644
--- a/src/f32-vbinary/f32-vcmul.h
+++ b/src/f32-vbinary/f32-vcmul.h
@@ -43,13 +43,15 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vcmul_ukernel__fma3_u8, 8, fa
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vcmul_ukernel__fma3_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vcmul_ukernel__fma3_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vcmul_ukernel__fma3_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
-
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_vcmul_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_vcmul_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_vcmul_ukernel__avx512f_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f32_vcmul_ukernel__avx512f_u128, 128, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcmul_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcmul_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcmul_ukernel__avx512f_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcmul_ukernel__avx512f_u128, 128, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vcmul_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vcmul_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
diff --git a/src/f32-vbinary/f32-vcopysign.h b/src/f32-vbinary/f32-vcopysign.h
index a7710ca6d74..27f9b1358a1 100644
--- a/src/f32-vbinary/f32-vcopysign.h
+++ b/src/f32-vbinary/f32-vcopysign.h
@@ -29,6 +29,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysign_ukernel__avx_u8, 8,
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysign_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysign_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysign_ukernel__avx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcopysign_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcopysign_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcopysign_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
diff --git a/src/f32-vbinary/f32-vcopysignc.h b/src/f32-vbinary/f32-vcopysignc.h
index 7d989e7674e..c7d189efeb8 100644
--- a/src/f32-vbinary/f32-vcopysignc.h
+++ b/src/f32-vbinary/f32-vcopysignc.h
@@ -29,6 +29,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysignc_ukernel__avx_u8, 8,
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysignc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysignc_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vcopysignc_ukernel__avx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcopysignc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcopysignc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vcopysignc_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
diff --git a/src/f32-vbinary/f32-vdiv.h b/src/f32-vbinary/f32-vdiv.h
index 23937909a3f..4667ce66563 100644
--- a/src/f32-vbinary/f32-vdiv.h
+++ b/src/f32-vbinary/f32-vdiv.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__sse_u4, 4, false, float, struct
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdiv_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdiv_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdiv_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdiv_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdiv_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vbinary/f32-vdivc.h b/src/f32-vbinary/f32-vdivc.h
index e776858bab1..9a8d40f378a 100644
--- a/src/f32-vbinary/f32-vdivc.h
+++ b/src/f32-vbinary/f32-vdivc.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__sse_u4, 4, false, float, struc
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vdivc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdivc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vdivc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdivc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vdivc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vbinary/f32-vmax.h b/src/f32-vbinary/f32-vmax.h
index 27538e4e25f..e819669fafd 100644
--- a/src/f32-vbinary/f32-vmax.h
+++ b/src/f32-vbinary/f32-vmax.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmax_ukernel__sse_u4, 4, false, float, struct
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmax_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmax_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmax_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmax_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmax_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vbinary/f32-vmaxc.h b/src/f32-vbinary/f32-vmaxc.h
index afd0074bd3e..1ef1357039e 100644
--- a/src/f32-vbinary/f32-vmaxc.h
+++ b/src/f32-vbinary/f32-vmaxc.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmaxc_ukernel__sse_u4, 4, false, float, struc
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmaxc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmaxc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmaxc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmaxc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmaxc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vbinary/f32-vmin.h b/src/f32-vbinary/f32-vmin.h
index b4ccbe1e358..b7435fc1921 100644
--- a/src/f32-vbinary/f32-vmin.h
+++ b/src/f32-vbinary/f32-vmin.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmin_ukernel__sse_u4, 4, false, float, struct
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmin_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmin_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmin_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmin_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmin_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vbinary/f32-vminc.h b/src/f32-vbinary/f32-vminc.h
index ae3b8cbfa15..80d45392c4d 100644
--- a/src/f32-vbinary/f32-vminc.h
+++ b/src/f32-vbinary/f32-vminc.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vminc_ukernel__sse_u4, 4, false, float, struc
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vminc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vminc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vminc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vminc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vminc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vbinary/f32-vmul.h b/src/f32-vbinary/f32-vmul.h
index 437721a8d80..05090b84382 100644
--- a/src/f32-vbinary/f32-vmul.h
+++ b/src/f32-vbinary/f32-vmul.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__sse_u4, 4, false, float, struct
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmul_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmul_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmul_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmul_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmul_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vbinary/f32-vmulc.h b/src/f32-vbinary/f32-vmulc.h
index ad45df4c31c..e1242b33274 100644
--- a/src/f32-vbinary/f32-vmulc.h
+++ b/src/f32-vbinary/f32-vmulc.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__sse_u4, 4, false, float, struc
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vmulc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmulc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vmulc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmulc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vmulc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vbinary/f32-vprelu.h b/src/f32-vbinary/f32-vprelu.h
index f990211c6b9..30582c742c5 100644
--- a/src/f32-vbinary/f32-vprelu.h
+++ b/src/f32-vbinary/f32-vprelu.h
@@ -28,6 +28,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vprelu_ukernel__sse41_u4, 4
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vprelu_ukernel__sse41_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vprelu_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vprelu_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vprelu_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vprelu_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -46,11 +49,15 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__wasmrelaxedsimd_u16, 16, fals
 
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vprelu_ukernel__scalar_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 
 #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS
 #undef XNN_DEFINED_UKERNEL_WITH_PARAMS
diff --git a/src/f32-vbinary/f32-vpreluc.h b/src/f32-vbinary/f32-vpreluc.h
index 20ed57d1a87..7c7653222a4 100644
--- a/src/f32-vbinary/f32-vpreluc.h
+++ b/src/f32-vbinary/f32-vpreluc.h
@@ -28,6 +28,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vpreluc_ukernel__sse41_u4,
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vpreluc_ukernel__sse41_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vpreluc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vpreluc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vpreluc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vpreluc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -46,11 +49,15 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__wasmrelaxedsimd_u16, 16, fal
 
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vpreluc_ukernel__scalar_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 
 #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS
 #undef XNN_DEFINED_UKERNEL_WITH_PARAMS
diff --git a/src/f32-vbinary/f32-vrcopysignc.h b/src/f32-vbinary/f32-vrcopysignc.h
index 3c2389e91ff..00c5e4aad23 100644
--- a/src/f32-vbinary/f32-vrcopysignc.h
+++ b/src/f32-vbinary/f32-vrcopysignc.h
@@ -29,6 +29,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrcopysignc_ukernel__avx_u8, 8
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrcopysignc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrcopysignc_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrcopysignc_ukernel__avx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrcopysignc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrcopysignc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrcopysignc_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
diff --git a/src/f32-vbinary/f32-vrdivc.h b/src/f32-vbinary/f32-vrdivc.h
index 586ea49ede9..bf15c2a0f15 100644
--- a/src/f32-vbinary/f32-vrdivc.h
+++ b/src/f32-vbinary/f32-vrdivc.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__sse_u4, 4, false, float, stru
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrdivc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrdivc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrdivc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrdivc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrdivc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vbinary/f32-vrpreluc.h b/src/f32-vbinary/f32-vrpreluc.h
index 041043ad8f6..69ee06b5d0e 100644
--- a/src/f32-vbinary/f32-vrpreluc.h
+++ b/src/f32-vbinary/f32-vrpreluc.h
@@ -28,6 +28,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrpreluc_ukernel__sse41_u4,
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrpreluc_ukernel__sse41_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrpreluc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrpreluc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrpreluc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrpreluc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -46,11 +49,15 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__wasmrelaxedsimd_u16, 16, fa
 
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__wasm_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__wasm_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__wasm_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__wasm_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrpreluc_ukernel__scalar_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 
 #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS
 #undef XNN_DEFINED_UKERNEL_WITH_PARAMS
diff --git a/src/f32-vbinary/f32-vrsubc.h b/src/f32-vbinary/f32-vrsubc.h
index 2fcda551f07..9c599ecd861 100644
--- a/src/f32-vbinary/f32-vrsubc.h
+++ b/src/f32-vbinary/f32-vrsubc.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__sse_u4, 4, false, float, stru
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsubc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsubc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsubc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsubc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsubc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vbinary/f32-vsqrdiff.h b/src/f32-vbinary/f32-vsqrdiff.h
index b5dbbf2df05..5b3dc78de66 100644
--- a/src/f32-vbinary/f32-vsqrdiff.h
+++ b/src/f32-vbinary/f32-vsqrdiff.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrdiff_ukernel__sse_u4, 4, false, float, st
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrdiff_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrdiff_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrdiff_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrdiff_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrdiff_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vbinary/f32-vsqrdiffc.h b/src/f32-vbinary/f32-vsqrdiffc.h
index 1aa224f208a..04bf92ae335 100644
--- a/src/f32-vbinary/f32-vsqrdiffc.h
+++ b/src/f32-vbinary/f32-vsqrdiffc.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrdiffc_ukernel__sse_u4, 4, false, float, s
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrdiffc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrdiffc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrdiffc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrdiffc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrdiffc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vbinary/f32-vsub.h b/src/f32-vbinary/f32-vsub.h
index 5208f2d904d..1679792dfc6 100644
--- a/src/f32-vbinary/f32-vsub.h
+++ b/src/f32-vbinary/f32-vsub.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__sse_u4, 4, false, float, struct
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsub_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsub_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsub_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsub_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsub_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vbinary/f32-vsubc.h b/src/f32-vbinary/f32-vsubc.h
index 477f39a5edb..cde2d65a090 100644
--- a/src/f32-vbinary/f32-vsubc.h
+++ b/src/f32-vbinary/f32-vsubc.h
@@ -31,6 +31,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__sse_u4, 4, false, float, struc
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsubc_ukernel__sse_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsubc_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsubc_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsubc_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsubc_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vclamp/f32-vclamp.h b/src/f32-vclamp/f32-vclamp.h
index cdfb0ac46ba..a801e8efe4a 100644
--- a/src/f32-vclamp/f32-vclamp.h
+++ b/src/f32-vclamp/f32-vclamp.h
@@ -34,6 +34,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__sse_u4, 4, false, float, unio
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__sse_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vclamp_ukernel__avx_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vclamp_ukernel__avx_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vclamp_ukernel__avx512f_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vclamp_ukernel__avx512f_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params)
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-velu/f32-velu.h b/src/f32-velu/f32-velu.h
index c5ade61212d..7de1d9e3055 100644
--- a/src/f32-velu/f32-velu.h
+++ b/src/f32-velu/f32-velu.h
@@ -126,6 +126,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_velu_ukernel__avx2_rr1_p6_u56
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_velu_ukernel__avx2_rr1_p6_u64, 64, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_velu_ukernel__avx2_rr1_p6_u72, 72, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_velu_ukernel__avx2_rr1_p6_u80, 80, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_u16, 16, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_u32, 32, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_u48, 48, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params)
diff --git a/src/f32-vgelu/f32-vgelu.h b/src/f32-vgelu/f32-vgelu.h
index a22c8591bcc..8c60243f2bd 100644
--- a/src/f32-vgelu/f32-vgelu.h
+++ b/src/f32-vgelu/f32-vgelu.h
@@ -36,6 +36,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_12_10_div_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_12_10_div_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_div_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_div_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
diff --git a/src/f32-vhswish/f32-vhswish.h b/src/f32-vhswish/f32-vhswish.h
index 440323861ba..fa31e9d5b92 100644
--- a/src/f32-vhswish/f32-vhswish.h
+++ b/src/f32-vhswish/f32-vhswish.h
@@ -36,6 +36,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vhswish_ukernel__avx_u8, 8, fa
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vhswish_ukernel__avx_u16, 16, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vhswish_ukernel__fma3_u8, 8, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vhswish_ukernel__fma3_u16, 16, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vhswish_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vhswish_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vlog/f32-vlog.h b/src/f32-vlog/f32-vlog.h
index d479e100cb6..c3b5c18673b 100644
--- a/src/f32-vlog/f32-vlog.h
+++ b/src/f32-vlog/f32-vlog.h
@@ -40,6 +40,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_nr_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_nr_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_nr_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_div_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_div_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
diff --git a/src/f32-vlrelu/f32-vlrelu.h b/src/f32-vlrelu/f32-vlrelu.h
index c7b6de61334..4c26184f8a3 100644
--- a/src/f32-vlrelu/f32-vlrelu.h
+++ b/src/f32-vlrelu/f32-vlrelu.h
@@ -37,6 +37,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vlrelu_ukernel__sse41_u4, 4
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vlrelu_ukernel__sse41_u8, 8, false, float, struct xnn_f32_lrelu_params, xnn_init_f32_lrelu_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vlrelu_ukernel__avx_u8, 8, false, float, struct xnn_f32_lrelu_params, xnn_init_f32_lrelu_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vlrelu_ukernel__avx_u16, 16, false, float, struct xnn_f32_lrelu_params, xnn_init_f32_lrelu_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlrelu_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_lrelu_params, xnn_init_f32_lrelu_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlrelu_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_lrelu_params, xnn_init_f32_lrelu_scalar_params)
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vneg/f32-vneg.h b/src/f32-vneg/f32-vneg.h
index 18fc3c56686..491c2491e77 100644
--- a/src/f32-vneg/f32-vneg.h
+++ b/src/f32-vneg/f32-vneg.h
@@ -36,6 +36,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__sse2_u12, 12, false, float, str
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vneg_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vneg_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vneg_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vneg_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vneg_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vneg_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
diff --git a/src/f32-vrelu/f32-vrelu.h b/src/f32-vrelu/f32-vrelu.h
index 462dd311111..d6642db19ce 100644
--- a/src/f32-vrelu/f32-vrelu.h
+++ b/src/f32-vrelu/f32-vrelu.h
@@ -33,6 +33,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__sse_u4, 4, false, float, struc
 XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__sse_u8, 8, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrelu_ukernel__avx_u8, 8, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrelu_ukernel__avx_u16, 16, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrelu_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrelu_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vrnd/f32-vrndd.h b/src/f32-vrnd/f32-vrndd.h
index 4abe7dc7bc7..87da6ddad84 100644
--- a/src/f32-vrnd/f32-vrndd.h
+++ b/src/f32-vrnd/f32-vrndd.h
@@ -37,6 +37,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndd_ukernel__sse41_u4, 4,
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndd_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndd_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndd_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndd_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndd_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vrnd/f32-vrndne.h b/src/f32-vrnd/f32-vrndne.h
index af62e53fe59..4cf05f41dcc 100644
--- a/src/f32-vrnd/f32-vrndne.h
+++ b/src/f32-vrnd/f32-vrndne.h
@@ -37,6 +37,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndne_ukernel__sse41_u4, 4
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndne_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndne_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndne_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndne_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndne_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vrnd/f32-vrndu.h b/src/f32-vrnd/f32-vrndu.h
index 7cb276255da..efaeb9977bb 100644
--- a/src/f32-vrnd/f32-vrndu.h
+++ b/src/f32-vrnd/f32-vrndu.h
@@ -37,6 +37,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndu_ukernel__sse41_u4, 4,
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndu_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndu_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndu_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndu_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndu_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vrnd/f32-vrndz.h b/src/f32-vrnd/f32-vrndz.h
index 96da3c85cd9..30fa23a9f6b 100644
--- a/src/f32-vrnd/f32-vrndz.h
+++ b/src/f32-vrnd/f32-vrndz.h
@@ -37,6 +37,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndz_ukernel__sse41_u4, 4,
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndz_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndz_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndz_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndz_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndz_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL))
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/f32-vrsqrt/f32-vrsqrt.h b/src/f32-vrsqrt/f32-vrsqrt.h
index a56caf31b23..2e4e841e6aa 100644
--- a/src/f32-vrsqrt/f32-vrsqrt.h
+++ b/src/f32-vrsqrt/f32-vrsqrt.h
@@ -41,6 +41,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32,
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8, 8, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16, 16, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32, 32, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16, 16, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32, 32, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64, 64, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL))
diff --git a/src/f32-vsigmoid/f32-vsigmoid.h b/src/f32-vsigmoid/f32-vsigmoid.h
index c1098dea0b8..106a5dc2586 100644
--- a/src/f32-vsigmoid/f32-vsigmoid.h
+++ b/src/f32-vsigmoid/f32-vsigmoid.h
@@ -187,6 +187,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u64, 64, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u72, 72, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u80, 80, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u32, 32, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u48, 48, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL))
diff --git a/src/f32-vsqr/f32-vsqr.h b/src/f32-vsqr/f32-vsqr.h
index e9a4542bb72..2a876685664 100644
--- a/src/f32-vsqr/f32-vsqr.h
+++ b/src/f32-vsqr/f32-vsqr.h
@@ -36,6 +36,9 @@ XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__sse2_u12, 12, false, float, str
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqr_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqr_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqr_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqr_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqr_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqr_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL))
diff --git a/src/f32-vsqrt/f32-vsqrt.h b/src/f32-vsqrt/f32-vsqrt.h
index 4696076506c..f036b68c58f 100644
--- a/src/f32-vsqrt/f32-vsqrt.h
+++ b/src/f32-vsqrt/f32-vsqrt.h
@@ -45,6 +45,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrt_ukernel__avx_rsqrt_u32,
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32, 32, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32, 32, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48, 48, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL))
diff --git a/src/f32-vtanh/f32-vtanh.h b/src/f32-vtanh/f32-vtanh.h
index 0d447b7082a..7db0a58f588 100644
--- a/src/f32-vtanh/f32-vtanh.h
+++ b/src/f32-vtanh/f32-vtanh.h
@@ -45,6 +45,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_nr_u16, 16, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_nr_u24, 24, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_nr_u32, 32, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_div_u16, 16, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_div_u32, 32, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_div_u48, 48, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL))
diff --git a/src/microparams-init.c b/src/microparams-init.c
index 2d7398c0f84..df4014b7d7e 100644
--- a/src/microparams-init.c
+++ b/src/microparams-init.c
@@ -2064,13 +2064,9 @@ size_t xnn_init_qs8_mul_minmax_rndnu_neon_params(
 size_t xnn_init_f16_qs8_cvt_scalar_params(
   struct xnn_f16_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
   xnn_float16 scale,
-  int8_t output_zero_point,
-  int8_t output_min,
-  int8_t output_max)
+  int8_t output_zero_point)
 {
   params->scalar.scale = scale;
-  params->scalar.output_min = output_min;
-  params->scalar.output_max = output_max;
   params->scalar.output_zero_point = output_zero_point;
   return sizeof(params->scalar);
 }
@@ -2078,14 +2074,10 @@ size_t xnn_init_f16_qs8_cvt_scalar_params(
 size_t xnn_init_f32_qs8_cvt_scalar_params(
   struct xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
   float scale,
-  int8_t output_zero_point,
-  int8_t output_min,
-  int8_t output_max)
+  int8_t output_zero_point)
 {
   params->scalar.scale = scale;
   params->scalar.output_zero_point = (int16_t) output_zero_point;
-  params->scalar.output_min = output_min;
-  params->scalar.output_max = output_max;
   return sizeof(params->scalar);
 }
 
@@ -2120,14 +2112,10 @@ size_t xnn_init_qu8_reduce_minmax_scalar_params(
 size_t xnn_init_f32_qu8_cvt_scalar_params(
   struct xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
   float scale,
-  uint8_t output_zero_point,
-  uint8_t output_min,
-  uint8_t output_max)
+  uint8_t output_zero_point)
 {
   params->scalar.scale = scale;
   params->scalar.output_zero_point = (int16_t) output_zero_point;
-  params->scalar.output_min = output_min;
-  params->scalar.output_max = output_max;
   return sizeof(params->scalar);
 }
 
diff --git a/src/operator-run.c b/src/operator-run.c
index ec9d1cb0a9f..9bec626c35b 100644
--- a/src/operator-run.c
+++ b/src/operator-run.c
@@ -1562,19 +1562,6 @@ void xnn_compute_resize_bilinear_chw(
     context->input_channel_stride);
 }
 
-void xnn_compute_prelu(
-    const struct prelu_context context[restrict XNN_MIN_ELEMENTS(1)],
-    size_t batch_start,
-    size_t batch_range)
-{
-  const size_t x_stride = context->x_stride;
-  const size_t y_stride = context->y_stride;
-  const void* x = (const void*) ((uintptr_t) context->x + x_stride * batch_start);
-  void* y = (void*) ((uintptr_t) context->y + y_stride * batch_start);
-
-  context->ukernel(batch_range, context->n, x, x_stride, context->w, y, y_stride);
-}
-
 void xnn_compute_pad_5d(
     const struct pad_context context[restrict XNN_MIN_ELEMENTS(1)],
     size_t i, size_t j, size_t k, size_t l, size_t m)
@@ -2197,7 +2184,7 @@ void xnn_compute_contiguous_reduce(
       context->s32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr,
                                    workspace_ptr, /*params=*/&s32_f32_cvt_params);
       struct xnn_f32_qs8_cvt_params cvt_params;
-      xnn_init_f32_qs8_cvt_scalar_params(&cvt_params, context->params.qs8_mean.scalar.scale, context->params.qs8_mean.scalar.output_zero_point, INT8_MIN, INT8_MAX);
+      xnn_init_f32_qs8_cvt_scalar_params(&cvt_params, context->params.qs8_mean.scalar.scale, context->params.qs8_mean.scalar.output_zero_point);
       context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr,
                            output_ptr, /*params=*/&cvt_params);
     } else if (context->u32_f32_cvt_ukernel) {
@@ -2206,7 +2193,7 @@ void xnn_compute_contiguous_reduce(
       context->u32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr,
                                    workspace_ptr, /*params=*/&u32_f32_cvt_params);
       struct xnn_f32_qu8_cvt_params cvt_params;
-      xnn_init_f32_qu8_cvt_scalar_params(&cvt_params, context->params.qu8_mean.scalar.scale, context->params.qu8_mean.scalar.output_zero_point, 0, UINT8_MAX);
+      xnn_init_f32_qu8_cvt_scalar_params(&cvt_params, context->params.qu8_mean.scalar.scale, context->params.qu8_mean.scalar.output_zero_point);
       context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr,
                            output_ptr, /*params=*/&cvt_params);
     } else {
@@ -2278,7 +2265,7 @@ void xnn_compute_discontiguous_reduce(
       context->s32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr,
                                    workspace_ptr, /*params=*/&s32_f32_cvt_params);
       struct xnn_f32_qs8_cvt_params cvt_params;
-      xnn_init_f32_qs8_cvt_scalar_params(&cvt_params, context->params.qs8_mean.scalar.scale, context->params.qs8_mean.scalar.output_zero_point, INT8_MIN, INT8_MAX);
+      xnn_init_f32_qs8_cvt_scalar_params(&cvt_params, context->params.qs8_mean.scalar.scale, context->params.qs8_mean.scalar.output_zero_point);
       context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr,
                            output_ptr, /*params=*/&cvt_params);
     } else if (context->u32_f32_cvt_ukernel) {
@@ -2287,7 +2274,7 @@ void xnn_compute_discontiguous_reduce(
       context->u32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr,
                                    workspace_ptr, /*params=*/&u32_f32_cvt_params);
       struct xnn_f32_qu8_cvt_params cvt_params;
-      xnn_init_f32_qu8_cvt_scalar_params(&cvt_params, context->params.qu8_mean.scalar.scale, context->params.qu8_mean.scalar.output_zero_point, 0, UINT8_MAX);
+      xnn_init_f32_qu8_cvt_scalar_params(&cvt_params, context->params.qu8_mean.scalar.scale, context->params.qu8_mean.scalar.output_zero_point);
       context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr,
                            output_ptr, /*params=*/&cvt_params);
     } else {
@@ -2323,7 +2310,7 @@ void xnn_compute_f16_qd8_convert(
   context->quantization_params[batch_index] = xnn_f16_qd8_asymmetric_quantization_params(minmax[0], minmax[1], &f16_scale);
 
   struct xnn_f16_qs8_cvt_params params;
-  context->init_params(&params, f16_scale, context->quantization_params[batch_index].zero_point, INT8_MIN, INT8_MAX);
+  context->init_params(&params, f16_scale, context->quantization_params[batch_index].zero_point);
   context->convert_ukernel(n, input, output, &params);
 }
 
@@ -2342,7 +2329,7 @@ void xnn_compute_f32_qd8_convert(
   context->quantization_params[batch_index] = xnn_f32_qd8_asymmetric_quantization_params(minmax[0], minmax[1]);
 
   struct xnn_f32_qs8_cvt_params params;
-  context->init_params(&params, 1.0f / context->quantization_params[batch_index].inv_scale, context->quantization_params[batch_index].zero_point, INT8_MIN, INT8_MAX);
+  context->init_params(&params, 1.0f / context->quantization_params[batch_index].inv_scale, context->quantization_params[batch_index].zero_point);
   context->convert_ukernel(n, input, output, &params);
 }
 
diff --git a/src/operators/binary-elementwise-nd.c b/src/operators/binary-elementwise-nd.c
index 99f71611b08..f8a77c842a4 100644
--- a/src/operators/binary-elementwise-nd.c
+++ b/src/operators/binary-elementwise-nd.c
@@ -141,6 +141,15 @@ static const struct xnn_binary_elementwise_config* init_config(
         default:
           return NULL;
       }
+    case xnn_binary_prelu:
+      switch (datatype) {
+        case xnn_datatype_fp32:
+          return xnn_init_f32_vprelu_config();
+        case xnn_datatype_fp16:
+          return xnn_init_f16_vprelu_config();
+        default:
+          return NULL;
+      }
     default:
       return NULL;
   }
diff --git a/src/operators/prelu-nc.c b/src/operators/prelu-nc.c
deleted file mode 100644
index 50e96c237c0..00000000000
--- a/src/operators/prelu-nc.c
+++ /dev/null
@@ -1,336 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "xnnpack.h"
-#include "xnnpack/allocator.h"
-#include "xnnpack/cache.h"
-#include "xnnpack/common.h"
-#include "xnnpack/compute.h"
-#include "xnnpack/config-types.h"
-#include "xnnpack/config.h"
-#include "xnnpack/log.h"
-#include "xnnpack/math.h"
-#include "xnnpack/operator-type.h"
-#include "xnnpack/operator-utils.h"
-#include "xnnpack/operator.h"
-#include "xnnpack/pack.h"
-#include "xnnpack/params.h"
-#include "pthreadpool.h"
-
-static enum xnn_status create_prelu_nc(
-    size_t input_channels,
-    size_t slope_channels,
-    size_t input_stride,
-    size_t output_stride,
-    const void* negative_slope,
-    uint32_t flags,
-    uint32_t log2_weights_element_size,
-    xnn_pack_prelu_w_fn pack_prelu_w,
-    enum xnn_operator_type operator_type,
-    const struct xnn_prelu_config* prelu_config,
-    xnn_code_cache_t code_cache,
-    xnn_weights_cache_t weights_cache,
-    xnn_operator_t* prelu_op_out)
-{
-  xnn_operator_t prelu_op = NULL;
-  enum xnn_status status = xnn_status_uninitialized;
-
-  if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
-    xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
-      xnn_operator_type_to_string(operator_type));
-    return xnn_status_uninitialized;
-  }
-
-  status = xnn_status_invalid_parameter;
-
-  if (slope_channels == 0) {
-    xnn_log_error(
-      "failed to create %s operator with %zu slope channels: number of slope channels must be non-zero",
-      xnn_operator_type_to_string(operator_type), slope_channels);
-    goto error;
-  }
-
-  if (input_channels != slope_channels && slope_channels != 1) {
-    xnn_log_error(
-      "failed to create %s operator with input channels of %zu: "
-      "slope channels (%zu) must be either equal to the number input channels or 1",
-      xnn_operator_type_to_string(operator_type), slope_channels, input_channels);
-    goto error;
-  }
-
-  if (input_stride < input_channels) {
-    xnn_log_error(
-      "failed to create %s operator with input element stride of %zu: "
-      "stride must be at least as large as the number of input channels (%zu)",
-      xnn_operator_type_to_string(operator_type), input_stride, input_channels);
-    goto error;
-  }
-
-  if (output_stride < input_channels) {
-    xnn_log_error(
-      "failed to create %s operator with output element stride of %zu: "
-      "stride must be at least as large as the number of input channels (%zu)",
-      xnn_operator_type_to_string(operator_type), output_stride, input_channels);
-    goto error;
-  }
-
-  status = xnn_status_out_of_memory;
-
-  prelu_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
-  if (prelu_op == NULL) {
-    xnn_log_error(
-      "failed to allocate %zu bytes for %s operator descriptor",
-      sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
-    goto error;
-  }
-
-  prelu_op->input_pixel_stride = input_stride;
-  prelu_op->output_pixel_stride = output_stride;
-
-  prelu_op->weights_cache = weights_cache;
-
-  const size_t packed_weights_size = (input_channels << log2_weights_element_size) + XNN_EXTRA_BYTES;
-  const size_t aligned_total_weights_size = round_up_po2(packed_weights_size, XNN_ALLOCATION_ALIGNMENT);
-  void* weights_ptr = xnn_get_pointer_to_write_weights(prelu_op, aligned_total_weights_size, 0);
-  xnn_log_debug("allocated %zu bytes for packed weights in %s operator",
-    aligned_total_weights_size, xnn_operator_type_to_string(operator_type));
-
-  pack_prelu_w(input_channels, slope_channels, negative_slope, weights_ptr);
-
-  if (use_weights_cache(prelu_op)) {
-    struct xnn_weights_cache_look_up_key cache_key;
-    cache_key.seed = murmur_hash3(weights_ptr, aligned_total_weights_size, /*seed=*/7);
-    cache_key.kernel = negative_slope;
-    cache_key.bias = NULL;
-    prelu_op->packed_weights.offset = xnn_look_up_or_insert_weights_cache(
-        prelu_op->weights_cache, &cache_key, weights_ptr, aligned_total_weights_size);
-  }
-
-  prelu_op->channels = input_channels;
-
-  prelu_op->type = operator_type;
-  prelu_op->flags = flags;
-  prelu_op->prelu_config = prelu_config;
-
-  prelu_op->state = xnn_run_state_invalid;
-
-  *prelu_op_out = prelu_op;
-  return xnn_status_success;
-
-error:
-  xnn_delete_operator(prelu_op);
-  return status;
-}
-
-
-enum xnn_status xnn_create_prelu_nc_f16(
-    size_t input_channels,
-    size_t slope_channels,
-    size_t input_stride,
-    size_t output_stride,
-    const void* negative_slope,
-    uint32_t flags,
-    xnn_code_cache_t code_cache,
-    xnn_weights_cache_t weights_cache,
-    xnn_operator_t* prelu_op_out)
-{
-  xnn_pack_prelu_w_fn pack_prelu_w = (xnn_pack_prelu_w_fn) xnn_pack_f16_prelu_w;
-  if (flags & XNN_FLAG_FP32_STATIC_WEIGHTS) {
-    pack_prelu_w = (xnn_pack_prelu_w_fn) xnn_pack_f32_to_f16_prelu_w;
-  }
-
-  const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config();
-  if (prelu_config == NULL) {
-    xnn_log_error("failed to create %s operator: unsupported hardware configuration",
-                  xnn_operator_type_to_string(xnn_operator_type_prelu_nc_f16));
-    return xnn_status_unsupported_hardware;
-  }
-
-  return create_prelu_nc(
-    input_channels, slope_channels, input_stride,
-    output_stride, negative_slope, flags,
-    /*log2_weights_element_size=*/XNN_LOG2_SIZEOF_HALF,
-    pack_prelu_w,
-    xnn_operator_type_prelu_nc_f16,
-    prelu_config,
-    /*code_cache=*/code_cache,
-    /*weights_cache=*/weights_cache,
-    prelu_op_out);
-}
-
-enum xnn_status xnn_create_prelu_nc_f32(
-    size_t input_channels,
-    size_t slope_channels,
-    size_t input_stride,
-    size_t output_stride,
-    const float* negative_slope,
-    uint32_t flags,
-    xnn_code_cache_t code_cache,
-    xnn_weights_cache_t weights_cache,
-    xnn_operator_t* prelu_op_out)
-{
-  const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config();
-  if (prelu_config == NULL) {
-    xnn_log_error("failed to create %s operator: unsupported hardware configuration",
-                  xnn_operator_type_to_string(xnn_operator_type_prelu_nc_f32));
-    return xnn_status_unsupported_hardware;
-  }
-
-  return create_prelu_nc(
-    input_channels, slope_channels, input_stride,
-    output_stride, negative_slope, flags,
-    /*log2_weights_element_size=*/XNN_LOG2_SIZEOF_FLOAT,
-    (xnn_pack_prelu_w_fn) xnn_pack_f32_prelu_w,
-    xnn_operator_type_prelu_nc_f32,
-    prelu_config,
-    /*code_cache=*/code_cache,
-    /*weights_cache=*/weights_cache,
-    prelu_op_out);
-}
-
-static enum xnn_status reshape_prelu_nc(
-    xnn_operator_t prelu_op,
-    enum xnn_operator_type expected_operator_type,
-    size_t batch_size,
-    uint32_t log2_element_size,
-    pthreadpool_t threadpool)
-{
-  if (prelu_op->type != expected_operator_type) {
-    xnn_log_error("failed to reshape operator: operator type mismatch (expected %s, got %s)",
-      xnn_operator_type_to_string(expected_operator_type),
-      xnn_operator_type_to_string(prelu_op->type));
-    return xnn_status_invalid_parameter;
-  }
-  prelu_op->state = xnn_run_state_invalid;
-
-  if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
-    xnn_log_error("failed to reshape %s operator: XNNPACK is not initialized",
-      xnn_operator_type_to_string(expected_operator_type));
-    return xnn_status_uninitialized;
-  }
-
-  if (batch_size == 0) {
-    prelu_op->state = xnn_run_state_skip;
-    return xnn_status_success;
-  }
-
-  const struct xnn_prelu_config* prelu = prelu_op->prelu_config;
-
-  const size_t input_channels = prelu_op->channels;
-  prelu_op->context.prelu = (struct prelu_context) {
-    .n = input_channels << log2_element_size,
-    .x_stride = prelu_op->input_pixel_stride << log2_element_size,
-    .w = packed_weights(prelu_op),
-    .y_stride = prelu_op->output_pixel_stride << log2_element_size,
-    .ukernel = prelu->ukernel,
-  };
-
-  size_t batch_tile = batch_size;
-  const size_t num_threads = pthreadpool_get_threads_count(threadpool);
-  if (num_threads > 1) {
-    const size_t target_tiles_per_thread = 5;
-    const size_t max_batch_tile = divide_round_up(batch_size, num_threads * target_tiles_per_thread);
-    if (max_batch_tile < batch_tile) {
-      const uint32_t row_tile = prelu->row_tile;
-      batch_tile = min(batch_tile, divide_round_up(batch_tile, max_batch_tile * row_tile) * row_tile);
-    }
-  }
-
-  prelu_op->compute[0].type = xnn_parallelization_type_1d_tile_1d;
-  prelu_op->compute[0].task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_prelu;
-  prelu_op->compute[0].range[0] = batch_size;
-  prelu_op->compute[0].tile[0] = batch_tile;
-  prelu_op->state = xnn_run_state_needs_setup;
-
-  return xnn_status_success;
-}
-
-enum xnn_status xnn_reshape_prelu_nc_f16(
-    xnn_operator_t prelu_op,
-    size_t batch_size,
-    pthreadpool_t threadpool)
-{
-  return reshape_prelu_nc(
-    prelu_op, xnn_operator_type_prelu_nc_f16,
-    batch_size, /*log2_element_size=*/XNN_LOG2_SIZEOF_HALF,
-    threadpool);
-}
-
-enum xnn_status xnn_reshape_prelu_nc_f32(
-    xnn_operator_t prelu_op,
-    size_t batch_size,
-    pthreadpool_t threadpool)
-{
-  return reshape_prelu_nc(
-    prelu_op, xnn_operator_type_prelu_nc_f32,
-    batch_size, /*log2_element_size=*/XNN_LOG2_SIZEOF_FLOAT,
-    threadpool);
-}
-
-static enum xnn_status setup_prelu_nc(
-    xnn_operator_t prelu_op,
-    enum xnn_operator_type expected_operator_type,
-    const float* input,
-    float* output)
-{
-  if (prelu_op->type != expected_operator_type) {
-    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
-      xnn_operator_type_to_string(expected_operator_type),
-      xnn_operator_type_to_string(prelu_op->type));
-    return xnn_status_invalid_parameter;
-  }
-
-  if (prelu_op->weights_cache != NULL && !xnn_weights_cache_is_finalized(prelu_op->weights_cache)) {
-    xnn_log_error("failed to setup %s operator: weights cache is not finalized",
-      xnn_operator_type_to_string(expected_operator_type));
-    return xnn_status_invalid_state;
-  }
-
-  switch (prelu_op->state) {
-    case xnn_run_state_skip:
-      return xnn_status_success;
-    case xnn_run_state_invalid:
-      xnn_log_error(
-        "failed to setup %s operator: operator has not been reshaped yet",
-        xnn_operator_type_to_string(prelu_op->type));
-      return xnn_status_invalid_state;
-    case xnn_run_state_needs_setup:
-      // Operator has been reshaped, but not setup, continue with setup.
-    case xnn_run_state_ready:
-      // Operator has been reshaped, and we are setting up with different pointers.
-      break;
-  }
-
-  prelu_op->context.prelu.x = input;
-  prelu_op->context.prelu.y = output;
-  prelu_op->state = xnn_run_state_ready;
-
-  return xnn_status_success;
-}
-
-enum xnn_status xnn_setup_prelu_nc_f16(
-    xnn_operator_t prelu_op,
-    const void* input,
-    void* output)
-{
-  return setup_prelu_nc(
-    prelu_op, xnn_operator_type_prelu_nc_f16,
-    input, output);
-}
-
-enum xnn_status xnn_setup_prelu_nc_f32(
-    xnn_operator_t prelu_op,
-    const float* input,
-    float* output)
-{
-  return setup_prelu_nc(
-    prelu_op, xnn_operator_type_prelu_nc_f32,
-    input, output);
-}
diff --git a/src/operators/rope-nthc.c b/src/operators/rope-nthc.c
index 5de450de70f..43d4e33e8fc 100644
--- a/src/operators/rope-nthc.c
+++ b/src/operators/rope-nthc.c
@@ -22,7 +22,6 @@
 #include "pthreadpool.h"
 
 static enum xnn_status create_rope_nthc(
-    size_t max_tokens,
     uint32_t flags,
     enum xnn_operator_type operator_type,
     const struct xnn_cmul_config* config,
@@ -39,13 +38,6 @@ static enum xnn_status create_rope_nthc(
 
   status = xnn_status_invalid_parameter;
 
-  if (max_tokens == 0) {
-    xnn_log_error(
-      "failed to create %s operator with %zu max tokens: maximum number of tokens must be non-zero",
-      xnn_operator_type_to_string(operator_type), max_tokens);
-    goto error;
-  }
-
   status = xnn_status_out_of_memory;
 
   rope_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
@@ -56,8 +48,6 @@ static enum xnn_status create_rope_nthc(
     goto error;
   }
 
-  rope_op->max_tokens = max_tokens;
-
   rope_op->type = operator_type;
   rope_op->flags = flags;
   rope_op->cmul_config = config;
@@ -73,7 +63,6 @@ static enum xnn_status create_rope_nthc(
 }
 
 enum xnn_status xnn_create_rope_nthc_f16(
-  size_t max_tokens,
   uint32_t flags,
   xnn_operator_t* rope_op_out)
 {
@@ -85,7 +74,6 @@ enum xnn_status xnn_create_rope_nthc_f16(
   }
 
   return create_rope_nthc(
-    max_tokens,
     flags,
     xnn_operator_type_rope_nthc_f16,
     config,
@@ -93,7 +81,6 @@ enum xnn_status xnn_create_rope_nthc_f16(
 }
 
 enum xnn_status xnn_create_rope_nthc_f32(
-  size_t max_tokens,
   uint32_t flags,
   xnn_operator_t* rope_op_out)
 {
@@ -105,7 +92,6 @@ enum xnn_status xnn_create_rope_nthc_f32(
   }
 
   return create_rope_nthc(
-    max_tokens,
     flags,
     xnn_operator_type_rope_nthc_f32,
     config,
@@ -138,13 +124,6 @@ static enum xnn_status reshape_rope_nthc(
     return xnn_status_invalid_parameter;
   }
 
-  if (tokens > rope_op->max_tokens) {
-    xnn_log_error(
-      "failed to reshape %s operator with %zu tokens: number of tokens can not exceed the maximum %zu",
-      xnn_operator_type_to_string(rope_op->type), tokens, rope_op->max_tokens);
-    return xnn_status_invalid_parameter;
-  }
-
   if (heads == 0) {
     xnn_log_error(
       "failed to reshape %s operator with %zu heads: number of heads must be non-zero",
diff --git a/src/operators/softmax-nc.c b/src/operators/softmax-nc.c
index 75e57edc9c9..9e9fbb49101 100644
--- a/src/operators/softmax-nc.c
+++ b/src/operators/softmax-nc.c
@@ -21,6 +21,7 @@
 #include "xnnpack/config-types.h"
 #include "xnnpack/config.h"
 #include "xnnpack/log.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams.h"
 #include "xnnpack/operator-type.h"
diff --git a/src/operators/unary-elementwise-nc.c b/src/operators/unary-elementwise-nc.c
index 2270934695a..a0d9764227b 100644
--- a/src/operators/unary-elementwise-nc.c
+++ b/src/operators/unary-elementwise-nc.c
@@ -18,6 +18,7 @@
 #include "xnnpack/config-types.h"
 #include "xnnpack/config.h"
 #include "xnnpack/log.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams.h"
 #include "xnnpack/operator-type.h"
@@ -516,8 +517,6 @@ enum xnn_status xnn_create_convert_nc_f32_f16(
 enum xnn_status xnn_create_convert_nc_f32_qs8(
   float output_scale,
   int8_t output_zero_point,
-  int8_t output_min,
-  int8_t output_max,
   uint32_t flags,
   xnn_operator_t* convert_op_out)
 {
@@ -528,19 +527,12 @@ enum xnn_status xnn_create_convert_nc_f32_qs8(
     return xnn_status_invalid_parameter;
   }
 
-  if (output_min > output_max) {
-    xnn_log_error(
-      "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: lower bound must be less than or equal to upper bound",
-      xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qs8), output_min, output_max);
-    return xnn_status_invalid_parameter;
-  }
-
   const struct xnn_unary_elementwise_config* f32_to_qs8_cvt_config = xnn_init_f32_to_qs8_cvt_config();
 
   struct xnn_f32_qs8_cvt_params params;
   if XNN_LIKELY(f32_to_qs8_cvt_config != NULL) {
     assert(f32_to_qs8_cvt_config->init.f32_qs8_cvt != NULL);
-    f32_to_qs8_cvt_config->init.f32_qs8_cvt(&params, 1.0f / output_scale, output_zero_point, output_min, output_max);
+    f32_to_qs8_cvt_config->init.f32_qs8_cvt(&params, 1.0f / output_scale, output_zero_point);
   }
 
   return create_unary_elementwise_nc(
@@ -619,8 +611,6 @@ enum xnn_status xnn_create_convert_nc_f32_qp8(uint32_t flags,
 enum xnn_status xnn_create_convert_nc_f32_qu8(
   float output_scale,
   uint8_t output_zero_point,
-  uint8_t output_min,
-  uint8_t output_max,
   uint32_t flags,
   xnn_operator_t* convert_op_out)
 {
@@ -631,19 +621,12 @@ enum xnn_status xnn_create_convert_nc_f32_qu8(
     return xnn_status_invalid_parameter;
   }
 
-  if (output_min > output_max) {
-    xnn_log_error(
-      "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: lower bound must be less than or equal to upper bound",
-      xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qu8), output_min, output_max);
-    return xnn_status_invalid_parameter;
-  }
-
   const struct xnn_unary_elementwise_config* f32_to_qu8_cvt_config = xnn_init_f32_to_qu8_cvt_config();
 
   struct xnn_f32_qu8_cvt_params params;
   if XNN_LIKELY(f32_to_qu8_cvt_config != NULL) {
     assert(f32_to_qu8_cvt_config->init.f32_qu8_cvt != NULL);
-    f32_to_qu8_cvt_config->init.f32_qu8_cvt(&params, 1.0f / output_scale, output_zero_point, output_min, output_max);
+    f32_to_qu8_cvt_config->init.f32_qu8_cvt(&params, 1.0f / output_scale, output_zero_point);
   }
 
   return create_unary_elementwise_nc(
@@ -3420,7 +3403,7 @@ enum xnn_status xnn_run_convert_nc_f32_qs8(
   struct xnn_f32_qs8_cvt_params params;
   if XNN_LIKELY(f32_to_qs8_cvt_config != NULL) {
     assert(f32_to_qs8_cvt_config->init.f32_qs8_cvt != NULL);
-    f32_to_qs8_cvt_config->init.f32_qs8_cvt(&params, 1.0f / output_scale, output_zero_point, INT8_MIN, INT8_MAX);
+    f32_to_qs8_cvt_config->init.f32_qs8_cvt(&params, 1.0f / output_scale, output_zero_point);
   }
 
   return run_unary_elementwise_nc(
@@ -3458,7 +3441,7 @@ enum xnn_status xnn_run_convert_nc_f32_qu8(
   struct xnn_f32_qu8_cvt_params params;
   if XNN_LIKELY(f32_to_qu8_cvt_config != NULL) {
     assert(f32_to_qu8_cvt_config->init.f32_qu8_cvt != NULL);
-    f32_to_qu8_cvt_config->init.f32_qu8_cvt(&params, 1.0f / output_scale, output_zero_point, 0, UINT8_MAX);
+    f32_to_qu8_cvt_config->init.f32_qu8_cvt(&params, 1.0f / output_scale, output_zero_point);
   }
 
   return run_unary_elementwise_nc(
diff --git a/src/packing.cc b/src/packing.cc
index 39f81b97dbe..19cf4e357af 100644
--- a/src/packing.cc
+++ b/src/packing.cc
@@ -999,21 +999,11 @@ void xnn_pack_f32_qc4w_gemm_goi_w(
   } while (--g != 0);
 }
 
-void xnn_pack_f32_gemm_gio_w(
-  size_t g,
-  size_t nc,
-  size_t kc,
-  size_t nr,
-  size_t kr,
-  size_t sr,
-  size_t k_stride,
-  const float* k,
-  const float* b,
-  const void* scale,
-  float* packed_weights,
-  size_t extra_bytes,
-  const void* params)
-{
+void xnn_pack_f32_gemm_gio_w(size_t g, size_t nc, size_t kc, size_t nr,
+                             size_t kr, size_t sr, size_t k_stride,
+                             const float* k, const float* b, const void* scale,
+                             float* packed_weights, size_t extra_bytes,
+                             const void* params) {
   assert(g != 0);
   assert(nr >= sr);
   assert(k != nullptr);
@@ -1026,20 +1016,39 @@ void xnn_pack_f32_gemm_gio_w(
       copy_bias(b, nr_block_start, nr_block_size, packed_weights);
       packed_weights += nr;
 
-      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1));
-          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
-            const size_t kc_idx = kc_begin + kr_block_offset;
-            if (kc_idx < kc) {
-              packed_weights[kr_block_offset] = k[kc_idx * k_stride + nr_block_start + nr_block_offset];
+      // Special case for trivial packings.
+      if (skr == 1) {
+        for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start++) {
+          const size_t kc_idx = round_down_po2(kr_block_start, skr);
+          if (kc_idx < kc) {
+            std::copy_n(&k[kc_idx * k_stride + nr_block_start], nr_block_size,
+                        packed_weights);
+          }
+          packed_weights += nr;
+        }
+
+      } else {
+        for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr);
+             kr_block_start += kr) {
+          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size;
+               nr_block_offset++) {
+            const size_t kc_begin =
+                round_down_po2(kr_block_start, skr) +
+                ((kr_block_start + nr_block_offset * kr) & (skr - 1));
+            for (size_t kr_block_offset = 0; kr_block_offset < kr;
+                 kr_block_offset++) {
+              const size_t kc_idx = kc_begin + kr_block_offset;
+              if (kc_idx < kc) {
+                packed_weights[kr_block_offset] =
+                    k[kc_idx * k_stride + nr_block_start + nr_block_offset];
+              }
             }
+            packed_weights += kr;
           }
-          packed_weights += kr;
+          packed_weights += (nr - nr_block_size) * kr;
         }
-        packed_weights += (nr - nr_block_size) * kr;
       }
-      packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
+      packed_weights = (float*)((uintptr_t)packed_weights + extra_bytes);
     }
     k += nc * kc;
     if XNN_UNPREDICTABLE(b != nullptr) {
@@ -1048,21 +1057,11 @@ void xnn_pack_f32_gemm_gio_w(
   } while (--g != 0);
 }
 
-void xnn_pack_f16_gemm_gio_w(
-  size_t g,
-  size_t nc,
-  size_t kc,
-  size_t nr,
-  size_t kr,
-  size_t sr,
-  size_t k_stride,
-  const uint16_t* k,
-  const uint16_t* b,
-  const void* scale,
-  uint16_t* packed_weights,
-  size_t extra_bytes,
-  const void* params)
-{
+void xnn_pack_f16_gemm_gio_w(size_t g, size_t nc, size_t kc, size_t nr,
+                             size_t kr, size_t sr, size_t k_stride,
+                             const uint16_t* k, const uint16_t* b,
+                             const void* scale, uint16_t* packed_weights,
+                             size_t extra_bytes, const void* params) {
   assert(g != 0);
   assert(nr >= sr);
   assert(k != nullptr);
@@ -1075,20 +1074,39 @@ void xnn_pack_f16_gemm_gio_w(
       copy_bias(b, nr_block_start, nr_block_size, packed_weights);
       packed_weights += nr;
 
-      for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
-        for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
-          const size_t kc_begin = round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & (skr - 1));
-          for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
-            const size_t kc_idx = kc_begin + kr_block_offset;
-            if (kc_idx < kc) {
-              packed_weights[kr_block_offset] = k[kc_idx * k_stride + nr_block_start + nr_block_offset];
+      // Special case for trivial packings.
+      if (skr == 1) {
+        for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start++) {
+          const size_t kc_idx = round_down_po2(kr_block_start, skr);
+          if (kc_idx < kc) {
+            std::copy_n(&k[kc_idx * k_stride + nr_block_start], nr_block_size,
+                        packed_weights);
+          }
+          packed_weights += nr;
+        }
+
+      } else {
+        for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr);
+             kr_block_start += kr) {
+          for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size;
+               nr_block_offset++) {
+            const size_t kc_begin =
+                round_down_po2(kr_block_start, skr) +
+                ((kr_block_start + nr_block_offset * kr) & (skr - 1));
+            for (size_t kr_block_offset = 0; kr_block_offset < kr;
+                 kr_block_offset++) {
+              const size_t kc_idx = kc_begin + kr_block_offset;
+              if (kc_idx < kc) {
+                packed_weights[kr_block_offset] =
+                    k[kc_idx * k_stride + nr_block_start + nr_block_offset];
+              }
             }
+            packed_weights += kr;
           }
-          packed_weights += kr;
+          packed_weights += (nr - nr_block_size) * kr;
         }
-        packed_weights += (nr - nr_block_size) * kr;
       }
-      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
+      packed_weights = (uint16_t*)((uintptr_t)packed_weights + extra_bytes);
     }
     k += nc * kc;
     if XNN_UNPREDICTABLE(b != nullptr) {
@@ -4918,66 +4936,6 @@ void xnn_pack_f32_to_f16_vmulcaddc_w(
   }
 }
 
-void xnn_pack_f32_prelu_w(
-  size_t input_channels,
-  size_t slope_channels,
-  const float* s,
-  float* packed_weights)
-{
-  assert(s != nullptr);
-  assert(packed_weights != nullptr);
-  assert(slope_channels == input_channels || slope_channels == 1);
-
-  if (slope_channels == 1) {
-    do {
-      *packed_weights++ = *s;
-    } while (--input_channels != 0);
-  } else {
-    memcpy(packed_weights, s, slope_channels * sizeof(float));
-  }
-}
-
-void xnn_pack_f16_prelu_w(
-  size_t input_channels,
-  size_t slope_channels,
-  const uint16_t* s,
-  uint16_t* packed_weights)
-{
-  assert(s != nullptr);
-  assert(packed_weights != nullptr);
-  assert(slope_channels == input_channels || slope_channels == 1);
-
-  if (slope_channels == 1) {
-    do {
-      *packed_weights++ = *s;
-    } while (--input_channels != 0);
-  } else {
-    memcpy(packed_weights, s, slope_channels * sizeof(uint16_t));
-  }
-}
-
-void xnn_pack_f32_to_f16_prelu_w(
-  size_t input_channels,
-  size_t slope_channels,
-  const float* s,
-  xnn_float16* packed_weights)
-{
-  assert(s != nullptr);
-  assert(packed_weights != nullptr);
-  assert(slope_channels == input_channels || slope_channels == 1);
-
-  if (slope_channels == 1) {
-    xnn_float16 v =  xnn_float16_from_float(*s);
-    for (size_t i = 0; i < input_channels; ++i) {
-      packed_weights[i] = v;
-    }
-  } else {
-    do {
-      *packed_weights++ = xnn_float16_from_float(*s++);
-    } while (--input_channels != 0);
-  }
-}
-
 void xnn_analyze_f32_spmm_w(
   size_t group_output_channels,
   size_t group_input_channels,
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c
index e6029459e8b..85821458413 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-imagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-imagic.c
index d7928ea6468..80cc8e91903 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-imagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c
index d7937594ffe..56500d0f7e9 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c
index ec59cdca556..3a9906a607b 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c
index 7731e2f25e6..50b4797ff28 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-imagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-imagic.c
index d3c2c5521cd..6f1849d5446 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-imagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c
index 24bf861851f..fa6ca75f77c 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c
index 32f23a8dfba..d29196ad15e 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c
index 034d45ac4c7..36f998d1f28 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic(
     size_t channels,
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-imagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-imagic.c
index 2b851a0cbdb..47aa2941df2 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-imagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic(
     size_t channels,
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c
index 5dad72ff47d..310d6c783e7 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf(
     size_t channels,
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c
index b49a44b2c28..9db83f1952a 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic(
     size_t channels,
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c
index a9dc7ab79aa..c78c217ddf1 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-imagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-imagic.c
index f0a9de4edab..57ae6e8e40f 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-imagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c
index 4a8871c3381..342843bf218 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c
index cf4e8b04060..294ab54efdf 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-rndnu-scalar.c b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-rndnu-scalar.c
index a9229e75b65..0619458007d 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-rndnu-scalar.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-rndnu-scalar.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_rndnu_ukernel_9p1c__scalar(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c
index c3c235de507..5308c1cfcdd 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-imagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-imagic.c
index f50ad5aa52b..ea8c111e9ad 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-imagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c
index 225cbbd3d05..6df79d4d0d9 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c
index 464a40ef827..e25551b317a 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-rndnu-scalar.c b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-rndnu-scalar.c
index 0bf0c224060..115300ce509 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-rndnu-scalar.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-rndnu-scalar.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_dwconv_minmax_rndnu_ukernel_9p2c__scalar(
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c
index c310670400c..ba767d9b162 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic(
     size_t channels,
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-imagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-imagic.c
index 9c66853c11b..54145636610 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-imagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic(
     size_t channels,
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c
index 85a538f1e2c..546582e8cb7 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf(
     size_t channels,
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c
index e3b1e6315c2..670aded74fd 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic(
     size_t channels,
diff --git a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-rndnu-scalar.c b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-rndnu-scalar.c
index 65e67b5810e..bafbd4d6f23 100644
--- a/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-rndnu-scalar.c
+++ b/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-rndnu-scalar.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_dwconv_minmax_rndnu_ukernel_9p4c__scalar(
     size_t channels,
diff --git a/src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h b/src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h
index e595ae13676..87381179a7c 100644
--- a/src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h
+++ b/src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h
@@ -91,13 +91,16 @@ XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_add16_vpunpck, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpmovsx, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpunpck, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32, 5, 5, 5, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32, 5, 5, 5, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32, 6, 6, 7, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32, 6, 6, 7, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32, 8, 8, 9, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32, 8, 8, 9, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_DWCONV_MULTIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
diff --git a/src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h b/src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h
index f3773aad7bf..bd63b39ce4b 100644
--- a/src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h
+++ b/src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h
@@ -72,11 +72,14 @@ XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c_
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpmovsx, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpunpck, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32, 16, false, 16, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32, 32, false, 32, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32, 16, false, 16, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32, 32, false, 32, 25, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_DWCONV_UNIPASS(0, xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16, 8, false, 8, 9, int8_t, void, union xnn_qs8_conv_minmax_params, xnn_init_qs8_conv_minmax_fp32_scalar_params)
diff --git a/src/qs8-dwconv/unipass-scalar.c.in b/src/qs8-dwconv/unipass-scalar.c.in
index 6e7d0370ac1..a744988b8e4 100644
--- a/src/qs8-dwconv/unipass-scalar.c.in
+++ b/src/qs8-dwconv/unipass-scalar.c.in
@@ -10,9 +10,13 @@ $assert DATATYPE in ["QC8", "QS8", "QU8"]
 #include <assert.h>
 $if VARIANT == "LRINTF":
   #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 $if CHANNEL_TILE % 4 != 0:
   #include "xnnpack/unaligned.h"
 
diff --git a/src/qs8-f32-vcvt/qs8-f32-vcvt.h b/src/qs8-f32-vcvt/qs8-f32-vcvt.h
index e7de4a98713..0df6e93b83f 100644
--- a/src/qs8-f32-vcvt/qs8-f32-vcvt.h
+++ b/src/qs8-f32-vcvt/qs8-f32-vcvt.h
@@ -40,11 +40,14 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_f32_vcvt_ukernel__avx2_u8
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_f32_vcvt_ukernel__avx2_u16, 16, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_f32_vcvt_ukernel__avx2_u24, 24, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_f32_vcvt_ukernel__avx2_u32, 32, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_f32_vcvt_ukernel__avx512skx_u16, 16, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_f32_vcvt_ukernel__avx512skx_u32, 32, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_f32_vcvt_ukernel__avx512skx_u48, 48, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_f32_vcvt_ukernel__avx512skx_u64, 64, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qs8_f32_vcvt_ukernel__wasmsimd_u8, 8, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params)
diff --git a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni-prfm.c b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni-prfm.c
index cfe2ec2166a..7e36fddd660 100644
--- a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni-prfm.c
+++ b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni-prfm.c
@@ -31,7 +31,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -187,190 +187,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
@@ -587,190 +443,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
diff --git a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni.c b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni.c
index 67091f212a8..c12845e16c8 100644
--- a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni.c
+++ b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avx256vnni.c
@@ -30,7 +30,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avx256vnni(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -138,190 +138,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avx256vnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
@@ -490,190 +346,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avx256vnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
diff --git a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni-prfm.c b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni-prfm.c
index 6e2926d419d..e1bbd2b1556 100644
--- a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni-prfm.c
+++ b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni-prfm.c
@@ -31,7 +31,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avxvnni_prfm(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -187,190 +187,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avxvnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
@@ -587,190 +443,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avxvnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
diff --git a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni.c b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni.c
index 8ddf82e3b88..1891bbff0f8 100644
--- a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni.c
+++ b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-goi-avxvnni.c
@@ -30,7 +30,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avxvnni(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -138,190 +138,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avxvnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
@@ -490,190 +346,46 @@ void xnn_qs8_packw_gemm_goi_ukernel_x16c8__avxvnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
diff --git a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni-prfm.c b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni-prfm.c
index f3f8d2672b4..4a60755892c 100644
--- a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni-prfm.c
+++ b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni-prfm.c
@@ -31,7 +31,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avx256vnni_prfm(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -130,108 +130,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avx256vnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
@@ -360,108 +280,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avx256vnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
diff --git a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni.c b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni.c
index 7919e5d3561..c25d8598d6c 100644
--- a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni.c
+++ b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avx256vnni.c
@@ -30,7 +30,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avx256vnni(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -105,108 +105,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avx256vnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
@@ -311,108 +231,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avx256vnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
diff --git a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni-prfm.c b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni-prfm.c
index 687273d457f..44354081b3f 100644
--- a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni-prfm.c
+++ b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni-prfm.c
@@ -31,7 +31,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -130,108 +130,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
@@ -360,108 +280,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
diff --git a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni.c b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni.c
index d7b555ff552..acbbdfdc876 100644
--- a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni.c
+++ b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni.c
@@ -30,7 +30,7 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -105,108 +105,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
@@ -311,108 +231,28 @@ void xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
diff --git a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c
new file mode 100644
index 00000000000..881ea5af2a2
--- /dev/null
+++ b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c
@@ -0,0 +1,370 @@
+// Auto-generated file. Do not edit!
+//   Template: src/x8-packw/kr-wasmdot.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include "xnnpack/packw.h"
+
+
+void xnn_qs8_packw_gemm_goi_ukernel_x8c8__wasmrelaxedsimd(
+  size_t g,
+  size_t nc,
+  size_t kc,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const int8_t* weights,
+  const int32_t* bias,
+  const void* scale,
+  int8_t* packed_weights,
+  size_t extra_bytes,
+  const void* params) XNN_OOB_READS
+{
+  assert(g != 0);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(nr == 8);
+  assert(kr == 8);
+  assert(sr == 1);
+  assert(weights != NULL);
+  assert(packed_weights != NULL);
+
+  const v128_t vone = wasm_i8x16_splat(1);
+  const v128_t vzero = wasm_i32x4_splat(0);
+  XNN_FORCE_REALIZATION(vone);
+  XNN_FORCE_REALIZATION(vzero);
+  int8_t* out = (int8_t*) packed_weights;
+  const uint32_t* b = (const uint32_t*) bias;
+  const uint32_t izp = (uint32_t) (params ? (((const struct xnn_qs8_packw_params*) params)->input_zero_point + 0): 0);
+  v128_t vzeropoint = wasm_i32x4_splat((int32_t) izp);
+
+  do {
+    // NC main loop multiple of 8
+    const int8_t* w0 = (const int8_t*) weights;
+    size_t n = nc;
+    for (;n >= 8; n -= 8) {
+      int32_t* packed_b = (int32_t*) out;
+      if XNN_LIKELY(b != NULL) {
+        const v128_t vb0 = wasm_v128_load(b + 0);
+        wasm_v128_store(out + 0, vb0);
+        const v128_t vb1 = wasm_v128_load(b + 4);
+        wasm_v128_store(out + 16, vb1);
+        b += 8;
+      } else {
+        wasm_v128_store(out + 0, vzero);
+        wasm_v128_store(out + 16, vzero);
+      }
+      out += 8 * sizeof(uint32_t);
+
+      const int8_t* w1 = w0 + kc;
+      const int8_t* w2 = w1 + kc;
+      const int8_t* w3 = w2 + kc;
+      const int8_t* w4 = w3 + kc;
+      const int8_t* w5 = w4 + kc;
+      const int8_t* w6 = w5 + kc;
+      const int8_t* w7 = w6 + kc;
+
+      v128_t vacc01 = wasm_i32x4_splat(0);
+      v128_t vacc23 = wasm_i32x4_splat(0);
+      v128_t vacc45 = wasm_i32x4_splat(0);
+      v128_t vacc67 = wasm_i32x4_splat(0);
+
+      // KC main loop multiple of 8x8
+      size_t k = kc;
+      for (; k >= 16; k -= 16) {
+        v128_t v0_01 = wasm_v128_load(w0);
+        v128_t v1_01 = wasm_v128_load(w1);
+        v128_t v2_01 = wasm_v128_load(w2);
+        v128_t v3_01 = wasm_v128_load(w3);
+        v128_t v4_01 = wasm_v128_load(w4);
+        v128_t v5_01 = wasm_v128_load(w5);
+        v128_t v6_01 = wasm_v128_load(w6);
+        v128_t v7_01 = wasm_v128_load(w7);
+
+        v128_t v01_0 = wasm_i64x2_shuffle(v0_01, v1_01, 0, 2);
+        v128_t v01_1 = wasm_i64x2_shuffle(v0_01, v1_01, 1, 3);
+        v128_t v23_0 = wasm_i64x2_shuffle(v2_01, v3_01, 0, 2);
+        v128_t v23_1 = wasm_i64x2_shuffle(v2_01, v3_01, 1, 3);
+        v128_t v45_0 = wasm_i64x2_shuffle(v4_01, v5_01, 0, 2);
+        v128_t v45_1 = wasm_i64x2_shuffle(v4_01, v5_01, 1, 3);
+        v128_t v67_0 = wasm_i64x2_shuffle(v6_01, v7_01, 0, 2);
+        v128_t v67_1 = wasm_i64x2_shuffle(v6_01, v7_01, 1, 3);
+
+        vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01_0, vone, vacc01);
+        vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01_1, vone, vacc01);
+        vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23_0, vone, vacc23);
+        vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23_1, vone, vacc23);
+        vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45_0, vone, vacc45);
+        vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45_1, vone, vacc45);
+        vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67_0, vone, vacc67);
+        vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67_1, vone, vacc67);
+
+        wasm_v128_store(out + 0, v01_0);
+        wasm_v128_store(out + 16, v23_0);
+        wasm_v128_store(out + 32, v45_0);
+        wasm_v128_store(out + 48, v67_0);
+
+        wasm_v128_store(out + 64, v01_1);
+        wasm_v128_store(out + 80, v23_1);
+        wasm_v128_store(out + 96, v45_1);
+        wasm_v128_store(out + 112, v67_1);
+
+        w0 += 16;
+        w1 += 16;
+        w2 += 16;
+        w3 += 16;
+        w4 += 16;
+        w5 += 16;
+        w6 += 16;
+        w7 += 16;
+        out += 128;
+      }
+
+      for (; k >= 8; k -= 8) {
+        const v128_t v0 = wasm_v128_load64_splat(w0);
+        const v128_t v1 = wasm_v128_load64_splat(w1);
+        const v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3);
+        const v128_t v2 = wasm_v128_load64_splat(w2);
+        const v128_t v3 = wasm_v128_load64_splat(w3);
+        const v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3);
+        const v128_t v4 = wasm_v128_load64_splat(w4);
+        const v128_t v5 = wasm_v128_load64_splat(w5);
+        const v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3);
+        const v128_t v6 = wasm_v128_load64_splat(w6);
+        const v128_t v7 = wasm_v128_load64_splat(w7);
+        const v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3);
+
+        vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01);
+        vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23);
+        vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45);
+        vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67);
+
+        wasm_v128_store(out + 0, v01);
+        wasm_v128_store(out + 16, v23);
+        wasm_v128_store(out + 32, v45);
+        wasm_v128_store(out + 48, v67);
+
+        w0 += 8;
+        w1 += 8;
+        w2 += 8;
+        w3 += 8;
+        w4 += 8;
+        w5 += 8;
+        w6 += 8;
+        w7 += 8;
+        out += 64;
+      }
+
+      // KC remainder 1..KR-1
+      if (k != 0) {
+        assert(k >= 1 && k <= 7);
+
+        const v128_t vmask = wasm_u64x2_shr(wasm_i32x4_splat(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        const v128_t v0 = wasm_v128_load64_splat(w0);
+        const v128_t v1 = wasm_v128_load64_splat(w1);
+        v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3);
+        v01 = wasm_v128_and(v01, vmask);
+        const v128_t v2 = wasm_v128_load64_splat(w2);
+        const v128_t v3 = wasm_v128_load64_splat(w3);
+        v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3);
+        v23 = wasm_v128_and(v23, vmask);
+        const v128_t v4 = wasm_v128_load64_splat(w4);
+        const v128_t v5 = wasm_v128_load64_splat(w5);
+        v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3);
+        v45 = wasm_v128_and(v45, vmask);
+        const v128_t v6 = wasm_v128_load64_splat(w6);
+        const v128_t v7 = wasm_v128_load64_splat(w7);
+        v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3);
+        v67 = wasm_v128_and(v67, vmask);
+
+        vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01);
+        vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23);
+        vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45);
+        vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67);
+
+        wasm_v128_store(out + 0, v01);
+        wasm_v128_store(out + 16, v23);
+        wasm_v128_store(out + 32, v45);
+        wasm_v128_store(out + 48, v67);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        out += 64;
+      }
+
+      v128_t vksum0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc01, vacc23, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc01, vacc23, 1, 3, 5, 7));
+      v128_t vksum4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc45, vacc67, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc45, vacc67, 1, 3, 5, 7));
+
+      vksum0123 = wasm_i32x4_mul(vksum0123, vzeropoint);
+      vksum4567 = wasm_i32x4_mul(vksum4567, vzeropoint);
+
+      v128_t vpack0123 = wasm_v128_load(packed_b);
+      v128_t vpack4567 = wasm_v128_load(packed_b + 4);
+
+      wasm_v128_store(packed_b, wasm_i32x4_sub(vpack0123, vksum0123));
+      wasm_v128_store(packed_b + 4, wasm_i32x4_sub(vpack4567, vksum4567));
+
+      out = (int8_t*) ((uintptr_t) out + extra_bytes);
+      w0 = w7;
+    }
+
+    // NC remainder (1..7)
+    if XNN_UNLIKELY(n != 0) {
+      assert(n >= 1 && n <= 7);
+
+      int32_t* packed_b = (int32_t*) out;
+      if XNN_LIKELY(b != NULL) {
+        size_t nb = n;
+        do {
+          *((uint32_t*) out) = *b++;
+          out += sizeof(uint32_t);
+        } while (--nb != 0);
+      } else {
+        size_t nb = n;
+        do {
+          *((uint32_t*) out) = 0;
+          out += sizeof(uint32_t);
+        } while (--nb != 0);
+      }
+      out += (8 - n) * sizeof(uint32_t);
+
+      const int8_t* w1 = w0 + kc;
+      if XNN_UNPREDICTABLE(n < 2) {
+        w1 = w0;
+      }
+      const int8_t* w2 = w1 + kc;
+      if XNN_UNPREDICTABLE(n <= 2) {
+        w2 = w1;
+      }
+      const int8_t* w3 = w2 + kc;
+      if XNN_UNPREDICTABLE(n < 4) {
+        w3 = w2;
+      }
+      const int8_t* w4 = w3 + kc;
+      if XNN_UNPREDICTABLE(n <= 4) {
+        w4 = w3;
+      }
+      const int8_t* w5 = w4 + kc;
+      if XNN_UNPREDICTABLE(n < 6) {
+        w5 = w4;
+      }
+      const int8_t* w6 = w5 + kc;
+      if XNN_UNPREDICTABLE(n <= 6) {
+        w6 = w5;
+      }
+      const int8_t* w7 = w6 + kc;
+      if XNN_UNPREDICTABLE(n < 8) {
+        w7 = w6;
+      }
+
+      v128_t vacc01 = wasm_i32x4_splat(0);
+      v128_t vacc23 = wasm_i32x4_splat(0);
+      v128_t vacc45 = wasm_i32x4_splat(0);
+      v128_t vacc67 = wasm_i32x4_splat(0);
+
+      // KC main loop multiple of 8x8
+      size_t k = kc;
+      for (; k >= 8; k -= 8) {
+        const v128_t v0 = wasm_v128_load64_splat(w0);
+        const v128_t v1 = wasm_v128_load64_splat(w1);
+        const v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3);
+        const v128_t v2 = wasm_v128_load64_splat(w2);
+        const v128_t v3 = wasm_v128_load64_splat(w3);
+        const v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3);
+        const v128_t v4 = wasm_v128_load64_splat(w4);
+        const v128_t v5 = wasm_v128_load64_splat(w5);
+        const v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3);
+        const v128_t v6 = wasm_v128_load64_splat(w6);
+        const v128_t v7 = wasm_v128_load64_splat(w7);
+        const v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3);
+
+        vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01);
+        vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23);
+        vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45);
+        vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67);
+
+        wasm_v128_store(out + 0, v01);
+        wasm_v128_store(out + 16, v23);
+        wasm_v128_store(out + 32, v45);
+        wasm_v128_store(out + 48, v67);
+
+        w0 += 8;
+        w1 += 8;
+        w2 += 8;
+        w3 += 8;
+        w4 += 8;
+        w5 += 8;
+        w6 += 8;
+        w7 += 8;
+        out += 64;
+      }
+
+      // KC remainder of 1..7
+      if (k != 0) {
+        assert(k >= 1 && k <= 7);
+
+        const v128_t vmask = wasm_u64x2_shr(wasm_i32x4_splat(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        const v128_t v0 = wasm_v128_load64_splat(w0);
+        const v128_t v1 = wasm_v128_load64_splat(w1);
+        v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3);
+        v01 = wasm_v128_and(v01, vmask);
+        const v128_t v2 = wasm_v128_load64_splat(w2);
+        const v128_t v3 = wasm_v128_load64_splat(w3);
+        v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3);
+        v23 = wasm_v128_and(v23, vmask);
+        const v128_t v4 = wasm_v128_load64_splat(w4);
+        const v128_t v5 = wasm_v128_load64_splat(w5);
+        v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3);
+        v45 = wasm_v128_and(v45, vmask);
+        const v128_t v6 = wasm_v128_load64_splat(w6);
+        const v128_t v7 = wasm_v128_load64_splat(w7);
+        v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3);
+        v67 = wasm_v128_and(v67, vmask);
+
+        vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01);
+        vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23);
+        vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45);
+        vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67);
+
+        wasm_v128_store(out + 0, v01);
+        wasm_v128_store(out + 16, v23);
+        wasm_v128_store(out + 32, v45);
+        wasm_v128_store(out + 48, v67);
+
+        out += 64;
+      }
+
+      v128_t vksum0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc01, vacc23, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc01, vacc23, 1, 3, 5, 7));
+      v128_t vksum4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc45, vacc67, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc45, vacc67, 1, 3, 5, 7));
+
+      vksum0123 = wasm_i32x4_mul(vksum0123, vzeropoint);
+      vksum4567 = wasm_i32x4_mul(vksum4567, vzeropoint);
+
+      v128_t vpack0123 = wasm_v128_load(packed_b);
+      v128_t vpack4567 = wasm_v128_load(packed_b + 4);
+
+      wasm_v128_store(packed_b, wasm_i32x4_sub(vpack0123, vksum0123));
+      wasm_v128_store(packed_b + 4, wasm_i32x4_sub(vpack4567, vksum4567));
+
+      out = (int8_t*) ((uintptr_t) out + extra_bytes);
+    }
+    weights += nc * kc;
+  } while (--g != 0);
+}
diff --git a/src/qs8-packw/qs8-packw.h b/src/qs8-packw/qs8-packw.h
index 21088b9e763..24fc85abf05 100644
--- a/src/qs8-packw/qs8-packw.h
+++ b/src/qs8-packw/qs8-packw.h
@@ -38,3 +38,8 @@ XNN_QS8_UKERNEL(xnn_arch_x86_avx256vnni, xnn_qs8_packw_gemm_goi_ukernel_x16c8__a
 XNN_QS8_UKERNEL(xnn_arch_x86_avx256vnni, xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni, 16, 8, 1, 8, 1, 128)
 XNN_QS8_UKERNEL(xnn_arch_x86_avx256vnni, xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm, 16, 8, 1, 8, 1, 128)
 #endif
+
+#if XNN_ARCH_WASMRELAXEDSIMD
+XNN_QS8_UKERNEL(0, xnn_qs8_packw_gemm_goi_ukernel_x8c8__wasmrelaxedsimd, 8, 8, 1, 8, 1, 0)
+XNN_QS8_UKERNEL(0, xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__wasmrelaxedsimd, 8, 8, 1, 8, 1, 128)
+#endif // XNN_ARCH_WASMRELAXEDSIMD
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-fmagic.c
index 0c68f57a3a6..94407ec9dcb 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-fmagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-imagic.c
index 1bcc3f67e72..e14f5a01bf8 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-imagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-lrintf.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-lrintf.c
index 0430a0900b7..f97e7b295ba 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-lrintf.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-wasm-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-wasm-fmagic.c
index b18c4ad52b8..c1c4dbe9d39 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-wasm-fmagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p1c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-fmagic.c
index cc41ca8cd2d..06a39ff90e8 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-fmagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-imagic.c
index c762f870f3d..a0ff8a51dec 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-imagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-lrintf.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-lrintf.c
index addfe971325..ce932cc4423 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-lrintf.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-wasm-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-wasm-fmagic.c
index d47df6e330d..6d78543d04c 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-wasm-fmagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p2c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-fmagic.c
index a810c5da499..b8c4d56a4d2 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-fmagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic(
     size_t channels,
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-imagic.c
index d4c4c571d15..499f670fb42 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-imagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic(
     size_t channels,
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-lrintf.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-lrintf.c
index e43c03815f1..0c8925e1854 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-lrintf.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf(
     size_t channels,
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-wasm-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-wasm-fmagic.c
index 43d3632997a..0a0fbaf8b04 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-wasm-fmagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p4c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic(
     size_t channels,
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p1c-minmax-fp32-scalar-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p1c-minmax-fp32-scalar-fmagic.c
index b47e330f6fd..ca03406b25f 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p1c-minmax-fp32-scalar-fmagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p1c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p1c__scalar_fmagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-imagic.c
index 5cf82696e6c..71afee0f507 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-imagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_imagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-lrintf.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-lrintf.c
index 68a72856c97..c1ad1d55718 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-lrintf.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_lrintf(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-wasm-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-wasm-fmagic.c
index bf7c0aae4e8..3065b899216 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-wasm-fmagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__wasm_fmagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-4p2c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-4p2c-minmax-fp32-scalar-imagic.c
index 1fcf3acecfc..64a064d3c71 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-4p2c-minmax-fp32-scalar-imagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-4p2c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_4p2c__scalar_imagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-fmagic.c
index 5030009c408..2fc424d2acc 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-fmagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-imagic.c
index e0971ab002c..8dfa3021075 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-imagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-lrintf.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-lrintf.c
index d8ebc7fffec..07cb8cbe367 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-lrintf.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-wasm-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-wasm-fmagic.c
index 97a16c1b510..d132f26b328 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-wasm-fmagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p1c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-fmagic.c
index 9b0a23e3d71..db08261c7ff 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-fmagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-imagic.c
index 1a00cc7b501..7b22beb58d3 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-imagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-lrintf.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-lrintf.c
index a14d6fece87..0c77229c630 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-lrintf.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-wasm-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-wasm-fmagic.c
index 65ba30f7590..1f648aab110 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-wasm-fmagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p2c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic(
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-fmagic.c
index 570c2a0b0c6..e126d0484a4 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-fmagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic(
     size_t channels,
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-imagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-imagic.c
index 708fd85cbdc..a67aaba7bf4 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-imagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic(
     size_t channels,
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-lrintf.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-lrintf.c
index 428bcb6b067..ba90d727b0f 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-lrintf.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf(
     size_t channels,
diff --git a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-wasm-fmagic.c b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-wasm-fmagic.c
index 877d026b07b..5acc1a34a33 100644
--- a/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-wasm-fmagic.c
+++ b/src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p4c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic(
     size_t channels,
diff --git a/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h b/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h
index 2787de0b4a4..429fc86b505 100644
--- a/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h
+++ b/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h
@@ -127,13 +127,16 @@ XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_add16_vpunpck, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpmovsx, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s16r__avx2_mul16_vpunpck, 8, 8, 9, 32, 16, 16, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32, 5, 5, 5, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32, 5, 5, 5, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32, 6, 6, 7, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32, 6, 6, 7, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32, 8, 8, 9, 16, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32, 8, 8, 9, 32, 16, 1, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_DWCONV_MULTIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16, 5, 5, 5, 8, 8, 8, int8_t, void, int32_t, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
diff --git a/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h b/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h
index 5281e1b2243..81174b6ffbc 100644
--- a/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h
+++ b/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h
@@ -118,12 +118,15 @@ XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpmovsx, 32, false, 32, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul16_vpunpck, 32, false, 32, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32, 32, false, 32, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p32c__avx512skx_mul32, 32, false, 32, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32, 16, false, 16, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32, 32, false, 32, 9, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32, 16, false, 16, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32, 32, false, 32, 25, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_DWCONV_UNIPASS(0, xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__wasmsimd_mul16_add16, 16, false, 16, 3, int8_t, void, union xnn_qs8_qc8w_conv_minmax_params, xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params)
diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni-prfm.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni-prfm.c
index 4b40552f071..e66654f7dbf 100644
--- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni-prfm.c
+++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni-prfm.c
@@ -31,7 +31,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -187,190 +187,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
@@ -587,190 +443,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni.c
index 357f922f971..5072bde3fab 100644
--- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni.c
+++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avx256vnni.c
@@ -30,7 +30,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -138,190 +138,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
@@ -490,190 +346,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avx256vnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni-prfm.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni-prfm.c
index a63ac278656..0c7edfb76fa 100644
--- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni-prfm.c
+++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni-prfm.c
@@ -31,7 +31,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avxvnni_prfm(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -187,190 +187,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avxvnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
@@ -587,190 +443,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avxvnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni.c
index f11674bc7a3..15041c4f0ba 100644
--- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni.c
+++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-avxvnni.c
@@ -30,7 +30,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avxvnni(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -138,190 +138,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avxvnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
@@ -490,190 +346,46 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__avxvnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-        __m256i v8 = _mm256_setzero_si256();
-        __m256i v12 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w8, 0);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w9, 2);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w10, 4);
-          v8 = _mm256_insert_epi32(v8, *(const int32_t *)w11, 6);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w12, 0);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w13, 2);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w14, 4);
-          v12 = _mm256_insert_epi32(v12, *(const int32_t *)w15, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-          w8 += 4;
-          w9 += 4;
-          w10 += 4;
-          w11 += 4;
-          w12 += 4;
-          w13 += 4;
-          w14 += 4;
-          w15 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 2);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 6);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 10);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 14);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 2);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 6);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 10);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w8, 0);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w9, 4);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w10, 8);
-            v8 = _mm256_insert_epi16(v8, *(const int16_t *)w11, 12);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w12, 0);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w13, 4);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w14, 8);
-            v12 = _mm256_insert_epi16(v12, *(const int16_t *)w15, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-          w8 += 2;
-          w9 += 2;
-          w10 += 2;
-          w11 += 2;
-          w12 += 2;
-          w13 += 2;
-          w14 += 2;
-          w15 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 6);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 14);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 22);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 30);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 6);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 14);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 22);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 4);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 12);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 20);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 28);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 4);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 12);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 20);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 2);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 10);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 18);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 26);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 2);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 10);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 18);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w8, 0);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w9, 8);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w10, 16);
-            v8 = _mm256_insert_epi8(v8, *(const int8_t *)w11, 24);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w12, 0);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w13, 8);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w14, 16);
-            v12 = _mm256_insert_epi8(v12, *(const int8_t *)w15, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-          w8 += 1;
-          w9 += 1;
-          w10 += 1;
-          w11 += 1;
-          w12 += 1;
-          w13 += 1;
-          w14 += 1;
-          w15 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+        __m256i v8 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w8));
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w9)), 0x0C);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w10)), 0x30);
+        v8 = _mm256_blend_epi32(v8, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w11)), 0xC0);
+        v8 = _mm256_and_si256(v8, vmask);
+        __m256i v12 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w12));
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w13)), 0x0C);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w14)), 0x30);
+        v12 = _mm256_blend_epi32(v12, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w15)), 0xC0);
+        v12 = _mm256_and_si256(v12, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        w8 += k;
+        w9 += k;
+        w10 += k;
+        w11 += k;
+        w12 += k;
+        w13 += k;
+        w14 += k;
+        w15 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni-prfm.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni-prfm.c
index 56153c32777..7eca738746c 100644
--- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni-prfm.c
+++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni-prfm.c
@@ -31,7 +31,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avx256vnni_prfm(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -130,108 +130,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avx256vnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
@@ -360,108 +280,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avx256vnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni.c
index 53ca5f206d1..dfac2ab8885 100644
--- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni.c
+++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avx256vnni.c
@@ -30,7 +30,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avx256vnni(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -105,108 +105,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avx256vnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
@@ -311,108 +231,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avx256vnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_epi32(vacc4, vone, v4);
diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni-prfm.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni-prfm.c
index 6bf982a99e5..ef9ea4338d0 100644
--- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni-prfm.c
+++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni-prfm.c
@@ -31,7 +31,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -130,108 +130,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
@@ -360,108 +280,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni.c
index 9c6c539df97..6fdc2cc2940 100644
--- a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni.c
+++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-avxvnni.c
@@ -30,7 +30,7 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avxvnni(
   const void* scale,
   int8_t* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -105,108 +105,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avxvnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
@@ -311,108 +231,28 @@ void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__avxvnni(
       // KC remainder of 1..7
       if (k != 0) {
         assert(k >= 1 && k <= 7);
-        __m256i v0 = _mm256_setzero_si256();
-        __m256i v4 = _mm256_setzero_si256();
-
-        if (k & 4) {
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w0, 0);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w1, 2);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w2, 4);
-          v0 = _mm256_insert_epi32(v0, *(const int32_t *)w3, 6);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w4, 0);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w5, 2);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w6, 4);
-          v4 = _mm256_insert_epi32(v4, *(const int32_t *)w7, 6);
-          w0 += 4;
-          w1 += 4;
-          w2 += 4;
-          w3 += 4;
-          w4 += 4;
-          w5 += 4;
-          w6 += 4;
-          w7 += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 2);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 6);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 10);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 14);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 2);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 6);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 10);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 14);
-          } else {
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w0, 0);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w1, 4);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w2, 8);
-            v0 = _mm256_insert_epi16(v0, *(const int16_t *)w3, 12);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w4, 0);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w5, 4);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w6, 8);
-            v4 = _mm256_insert_epi16(v4, *(const int16_t *)w7, 12);
-          }
-
-          w0 += 2;
-          w1 += 2;
-          w2 += 2;
-          w3 += 2;
-          w4 += 2;
-          w5 += 2;
-          w6 += 2;
-          w7 += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 6);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 14);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 22);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 30);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 6);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 14);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 22);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 30);
-          }
-          else if (k & 4) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 4);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 12);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 20);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 28);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 4);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 12);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 20);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 28);
-          }
-          else if (k & 2) {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 2);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 10);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 18);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 26);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 2);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 10);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 18);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 26);
-          }
-          else {
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w0, 0);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w1, 8);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w2, 16);
-            v0 = _mm256_insert_epi8(v0, *(const int8_t *)w3, 24);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w4, 0);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w5, 8);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w6, 16);
-            v4 = _mm256_insert_epi8(v4, *(const int8_t *)w7, 24);
-          }
-
-          w0 += 1;
-          w1 += 1;
-          w2 += 1;
-          w3 += 1;
-          w4 += 1;
-          w5 += 1;
-          w6 += 1;
-          w7 += 1;
-        }
+
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        __m256i v0 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w0));
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w1)), 0x0C);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w2)), 0x30);
+        v0 = _mm256_blend_epi32(v0, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w3)), 0xC0);
+        v0 = _mm256_and_si256(v0, vmask);
+        __m256i v4 = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w4));
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w5)), 0x0C);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w6)), 0x30);
+        v4 = _mm256_blend_epi32(v4, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w7)), 0xC0);
+        v4 = _mm256_and_si256(v4, vmask);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
 
         vacc0 = _mm256_dpbusd_avx_epi32(vacc0, vone, v0);
         vacc4 = _mm256_dpbusd_avx_epi32(vacc4, vone, v4);
diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c
new file mode 100644
index 00000000000..f479706779f
--- /dev/null
+++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c
@@ -0,0 +1,370 @@
+// Auto-generated file. Do not edit!
+//   Template: src/x8-packw/kr-wasmdot.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include "xnnpack/packw.h"
+
+
+void xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__wasmrelaxedsimd(
+  size_t g,
+  size_t nc,
+  size_t kc,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const int8_t* weights,
+  const int32_t* bias,
+  const void* scale,
+  int8_t* packed_weights,
+  size_t extra_bytes,
+  const void* params) XNN_OOB_READS
+{
+  assert(g != 0);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(nr == 8);
+  assert(kr == 8);
+  assert(sr == 1);
+  assert(weights != NULL);
+  assert(packed_weights != NULL);
+
+  const v128_t vone = wasm_i8x16_splat(1);
+  const v128_t vzero = wasm_i32x4_splat(0);
+  XNN_FORCE_REALIZATION(vone);
+  XNN_FORCE_REALIZATION(vzero);
+  int8_t* out = (int8_t*) packed_weights;
+  const uint32_t* b = (const uint32_t*) bias;
+  const uint32_t izp = (uint32_t) (params ? (((const struct xnn_qs8_packw_params*) params)->input_zero_point + 128): 128);
+  v128_t vzeropoint = wasm_i32x4_splat((int32_t) izp);
+
+  do {
+    // NC main loop multiple of 8
+    const int8_t* w0 = (const int8_t*) weights;
+    size_t n = nc;
+    for (;n >= 8; n -= 8) {
+      int32_t* packed_b = (int32_t*) out;
+      if XNN_LIKELY(b != NULL) {
+        const v128_t vb0 = wasm_v128_load(b + 0);
+        wasm_v128_store(out + 0, vb0);
+        const v128_t vb1 = wasm_v128_load(b + 4);
+        wasm_v128_store(out + 16, vb1);
+        b += 8;
+      } else {
+        wasm_v128_store(out + 0, vzero);
+        wasm_v128_store(out + 16, vzero);
+      }
+      out += 8 * sizeof(uint32_t);
+
+      const int8_t* w1 = w0 + kc;
+      const int8_t* w2 = w1 + kc;
+      const int8_t* w3 = w2 + kc;
+      const int8_t* w4 = w3 + kc;
+      const int8_t* w5 = w4 + kc;
+      const int8_t* w6 = w5 + kc;
+      const int8_t* w7 = w6 + kc;
+
+      v128_t vacc01 = wasm_i32x4_splat(0);
+      v128_t vacc23 = wasm_i32x4_splat(0);
+      v128_t vacc45 = wasm_i32x4_splat(0);
+      v128_t vacc67 = wasm_i32x4_splat(0);
+
+      // KC main loop multiple of 8x8
+      size_t k = kc;
+      for (; k >= 16; k -= 16) {
+        v128_t v0_01 = wasm_v128_load(w0);
+        v128_t v1_01 = wasm_v128_load(w1);
+        v128_t v2_01 = wasm_v128_load(w2);
+        v128_t v3_01 = wasm_v128_load(w3);
+        v128_t v4_01 = wasm_v128_load(w4);
+        v128_t v5_01 = wasm_v128_load(w5);
+        v128_t v6_01 = wasm_v128_load(w6);
+        v128_t v7_01 = wasm_v128_load(w7);
+
+        v128_t v01_0 = wasm_i64x2_shuffle(v0_01, v1_01, 0, 2);
+        v128_t v01_1 = wasm_i64x2_shuffle(v0_01, v1_01, 1, 3);
+        v128_t v23_0 = wasm_i64x2_shuffle(v2_01, v3_01, 0, 2);
+        v128_t v23_1 = wasm_i64x2_shuffle(v2_01, v3_01, 1, 3);
+        v128_t v45_0 = wasm_i64x2_shuffle(v4_01, v5_01, 0, 2);
+        v128_t v45_1 = wasm_i64x2_shuffle(v4_01, v5_01, 1, 3);
+        v128_t v67_0 = wasm_i64x2_shuffle(v6_01, v7_01, 0, 2);
+        v128_t v67_1 = wasm_i64x2_shuffle(v6_01, v7_01, 1, 3);
+
+        vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01_0, vone, vacc01);
+        vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01_1, vone, vacc01);
+        vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23_0, vone, vacc23);
+        vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23_1, vone, vacc23);
+        vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45_0, vone, vacc45);
+        vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45_1, vone, vacc45);
+        vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67_0, vone, vacc67);
+        vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67_1, vone, vacc67);
+
+        wasm_v128_store(out + 0, v01_0);
+        wasm_v128_store(out + 16, v23_0);
+        wasm_v128_store(out + 32, v45_0);
+        wasm_v128_store(out + 48, v67_0);
+
+        wasm_v128_store(out + 64, v01_1);
+        wasm_v128_store(out + 80, v23_1);
+        wasm_v128_store(out + 96, v45_1);
+        wasm_v128_store(out + 112, v67_1);
+
+        w0 += 16;
+        w1 += 16;
+        w2 += 16;
+        w3 += 16;
+        w4 += 16;
+        w5 += 16;
+        w6 += 16;
+        w7 += 16;
+        out += 128;
+      }
+
+      for (; k >= 8; k -= 8) {
+        const v128_t v0 = wasm_v128_load64_splat(w0);
+        const v128_t v1 = wasm_v128_load64_splat(w1);
+        const v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3);
+        const v128_t v2 = wasm_v128_load64_splat(w2);
+        const v128_t v3 = wasm_v128_load64_splat(w3);
+        const v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3);
+        const v128_t v4 = wasm_v128_load64_splat(w4);
+        const v128_t v5 = wasm_v128_load64_splat(w5);
+        const v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3);
+        const v128_t v6 = wasm_v128_load64_splat(w6);
+        const v128_t v7 = wasm_v128_load64_splat(w7);
+        const v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3);
+
+        vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01);
+        vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23);
+        vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45);
+        vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67);
+
+        wasm_v128_store(out + 0, v01);
+        wasm_v128_store(out + 16, v23);
+        wasm_v128_store(out + 32, v45);
+        wasm_v128_store(out + 48, v67);
+
+        w0 += 8;
+        w1 += 8;
+        w2 += 8;
+        w3 += 8;
+        w4 += 8;
+        w5 += 8;
+        w6 += 8;
+        w7 += 8;
+        out += 64;
+      }
+
+      // KC remainder 1..KR-1
+      if (k != 0) {
+        assert(k >= 1 && k <= 7);
+
+        const v128_t vmask = wasm_u64x2_shr(wasm_i32x4_splat(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        const v128_t v0 = wasm_v128_load64_splat(w0);
+        const v128_t v1 = wasm_v128_load64_splat(w1);
+        v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3);
+        v01 = wasm_v128_and(v01, vmask);
+        const v128_t v2 = wasm_v128_load64_splat(w2);
+        const v128_t v3 = wasm_v128_load64_splat(w3);
+        v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3);
+        v23 = wasm_v128_and(v23, vmask);
+        const v128_t v4 = wasm_v128_load64_splat(w4);
+        const v128_t v5 = wasm_v128_load64_splat(w5);
+        v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3);
+        v45 = wasm_v128_and(v45, vmask);
+        const v128_t v6 = wasm_v128_load64_splat(w6);
+        const v128_t v7 = wasm_v128_load64_splat(w7);
+        v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3);
+        v67 = wasm_v128_and(v67, vmask);
+
+        vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01);
+        vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23);
+        vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45);
+        vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67);
+
+        wasm_v128_store(out + 0, v01);
+        wasm_v128_store(out + 16, v23);
+        wasm_v128_store(out + 32, v45);
+        wasm_v128_store(out + 48, v67);
+
+        w0 += k;
+        w1 += k;
+        w2 += k;
+        w3 += k;
+        w4 += k;
+        w5 += k;
+        w6 += k;
+        w7 += k;
+        out += 64;
+      }
+
+      v128_t vksum0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc01, vacc23, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc01, vacc23, 1, 3, 5, 7));
+      v128_t vksum4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc45, vacc67, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc45, vacc67, 1, 3, 5, 7));
+
+      vksum0123 = wasm_i32x4_mul(vksum0123, vzeropoint);
+      vksum4567 = wasm_i32x4_mul(vksum4567, vzeropoint);
+
+      v128_t vpack0123 = wasm_v128_load(packed_b);
+      v128_t vpack4567 = wasm_v128_load(packed_b + 4);
+
+      wasm_v128_store(packed_b, wasm_i32x4_sub(vpack0123, vksum0123));
+      wasm_v128_store(packed_b + 4, wasm_i32x4_sub(vpack4567, vksum4567));
+
+      out = (int8_t*) ((uintptr_t) out + extra_bytes);
+      w0 = w7;
+    }
+
+    // NC remainder (1..7)
+    if XNN_UNLIKELY(n != 0) {
+      assert(n >= 1 && n <= 7);
+
+      int32_t* packed_b = (int32_t*) out;
+      if XNN_LIKELY(b != NULL) {
+        size_t nb = n;
+        do {
+          *((uint32_t*) out) = *b++;
+          out += sizeof(uint32_t);
+        } while (--nb != 0);
+      } else {
+        size_t nb = n;
+        do {
+          *((uint32_t*) out) = 0;
+          out += sizeof(uint32_t);
+        } while (--nb != 0);
+      }
+      out += (8 - n) * sizeof(uint32_t);
+
+      const int8_t* w1 = w0 + kc;
+      if XNN_UNPREDICTABLE(n < 2) {
+        w1 = w0;
+      }
+      const int8_t* w2 = w1 + kc;
+      if XNN_UNPREDICTABLE(n <= 2) {
+        w2 = w1;
+      }
+      const int8_t* w3 = w2 + kc;
+      if XNN_UNPREDICTABLE(n < 4) {
+        w3 = w2;
+      }
+      const int8_t* w4 = w3 + kc;
+      if XNN_UNPREDICTABLE(n <= 4) {
+        w4 = w3;
+      }
+      const int8_t* w5 = w4 + kc;
+      if XNN_UNPREDICTABLE(n < 6) {
+        w5 = w4;
+      }
+      const int8_t* w6 = w5 + kc;
+      if XNN_UNPREDICTABLE(n <= 6) {
+        w6 = w5;
+      }
+      const int8_t* w7 = w6 + kc;
+      if XNN_UNPREDICTABLE(n < 8) {
+        w7 = w6;
+      }
+
+      v128_t vacc01 = wasm_i32x4_splat(0);
+      v128_t vacc23 = wasm_i32x4_splat(0);
+      v128_t vacc45 = wasm_i32x4_splat(0);
+      v128_t vacc67 = wasm_i32x4_splat(0);
+
+      // KC main loop multiple of 8x8
+      size_t k = kc;
+      for (; k >= 8; k -= 8) {
+        const v128_t v0 = wasm_v128_load64_splat(w0);
+        const v128_t v1 = wasm_v128_load64_splat(w1);
+        const v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3);
+        const v128_t v2 = wasm_v128_load64_splat(w2);
+        const v128_t v3 = wasm_v128_load64_splat(w3);
+        const v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3);
+        const v128_t v4 = wasm_v128_load64_splat(w4);
+        const v128_t v5 = wasm_v128_load64_splat(w5);
+        const v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3);
+        const v128_t v6 = wasm_v128_load64_splat(w6);
+        const v128_t v7 = wasm_v128_load64_splat(w7);
+        const v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3);
+
+        vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01);
+        vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23);
+        vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45);
+        vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67);
+
+        wasm_v128_store(out + 0, v01);
+        wasm_v128_store(out + 16, v23);
+        wasm_v128_store(out + 32, v45);
+        wasm_v128_store(out + 48, v67);
+
+        w0 += 8;
+        w1 += 8;
+        w2 += 8;
+        w3 += 8;
+        w4 += 8;
+        w5 += 8;
+        w6 += 8;
+        w7 += 8;
+        out += 64;
+      }
+
+      // KC remainder of 1..7
+      if (k != 0) {
+        assert(k >= 1 && k <= 7);
+
+        const v128_t vmask = wasm_u64x2_shr(wasm_i32x4_splat(-1), (8 - k) * sizeof(int8_t) * 8);
+
+        const v128_t v0 = wasm_v128_load64_splat(w0);
+        const v128_t v1 = wasm_v128_load64_splat(w1);
+        v128_t v01 = wasm_i64x2_shuffle(v0, v1, 0, 3);
+        v01 = wasm_v128_and(v01, vmask);
+        const v128_t v2 = wasm_v128_load64_splat(w2);
+        const v128_t v3 = wasm_v128_load64_splat(w3);
+        v128_t v23 = wasm_i64x2_shuffle(v2, v3, 0, 3);
+        v23 = wasm_v128_and(v23, vmask);
+        const v128_t v4 = wasm_v128_load64_splat(w4);
+        const v128_t v5 = wasm_v128_load64_splat(w5);
+        v128_t v45 = wasm_i64x2_shuffle(v4, v5, 0, 3);
+        v45 = wasm_v128_and(v45, vmask);
+        const v128_t v6 = wasm_v128_load64_splat(w6);
+        const v128_t v7 = wasm_v128_load64_splat(w7);
+        v128_t v67 = wasm_i64x2_shuffle(v6, v7, 0, 3);
+        v67 = wasm_v128_and(v67, vmask);
+
+        vacc01 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v01, vone, vacc01);
+        vacc23 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v23, vone, vacc23);
+        vacc45 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v45, vone, vacc45);
+        vacc67 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v67, vone, vacc67);
+
+        wasm_v128_store(out + 0, v01);
+        wasm_v128_store(out + 16, v23);
+        wasm_v128_store(out + 32, v45);
+        wasm_v128_store(out + 48, v67);
+
+        out += 64;
+      }
+
+      v128_t vksum0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc01, vacc23, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc01, vacc23, 1, 3, 5, 7));
+      v128_t vksum4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc45, vacc67, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc45, vacc67, 1, 3, 5, 7));
+
+      vksum0123 = wasm_i32x4_mul(vksum0123, vzeropoint);
+      vksum4567 = wasm_i32x4_mul(vksum4567, vzeropoint);
+
+      v128_t vpack0123 = wasm_v128_load(packed_b);
+      v128_t vpack4567 = wasm_v128_load(packed_b + 4);
+
+      wasm_v128_store(packed_b, wasm_i32x4_sub(vpack0123, vksum0123));
+      wasm_v128_store(packed_b + 4, wasm_i32x4_sub(vpack4567, vksum4567));
+
+      out = (int8_t*) ((uintptr_t) out + extra_bytes);
+    }
+    weights += nc * kc;
+  } while (--g != 0);
+}
diff --git a/src/qs8-requantization/qs8-requantization-rndna-neon.c b/src/qs8-requantization/qs8-requantization-rndna-neon.c
deleted file mode 100644
index 4a1ac199959..00000000000
--- a/src/qs8-requantization/qs8-requantization-rndna-neon.c
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/requantization-stubs.h"
-
-
-void xnn_qs8_requantize_rndna__neon(
-    size_t n,
-    const int32_t* input,
-    float scale,
-    int8_t zero_point,
-    int8_t qmin,
-    int8_t qmax,
-    int8_t* output)
-{
-  assert(n % 16 == 0);
-  assert(scale < 1.0f);
-  assert(scale >= 0x1.0p-32f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
-  const int32_t shift = 127 + 23 - (scale_bits >> 23);
-  assert(shift >= 24);
-  assert(shift < 56);
-
-#if defined(__aarch64__)
-  const int32x4_t vmultiplier = vdupq_n_s32(multiplier);
-#else
-  const int32x2_t vmultiplier = vdup_n_s32(multiplier);
-#endif
-  const int16x8_t vzero_point = vdupq_n_s16((int16_t) zero_point);
-  const int64x2_t vshift = vdupq_n_s64(-shift);
-  const int8x16_t vqmin = vdupq_n_s8(qmin);
-  const int8x16_t vqmax = vdupq_n_s8(qmax);
-  for (; n != 0; n -= 16) {
-    const int32x4_t x = vld1q_s32(input);
-    const int32x4_t y = vld1q_s32(input + 4);
-    const int32x4_t z = vld1q_s32(input + 8);
-    const int32x4_t w = vld1q_s32(input + 12);
-    input += 16;
-
-    const uint32x4_t x_neg_mask = vcltq_s32(x, vmovq_n_s32(0));
-    const uint32x4_t y_neg_mask = vcltq_s32(y, vmovq_n_s32(0));
-    const uint32x4_t z_neg_mask = vcltq_s32(z, vmovq_n_s32(0));
-    const uint32x4_t w_neg_mask = vcltq_s32(w, vmovq_n_s32(0));
-
-#if defined(__aarch64__)
-    const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vget_low_s32(vmultiplier));
-    const int64x2_t x23_product = vmull_high_s32(x, vmultiplier);
-    const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vget_low_s32(vmultiplier));
-    const int64x2_t y23_product = vmull_high_s32(y, vmultiplier);
-    const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vget_low_s32(vmultiplier));
-    const int64x2_t z23_product = vmull_high_s32(z, vmultiplier);
-    const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vget_low_s32(vmultiplier));
-    const int64x2_t w23_product = vmull_high_s32(w, vmultiplier);
-#else
-    const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vmultiplier);
-    const int64x2_t x23_product = vmull_s32(vget_high_s32(x), vmultiplier);
-    const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vmultiplier);
-    const int64x2_t y23_product = vmull_s32(vget_high_s32(y), vmultiplier);
-    const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vmultiplier);
-    const int64x2_t z23_product = vmull_s32(vget_high_s32(z), vmultiplier);
-    const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vmultiplier);
-    const int64x2_t w23_product = vmull_s32(vget_high_s32(w), vmultiplier);
-#endif
-
-#if defined(__aarch64__)
-    const int64x2_t x01_adjusted_product = vaddw_s32(x01_product, vreinterpret_s32_u32(vget_low_u32(x_neg_mask)));
-    const int64x2_t x23_adjusted_product = vaddw_high_s32(x23_product, vreinterpretq_s32_u32(x_neg_mask));
-    const int64x2_t y01_adjusted_product = vaddw_s32(y01_product, vreinterpret_s32_u32(vget_low_u32(y_neg_mask)));
-    const int64x2_t y23_adjusted_product = vaddw_high_s32(y23_product, vreinterpretq_s32_u32(y_neg_mask));
-    const int64x2_t z01_adjusted_product = vaddw_s32(z01_product, vreinterpret_s32_u32(vget_low_u32(z_neg_mask)));
-    const int64x2_t z23_adjusted_product = vaddw_high_s32(z23_product, vreinterpretq_s32_u32(z_neg_mask));
-    const int64x2_t w01_adjusted_product = vaddw_s32(w01_product, vreinterpret_s32_u32(vget_low_u32(w_neg_mask)));
-    const int64x2_t w23_adjusted_product = vaddw_high_s32(w23_product, vreinterpretq_s32_u32(w_neg_mask));
-#else
-    const int64x2_t x01_adjusted_product = vaddw_s32(x01_product, vreinterpret_s32_u32(vget_low_u32(x_neg_mask)));
-    const int64x2_t x23_adjusted_product = vaddw_s32(x23_product, vreinterpret_s32_u32(vget_high_u32(x_neg_mask)));
-    const int64x2_t y01_adjusted_product = vaddw_s32(y01_product, vreinterpret_s32_u32(vget_low_u32(y_neg_mask)));
-    const int64x2_t y23_adjusted_product = vaddw_s32(y23_product, vreinterpret_s32_u32(vget_high_u32(y_neg_mask)));
-    const int64x2_t z01_adjusted_product = vaddw_s32(z01_product, vreinterpret_s32_u32(vget_low_u32(z_neg_mask)));
-    const int64x2_t z23_adjusted_product = vaddw_s32(z23_product, vreinterpret_s32_u32(vget_high_u32(z_neg_mask)));
-    const int64x2_t w01_adjusted_product = vaddw_s32(w01_product, vreinterpret_s32_u32(vget_low_u32(w_neg_mask)));
-    const int64x2_t w23_adjusted_product = vaddw_s32(w23_product, vreinterpret_s32_u32(vget_high_u32(w_neg_mask)));
-#endif
-
-    const int64x2_t x01_scaled = vrshlq_s64(x01_adjusted_product, vshift);
-    const int64x2_t x23_scaled = vrshlq_s64(x23_adjusted_product, vshift);
-    const int64x2_t y01_scaled = vrshlq_s64(y01_adjusted_product, vshift);
-    const int64x2_t y23_scaled = vrshlq_s64(y23_adjusted_product, vshift);
-    const int64x2_t z01_scaled = vrshlq_s64(z01_adjusted_product, vshift);
-    const int64x2_t z23_scaled = vrshlq_s64(z23_adjusted_product, vshift);
-    const int64x2_t w01_scaled = vrshlq_s64(w01_adjusted_product, vshift);
-    const int64x2_t w23_scaled = vrshlq_s64(w23_adjusted_product, vshift);
-
-#ifdef __aarch64__
-    const int32x4_t x_scaled = vuzp1q_s32(vreinterpretq_s32_s64(x01_scaled), vreinterpretq_s32_s64(x23_scaled));
-    const int32x4_t y_scaled = vuzp1q_s32(vreinterpretq_s32_s64(y01_scaled), vreinterpretq_s32_s64(y23_scaled));
-    const int32x4_t z_scaled = vuzp1q_s32(vreinterpretq_s32_s64(z01_scaled), vreinterpretq_s32_s64(z23_scaled));
-    const int32x4_t w_scaled = vuzp1q_s32(vreinterpretq_s32_s64(w01_scaled), vreinterpretq_s32_s64(w23_scaled));
-
-    const int16x8_t xy_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(x_scaled), y_scaled), vzero_point);
-    const int16x8_t zw_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(z_scaled), w_scaled), vzero_point);
-    const int8x16_t xyzw_packed = vqmovn_high_s16(vqmovn_s16(xy_packed), zw_packed);
-#else
-    const int32x4_t x_scaled = vcombine_s32(vmovn_s64(x01_scaled), vmovn_s64(x23_scaled));
-    const int32x4_t y_scaled = vcombine_s32(vmovn_s64(y01_scaled), vmovn_s64(y23_scaled));
-    const int32x4_t z_scaled = vcombine_s32(vmovn_s64(z01_scaled), vmovn_s64(z23_scaled));
-    const int32x4_t w_scaled = vcombine_s32(vmovn_s64(w01_scaled), vmovn_s64(w23_scaled));
-
-    const int16x8_t xy_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(x_scaled), vqmovn_s32(y_scaled)), vzero_point);
-    const int16x8_t zw_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(z_scaled), vqmovn_s32(w_scaled)), vzero_point);
-    const int8x16_t xyzw_packed = vcombine_s8(vqmovn_s16(xy_packed), vqmovn_s16(zw_packed));
-#endif
-
-    const int8x16_t xyzw_clamped = vmaxq_s8(vminq_s8(xyzw_packed, vqmax), vqmin);
-
-    // AArch32 version:
-    //   4x VCLT.S32 Qd, Qm, #0
-    //   8x VMULL.S32 Qd, Dm, Dn
-    //   8x VADDW.S32 Qd, Qm, Dn
-    //   8x VRSHL.S32 Qd, Qm, Qn
-    //   8x VMOVN.S64 Dd, Qm
-    //   4x VQMOVN.S32 Dd, Qm
-    //   2x VQADD.S16 Qd, Qm, Qn
-    //   2x VQMOVUN.S16 Dd, Qm
-    //   1x VMAX.U8 Qd, Qm, Qn
-    //   1x VMIN.U8 Qd, Qm, Qn
-    // ---------------------
-    // 46 instructions total
-    //
-    // AArch64 version:
-    //   4x CMLT Vd.4S, Vn.4S, #0
-    //   4x SMULL Vd.2D, Vn.2S, Vm.2S
-    //   4x SMULL2 Vd.2D, Vn.4S, Vm.4S
-    //   4x SADDW Vd.2D, Vn.2D, Vm.2S
-    //   4x SADDW2 Vd.2D, Vn.2D, Vm.4S
-    //   8x SRSHL Vd.2D, Vn.2D, Vm.2D
-    //   4x UZP1 Vd.4S, Vn.4S, Vm.4S
-    //   2x SQXTN Vd.4H, Vn.4S
-    //   2x SQXTN2 Vd.8H, Vn.4S
-    //   2x SQADD Vd.8H, Vn.8H, Vm.8H
-    //   1x SQXTN Vd.8B, Vn.8H
-    //   1x SQXTN2 Vd.16B, Vn.8H
-    //   1x SMIN Vd.16B, Vn.16B, Vm.16B
-    //   1x SMAX Vd.16B, Vn.16B, Vm.16B
-    // ---------------------
-    // 42 instructions total
-
-    vst1q_s8(output, xyzw_clamped);
-    output += 16;
-  }
-}
diff --git a/src/qs8-requantization/qs8-requantization-rndna-scalar-signed64.c b/src/qs8-requantization/qs8-requantization-rndna-scalar-signed64.c
deleted file mode 100644
index bc190737b57..00000000000
--- a/src/qs8-requantization/qs8-requantization-rndna-scalar-signed64.c
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/requantization-stubs.h"
-
-
-void xnn_qs8_requantize_rndna__scalar_signed64(
-    size_t n,
-    const int32_t* input,
-    float scale,
-    int8_t zero_point,
-    int8_t qmin,
-    int8_t qmax,
-    int8_t* output)
-{
-  assert(n % 4 == 0);
-  assert(scale < 1.0f);
-  assert(scale >= 0x1.0p-32f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
-  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
-  assert(shift >= 24);
-  assert(shift < 56);
-
-  const int64_t rounding = INT64_C(1) << (shift - 1);
-  const int32_t smin = (int32_t) qmin - (int32_t) zero_point;
-  const int32_t smax = (int32_t) qmax - (int32_t) zero_point;
-  for (; n != 0; n -= 4) {
-    const int32_t x = input[0];
-    const int32_t y = input[1];
-    const int32_t z = input[2];
-    const int32_t w = input[3];
-    input += 4;
-
-    // Compute full 64-bit product of signed 32-bit factors.
-    //
-    // Note: multiplier can be treated as either signed or unsigned.
-    const int64_t x_product = (int64_t) x * (int64_t) multiplier;
-    const int64_t y_product = (int64_t) y * (int64_t) multiplier;
-    const int64_t z_product = (int64_t) z * (int64_t) multiplier;
-    const int64_t w_product = (int64_t) w * (int64_t) multiplier;
-
-    // Adjust product before subsequent shift with rounding up to simulate shift with rounding away from zero.
-    const int64_t x_adjusted_product = x_product - (int64_t)(x < 0);
-    const int64_t y_adjusted_product = y_product - (int64_t)(y < 0);
-    const int64_t z_adjusted_product = z_product - (int64_t)(z < 0);
-    const int64_t w_adjusted_product = w_product - (int64_t)(w < 0);
-
-    // Arithmetically shift the full 64-bit product right with rounding.
-    // Rounding is performed towards closest integer, with midpoints rounded up.
-    //
-    // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
-    // "right shift with rounding" instruction each line below can be represented by just one such instruction
-    // (e.g. VRSHL.S64 on ARM NEON, SRSHL in ARM64 Advanced SIMD).
-    const int32_t x_scaled = (int32_t) math_asr_s64(x_adjusted_product + rounding, shift);
-    const int32_t y_scaled = (int32_t) math_asr_s64(y_adjusted_product + rounding, shift);
-    const int32_t z_scaled = (int32_t) math_asr_s64(z_adjusted_product + rounding, shift);
-    const int32_t w_scaled = (int32_t) math_asr_s64(w_adjusted_product + rounding, shift);
-
-    // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
-    const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
-    const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax);
-    const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax);
-    const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax);
-
-    // Add zero point to clamped value.
-    // The result is guaranteed to be in [qmin, qmax] range.
-    //
-    // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
-    // range, so addition of zero point (which can be up to 127) can overflow signed 32-bit integer.
-    const int32_t x_biased = x_clamped + zero_point;
-    const int32_t y_biased = y_clamped + zero_point;
-    const int32_t z_biased = z_clamped + zero_point;
-    const int32_t w_biased = w_clamped + zero_point;
-
-    output[0] = (int8_t) x_biased;
-    output[1] = (int8_t) y_biased;
-    output[2] = (int8_t) z_biased;
-    output[3] = (int8_t) w_biased;
-    output += 4;
-  }
-}
diff --git a/src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c b/src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c
deleted file mode 100644
index 0d8c27109b8..00000000000
--- a/src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/requantization-stubs.h"
-
-
-void xnn_qs8_requantize_rndna__scalar_unsigned32(
-    size_t n,
-    const int32_t* input,
-    float scale,
-    int8_t zero_point,
-    int8_t qmin,
-    int8_t qmax,
-    int8_t* output)
-{
-  assert(n % 4 == 0);
-  assert(scale < 1.0f);
-  assert(scale >= 0x1.0p-32f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const uint32_t multiplier = (scale_bits << 8) | UINT32_C(0x80000000);
-  const uint32_t shift = 127 + 31 - (scale_bits >> 23);
-  assert(shift >= 32);
-  assert(shift < 64);
-
-  const uint64_t rounding = UINT64_C(1) << (shift - 1);
-  const uint32_t rounding_hi = (uint32_t)(rounding >> 32);
-  const uint32_t rounding_lo = (uint32_t) rounding;
-  const uint32_t shift_minus_32 = shift - 32;
-  const int32_t smin = (int32_t) qmin - (int32_t) zero_point;
-  const int32_t smax = (int32_t) qmax - (int32_t) zero_point;
-  for (; n != 0; n -= 4) {
-    const int32_t x = input[0];
-    const int32_t y = input[1];
-    const int32_t z = input[2];
-    const int32_t w = input[3];
-    input += 4;
-
-    // Compute absolute value of input as unsigned 32-bit int.
-    // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
-    const uint32_t x_abs = (x >= 0) ? (uint32_t) x : -(uint32_t) x;
-    const uint32_t y_abs = (y >= 0) ? (uint32_t) y : -(uint32_t) y;
-    const uint32_t z_abs = (z >= 0) ? (uint32_t) z : -(uint32_t) z;
-    const uint32_t w_abs = (w >= 0) ? (uint32_t) w : -(uint32_t) w;
-
-    // Compute full 64-bit product of 32-bit factors.
-    const uint64_t x_product = (uint64_t) x_abs * (uint64_t) multiplier;
-    const uint64_t y_product = (uint64_t) y_abs * (uint64_t) multiplier;
-    const uint64_t z_product = (uint64_t) z_abs * (uint64_t) multiplier;
-    const uint64_t w_product = (uint64_t) w_abs * (uint64_t) multiplier;
-
-    // Shift the full 64-bit product right with rounding.
-    // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
-    //
-    // Generally, this operation requires both 64-bit addition and 64-bit shift, but we use two tricks to replace
-    // 64-bit operations with 32-bit operations.
-    //
-    // To avoid full 64-bit addition we make use of three facts:
-    // - 64-bit rounding value added before the shift is a power of 2, and thus has only one bit set.
-    // - When 0x1.0p-32f <= scale < 0x1.0p-31f, then the non-zero bit in rounding is in the low 32 bits, and
-    //   rounding is exactly 0x80000000 (2**31), because rounding is 2**(scale-1) and scale >= 32. In this case,
-    //   addition of rounding can affect high 32 bits of the product only through overflow, which happens if
-    //   low 32-bit part of the product equals or exceeds 0x80000000. We can reformulate the latter condition
-    //   as low 32-bit part of the product has the bit 31 set, and then overflow happens if both the low 32-bit part
-    //   of the product and the low 32-bit part of the rounding value have bit 31 set. Since 32-bit numbers with the
-    //   bit 31 set are negative when interpreted as signed integers, we can check the overflow condition as
-    //      (int32_t) (LOW(product) & LOW(rounding)) < 0
-    // - When 0x1.0p-31f <= scale < 1.0f, then the non-zero bit is in the high 32 bits of rounding. We just need
-    //   to do 32-bit addition of high 32 bits of rounding and high 32 bits of product. This addition never
-    //   overflows because product <= 0x80000000 * 0xFFFFFF00 < 2**63 and rounding = 2**(scale-1) <= 2**62.
-    //
-    // To avoid full 64-bit shift, we leverage the fact that shift >= 32, and do it in two steps:
-    // - Shift by 32, which can be implemented by extacting the high 32-bit word on 32-bit systems.
-    // - Shift by (shift - 32), which can be implemented as a 32-bit shift of high word of addition result.
-    const uint32_t x_carry_lo = (uint32_t) ((int32_t) ((uint32_t) x_product & rounding_lo) < 0);
-    const uint32_t y_carry_lo = (uint32_t) ((int32_t) ((uint32_t) y_product & rounding_lo) < 0);
-    const uint32_t z_carry_lo = (uint32_t) ((int32_t) ((uint32_t) z_product & rounding_lo) < 0);
-    const uint32_t w_carry_lo = (uint32_t) ((int32_t) ((uint32_t) w_product & rounding_lo) < 0);
-
-    const uint32_t x_product_hi = (uint32_t) (x_product >> 32);
-    const uint32_t y_product_hi = (uint32_t) (y_product >> 32);
-    const uint32_t z_product_hi = (uint32_t) (z_product >> 32);
-    const uint32_t w_product_hi = (uint32_t) (w_product >> 32);
-
-    const uint32_t x_abs_scaled = (uint32_t) (x_product_hi + rounding_hi + x_carry_lo) >> shift_minus_32;
-    const uint32_t y_abs_scaled = (uint32_t) (y_product_hi + rounding_hi + y_carry_lo) >> shift_minus_32;
-    const uint32_t z_abs_scaled = (uint32_t) (z_product_hi + rounding_hi + z_carry_lo) >> shift_minus_32;
-    const uint32_t w_abs_scaled = (uint32_t) (w_product_hi + rounding_hi + w_carry_lo) >> shift_minus_32;
-
-    // Copy the sign of input to scaled absolute input value.
-    const int32_t x_scaled = (int32_t) (x >= 0 ? x_abs_scaled : -x_abs_scaled);
-    const int32_t y_scaled = (int32_t) (y >= 0 ? y_abs_scaled : -y_abs_scaled);
-    const int32_t z_scaled = (int32_t) (z >= 0 ? z_abs_scaled : -z_abs_scaled);
-    const int32_t w_scaled = (int32_t) (w >= 0 ? w_abs_scaled : -w_abs_scaled);
-
-    // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
-    const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
-    const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax);
-    const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax);
-    const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax);
-
-    // Add zero point to clamped value.
-    // The result is guaranteed to be in [qmin, qmax] range.
-    //
-    // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
-    // range, so addition of zero point (which can be up to 127) can overflow signed 32-bit integer.
-    const int32_t x_biased = x_clamped + zero_point;
-    const int32_t y_biased = y_clamped + zero_point;
-    const int32_t z_biased = z_clamped + zero_point;
-    const int32_t w_biased = w_clamped + zero_point;
-
-    output[0] = (int8_t) x_biased;
-    output[1] = (int8_t) y_biased;
-    output[2] = (int8_t) z_biased;
-    output[3] = (int8_t) w_biased;
-    output += 4;
-  }
-}
diff --git a/src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c b/src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c
deleted file mode 100644
index 71d3a0a1f65..00000000000
--- a/src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/requantization-stubs.h"
-
-
-void xnn_qs8_requantize_rndna__scalar_unsigned64(
-    size_t n,
-    const int32_t* input,
-    float scale,
-    int8_t zero_point,
-    int8_t qmin,
-    int8_t qmax,
-    int8_t* output)
-{
-  assert(n % 4 == 0);
-  assert(scale < 1.0f);
-  assert(scale >= 0x1.0p-32f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
-  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
-  assert(shift >= 24);
-  assert(shift < 56);
-
-  const uint64_t rounding = UINT64_C(1) << (shift - 1);
-  const int32_t smin = (int32_t) qmin - (int32_t) zero_point;
-  const int32_t smax = (int32_t) qmax - (int32_t) zero_point;
-  for (; n != 0; n -= 4) {
-    const int32_t x = input[0];
-    const int32_t y = input[1];
-    const int32_t z = input[2];
-    const int32_t w = input[3];
-    input += 4;
-
-    // Compute absolute value of input as unsigned 32-bit int.
-    // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
-    const uint32_t x_abs = (x >= 0) ? (uint32_t) x : -(uint32_t) x;
-    const uint32_t y_abs = (y >= 0) ? (uint32_t) y : -(uint32_t) y;
-    const uint32_t z_abs = (z >= 0) ? (uint32_t) z : -(uint32_t) z;
-    const uint32_t w_abs = (w >= 0) ? (uint32_t) w : -(uint32_t) w;
-
-    // Compute full 64-bit product of 32-bit factors.
-    const uint64_t x_product = (uint64_t) x_abs * (uint64_t) multiplier;
-    const uint64_t y_product = (uint64_t) y_abs * (uint64_t) multiplier;
-    const uint64_t z_product = (uint64_t) z_abs * (uint64_t) multiplier;
-    const uint64_t w_product = (uint64_t) w_abs * (uint64_t) multiplier;
-
-    // Shift the full 64-bit product right with rounding.
-    // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
-    //
-    // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
-    // "right shift with rounding" instruction each line below can be represented by just one such instruction
-    // (e.g. VRSHL.U64 on ARM NEON, URSHL in ARM64 Advanced SIMD).
-    const uint32_t x_abs_scaled = (uint32_t) ((x_product + rounding) >> shift);
-    const uint32_t y_abs_scaled = (uint32_t) ((y_product + rounding) >> shift);
-    const uint32_t z_abs_scaled = (uint32_t) ((z_product + rounding) >> shift);
-    const uint32_t w_abs_scaled = (uint32_t) ((w_product + rounding) >> shift);
-
-    // Copy the sign of input to scaled absolute input value.
-    //
-    // On x86 processors with SSSE3 instruction set, this operation nicely maps to PSIGND instruction.
-    const int32_t x_scaled = (int32_t) (x >= 0 ? x_abs_scaled : -x_abs_scaled);
-    const int32_t y_scaled = (int32_t) (y >= 0 ? y_abs_scaled : -y_abs_scaled);
-    const int32_t z_scaled = (int32_t) (z >= 0 ? z_abs_scaled : -z_abs_scaled);
-    const int32_t w_scaled = (int32_t) (w >= 0 ? w_abs_scaled : -w_abs_scaled);
-
-    // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
-    const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
-    const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax);
-    const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax);
-    const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax);
-
-    // Add zero point to clamped value.
-    // The result is guaranteed to be in [qmin, qmax] range.
-    //
-    // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
-    // range, so addition of zero point (which can be up to 127) can overflow signed 32-bit integer.
-    const int32_t x_biased = x_clamped + zero_point;
-    const int32_t y_biased = y_clamped + zero_point;
-    const int32_t z_biased = z_clamped + zero_point;
-    const int32_t w_biased = w_clamped + zero_point;
-
-    output[0] = (int8_t) x_biased;
-    output[1] = (int8_t) y_biased;
-    output[2] = (int8_t) z_biased;
-    output[3] = (int8_t) w_biased;
-    output += 4;
-  }
-}
diff --git a/src/qs8-requantization/qs8-requantization-rndna-sse2.c b/src/qs8-requantization/qs8-requantization-rndna-sse2.c
deleted file mode 100644
index a9158b17659..00000000000
--- a/src/qs8-requantization/qs8-requantization-rndna-sse2.c
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#include <emmintrin.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/requantization-stubs.h"
-
-
-void xnn_qs8_requantize_rndna__sse2(
-    size_t n,
-    const int32_t* input,
-    float scale,
-    int8_t zero_point,
-    int8_t qmin,
-    int8_t qmax,
-    int8_t* output)
-{
-  assert(n % 16 == 0);
-  assert(scale < 1.0f);
-  assert(scale >= 0x1.0p-32f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
-  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
-  assert(shift >= 24);
-  assert(shift < 56);
-  const uint64_t rounding = UINT64_C(1) << (shift - 1);
-
-  const __m128i vmultiplier = _mm_set1_epi32(multiplier);
-  const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
-  const __m128i vqmin = _mm_set1_epi8((short) qmin);
-  const __m128i vqmax = _mm_set1_epi8((short) qmax);
-  const __m128i vshift = _mm_cvtsi32_si128((int) shift);
-  const __m128i vrounding = _mm_set1_epi64x(rounding);
-  for (; n != 0; n -= 16) {
-    const __m128i x = _mm_loadu_si128((const __m128i*) input);
-    const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
-    const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
-    const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
-    input += 16;
-
-    const __m128i x_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), x);
-    const __m128i y_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), y);
-    const __m128i z_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), z);
-    const __m128i w_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), w);
-
-    const __m128i x_abs0123 = _mm_sub_epi32(_mm_xor_si128(x, x_neg_mask), x_neg_mask);
-    const __m128i y_abs0123 = _mm_sub_epi32(_mm_xor_si128(y, y_neg_mask), y_neg_mask);
-    const __m128i z_abs0123 = _mm_sub_epi32(_mm_xor_si128(z, z_neg_mask), z_neg_mask);
-    const __m128i w_abs0123 = _mm_sub_epi32(_mm_xor_si128(w, w_neg_mask), w_neg_mask);
-
-    const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-
-    const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier);
-    const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier);
-    const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier);
-    const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier);
-
-    const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier);
-    const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier);
-    const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier);
-    const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier);
-
-    const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshift);
-    const __m128i x_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(x_absmul13, vrounding), vshift);
-    const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshift);
-    const __m128i y_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(y_absmul13, vrounding), vshift);
-    const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshift);
-    const __m128i z_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(z_absmul13, vrounding), vshift);
-    const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshift);
-    const __m128i w_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(w_absmul13, vrounding), vshift);
-
-    const __m128i x_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(x_abs_scaled02), _mm_castsi128_ps(x_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i y_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(y_abs_scaled02), _mm_castsi128_ps(y_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i z_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(z_abs_scaled02), _mm_castsi128_ps(z_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i w_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(w_abs_scaled02), _mm_castsi128_ps(w_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-
-    const __m128i x_abs_scaled = _mm_shuffle_epi32(x_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i y_abs_scaled = _mm_shuffle_epi32(y_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i z_abs_scaled = _mm_shuffle_epi32(z_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i w_abs_scaled = _mm_shuffle_epi32(w_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-
-    const __m128i x_scaled = _mm_sub_epi32(_mm_xor_si128(x_abs_scaled, x_neg_mask), x_neg_mask);
-    const __m128i y_scaled = _mm_sub_epi32(_mm_xor_si128(y_abs_scaled, y_neg_mask), y_neg_mask);
-    const __m128i z_scaled = _mm_sub_epi32(_mm_xor_si128(z_abs_scaled, z_neg_mask), z_neg_mask);
-    const __m128i w_scaled = _mm_sub_epi32(_mm_xor_si128(w_abs_scaled, w_neg_mask), w_neg_mask);
-
-    const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
-    const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
-    const __m128i xy_clamped = _mm_max_epi16(_mm_min_epi16(xy_packed, vqmax), vqmin);
-    const __m128i zw_clamped = _mm_max_epi16(_mm_min_epi16(zw_packed, vqmax), vqmin);
-    const __m128i xyzw_clamped = _mm_packs_epi16(xy_clamped, zw_clamped);
-
-    // 4x PXOR (setzero)
-    // 8x PSUBD
-    // 8x PXOR
-    // 8x PSHUFD
-    // 8x PMULUDQ
-    // 8x PSRLQ
-    // 8x PADDQ
-    // 4x SHUFPS
-    // 2x PACKSSDW
-    // 2x PADDSW
-    // 2x PMAXSW
-    // 2x PMINSW
-    // 1x PACKSSWB
-    // ---------------------
-    // 63 instructions total
-
-    _mm_storeu_si128((__m128i*) output, xyzw_clamped);
-    output += 16;
-  }
-}
diff --git a/src/qs8-requantization/qs8-requantization-rndna-sse41.c b/src/qs8-requantization/qs8-requantization-rndna-sse41.c
deleted file mode 100644
index 1801e3642ca..00000000000
--- a/src/qs8-requantization/qs8-requantization-rndna-sse41.c
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#include <smmintrin.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/requantization-stubs.h"
-
-
-void xnn_qs8_requantize_rndna__sse41(
-    size_t n,
-    const int32_t* input,
-    float scale,
-    int8_t zero_point,
-    int8_t qmin,
-    int8_t qmax,
-    int8_t* output)
-{
-  assert(n % 16 == 0);
-  assert(scale < 1.0f);
-  assert(scale >= 0x1.0p-32f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const uint32_t multiplier = (scale_bits << 8) | UINT32_C(0x80000000);
-  const uint32_t shift = 127 + 31 - (scale_bits >> 23);
-  assert(shift >= 32);
-  assert(shift < 64);
-  const uint64_t rounding = UINT64_C(1) << (shift - 1);
-
-  const __m128i vmultiplier = _mm_set1_epi32(multiplier);
-  const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
-  const __m128i vqmin = _mm_set1_epi8((char) qmin);
-  const __m128i vqmax = _mm_set1_epi8((char) qmax);
-  const __m128i vshiftlo = _mm_cvtsi32_si128((int) shift);
-  const __m128i vshifthi = _mm_cvtsi32_si128((int) shift - 32);
-  const __m128i vrounding = _mm_set1_epi64x(rounding);
-  for (; n != 0; n -= 16) {
-    const __m128i x = _mm_loadu_si128((const __m128i*) input);
-    const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
-    const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
-    const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
-    input += 16;
-
-    const __m128i x_abs0123 = _mm_abs_epi32(x);
-    const __m128i y_abs0123 = _mm_abs_epi32(y);
-    const __m128i z_abs0123 = _mm_abs_epi32(z);
-    const __m128i w_abs0123 = _mm_abs_epi32(w);
-
-    const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-
-    const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier);
-    const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier);
-    const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier);
-    const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier);
-
-    const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier);
-    const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier);
-    const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier);
-    const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier);
-
-    const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshiftlo);
-    const __m128i x_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(x_absmul13, vrounding), vshifthi);
-    const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshiftlo);
-    const __m128i y_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(y_absmul13, vrounding), vshifthi);
-    const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshiftlo);
-    const __m128i z_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(z_absmul13, vrounding), vshifthi);
-    const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshiftlo);
-    const __m128i w_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(w_absmul13, vrounding), vshifthi);
-
-    const __m128i x_abs_scaled = _mm_blend_epi16(x_abs_scaled02, x_abs_scaled13, 0xCC);
-    const __m128i y_abs_scaled = _mm_blend_epi16(y_abs_scaled02, y_abs_scaled13, 0xCC);
-    const __m128i z_abs_scaled = _mm_blend_epi16(z_abs_scaled02, z_abs_scaled13, 0xCC);
-    const __m128i w_abs_scaled = _mm_blend_epi16(w_abs_scaled02, w_abs_scaled13, 0xCC);
-
-    const __m128i x_scaled = _mm_sign_epi32(x_abs_scaled, x);
-    const __m128i y_scaled = _mm_sign_epi32(y_abs_scaled, y);
-    const __m128i z_scaled = _mm_sign_epi32(z_abs_scaled, z);
-    const __m128i w_scaled = _mm_sign_epi32(w_abs_scaled, w);
-
-    const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
-    const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
-    const __m128i xyzw_packed = _mm_packs_epi16(xy_packed, zw_packed);
-    const __m128i xyzw_clamped = _mm_max_epi8(_mm_min_epi8(xyzw_packed, vqmax), vqmin);
-
-    // 4x PABSD
-    // 4x PSHUFD
-    // 8x PMULUDQ
-    // 4x PSRLQ
-    // 4x PSRLD
-    // 8x PADDQ
-    // 4x PBLENDW
-    // 4x PSIGND
-    // 2x PACKSSDW
-    // 2x PADDSW
-    // 1x PACKSSWB
-    // 1x PMAXSB
-    // 1x PMINSB
-    // ---------------------
-    // 47 instructions total
-
-    _mm_storeu_si128((__m128i*) output, xyzw_clamped);
-    output += 16;
-  }
-}
diff --git a/src/qs8-requantization/qs8-requantization-rndna-ssse3.c b/src/qs8-requantization/qs8-requantization-rndna-ssse3.c
deleted file mode 100644
index 7ef62fca878..00000000000
--- a/src/qs8-requantization/qs8-requantization-rndna-ssse3.c
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#include <tmmintrin.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/requantization-stubs.h"
-
-
-void xnn_qs8_requantize_rndna__ssse3(
-    size_t n,
-    const int32_t* input,
-    float scale,
-    int8_t zero_point,
-    int8_t qmin,
-    int8_t qmax,
-    int8_t* output)
-{
-  assert(n % 16 == 0);
-  assert(scale < 1.0f);
-  assert(scale >= 0x1.0p-32f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
-  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
-  assert(shift >= 24);
-  assert(shift < 56);
-  const uint64_t rounding = UINT64_C(1) << (shift - 1);
-
-  const __m128i vmultiplier = _mm_set1_epi32(multiplier);
-  const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
-  const __m128i vqmin = _mm_set1_epi8((char) qmin);
-  const __m128i vqmax = _mm_set1_epi8((char) qmax);
-  const __m128i vshift = _mm_cvtsi32_si128((int) shift);
-  const __m128i vrounding = _mm_set1_epi64x(rounding);
-  for (; n != 0; n -= 16) {
-    const __m128i x = _mm_loadu_si128((const __m128i*) input);
-    const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
-    const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
-    const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
-    input += 16;
-
-    const __m128i x_abs0123 = _mm_abs_epi32(x);
-    const __m128i y_abs0123 = _mm_abs_epi32(y);
-    const __m128i z_abs0123 = _mm_abs_epi32(z);
-    const __m128i w_abs0123 = _mm_abs_epi32(w);
-
-    const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-
-    const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier);
-    const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier);
-    const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier);
-    const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier);
-
-    const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier);
-    const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier);
-    const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier);
-    const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier);
-
-    const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshift);
-    const __m128i x_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(x_absmul13, vrounding), vshift);
-    const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshift);
-    const __m128i y_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(y_absmul13, vrounding), vshift);
-    const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshift);
-    const __m128i z_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(z_absmul13, vrounding), vshift);
-    const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshift);
-    const __m128i w_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(w_absmul13, vrounding), vshift);
-
-    const __m128i x_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(x_abs_scaled02), _mm_castsi128_ps(x_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i y_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(y_abs_scaled02), _mm_castsi128_ps(y_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i z_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(z_abs_scaled02), _mm_castsi128_ps(z_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i w_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(w_abs_scaled02), _mm_castsi128_ps(w_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-
-    const __m128i x_abs_scaled = _mm_shuffle_epi32(x_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i y_abs_scaled = _mm_shuffle_epi32(y_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i z_abs_scaled = _mm_shuffle_epi32(z_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i w_abs_scaled = _mm_shuffle_epi32(w_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-
-    const __m128i x_scaled = _mm_sign_epi32(x_abs_scaled, x);
-    const __m128i y_scaled = _mm_sign_epi32(y_abs_scaled, y);
-    const __m128i z_scaled = _mm_sign_epi32(z_abs_scaled, z);
-    const __m128i w_scaled = _mm_sign_epi32(w_abs_scaled, w);
-
-    const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
-    const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
-    const __m128i xy_clamped = _mm_max_epi16(_mm_min_epi16(xy_packed, vqmax), vqmin);
-    const __m128i zw_clamped = _mm_max_epi16(_mm_min_epi16(zw_packed, vqmax), vqmin);
-    const __m128i xyzw_clamped = _mm_packs_epi16(xy_clamped, zw_clamped);
-
-    // 4x PABSD
-    // 8x PSHUFD
-    // 8x PMULUDQ
-    // 8x PSRLQ
-    // 8x PADDQ
-    // 4x SHUFPS
-    // 4x PSIGND
-    // 2x PACKSSDW
-    // 2x PADDSW
-    // 2x PMAXSW
-    // 2x PMINSW
-    // 1x PACKSSWB
-    // ---------------------
-    // 53 instructions total
-
-    _mm_storeu_si128((__m128i*) output, xyzw_clamped);
-    output += 16;
-  }
-}
diff --git a/src/qs8-rsum/avxvnni.c.in b/src/qs8-rsum/avxvnni.c.in
index a9360704950..3144dadd551 100644
--- a/src/qs8-rsum/avxvnni.c.in
+++ b/src/qs8-rsum/avxvnni.c.in
@@ -22,7 +22,7 @@ void xnn_qs8_rsum_ukernel__${ISA}_u${CHANNEL_TILE}${ACC_SUFFIX}(
     size_t batch,
     const int8_t* input,
     int32_t* output,
-    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
 {
   assert(batch != 0);
   assert(input != NULL);
diff --git a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc2.c b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc2.c
index 1966a24f5a4..cad57ae03ae 100644
--- a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc2.c
+++ b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc2.c
@@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avx256vnni_u128_acc2(
     size_t batch,
     const int8_t* input,
     int32_t* output,
-    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
 {
   assert(batch != 0);
   assert(input != NULL);
diff --git a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc4.c b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc4.c
index bd0696d9a69..b15d29b403c 100644
--- a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc4.c
+++ b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u128-acc4.c
@@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avx256vnni_u128_acc4(
     size_t batch,
     const int8_t* input,
     int32_t* output,
-    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
 {
   assert(batch != 0);
   assert(input != NULL);
diff --git a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u32.c b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u32.c
index 40d68d7348f..875122eaebe 100644
--- a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u32.c
+++ b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u32.c
@@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avx256vnni_u32(
     size_t batch,
     const int8_t* input,
     int32_t* output,
-    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
 {
   assert(batch != 0);
   assert(input != NULL);
diff --git a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u64-acc2.c b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u64-acc2.c
index 0bd11e29652..b14679e6fb7 100644
--- a/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u64-acc2.c
+++ b/src/qs8-rsum/gen/qs8-rsum-avx256vnni-u64-acc2.c
@@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avx256vnni_u64_acc2(
     size_t batch,
     const int8_t* input,
     int32_t* output,
-    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
 {
   assert(batch != 0);
   assert(input != NULL);
diff --git a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc2.c b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc2.c
index 36ab9b51333..12c1106f1a1 100644
--- a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc2.c
+++ b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc2.c
@@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avxvnni_u128_acc2(
     size_t batch,
     const int8_t* input,
     int32_t* output,
-    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
 {
   assert(batch != 0);
   assert(input != NULL);
diff --git a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc4.c b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc4.c
index 95eaa1d0152..8f8f83a184f 100644
--- a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc4.c
+++ b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u128-acc4.c
@@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avxvnni_u128_acc4(
     size_t batch,
     const int8_t* input,
     int32_t* output,
-    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
 {
   assert(batch != 0);
   assert(input != NULL);
diff --git a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u32.c b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u32.c
index 9c9dbcc02d8..5f941b2e3aa 100644
--- a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u32.c
+++ b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u32.c
@@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avxvnni_u32(
     size_t batch,
     const int8_t* input,
     int32_t* output,
-    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
 {
   assert(batch != 0);
   assert(input != NULL);
diff --git a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u64-acc2.c b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u64-acc2.c
index 49c1e249ead..b4da645e710 100644
--- a/src/qs8-rsum/gen/qs8-rsum-avxvnni-u64-acc2.c
+++ b/src/qs8-rsum/gen/qs8-rsum-avxvnni-u64-acc2.c
@@ -19,7 +19,7 @@ void xnn_qs8_rsum_ukernel__avxvnni_u64_acc2(
     size_t batch,
     const int8_t* input,
     int32_t* output,
-    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const struct xnn_qs8_rsum_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
 {
   assert(batch != 0);
   assert(input != NULL);
diff --git a/src/qs8-vadd/qs8-vadd-minmax.h b/src/qs8-vadd/qs8-vadd-minmax.h
index 7dc637cb967..b7733a26eac 100644
--- a/src/qs8-vadd/qs8-vadd-minmax.h
+++ b/src/qs8-vadd/qs8-vadd-minmax.h
@@ -50,9 +50,12 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vadd_minmax_ukernel__avx2_mul
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_u16, 16, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_u24, 24, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_u32, 32, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u16, 16, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u32, 32, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vadd_minmax_ukernel__wasmsimd_u8, 8, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params)
diff --git a/src/qs8-vaddc/qs8-vaddc-minmax.h b/src/qs8-vaddc/qs8-vaddc-minmax.h
index 4b355506b4a..ad1d3378c1a 100644
--- a/src/qs8-vaddc/qs8-vaddc-minmax.h
+++ b/src/qs8-vaddc/qs8-vaddc-minmax.h
@@ -50,9 +50,12 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vaddc_minmax_ukernel__avx2_mu
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16, 16, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u24, 24, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u32, 32, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16, 16, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u32, 32, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vaddc_minmax_ukernel__wasmsimd_u8, 8, false, int8_t, struct xnn_qs8_add_minmax_params, xnn_init_qs8_add_minmax_scalar_params)
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-avx-u16.c b/src/qs8-vhswish/gen/qs8-vhswish-avx-u16.c
index 07502a89c30..db71888fb26 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-avx-u16.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-avx-u16.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vhswish_ukernel__avx_u16(
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-avx-u32.c b/src/qs8-vhswish/gen/qs8-vhswish-avx-u32.c
index 320d57cd955..678b0673069 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-avx-u32.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-avx-u32.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vhswish_ukernel__avx_u32(
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-avx-u8.c b/src/qs8-vhswish/gen/qs8-vhswish-avx-u8.c
index 6cf502bc837..b3283a53465 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-avx-u8.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-avx-u8.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vhswish_ukernel__avx_u8(
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-neon-u16.c b/src/qs8-vhswish/gen/qs8-vhswish-neon-u16.c
index 8bf521a0793..424b59aba47 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-neon-u16.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-neon-u16.c
@@ -12,7 +12,7 @@
 #include <arm_neon.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vhswish_ukernel__neon_u16(
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-neon-u32.c b/src/qs8-vhswish/gen/qs8-vhswish-neon-u32.c
index 0147208accc..f1e85ba2102 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-neon-u32.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-neon-u32.c
@@ -12,7 +12,7 @@
 #include <arm_neon.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vhswish_ukernel__neon_u32(
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-neon-u8.c b/src/qs8-vhswish/gen/qs8-vhswish-neon-u8.c
index ff85fb858b9..ab486679d21 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-neon-u8.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-neon-u8.c
@@ -12,7 +12,7 @@
 #include <arm_neon.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vhswish_ukernel__neon_u8(
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-scalar-u1.c b/src/qs8-vhswish/gen/qs8-vhswish-scalar-u1.c
index 73a73e3011d..69cb0456dc5 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-scalar-u1.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-scalar-u1.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vhswish_ukernel__scalar_u1(
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-scalar-u2.c b/src/qs8-vhswish/gen/qs8-vhswish-scalar-u2.c
index 47bad8d6d64..133c27dec2f 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-scalar-u2.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-scalar-u2.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vhswish_ukernel__scalar_u2(
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-scalar-u4.c b/src/qs8-vhswish/gen/qs8-vhswish-scalar-u4.c
index 4f1bf9f7c36..9bd75968885 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-scalar-u4.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-scalar-u4.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vhswish_ukernel__scalar_u4(
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-sse2-u16.c b/src/qs8-vhswish/gen/qs8-vhswish-sse2-u16.c
index a2e32679b81..9ccf642d7e9 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-sse2-u16.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-sse2-u16.c
@@ -12,7 +12,7 @@
 #include <emmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-sse2-u32.c b/src/qs8-vhswish/gen/qs8-vhswish-sse2-u32.c
index 08496a330d4..0fdc1eb6304 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-sse2-u32.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-sse2-u32.c
@@ -12,7 +12,7 @@
 #include <emmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-sse41-u16.c b/src/qs8-vhswish/gen/qs8-vhswish-sse41-u16.c
index 217ce4dd5f2..3ede288bf8b 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-sse41-u16.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-sse41-u16.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vhswish_ukernel__sse41_u16(
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-sse41-u32.c b/src/qs8-vhswish/gen/qs8-vhswish-sse41-u32.c
index bf1eb5a0ee8..35530dbc19a 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-sse41-u32.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-sse41-u32.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vhswish_ukernel__sse41_u32(
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-sse41-u8.c b/src/qs8-vhswish/gen/qs8-vhswish-sse41-u8.c
index ab4d2aa9975..20469e949df 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-sse41-u8.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-sse41-u8.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vhswish_ukernel__sse41_u8(
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u16.c b/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u16.c
index e68869b4c45..edd712abac1 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u16.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u16.c
@@ -12,7 +12,7 @@
 #include <tmmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u32.c b/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u32.c
index cc12ba79780..9aff74aaaaf 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u32.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-ssse3-u32.c
@@ -12,7 +12,7 @@
 #include <tmmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u16.c b/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u16.c
index ad4147be867..ee083d92a84 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u16.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u16.c
@@ -12,7 +12,7 @@
 #include <wasm_simd128.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vhswish_ukernel__wasmsimd_u16(
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u32.c b/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u32.c
index 551309852f1..6181ed75c96 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u32.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u32.c
@@ -12,7 +12,7 @@
 #include <wasm_simd128.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vhswish_ukernel__wasmsimd_u32(
diff --git a/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u8.c b/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u8.c
index 20a8a9e6487..7ccd5946192 100644
--- a/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u8.c
+++ b/src/qs8-vhswish/gen/qs8-vhswish-wasmsimd-u8.c
@@ -12,7 +12,7 @@
 #include <wasm_simd128.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vhswish_ukernel__wasmsimd_u8(
diff --git a/src/qs8-vhswish/neon.c.in b/src/qs8-vhswish/neon.c.in
index 5002bd892b9..f0efc5e3578 100644
--- a/src/qs8-vhswish/neon.c.in
+++ b/src/qs8-vhswish/neon.c.in
@@ -12,7 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <arm_neon.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
diff --git a/src/qs8-vhswish/scalar.c.in b/src/qs8-vhswish/scalar.c.in
index 1e3cae3e2ec..fdc1e6d7c94 100644
--- a/src/qs8-vhswish/scalar.c.in
+++ b/src/qs8-vhswish/scalar.c.in
@@ -9,7 +9,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
diff --git a/src/qs8-vhswish/sse2.c.in b/src/qs8-vhswish/sse2.c.in
index bc6a64e18cb..926e7a04dbe 100644
--- a/src/qs8-vhswish/sse2.c.in
+++ b/src/qs8-vhswish/sse2.c.in
@@ -12,7 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <emmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qs8-vhswish/sse4.c.in b/src/qs8-vhswish/sse4.c.in
index 61c44b9f562..dff7aa0d1e3 100644
--- a/src/qs8-vhswish/sse4.c.in
+++ b/src/qs8-vhswish/sse4.c.in
@@ -13,7 +13,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
diff --git a/src/qs8-vhswish/ssse3.c.in b/src/qs8-vhswish/ssse3.c.in
index 62b9015227f..75307832f3b 100644
--- a/src/qs8-vhswish/ssse3.c.in
+++ b/src/qs8-vhswish/ssse3.c.in
@@ -12,7 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <tmmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qs8-vhswish/wasmsimd.c.in b/src/qs8-vhswish/wasmsimd.c.in
index 3ad456c9ff2..adf3174a928 100644
--- a/src/qs8-vhswish/wasmsimd.c.in
+++ b/src/qs8-vhswish/wasmsimd.c.in
@@ -12,7 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <wasm_simd128.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
diff --git a/src/qs8-vlrelu/armsimd32.c.in b/src/qs8-vlrelu/armsimd32.c.in
index 19489f37365..65c520a527e 100644
--- a/src/qs8-vlrelu/armsimd32.c.in
+++ b/src/qs8-vlrelu/armsimd32.c.in
@@ -14,7 +14,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include "xnnpack/intrinsics-polyfill.h"
 #include "xnnpack/math.h"
 #include "xnnpack/unaligned.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
diff --git a/src/qs8-vlrelu/avx2.c.in b/src/qs8-vlrelu/avx2.c.in
index ff38529403a..2fe1c86a099 100644
--- a/src/qs8-vlrelu/avx2.c.in
+++ b/src/qs8-vlrelu/avx2.c.in
@@ -13,7 +13,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u4.c b/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u4.c
index 2efa8844130..0c92404a11a 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u4.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u4.c
@@ -14,7 +14,7 @@
 #include "xnnpack/intrinsics-polyfill.h"
 #include "xnnpack/math.h"
 #include "xnnpack/unaligned.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__armsimd32_u4(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u8.c b/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u8.c
index edb86301ce5..6149d36f0c7 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u8.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-u8.c
@@ -14,7 +14,7 @@
 #include "xnnpack/intrinsics-polyfill.h"
 #include "xnnpack/math.h"
 #include "xnnpack/unaligned.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__armsimd32_u8(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u16.c b/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u16.c
index 4d48f8973ad..b414b773dbc 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u16.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u16.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__avx_u16(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u32.c b/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u32.c
index 6b6a000d02f..029d5369874 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u32.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u32.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__avx_u32(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u8.c b/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u8.c
index 3086f3b11e9..24345b6bc9a 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u8.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-avx-u8.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__avx_u8(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u16.c b/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u16.c
index bf1c50718c6..4bdef541773 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u16.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u16.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__avx2_u16(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u32.c b/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u32.c
index 103a671075e..a691a97eb87 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u32.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u32.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__avx2_u32(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u64.c b/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u64.c
index e5e93e65d66..d20d0fc3ec9 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u64.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-u64.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__avx2_u64(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u16.c b/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u16.c
index fd89575a7e7..8ddd5a73ad9 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u16.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u16.c
@@ -12,7 +12,7 @@
 #include <arm_neon.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__neon_u16(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u32.c b/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u32.c
index ad5ad976c69..2d001633e9e 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u32.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u32.c
@@ -12,7 +12,7 @@
 #include <arm_neon.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__neon_u32(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u8.c b/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u8.c
index c053ac1624e..bd2035afcd8 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u8.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-neon-u8.c
@@ -12,7 +12,7 @@
 #include <arm_neon.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__neon_u8(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u1v.c b/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u1v.c
index 7e9159c6b5f..2abe0424efb 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u1v.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u1v.c
@@ -11,7 +11,7 @@
 
 #include <riscv_vector.h>
 
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__rvv_u1v(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u2v.c b/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u2v.c
index 1e14268b149..a51e57991f4 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u2v.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u2v.c
@@ -11,7 +11,7 @@
 
 #include <riscv_vector.h>
 
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__rvv_u2v(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u1.c b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u1.c
index ebf633a0fe0..5c6e2d46695 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u1.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u1.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__scalar_andxor_u1(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u2.c b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u2.c
index d2e392cf879..df20df9c56c 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u2.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u2.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__scalar_andxor_u2(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u4.c b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u4.c
index 4344baf8141..d905e5bee90 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u4.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-u4.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__scalar_andxor_u4(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u1.c b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u1.c
index 471d652eb7f..5f9c21e99f2 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u1.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u1.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__scalar_select_u1(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u2.c b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u2.c
index 6ae6793d00f..529450bd9b5 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u2.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u2.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__scalar_select_u2(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u4.c b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u4.c
index 2b283b3abd7..04f73cb758d 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u4.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-u4.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__scalar_select_u4(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u16.c b/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u16.c
index 520bba77505..1ac03c29fee 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u16.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u16.c
@@ -12,7 +12,7 @@
 #include <emmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u32.c b/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u32.c
index 8305b457d0b..e25383cc0a0 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u32.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-u32.c
@@ -12,7 +12,7 @@
 #include <emmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u16.c b/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u16.c
index be708252465..a45744d6230 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u16.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u16.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__sse41_u16(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u32.c b/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u32.c
index 5fb36ef81c2..25d8161e4e5 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u32.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u32.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__sse41_u32(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u8.c b/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u8.c
index 4aa4856d5bb..a93d4d11e32 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u8.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-u8.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qs8_vlrelu_ukernel__sse41_u8(
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u16.c b/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u16.c
index 735a71bf082..acad3770453 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u16.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u16.c
@@ -12,7 +12,7 @@
 #include <tmmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u32.c b/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u32.c
index 4980cf680fb..6d5741569c4 100644
--- a/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u32.c
+++ b/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-u32.c
@@ -12,7 +12,7 @@
 #include <tmmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qs8-vlrelu/neon.c.in b/src/qs8-vlrelu/neon.c.in
index 9ad887f9972..7f17862634b 100644
--- a/src/qs8-vlrelu/neon.c.in
+++ b/src/qs8-vlrelu/neon.c.in
@@ -12,7 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <arm_neon.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
diff --git a/src/qs8-vlrelu/qs8-vlrelu.h b/src/qs8-vlrelu/qs8-vlrelu.h
index c384e6b0d73..c0d5c3e3618 100644
--- a/src/qs8-vlrelu/qs8-vlrelu.h
+++ b/src/qs8-vlrelu/qs8-vlrelu.h
@@ -17,59 +17,59 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qs8_vlrelu_ukernel__neon_u8, 8, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qs8_vlrelu_ukernel__neon_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qs8_vlrelu_ukernel__neon_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qs8_vlrelu_ukernel__neon_u8, 8, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qs8_vlrelu_ukernel__neon_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qs8_vlrelu_ukernel__neon_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 #if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qs8_vlrelu_ukernel__rvv_u1v, 1, true, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qs8_vlrelu_ukernel__rvv_u2v, 2, true, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qs8_vlrelu_ukernel__rvv_u1v, 1, true, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qs8_vlrelu_ukernel__rvv_u2v, 2, true, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
 #endif  // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV)
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__sse2_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__sse2_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qs8_vlrelu_ukernel__ssse3_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qs8_vlrelu_ukernel__ssse3_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qs8_vlrelu_ukernel__sse41_u8, 8, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qs8_vlrelu_ukernel__sse41_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qs8_vlrelu_ukernel__sse41_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qs8_vlrelu_ukernel__avx_u8, 8, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qs8_vlrelu_ukernel__avx_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qs8_vlrelu_ukernel__avx_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vlrelu_ukernel__avx2_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vlrelu_ukernel__avx2_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vlrelu_ukernel__avx2_u64, 64, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__sse2_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__sse2_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qs8_vlrelu_ukernel__ssse3_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qs8_vlrelu_ukernel__ssse3_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qs8_vlrelu_ukernel__sse41_u8, 8, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qs8_vlrelu_ukernel__sse41_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qs8_vlrelu_ukernel__sse41_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qs8_vlrelu_ukernel__avx_u8, 8, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qs8_vlrelu_ukernel__avx_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qs8_vlrelu_ukernel__avx_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vlrelu_ukernel__avx2_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vlrelu_ukernel__avx2_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qs8_vlrelu_ukernel__avx2_u64, 64, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_arm_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_arm_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_x86_u8, 8, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_x86_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_x86_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_arm_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_arm_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_x86_u8, 8, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_x86_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmsimd_x86_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 
 #if XNN_ARCH_WASMRELAXEDSIMD
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_u8, 8, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_u16, 16, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_u32, 32, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_u8, 8, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_u16, 16, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_u32, 32, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
 #endif  // XNN_ARCH_WASMRELAXEDSIMD
 
 #if XNN_ARCH_ARM
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qs8_vlrelu_ukernel__armsimd32_u4, 4, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qs8_vlrelu_ukernel__armsimd32_u8, 8, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qs8_vlrelu_ukernel__armsimd32_u4, 4, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qs8_vlrelu_ukernel__armsimd32_u8, 8, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
 #endif  // XNN_ARCH_ARM
 
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_select_u1, 1, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_select_u2, 2, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_select_u4, 4, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_andxor_u1, 1, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_andxor_u2, 2, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_andxor_u4, 4, false, int8_t, union xnn_qs8_lrelu_minmax_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_select_u1, 1, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_select_u2, 2, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_select_u4, 4, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_andxor_u1, 1, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_andxor_u2, 2, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qs8_vlrelu_ukernel__scalar_andxor_u4, 4, false, int8_t, struct xnn_qs8_lrelu_params, xnn_init_qs8_lrelu_scalar_params)
 
 #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS
 #undef XNN_DEFINED_UKERNEL_WITH_PARAMS
diff --git a/src/qs8-vlrelu/rvv.c.in b/src/qs8-vlrelu/rvv.c.in
index fee935ed324..2d0bf1470a9 100755
--- a/src/qs8-vlrelu/rvv.c.in
+++ b/src/qs8-vlrelu/rvv.c.in
@@ -9,7 +9,7 @@ $assert DATATYPE in ["QS8", "QU8"]
 
 #include <riscv_vector.h>
 
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
 
diff --git a/src/qs8-vlrelu/scalar-andxor.c.in b/src/qs8-vlrelu/scalar-andxor.c.in
index 3518adb4e23..6ce4a95e4a4 100644
--- a/src/qs8-vlrelu/scalar-andxor.c.in
+++ b/src/qs8-vlrelu/scalar-andxor.c.in
@@ -9,7 +9,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
diff --git a/src/qs8-vlrelu/scalar-select.c.in b/src/qs8-vlrelu/scalar-select.c.in
index efe0dcd684f..3786a811f50 100644
--- a/src/qs8-vlrelu/scalar-select.c.in
+++ b/src/qs8-vlrelu/scalar-select.c.in
@@ -9,7 +9,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
diff --git a/src/qs8-vlrelu/sse2.c.in b/src/qs8-vlrelu/sse2.c.in
index e2460673953..f825a6e0ac3 100644
--- a/src/qs8-vlrelu/sse2.c.in
+++ b/src/qs8-vlrelu/sse2.c.in
@@ -12,7 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <emmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qs8-vlrelu/sse4.c.in b/src/qs8-vlrelu/sse4.c.in
index 1b8a415bca2..5cec07fd300 100644
--- a/src/qs8-vlrelu/sse4.c.in
+++ b/src/qs8-vlrelu/sse4.c.in
@@ -13,7 +13,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
diff --git a/src/qs8-vlrelu/ssse3.c.in b/src/qs8-vlrelu/ssse3.c.in
index b71c09d05b3..5788f59c8e5 100644
--- a/src/qs8-vlrelu/ssse3.c.in
+++ b/src/qs8-vlrelu/ssse3.c.in
@@ -12,7 +12,7 @@ $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <tmmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c
index 86d44a0222c..e75dcc32f21 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-imagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-imagic.c
index 7e948c97a95..a5b70578988 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-imagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c
index 025ae55081f..97bf0989f8e 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_lrintf(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c
index 9c11113fa64..7eb023b47a4 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__wasm_fmagic(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c
index bb929b1573e..c3f5251b63c 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_fmagic(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-imagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-imagic.c
index bbaf53878aa..902ed641c6c 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-imagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_imagic(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c
index 445d754bc5a..15985a3ba14 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c
index 5c67ddc0476..c19ba42c44f 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c
index bd19f753b58..b0f3c4a748a 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_fmagic(
     size_t channels,
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-imagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-imagic.c
index 14d66ab5298..f46751a6172 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-imagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_imagic(
     size_t channels,
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c
index aaa3b70875f..8bbdd03a3ea 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__scalar_lrintf(
     size_t channels,
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c
index c93594a988e..7ae102c01ce 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_25p4c__wasm_fmagic(
     size_t channels,
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c
index 70de9952f2c..3884c87a98d 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-imagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-imagic.c
index 0e04aba98b0..f73c56c7f4f 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-imagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_imagic(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c
index 89a648ae835..a0d0b8ac541 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_lrintf(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c
index d39d8f0d504..c64e8a6f206 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__wasm_fmagic(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-rndnu-scalar.c b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-rndnu-scalar.c
index fcaf6b1ce78..e00f88a6900 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-rndnu-scalar.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-rndnu-scalar.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_rndnu_ukernel_9p1c__scalar(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c
index 7336f5547fa..21fa3e4bf87 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_fmagic(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-imagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-imagic.c
index 5a660c13bf6..fd70457629b 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-imagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c
index 0baaf4ed0e2..bdc10caea70 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c
index 40f39ee703e..f36f84fad11 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-rndnu-scalar.c b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-rndnu-scalar.c
index cadf0b45273..69f7b747400 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-rndnu-scalar.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-rndnu-scalar.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 #include "xnnpack/unaligned.h"
 
 void xnn_qu8_dwconv_minmax_rndnu_ukernel_9p2c__scalar(
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c
index 9a586d062c6..a9a229e4c09 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_fmagic(
     size_t channels,
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-imagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-imagic.c
index 89d1eb17be7..076dd656fde 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-imagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-imagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_imagic(
     size_t channels,
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c
index 643e55a7d55..37a0b640173 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c
@@ -9,9 +9,13 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__scalar_lrintf(
     size_t channels,
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c
index cde61f2cafc..710c51f5c74 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qu8_dwconv_minmax_fp32_ukernel_9p4c__wasm_fmagic(
     size_t channels,
diff --git a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-rndnu-scalar.c b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-rndnu-scalar.c
index 87ca4d6b3fd..0a2edcc6e52 100644
--- a/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-rndnu-scalar.c
+++ b/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-rndnu-scalar.c
@@ -8,9 +8,13 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "xnnpack/common.h"
 #include "xnnpack/dwconv.h"
 #include "xnnpack/math.h"
+#include "xnnpack/microparams.h"
 
 void xnn_qu8_dwconv_minmax_rndnu_ukernel_9p4c__scalar(
     size_t channels,
diff --git a/src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h b/src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h
index 667780c8ef7..25b35d8b4a4 100644
--- a/src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h
+++ b/src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h
@@ -61,13 +61,16 @@ XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c4s4r__avx_mul32, 8, 8, 9, 16, 4, 4, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c8s8r__avx2_mul32, 8, 8, 9, 16, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c8s8r__avx2_mul32, 8, 8, 9, 32, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l16c16s1r__avx512skx_mul32, 5, 5, 5, 16, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l32c16s1r__avx512skx_mul32, 5, 5, 5, 32, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l16c16s1r__avx512skx_mul32, 6, 6, 7, 16, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_6f6m7l32c16s1r__avx512skx_mul32, 6, 6, 7, 32, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l16c16s1r__avx512skx_mul32, 8, 8, 9, 16, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_MULTIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_8f8m9l32c16s1r__avx512skx_mul32, 8, 8, 9, 32, 16, 1, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_DWCONV_MULTIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_5f5m5l8c8s8r__wasmsimd_mul16, 5, 5, 5, 8, 8, 8, uint8_t, void, int32_t, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
diff --git a/src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h b/src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h
index 0a0cd7e224c..8db3b279bf8 100644
--- a/src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h
+++ b/src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h
@@ -48,11 +48,14 @@ XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul32, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx2, xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx2_mul32, 32, false, 32, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx512skx_mul32, 16, false, 16, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32, 32, false, 32, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx512skx_mul32, 16, false, 16, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
 XNN_DWCONV_UNIPASS(xnn_arch_x86_avx512skx, xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32, 32, false, 32, 25, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_DWCONV_UNIPASS(0, xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16, 8, false, 8, 9, uint8_t, void, union xnn_qu8_conv_minmax_params, xnn_init_qu8_conv_minmax_fp32_scalar_params)
diff --git a/src/qu8-f32-vcvt/qu8-f32-vcvt.h b/src/qu8-f32-vcvt/qu8-f32-vcvt.h
index 3bcbc008f05..628eed7ec1e 100644
--- a/src/qu8-f32-vcvt/qu8-f32-vcvt.h
+++ b/src/qu8-f32-vcvt/qu8-f32-vcvt.h
@@ -40,11 +40,14 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_f32_vcvt_ukernel__avx2_u8
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_f32_vcvt_ukernel__avx2_u16, 16, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_f32_vcvt_ukernel__avx2_u24, 24, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_f32_vcvt_ukernel__avx2_u32, 32, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_f32_vcvt_ukernel__avx512skx_u16, 16, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_f32_vcvt_ukernel__avx512skx_u32, 32, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_f32_vcvt_ukernel__avx512skx_u48, 48, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_f32_vcvt_ukernel__avx512skx_u64, 64, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qu8_f32_vcvt_ukernel__wasmsimd_u8, 8, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params)
diff --git a/src/qu8-requantization/qu8-requantization-rndna-neon.c b/src/qu8-requantization/qu8-requantization-rndna-neon.c
deleted file mode 100644
index 1178b1ff5cb..00000000000
--- a/src/qu8-requantization/qu8-requantization-rndna-neon.c
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/requantization-stubs.h"
-
-
-void xnn_qu8_requantize_rndna__neon(
-    size_t n,
-    const int32_t* input,
-    float scale,
-    uint8_t zero_point,
-    uint8_t qmin,
-    uint8_t qmax,
-    uint8_t* output)
-{
-  assert(n % 16 == 0);
-  assert(scale < 1.0f);
-  assert(scale >= 0x1.0p-32f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
-  const int32_t shift = 127 + 23 - (scale_bits >> 23);
-  assert(shift >= 24);
-  assert(shift < 56);
-
-#if defined(__aarch64__)
-  const int32x4_t vmultiplier = vdupq_n_s32(multiplier);
-#else
-  const int32x2_t vmultiplier = vdup_n_s32(multiplier);
-#endif
-  const int16x8_t vzero_point = vdupq_n_s16((int16_t)(uint16_t) zero_point);
-  const int64x2_t vshift = vdupq_n_s64(-shift);
-  const uint8x16_t vqmin = vdupq_n_u8(qmin);
-  const uint8x16_t vqmax = vdupq_n_u8(qmax);
-  for (; n != 0; n -= 16) {
-    const int32x4_t x = vld1q_s32(input);
-    const int32x4_t y = vld1q_s32(input + 4);
-    const int32x4_t z = vld1q_s32(input + 8);
-    const int32x4_t w = vld1q_s32(input + 12);
-    input += 16;
-
-    const uint32x4_t x_neg_mask = vcltq_s32(x, vmovq_n_s32(0));
-    const uint32x4_t y_neg_mask = vcltq_s32(y, vmovq_n_s32(0));
-    const uint32x4_t z_neg_mask = vcltq_s32(z, vmovq_n_s32(0));
-    const uint32x4_t w_neg_mask = vcltq_s32(w, vmovq_n_s32(0));
-
-#if defined(__aarch64__)
-    const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vget_low_s32(vmultiplier));
-    const int64x2_t x23_product = vmull_high_s32(x, vmultiplier);
-    const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vget_low_s32(vmultiplier));
-    const int64x2_t y23_product = vmull_high_s32(y, vmultiplier);
-    const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vget_low_s32(vmultiplier));
-    const int64x2_t z23_product = vmull_high_s32(z, vmultiplier);
-    const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vget_low_s32(vmultiplier));
-    const int64x2_t w23_product = vmull_high_s32(w, vmultiplier);
-#else
-    const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vmultiplier);
-    const int64x2_t x23_product = vmull_s32(vget_high_s32(x), vmultiplier);
-    const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vmultiplier);
-    const int64x2_t y23_product = vmull_s32(vget_high_s32(y), vmultiplier);
-    const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vmultiplier);
-    const int64x2_t z23_product = vmull_s32(vget_high_s32(z), vmultiplier);
-    const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vmultiplier);
-    const int64x2_t w23_product = vmull_s32(vget_high_s32(w), vmultiplier);
-#endif
-
-#if defined(__aarch64__)
-    const int64x2_t x01_adjusted_product = vaddw_s32(x01_product, vreinterpret_s32_u32(vget_low_u32(x_neg_mask)));
-    const int64x2_t x23_adjusted_product = vaddw_high_s32(x23_product, vreinterpretq_s32_u32(x_neg_mask));
-    const int64x2_t y01_adjusted_product = vaddw_s32(y01_product, vreinterpret_s32_u32(vget_low_u32(y_neg_mask)));
-    const int64x2_t y23_adjusted_product = vaddw_high_s32(y23_product, vreinterpretq_s32_u32(y_neg_mask));
-    const int64x2_t z01_adjusted_product = vaddw_s32(z01_product, vreinterpret_s32_u32(vget_low_u32(z_neg_mask)));
-    const int64x2_t z23_adjusted_product = vaddw_high_s32(z23_product, vreinterpretq_s32_u32(z_neg_mask));
-    const int64x2_t w01_adjusted_product = vaddw_s32(w01_product, vreinterpret_s32_u32(vget_low_u32(w_neg_mask)));
-    const int64x2_t w23_adjusted_product = vaddw_high_s32(w23_product, vreinterpretq_s32_u32(w_neg_mask));
-#else
-    const int64x2_t x01_adjusted_product = vaddw_s32(x01_product, vreinterpret_s32_u32(vget_low_u32(x_neg_mask)));
-    const int64x2_t x23_adjusted_product = vaddw_s32(x23_product, vreinterpret_s32_u32(vget_high_u32(x_neg_mask)));
-    const int64x2_t y01_adjusted_product = vaddw_s32(y01_product, vreinterpret_s32_u32(vget_low_u32(y_neg_mask)));
-    const int64x2_t y23_adjusted_product = vaddw_s32(y23_product, vreinterpret_s32_u32(vget_high_u32(y_neg_mask)));
-    const int64x2_t z01_adjusted_product = vaddw_s32(z01_product, vreinterpret_s32_u32(vget_low_u32(z_neg_mask)));
-    const int64x2_t z23_adjusted_product = vaddw_s32(z23_product, vreinterpret_s32_u32(vget_high_u32(z_neg_mask)));
-    const int64x2_t w01_adjusted_product = vaddw_s32(w01_product, vreinterpret_s32_u32(vget_low_u32(w_neg_mask)));
-    const int64x2_t w23_adjusted_product = vaddw_s32(w23_product, vreinterpret_s32_u32(vget_high_u32(w_neg_mask)));
-#endif
-
-    const int64x2_t x01_scaled = vrshlq_s64(x01_adjusted_product, vshift);
-    const int64x2_t x23_scaled = vrshlq_s64(x23_adjusted_product, vshift);
-    const int64x2_t y01_scaled = vrshlq_s64(y01_adjusted_product, vshift);
-    const int64x2_t y23_scaled = vrshlq_s64(y23_adjusted_product, vshift);
-    const int64x2_t z01_scaled = vrshlq_s64(z01_adjusted_product, vshift);
-    const int64x2_t z23_scaled = vrshlq_s64(z23_adjusted_product, vshift);
-    const int64x2_t w01_scaled = vrshlq_s64(w01_adjusted_product, vshift);
-    const int64x2_t w23_scaled = vrshlq_s64(w23_adjusted_product, vshift);
-
-#ifdef __aarch64__
-    const int32x4_t x_scaled = vuzp1q_s32(vreinterpretq_s32_s64(x01_scaled), vreinterpretq_s32_s64(x23_scaled));
-    const int32x4_t y_scaled = vuzp1q_s32(vreinterpretq_s32_s64(y01_scaled), vreinterpretq_s32_s64(y23_scaled));
-    const int32x4_t z_scaled = vuzp1q_s32(vreinterpretq_s32_s64(z01_scaled), vreinterpretq_s32_s64(z23_scaled));
-    const int32x4_t w_scaled = vuzp1q_s32(vreinterpretq_s32_s64(w01_scaled), vreinterpretq_s32_s64(w23_scaled));
-
-    const int16x8_t xy_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(x_scaled), y_scaled), vzero_point);
-    const int16x8_t zw_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(z_scaled), w_scaled), vzero_point);
-    const uint8x16_t xyzw_packed = vqmovun_high_s16(vqmovun_s16(xy_packed), zw_packed);
-#else
-    const int32x4_t x_scaled = vcombine_s32(vmovn_s64(x01_scaled), vmovn_s64(x23_scaled));
-    const int32x4_t y_scaled = vcombine_s32(vmovn_s64(y01_scaled), vmovn_s64(y23_scaled));
-    const int32x4_t z_scaled = vcombine_s32(vmovn_s64(z01_scaled), vmovn_s64(z23_scaled));
-    const int32x4_t w_scaled = vcombine_s32(vmovn_s64(w01_scaled), vmovn_s64(w23_scaled));
-
-    const int16x8_t xy_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(x_scaled), vqmovn_s32(y_scaled)), vzero_point);
-    const int16x8_t zw_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(z_scaled), vqmovn_s32(w_scaled)), vzero_point);
-    const uint8x16_t xyzw_packed = vcombine_u8(vqmovun_s16(xy_packed), vqmovun_s16(zw_packed));
-#endif
-
-    const uint8x16_t xyzw_clamped = vmaxq_u8(vminq_u8(xyzw_packed, vqmax), vqmin);
-
-    // AArch32 version:
-    //   4x VCLT.S32 Qd, Qm, #0
-    //   8x VMULL.S32 Qd, Dm, Dn
-    //   8x VADDW.S32 Qd, Qm, Dn
-    //   8x VRSHL.S32 Qd, Qm, Qn
-    //   8x VMOVN.S64 Dd, Qm
-    //   4x VQMOVN.S32 Dd, Qm
-    //   2x VQADD.S16 Qd, Qm, Qn
-    //   2x VQMOVUN.S16 Dd, Qm
-    //   1x VMAX.U8 Qd, Qm, Qn
-    //   1x VMIN.U8 Qd, Qm, Qn
-    // ---------------------
-    // 46 instructions total
-    //
-    // AArch64 version:
-    //   4x CMLT Vd.4S, Vn.4S, #0
-    //   4x SMULL Vd.2D, Vn.2S, Vm.2S
-    //   4x SMULL2 Vd.2D, Vn.4S, Vm.4S
-    //   4x SADDW Vd.2D, Vn.2D, Vm.2S
-    //   4x SADDW2 Vd.2D, Vn.2D, Vm.4S
-    //   8x SRSHL Vd.2D, Vn.2D, Vm.2D
-    //   4x UZP1 Vd.4S, Vn.4S, Vm.4S
-    //   2x SQXTN Vd.4H, Vn.4S
-    //   2x SQXTN2 Vd.8H, Vn.4S
-    //   2x SQADD Vd.8H, Vn.8H, Vm.8H
-    //   1x SQXTUN Vd.8B, Vn.8H
-    //   1x SQXTUN2 Vd.16B, Vn.8H
-    //   1x UMIN Vd.16B, Vn.16B, Vm.16B
-    //   1x UMAX Vd.16B, Vn.16B, Vm.16B
-    // ---------------------
-    // 42 instructions total
-
-    vst1q_u8(output, xyzw_clamped);
-    output += 16;
-  }
-}
diff --git a/src/qu8-requantization/qu8-requantization-rndna-scalar-signed64.c b/src/qu8-requantization/qu8-requantization-rndna-scalar-signed64.c
deleted file mode 100644
index 2a744620fde..00000000000
--- a/src/qu8-requantization/qu8-requantization-rndna-scalar-signed64.c
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/requantization-stubs.h"
-
-
-void xnn_qu8_requantize_rndna__scalar_signed64(
-    size_t n,
-    const int32_t* input,
-    float scale,
-    uint8_t zero_point,
-    uint8_t qmin,
-    uint8_t qmax,
-    uint8_t* output)
-{
-  assert(n % 4 == 0);
-  assert(scale < 1.0f);
-  assert(scale >= 0x1.0p-32f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
-  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
-  assert(shift >= 24);
-  assert(shift < 56);
-
-  const int64_t rounding = INT64_C(1) << (shift - 1);
-  const int32_t smin = (int32_t) (uint32_t) qmin - (int32_t) (uint32_t) zero_point;
-  const int32_t smax = (int32_t) (uint32_t) qmax - (int32_t) (uint32_t) zero_point;
-  for (; n != 0; n -= 4) {
-    const int32_t x = input[0];
-    const int32_t y = input[1];
-    const int32_t z = input[2];
-    const int32_t w = input[3];
-    input += 4;
-
-    // Compute full 64-bit product of signed 32-bit factors.
-    //
-    // Note: multiplier can be treated as either signed or unsigned.
-    const int64_t x_product = (int64_t) x * (int64_t) multiplier;
-    const int64_t y_product = (int64_t) y * (int64_t) multiplier;
-    const int64_t z_product = (int64_t) z * (int64_t) multiplier;
-    const int64_t w_product = (int64_t) w * (int64_t) multiplier;
-
-    // Adjust product before subsequent shift with rounding up to simulate shift with rounding away from zero.
-    const int64_t x_adjusted_product = x_product - (int64_t) (x < 0);
-    const int64_t y_adjusted_product = y_product - (int64_t) (y < 0);
-    const int64_t z_adjusted_product = z_product - (int64_t) (z < 0);
-    const int64_t w_adjusted_product = w_product - (int64_t) (w < 0);
-
-    // Arithmetically shift the full 64-bit product right with rounding.
-    // Rounding is performed towards closest integer, with midpoints rounded up.
-    //
-    // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
-    // "right shift with rounding" instruction each line below can be represented by just one such instruction
-    // (e.g. VRSHL.S64 on ARM NEON, SRSHL in ARM64 Advanced SIMD).
-    const int32_t x_scaled = (int32_t) math_asr_s64(x_adjusted_product + rounding, shift);
-    const int32_t y_scaled = (int32_t) math_asr_s64(y_adjusted_product + rounding, shift);
-    const int32_t z_scaled = (int32_t) math_asr_s64(z_adjusted_product + rounding, shift);
-    const int32_t w_scaled = (int32_t) math_asr_s64(w_adjusted_product + rounding, shift);
-
-    // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
-    const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
-    const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax);
-    const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax);
-    const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax);
-
-    // Add zero point to clamped value.
-    // The result is guaranteed to be in [qmin, qmax] range.
-    //
-    // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
-    // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
-    const int32_t x_biased = x_clamped + zero_point;
-    const int32_t y_biased = y_clamped + zero_point;
-    const int32_t z_biased = z_clamped + zero_point;
-    const int32_t w_biased = w_clamped + zero_point;
-
-    output[0] = (uint8_t) x_biased;
-    output[1] = (uint8_t) y_biased;
-    output[2] = (uint8_t) z_biased;
-    output[3] = (uint8_t) w_biased;
-    output += 4;
-  }
-}
diff --git a/src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned32.c b/src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned32.c
deleted file mode 100644
index 3c803e3b45c..00000000000
--- a/src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned32.c
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/requantization-stubs.h"
-
-
-void xnn_qu8_requantize_rndna__scalar_unsigned32(
-    size_t n,
-    const int32_t* input,
-    float scale,
-    uint8_t zero_point,
-    uint8_t qmin,
-    uint8_t qmax,
-    uint8_t* output)
-{
-  assert(n % 4 == 0);
-  assert(scale < 1.0f);
-  assert(scale >= 0x1.0p-32f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const uint32_t multiplier = (scale_bits << 8) | UINT32_C(0x80000000);
-  const uint32_t shift = 127 + 31 - (scale_bits >> 23);
-  assert(shift >= 32);
-  assert(shift < 64);
-
-  const uint64_t rounding = UINT64_C(1) << (shift - 1);
-  const uint32_t rounding_hi = (uint32_t) (rounding >> 32);
-  const uint32_t rounding_lo = (uint32_t) rounding;
-  const uint32_t shift_minus_32 = shift - 32;
-  const int32_t smin = (int32_t) (uint32_t) qmin - (int32_t) (uint32_t) zero_point;
-  const int32_t smax = (int32_t) (uint32_t) qmax - (int32_t) (uint32_t) zero_point;
-  for (; n != 0; n -= 4) {
-    const int32_t x = input[0];
-    const int32_t y = input[1];
-    const int32_t z = input[2];
-    const int32_t w = input[3];
-    input += 4;
-
-    // Compute absolute value of input as unsigned 32-bit int.
-    // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
-    const uint32_t x_abs = (x >= 0) ? (uint32_t) x : -(uint32_t) x;
-    const uint32_t y_abs = (y >= 0) ? (uint32_t) y : -(uint32_t) y;
-    const uint32_t z_abs = (z >= 0) ? (uint32_t) z : -(uint32_t) z;
-    const uint32_t w_abs = (w >= 0) ? (uint32_t) w : -(uint32_t) w;
-
-    // Compute full 64-bit product of 32-bit factors.
-    const uint64_t x_product = (uint64_t) x_abs * (uint64_t) multiplier;
-    const uint64_t y_product = (uint64_t) y_abs * (uint64_t) multiplier;
-    const uint64_t z_product = (uint64_t) z_abs * (uint64_t) multiplier;
-    const uint64_t w_product = (uint64_t) w_abs * (uint64_t) multiplier;
-
-    // Shift the full 64-bit product right with rounding.
-    // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
-    //
-    // Generally, this operation requires both 64-bit addition and 64-bit shift, but we use two tricks to replace
-    // 64-bit operations with 32-bit operations.
-    //
-    // To avoid full 64-bit addition we make use of three facts:
-    // - 64-bit rounding value added before the shift is a power of 2, and thus has only one bit set.
-    // - When 0x1.0p-32f <= scale < 0x1.0p-31f, then the non-zero bit in rounding is in the low 32 bits, and
-    //   rounding is exactly 0x80000000 (2**31), because rounding is 2**(scale-1) and scale >= 32. In this case,
-    //   addition of rounding can affect high 32 bits of the product only through overflow, which happens if
-    //   low 32-bit part of the product equals or exceeds 0x80000000. We can reformulate the latter condition
-    //   as low 32-bit part of the product has the bit 31 set, and then overflow happens if both the low 32-bit part
-    //   of the product and the low 32-bit part of the rounding value have bit 31 set. Since 32-bit numbers with the
-    //   bit 31 set are negative when interpreted as signed integers, we can check the overflow condition as
-    //      (int32_t) (LOW(product) & LOW(rounding)) < 0
-    // - When 0x1.0p-31f <= scale < 1.0f, then the non-zero bit is in the high 32 bits of rounding. We just need
-    //   to do 32-bit addition of high 32 bits of rounding and high 32 bits of product. This addition never
-    //   overflows because product <= 0x80000000 * 0xFFFFFF00 < 2**63 and rounding = 2**(scale-1) <= 2**62.
-    //
-    // To avoid full 64-bit shift, we leverage the fact that shift >= 32, and do it in two steps:
-    // - Shift by 32, which can be implemented by extacting the high 32-bit word on 32-bit systems.
-    // - Shift by (shift - 32), which can be implemented as a 32-bit shift of high word of addition result.
-    const uint32_t x_carry_lo = (uint32_t) ((int32_t)((uint32_t) x_product & rounding_lo) < 0);
-    const uint32_t y_carry_lo = (uint32_t) ((int32_t)((uint32_t) y_product & rounding_lo) < 0);
-    const uint32_t z_carry_lo = (uint32_t) ((int32_t)((uint32_t) z_product & rounding_lo) < 0);
-    const uint32_t w_carry_lo = (uint32_t) ((int32_t)((uint32_t) w_product & rounding_lo) < 0);
-
-    const uint32_t x_product_hi = (uint32_t) (x_product >> 32);
-    const uint32_t y_product_hi = (uint32_t) (y_product >> 32);
-    const uint32_t z_product_hi = (uint32_t) (z_product >> 32);
-    const uint32_t w_product_hi = (uint32_t) (w_product >> 32);
-
-    const uint32_t x_abs_scaled = (uint32_t) (x_product_hi + rounding_hi + x_carry_lo) >> shift_minus_32;
-    const uint32_t y_abs_scaled = (uint32_t) (y_product_hi + rounding_hi + y_carry_lo) >> shift_minus_32;
-    const uint32_t z_abs_scaled = (uint32_t) (z_product_hi + rounding_hi + z_carry_lo) >> shift_minus_32;
-    const uint32_t w_abs_scaled = (uint32_t) (w_product_hi + rounding_hi + w_carry_lo) >> shift_minus_32;
-
-    // Copy the sign of input to scaled absolute input value.
-    const int32_t x_scaled = (int32_t) (x >= 0 ? x_abs_scaled : -x_abs_scaled);
-    const int32_t y_scaled = (int32_t) (y >= 0 ? y_abs_scaled : -y_abs_scaled);
-    const int32_t z_scaled = (int32_t) (z >= 0 ? z_abs_scaled : -z_abs_scaled);
-    const int32_t w_scaled = (int32_t) (w >= 0 ? w_abs_scaled : -w_abs_scaled);
-
-    // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
-    const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
-    const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax);
-    const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax);
-    const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax);
-
-    // Add zero point to clamped value.
-    // The result is guaranteed to be in [qmin, qmax] range.
-    //
-    // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
-    // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
-    const int32_t x_biased = x_clamped + zero_point;
-    const int32_t y_biased = y_clamped + zero_point;
-    const int32_t z_biased = z_clamped + zero_point;
-    const int32_t w_biased = w_clamped + zero_point;
-
-    output[0] = (uint8_t) x_biased;
-    output[1] = (uint8_t) y_biased;
-    output[2] = (uint8_t) z_biased;
-    output[3] = (uint8_t) w_biased;
-    output += 4;
-  }
-}
diff --git a/src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned64.c b/src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned64.c
deleted file mode 100644
index cb6e9fd50c7..00000000000
--- a/src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned64.c
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/requantization-stubs.h"
-
-
-void xnn_qu8_requantize_rndna__scalar_unsigned64(
-    size_t n,
-    const int32_t* input,
-    float scale,
-    uint8_t zero_point,
-    uint8_t qmin,
-    uint8_t qmax,
-    uint8_t* output)
-{
-  assert(n % 4 == 0);
-  assert(scale < 1.0f);
-  assert(scale >= 0x1.0p-32f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
-  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
-  assert(shift >= 24);
-  assert(shift < 56);
-
-  const uint64_t rounding = UINT64_C(1) << (shift - 1);
-  const int32_t smin = (int32_t) (uint32_t) qmin - (int32_t) (uint32_t) zero_point;
-  const int32_t smax = (int32_t) (uint32_t) qmax - (int32_t) (uint32_t) zero_point;
-  for (; n != 0; n -= 4) {
-    const int32_t x = input[0];
-    const int32_t y = input[1];
-    const int32_t z = input[2];
-    const int32_t w = input[3];
-    input += 4;
-
-    // Compute absolute value of input as unsigned 32-bit int.
-    // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
-    const uint32_t x_abs = (x >= 0) ? (uint32_t) x : -(uint32_t) x;
-    const uint32_t y_abs = (y >= 0) ? (uint32_t) y : -(uint32_t) y;
-    const uint32_t z_abs = (z >= 0) ? (uint32_t) z : -(uint32_t) z;
-    const uint32_t w_abs = (w >= 0) ? (uint32_t) w : -(uint32_t) w;
-
-    // Compute full 64-bit product of 32-bit factors.
-    const uint64_t x_product = (uint64_t) x_abs * (uint64_t) multiplier;
-    const uint64_t y_product = (uint64_t) y_abs * (uint64_t) multiplier;
-    const uint64_t z_product = (uint64_t) z_abs * (uint64_t) multiplier;
-    const uint64_t w_product = (uint64_t) w_abs * (uint64_t) multiplier;
-
-    // Shift the full 64-bit product right with rounding.
-    // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
-    //
-    // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
-    // "right shift with rounding" instruction each line below can be represented by just one such instruction
-    // (e.g. VRSHL.U64 on ARM NEON, URSHL in ARM64 Advanced SIMD).
-    const uint32_t x_abs_scaled = (uint32_t) ((x_product + rounding) >> shift);
-    const uint32_t y_abs_scaled = (uint32_t) ((y_product + rounding) >> shift);
-    const uint32_t z_abs_scaled = (uint32_t) ((z_product + rounding) >> shift);
-    const uint32_t w_abs_scaled = (uint32_t) ((w_product + rounding) >> shift);
-
-    // Copy the sign of input to scaled absolute input value.
-    //
-    // On x86 processors with SSSE3 instruction set, this operation nicely maps to PSIGND instruction.
-    const int32_t x_scaled = (int32_t) (x >= 0 ? x_abs_scaled : -x_abs_scaled);
-    const int32_t y_scaled = (int32_t) (y >= 0 ? y_abs_scaled : -y_abs_scaled);
-    const int32_t z_scaled = (int32_t) (z >= 0 ? z_abs_scaled : -z_abs_scaled);
-    const int32_t w_scaled = (int32_t) (w >= 0 ? w_abs_scaled : -w_abs_scaled);
-
-    // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
-    const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
-    const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax);
-    const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax);
-    const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax);
-
-    // Add zero point to clamped value.
-    // The result is guaranteed to be in [qmin, qmax] range.
-    //
-    // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
-    // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
-    const int32_t x_biased = x_clamped + zero_point;
-    const int32_t y_biased = y_clamped + zero_point;
-    const int32_t z_biased = z_clamped + zero_point;
-    const int32_t w_biased = w_clamped + zero_point;
-
-    output[0] = (uint8_t) x_biased;
-    output[1] = (uint8_t) y_biased;
-    output[2] = (uint8_t) z_biased;
-    output[3] = (uint8_t) w_biased;
-    output += 4;
-  }
-}
diff --git a/src/qu8-requantization/qu8-requantization-rndna-sse2.c b/src/qu8-requantization/qu8-requantization-rndna-sse2.c
deleted file mode 100644
index c4bf952d9ee..00000000000
--- a/src/qu8-requantization/qu8-requantization-rndna-sse2.c
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#include <emmintrin.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/requantization-stubs.h"
-
-
-void xnn_qu8_requantize_rndna__sse2(
-    size_t n,
-    const int32_t* input,
-    float scale,
-    uint8_t zero_point,
-    uint8_t qmin,
-    uint8_t qmax,
-    uint8_t* output)
-{
-  assert(n % 16 == 0);
-  assert(scale < 1.0f);
-  assert(scale >= 0x1.0p-32f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
-  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
-  assert(shift >= 24);
-  assert(shift < 56);
-  const uint64_t rounding = UINT64_C(1) << (shift - 1);
-
-  const __m128i vmultiplier = _mm_set1_epi32(multiplier);
-  const __m128i vzero_point = _mm_set1_epi16((short) (uint16_t) zero_point);
-  const __m128i vqmin = _mm_set1_epi8((char) qmin);
-  const __m128i vqmax = _mm_set1_epi8((char) qmax);
-  const __m128i vshift = _mm_cvtsi32_si128((int) shift);
-  const __m128i vrounding = _mm_set1_epi64x(rounding);
-  for (; n != 0; n -= 16) {
-    const __m128i x = _mm_loadu_si128((const __m128i*) input);
-    const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
-    const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
-    const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
-    input += 16;
-
-    const __m128i x_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), x);
-    const __m128i y_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), y);
-    const __m128i z_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), z);
-    const __m128i w_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), w);
-
-    const __m128i x_abs0123 = _mm_sub_epi32(_mm_xor_si128(x, x_neg_mask), x_neg_mask);
-    const __m128i y_abs0123 = _mm_sub_epi32(_mm_xor_si128(y, y_neg_mask), y_neg_mask);
-    const __m128i z_abs0123 = _mm_sub_epi32(_mm_xor_si128(z, z_neg_mask), z_neg_mask);
-    const __m128i w_abs0123 = _mm_sub_epi32(_mm_xor_si128(w, w_neg_mask), w_neg_mask);
-
-    const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-
-    const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier);
-    const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier);
-    const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier);
-    const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier);
-
-    const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier);
-    const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier);
-    const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier);
-    const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier);
-
-    const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshift);
-    const __m128i x_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(x_absmul13, vrounding), vshift);
-    const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshift);
-    const __m128i y_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(y_absmul13, vrounding), vshift);
-    const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshift);
-    const __m128i z_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(z_absmul13, vrounding), vshift);
-    const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshift);
-    const __m128i w_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(w_absmul13, vrounding), vshift);
-
-    const __m128i x_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(x_abs_scaled02), _mm_castsi128_ps(x_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i y_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(y_abs_scaled02), _mm_castsi128_ps(y_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i z_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(z_abs_scaled02), _mm_castsi128_ps(z_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i w_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(w_abs_scaled02), _mm_castsi128_ps(w_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-
-    const __m128i x_abs_scaled = _mm_shuffle_epi32(x_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i y_abs_scaled = _mm_shuffle_epi32(y_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i z_abs_scaled = _mm_shuffle_epi32(z_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i w_abs_scaled = _mm_shuffle_epi32(w_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-
-    const __m128i x_scaled = _mm_sub_epi32(_mm_xor_si128(x_abs_scaled, x_neg_mask), x_neg_mask);
-    const __m128i y_scaled = _mm_sub_epi32(_mm_xor_si128(y_abs_scaled, y_neg_mask), y_neg_mask);
-    const __m128i z_scaled = _mm_sub_epi32(_mm_xor_si128(z_abs_scaled, z_neg_mask), z_neg_mask);
-    const __m128i w_scaled = _mm_sub_epi32(_mm_xor_si128(w_abs_scaled, w_neg_mask), w_neg_mask);
-
-    const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
-    const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
-    const __m128i xyzw_packed = _mm_packus_epi16(xy_packed, zw_packed);
-    const __m128i xyzw_clamped = _mm_max_epu8(_mm_min_epu8(xyzw_packed, vqmax), vqmin);
-
-    // 4x PXOR (setzero)
-    // 8x PSUBD
-    // 8x PXOR
-    // 8x PSHUFD
-    // 8x PMULUDQ
-    // 8x PSRLQ
-    // 8x PADDQ
-    // 4x SHUFPS
-    // 2x PACKSSDW
-    // 1x PACKUSWB
-    // 2x PADDW
-    // 1x PMAXUB
-    // 1x PMINUB
-    // ---------------------
-    // 63 instructions total
-
-    _mm_storeu_si128((__m128i*) output, xyzw_clamped);
-    output += 16;
-  }
-}
diff --git a/src/qu8-requantization/qu8-requantization-rndna-sse41.c b/src/qu8-requantization/qu8-requantization-rndna-sse41.c
deleted file mode 100644
index 044725bfe70..00000000000
--- a/src/qu8-requantization/qu8-requantization-rndna-sse41.c
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#include <smmintrin.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/requantization-stubs.h"
-
-
-void xnn_qu8_requantize_rndna__sse41(
-    size_t n,
-    const int32_t* input,
-    float scale,
-    uint8_t zero_point,
-    uint8_t qmin,
-    uint8_t qmax,
-    uint8_t* output)
-{
-  assert(n % 16 == 0);
-  assert(scale < 1.0f);
-  assert(scale >= 0x1.0p-32f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const uint32_t multiplier = (scale_bits << 8) | UINT32_C(0x80000000);
-  const uint32_t shift = 127 + 31 - (scale_bits >> 23);
-  assert(shift >= 32);
-  assert(shift < 64);
-  const uint64_t rounding = UINT64_C(1) << (shift - 1);
-
-  const __m128i vmultiplier = _mm_set1_epi32(multiplier);
-  const __m128i vzero_point = _mm_set1_epi16((short) (uint16_t) zero_point);
-  const __m128i vqmin = _mm_set1_epi8((char) qmin);
-  const __m128i vqmax = _mm_set1_epi8((char) qmax);
-  const __m128i vshiftlo = _mm_cvtsi32_si128((int) shift);
-  const __m128i vshifthi = _mm_cvtsi32_si128((int) shift - 32);
-  const __m128i vrounding = _mm_set1_epi64x(rounding);
-  for (; n != 0; n -= 16) {
-    const __m128i x = _mm_loadu_si128((const __m128i*) input);
-    const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
-    const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
-    const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
-    input += 16;
-
-    const __m128i x_abs0123 = _mm_abs_epi32(x);
-    const __m128i y_abs0123 = _mm_abs_epi32(y);
-    const __m128i z_abs0123 = _mm_abs_epi32(z);
-    const __m128i w_abs0123 = _mm_abs_epi32(w);
-
-    const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-
-    const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier);
-    const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier);
-    const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier);
-    const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier);
-
-    const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier);
-    const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier);
-    const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier);
-    const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier);
-
-    const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshiftlo);
-    const __m128i x_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(x_absmul13, vrounding), vshifthi);
-    const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshiftlo);
-    const __m128i y_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(y_absmul13, vrounding), vshifthi);
-    const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshiftlo);
-    const __m128i z_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(z_absmul13, vrounding), vshifthi);
-    const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshiftlo);
-    const __m128i w_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(w_absmul13, vrounding), vshifthi);
-
-    const __m128i x_abs_scaled = _mm_blend_epi16(x_abs_scaled02, x_abs_scaled13, 0xCC);
-    const __m128i y_abs_scaled = _mm_blend_epi16(y_abs_scaled02, y_abs_scaled13, 0xCC);
-    const __m128i z_abs_scaled = _mm_blend_epi16(z_abs_scaled02, z_abs_scaled13, 0xCC);
-    const __m128i w_abs_scaled = _mm_blend_epi16(w_abs_scaled02, w_abs_scaled13, 0xCC);
-
-    const __m128i x_scaled = _mm_sign_epi32(x_abs_scaled, x);
-    const __m128i y_scaled = _mm_sign_epi32(y_abs_scaled, y);
-    const __m128i z_scaled = _mm_sign_epi32(z_abs_scaled, z);
-    const __m128i w_scaled = _mm_sign_epi32(w_abs_scaled, w);
-
-    const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
-    const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
-    const __m128i xyzw_packed = _mm_packus_epi16(xy_packed, zw_packed);
-    const __m128i xyzw_clamped = _mm_max_epu8(_mm_min_epu8(xyzw_packed, vqmax), vqmin);
-
-    // 4x PABSD
-    // 4x PSHUFD
-    // 8x PMULUDQ
-    // 4x PSRLQ
-    // 4x PSRLD
-    // 8x PADDQ
-    // 4x PBLENDW
-    // 4x PSIGND
-    // 2x PACKSSDW
-    // 1x PACKUSWB
-    // 2x PADDW
-    // 1x PMAXUB
-    // 1x PMINUB
-    // ---------------------
-    // 47 instructions total
-
-    _mm_storeu_si128((__m128i*) output, xyzw_clamped);
-    output += 16;
-  }
-}
diff --git a/src/qu8-requantization/qu8-requantization-rndna-ssse3.c b/src/qu8-requantization/qu8-requantization-rndna-ssse3.c
deleted file mode 100644
index 564a4155c44..00000000000
--- a/src/qu8-requantization/qu8-requantization-rndna-ssse3.c
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#include <tmmintrin.h>
-
-#include "xnnpack/math.h"
-#include "xnnpack/requantization-stubs.h"
-
-
-void xnn_qu8_requantize_rndna__ssse3(
-    size_t n,
-    const int32_t* input,
-    float scale,
-    uint8_t zero_point,
-    uint8_t qmin,
-    uint8_t qmax,
-    uint8_t* output)
-{
-  assert(n % 16 == 0);
-  assert(scale < 1.0f);
-  assert(scale >= 0x1.0p-32f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
-  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
-  assert(shift >= 24);
-  assert(shift < 56);
-  const uint64_t rounding = UINT64_C(1) << (shift - 1);
-
-  const __m128i vmultiplier = _mm_set1_epi32(multiplier);
-  const __m128i vzero_point = _mm_set1_epi16((short) (uint16_t) zero_point);
-  const __m128i vqmin = _mm_set1_epi8((char) qmin);
-  const __m128i vqmax = _mm_set1_epi8((char) qmax);
-  const __m128i vshift = _mm_cvtsi32_si128((int) shift);
-  const __m128i vrounding = _mm_set1_epi64x(rounding);
-  for (; n != 0; n -= 16) {
-    const __m128i x = _mm_loadu_si128((const __m128i*) input);
-    const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
-    const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
-    const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
-    input += 16;
-
-    const __m128i x_abs0123 = _mm_abs_epi32(x);
-    const __m128i y_abs0123 = _mm_abs_epi32(y);
-    const __m128i z_abs0123 = _mm_abs_epi32(z);
-    const __m128i w_abs0123 = _mm_abs_epi32(w);
-
-    const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
-
-    const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier);
-    const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier);
-    const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier);
-    const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier);
-
-    const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier);
-    const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier);
-    const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier);
-    const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier);
-
-    const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshift);
-    const __m128i x_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(x_absmul13, vrounding), vshift);
-    const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshift);
-    const __m128i y_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(y_absmul13, vrounding), vshift);
-    const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshift);
-    const __m128i z_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(z_absmul13, vrounding), vshift);
-    const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshift);
-    const __m128i w_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(w_absmul13, vrounding), vshift);
-
-    const __m128i x_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(x_abs_scaled02), _mm_castsi128_ps(x_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i y_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(y_abs_scaled02), _mm_castsi128_ps(y_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i z_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(z_abs_scaled02), _mm_castsi128_ps(z_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i w_abs_scaled0213 = _mm_castps_si128(
-        _mm_shuffle_ps(_mm_castsi128_ps(w_abs_scaled02), _mm_castsi128_ps(w_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
-
-    const __m128i x_abs_scaled = _mm_shuffle_epi32(x_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i y_abs_scaled = _mm_shuffle_epi32(y_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i z_abs_scaled = _mm_shuffle_epi32(z_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i w_abs_scaled = _mm_shuffle_epi32(w_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
-
-    const __m128i x_scaled = _mm_sign_epi32(x_abs_scaled, x);
-    const __m128i y_scaled = _mm_sign_epi32(y_abs_scaled, y);
-    const __m128i z_scaled = _mm_sign_epi32(z_abs_scaled, z);
-    const __m128i w_scaled = _mm_sign_epi32(w_abs_scaled, w);
-
-    const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
-    const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
-    const __m128i xyzw_packed = _mm_packus_epi16(xy_packed, zw_packed);
-    const __m128i xyzw_clamped = _mm_max_epu8(_mm_min_epu8(xyzw_packed, vqmax), vqmin);
-
-    // 4x PABSD
-    // 8x PSHUFD
-    // 8x PMULUDQ
-    // 8x PSRLQ
-    // 8x PADDQ
-    // 4x SHUFPS
-    // 4x PSIGND
-    // 2x PACKSSDW
-    // 1x PACKUSWB
-    // 2x PADDW
-    // 1x PMAXUB
-    // 1x PMINUB
-    // ---------------------
-    // 51 instructions total
-
-    _mm_storeu_si128((__m128i*) output, xyzw_clamped);
-    output += 16;
-  }
-}
diff --git a/src/qu8-vadd/qu8-vadd-minmax.h b/src/qu8-vadd/qu8-vadd-minmax.h
index 2e9658f7fe9..2cc23f195c0 100644
--- a/src/qu8-vadd/qu8-vadd-minmax.h
+++ b/src/qu8-vadd/qu8-vadd-minmax.h
@@ -36,9 +36,12 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vadd_minmax_ukernel__avx_mul32
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_u16, 16, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_u8, 8, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_u16, 16, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u16, 16, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u32, 32, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vadd_minmax_ukernel__wasmsimd_u8, 8, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params)
diff --git a/src/qu8-vaddc/qu8-vaddc-minmax.h b/src/qu8-vaddc/qu8-vaddc-minmax.h
index 08c23dd2a97..1d15a4d9774 100644
--- a/src/qu8-vaddc/qu8-vaddc-minmax.h
+++ b/src/qu8-vaddc/qu8-vaddc-minmax.h
@@ -36,9 +36,12 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vaddc_minmax_ukernel__avx_mul3
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_u16, 16, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_u8, 8, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16, 16, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16, 16, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u32, 32, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params)
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vaddc_minmax_ukernel__wasmsimd_u8, 8, false, uint8_t, struct xnn_qu8_add_minmax_params, xnn_init_qu8_add_minmax_scalar_params)
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-avx-u16.c b/src/qu8-vhswish/gen/qu8-vhswish-avx-u16.c
index d7a5347b72e..632ce7bba2d 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-avx-u16.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-avx-u16.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vhswish_ukernel__avx_u16(
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-avx-u32.c b/src/qu8-vhswish/gen/qu8-vhswish-avx-u32.c
index a08379aebc7..cf2a42d9d9c 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-avx-u32.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-avx-u32.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vhswish_ukernel__avx_u32(
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-avx-u8.c b/src/qu8-vhswish/gen/qu8-vhswish-avx-u8.c
index ea6d75990cf..f9d8bcebdec 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-avx-u8.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-avx-u8.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vhswish_ukernel__avx_u8(
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-neon-u16.c b/src/qu8-vhswish/gen/qu8-vhswish-neon-u16.c
index 5b704e6f655..b689a77d76b 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-neon-u16.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-neon-u16.c
@@ -12,7 +12,7 @@
 #include <arm_neon.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vhswish_ukernel__neon_u16(
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-neon-u32.c b/src/qu8-vhswish/gen/qu8-vhswish-neon-u32.c
index 9115b9ac4f3..22d6538fee4 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-neon-u32.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-neon-u32.c
@@ -12,7 +12,7 @@
 #include <arm_neon.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vhswish_ukernel__neon_u32(
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-neon-u8.c b/src/qu8-vhswish/gen/qu8-vhswish-neon-u8.c
index 52f922b5f97..613c460c9c2 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-neon-u8.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-neon-u8.c
@@ -12,7 +12,7 @@
 #include <arm_neon.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vhswish_ukernel__neon_u8(
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-scalar-u1.c b/src/qu8-vhswish/gen/qu8-vhswish-scalar-u1.c
index 5db6f72ece8..be9de0e3542 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-scalar-u1.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-scalar-u1.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vhswish_ukernel__scalar_u1(
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-scalar-u2.c b/src/qu8-vhswish/gen/qu8-vhswish-scalar-u2.c
index 3fec85a7850..c4fc01757d3 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-scalar-u2.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-scalar-u2.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vhswish_ukernel__scalar_u2(
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-scalar-u4.c b/src/qu8-vhswish/gen/qu8-vhswish-scalar-u4.c
index 5215d81cb6b..a77516d39a6 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-scalar-u4.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-scalar-u4.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vhswish_ukernel__scalar_u4(
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-sse2-u16.c b/src/qu8-vhswish/gen/qu8-vhswish-sse2-u16.c
index ea3eb20ab72..d80e34b285c 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-sse2-u16.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-sse2-u16.c
@@ -12,7 +12,7 @@
 #include <emmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-sse2-u32.c b/src/qu8-vhswish/gen/qu8-vhswish-sse2-u32.c
index 7f0dfff8124..a797aa7f4d9 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-sse2-u32.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-sse2-u32.c
@@ -12,7 +12,7 @@
 #include <emmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-sse41-u16.c b/src/qu8-vhswish/gen/qu8-vhswish-sse41-u16.c
index 0f61f80cdb4..ab6c9fd4703 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-sse41-u16.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-sse41-u16.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vhswish_ukernel__sse41_u16(
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-sse41-u32.c b/src/qu8-vhswish/gen/qu8-vhswish-sse41-u32.c
index d77c2daffde..77abc2d5c2f 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-sse41-u32.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-sse41-u32.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vhswish_ukernel__sse41_u32(
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-sse41-u8.c b/src/qu8-vhswish/gen/qu8-vhswish-sse41-u8.c
index 3a9274eb06f..f10beb22a06 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-sse41-u8.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-sse41-u8.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vhswish_ukernel__sse41_u8(
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u16.c b/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u16.c
index 5a6009627f1..c5ab8780096 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u16.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u16.c
@@ -12,7 +12,7 @@
 #include <tmmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u32.c b/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u32.c
index 31d676297f5..ac6c3707eef 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u32.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-ssse3-u32.c
@@ -12,7 +12,7 @@
 #include <tmmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u16.c b/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u16.c
index ec8a6ad1f7a..87a80df6c00 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u16.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u16.c
@@ -12,7 +12,7 @@
 #include <wasm_simd128.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vhswish_ukernel__wasmsimd_u16(
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u32.c b/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u32.c
index ae26ff0d007..14e701ba098 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u32.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u32.c
@@ -12,7 +12,7 @@
 #include <wasm_simd128.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vhswish_ukernel__wasmsimd_u32(
diff --git a/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u8.c b/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u8.c
index 82c6e0140b1..b582ef3a129 100644
--- a/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u8.c
+++ b/src/qu8-vhswish/gen/qu8-vhswish-wasmsimd-u8.c
@@ -12,7 +12,7 @@
 #include <wasm_simd128.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vhswish_ukernel__wasmsimd_u8(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u4.c b/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u4.c
index 2c90579f979..749c1e52685 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u4.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u4.c
@@ -14,7 +14,7 @@
 #include "xnnpack/intrinsics-polyfill.h"
 #include "xnnpack/math.h"
 #include "xnnpack/unaligned.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__armsimd32_u4(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u8.c b/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u8.c
index d217b4cd995..f3307de68f7 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u8.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-u8.c
@@ -14,7 +14,7 @@
 #include "xnnpack/intrinsics-polyfill.h"
 #include "xnnpack/math.h"
 #include "xnnpack/unaligned.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__armsimd32_u8(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u16.c b/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u16.c
index e74c9e5efc0..368dd44b9c1 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u16.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u16.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__avx_u16(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u32.c b/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u32.c
index 95992be39d9..e5066d5ac84 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u32.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u32.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__avx_u32(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u8.c b/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u8.c
index fc7c4ec3841..7245be8b177 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u8.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-avx-u8.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__avx_u8(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u16.c b/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u16.c
index 2a3e00ef751..de0700bdd11 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u16.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u16.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__avx2_u16(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c b/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c
index 4495449c4fc..3a1f2aa6f34 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u32.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__avx2_u32(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u64.c b/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u64.c
index c462e8f27b2..87e650b173d 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u64.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-u64.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__avx2_u64(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u16.c b/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u16.c
index a7a7ab3c052..ee89ae561b7 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u16.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u16.c
@@ -12,7 +12,7 @@
 #include <arm_neon.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__neon_u16(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u32.c b/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u32.c
index 4c7d391689d..74085803790 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u32.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u32.c
@@ -12,7 +12,7 @@
 #include <arm_neon.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__neon_u32(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u8.c b/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u8.c
index aedd5a1c8df..a5f21080e73 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u8.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-neon-u8.c
@@ -12,7 +12,7 @@
 #include <arm_neon.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__neon_u8(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c b/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c
index 8b8ede5ebc8..d37375fbc44 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u1v.c
@@ -11,7 +11,7 @@
 
 #include <riscv_vector.h>
 
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__rvv_u1v(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c b/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c
index 188e17eb953..4222ad94212 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c
@@ -11,7 +11,7 @@
 
 #include <riscv_vector.h>
 
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__rvv_u2v(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u1.c b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u1.c
index 0c977ced365..bcc7e1784a1 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u1.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u1.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__scalar_andxor_u1(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u2.c b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u2.c
index 9c84e95cd25..6ee964d7599 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u2.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u2.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__scalar_andxor_u2(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u4.c b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u4.c
index b2c3dac2396..a8dcc85dfe4 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u4.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-u4.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__scalar_andxor_u4(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u1.c b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u1.c
index 07bda31539f..ee2f92a2b32 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u1.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u1.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__scalar_select_u1(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u2.c b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u2.c
index b5b8175e641..b384959fa1a 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u2.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u2.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__scalar_select_u2(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u4.c b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u4.c
index 5a7060ca906..2271daee282 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u4.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-u4.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 
 #include "xnnpack/math.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__scalar_select_u4(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u16.c b/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u16.c
index cda0c4f5583..495c1403e8f 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u16.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u16.c
@@ -12,7 +12,7 @@
 #include <emmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u32.c b/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u32.c
index 650a3e1450c..82c69bc34e2 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u32.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-u32.c
@@ -12,7 +12,7 @@
 #include <emmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u16.c b/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u16.c
index 16192fa2f24..b0a0d4b3f41 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u16.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u16.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__sse41_u16(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u32.c b/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u32.c
index 4e75c18f848..079f62071be 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u32.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u32.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__sse41_u32(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u8.c b/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u8.c
index ffb29eeb65a..60461abe767 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u8.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-u8.c
@@ -13,7 +13,7 @@
 
 #include "xnnpack/common.h"
 #include "xnnpack/intrinsics-polyfill.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 
 
 void xnn_qu8_vlrelu_ukernel__sse41_u8(
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u16.c b/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u16.c
index 55fa1989df8..e963944edac 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u16.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u16.c
@@ -12,7 +12,7 @@
 #include <tmmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u32.c b/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u32.c
index 9dc7daa7070..472713d9c3d 100644
--- a/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u32.c
+++ b/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-u32.c
@@ -12,7 +12,7 @@
 #include <tmmintrin.h>
 
 #include "xnnpack/common.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 #include "xnnpack/unaligned.h"
 
 
diff --git a/src/qu8-vlrelu/qu8-vlrelu.h b/src/qu8-vlrelu/qu8-vlrelu.h
index 5a404ada04f..ded476d9daf 100644
--- a/src/qu8-vlrelu/qu8-vlrelu.h
+++ b/src/qu8-vlrelu/qu8-vlrelu.h
@@ -17,59 +17,59 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qu8_vlrelu_ukernel__neon_u8, 8, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qu8_vlrelu_ukernel__neon_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qu8_vlrelu_ukernel__neon_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qu8_vlrelu_ukernel__neon_u8, 8, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qu8_vlrelu_ukernel__neon_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_qu8_vlrelu_ukernel__neon_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 #if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qu8_vlrelu_ukernel__rvv_u1v, 1, true, int8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qu8_vlrelu_ukernel__rvv_u2v, 2, true, int8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qu8_vlrelu_ukernel__rvv_u1v, 1, true, int8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_qu8_vlrelu_ukernel__rvv_u2v, 2, true, int8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
 #endif  // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV)
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__sse2_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__sse2_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qu8_vlrelu_ukernel__ssse3_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qu8_vlrelu_ukernel__ssse3_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qu8_vlrelu_ukernel__sse41_u8, 8, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qu8_vlrelu_ukernel__sse41_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qu8_vlrelu_ukernel__sse41_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vlrelu_ukernel__avx_u8, 8, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vlrelu_ukernel__avx_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vlrelu_ukernel__avx_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vlrelu_ukernel__avx2_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vlrelu_ukernel__avx2_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vlrelu_ukernel__avx2_u64, 64, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__sse2_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__sse2_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qu8_vlrelu_ukernel__ssse3_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_ssse3, xnn_qu8_vlrelu_ukernel__ssse3_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qu8_vlrelu_ukernel__sse41_u8, 8, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qu8_vlrelu_ukernel__sse41_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_qu8_vlrelu_ukernel__sse41_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vlrelu_ukernel__avx_u8, 8, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vlrelu_ukernel__avx_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_qu8_vlrelu_ukernel__avx_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vlrelu_ukernel__avx2_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vlrelu_ukernel__avx2_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_qu8_vlrelu_ukernel__avx2_u64, 64, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_arm_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_arm_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_x86_u8, 8, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_x86_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_x86_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_arm_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_arm_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_x86_u8, 8, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_x86_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmsimd_x86_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 
 #if XNN_ARCH_WASMRELAXEDSIMD
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_u8, 8, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_u16, 16, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_u32, 32, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_u8, 8, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_u16, 16, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_u32, 32, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
 #endif  // XNN_ARCH_WASMRELAXEDSIMD
 
 #if XNN_ARCH_ARM
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qu8_vlrelu_ukernel__armsimd32_u4, 4, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qu8_vlrelu_ukernel__armsimd32_u8, 8, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qu8_vlrelu_ukernel__armsimd32_u4, 4, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_v6, xnn_qu8_vlrelu_ukernel__armsimd32_u8, 8, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
 #endif  // XNN_ARCH_ARM
 
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_select_u1, 1, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_select_u2, 2, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_select_u4, 4, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_andxor_u1, 1, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_andxor_u2, 2, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
-XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_andxor_u4, 4, false, uint8_t, union xnn_qu8_lrelu_minmax_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_select_u1, 1, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_select_u2, 2, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_select_u4, 4, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_andxor_u1, 1, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_andxor_u2, 2, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(0, xnn_qu8_vlrelu_ukernel__scalar_andxor_u4, 4, false, uint8_t, struct xnn_qu8_lrelu_params, xnn_init_qu8_lrelu_scalar_params)
 
 #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS
 #undef XNN_DEFINED_UKERNEL_WITH_PARAMS
diff --git a/src/s32-f32-vcvt/s32-f32-vcvt.h b/src/s32-f32-vcvt/s32-f32-vcvt.h
index 8d7c1d11a3e..b73438208ca 100644
--- a/src/s32-f32-vcvt/s32-f32-vcvt.h
+++ b/src/s32-f32-vcvt/s32-f32-vcvt.h
@@ -28,6 +28,9 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_f32_vcvt_ukernel__avx2_u8
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_f32_vcvt_ukernel__avx2_u16, 16, false, int32_t, float, struct xnn_s32_f32_cvt_params, xnn_init_s32_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_f32_vcvt_ukernel__avx2_u24, 24, false, int32_t, float, struct xnn_s32_f32_cvt_params, xnn_init_s32_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_f32_vcvt_ukernel__avx2_u32, 32, false, int32_t, float, struct xnn_s32_f32_cvt_params, xnn_init_s32_f32_cvt_scalar_params)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_f32_vcvt_ukernel__avx512f_u16, 16, false, int32_t, float, struct xnn_s32_f32_cvt_params, xnn_init_s32_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_f32_vcvt_ukernel__avx512f_u32, 32, false, int32_t, float, struct xnn_s32_f32_cvt_params, xnn_init_s32_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_f32_vcvt_ukernel__avx512f_u48, 48, false, int32_t, float, struct xnn_s32_f32_cvt_params, xnn_init_s32_f32_cvt_scalar_params)
diff --git a/src/s32-vmul/s32-vmul.h b/src/s32-vmul/s32-vmul.h
index 6ccbf827ec4..be5f1618418 100644
--- a/src/s32-vmul/s32-vmul.h
+++ b/src/s32-vmul/s32-vmul.h
@@ -29,11 +29,14 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmul_ukernel__avx2_u8, 8, fal
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmul_ukernel__avx2_u16, 16, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmul_ukernel__avx2_u24, 24, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmul_ukernel__avx2_u32, 32, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_vmul_ukernel__avx512f_u16, 16, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_vmul_ukernel__avx512f_u32, 32, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_vmul_ukernel__avx512f_u48, 48, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_vmul_ukernel__avx512f_u64, 64, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL))
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_UKERNEL_WITH_PARAMS(0, xnn_s32_vmul_ukernel__wasmsimd_u4, 4, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL))
diff --git a/src/s32-vmul/s32-vmulc.h b/src/s32-vmul/s32-vmulc.h
index b478f7323e5..13e6f76ad99 100644
--- a/src/s32-vmul/s32-vmulc.h
+++ b/src/s32-vmul/s32-vmulc.h
@@ -29,6 +29,9 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmulc_ukernel__avx2_u8, 8, fa
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmulc_ukernel__avx2_u16, 16, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmulc_ukernel__avx2_u24, 24, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s32_vmulc_ukernel__avx2_u32, 32, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL))
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_vmulc_ukernel__avx512f_u16, 16, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_vmulc_ukernel__avx512f_u32, 32, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL))
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_s32_vmulc_ukernel__avx512f_u48, 48, false, int32_t, struct xnn_s32_default_params, ((xnn_init_s32_default_params_fn) NULL))
diff --git a/src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c b/src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c
new file mode 100644
index 00000000000..ce574eee512
--- /dev/null
+++ b/src/s8-vclamp/gen/s8-vclamp-rvv-u1v.c
@@ -0,0 +1,44 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s8-vclamp/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "xnnpack/common.h"
+#include "xnnpack/intrinsics-polyfill.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s8_vclamp_ukernel__rvv_u1v(
+    size_t batch,
+    const int8_t* input,
+    int8_t* output,
+    const struct xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const int8_t vmin = params->scalar.min;
+  const int8_t vmax = params->scalar.max;
+
+  do {
+    const size_t n = __riscv_vsetvl_e8m1(batch);
+    vint8m1_t vacc = __riscv_vle8_v_i8m1(input, n);
+    vacc = __riscv_vmax_vx_i8m1(vacc, vmin, n);
+    vacc = __riscv_vmin_vx_i8m1(vacc, vmax, n);
+    __riscv_vse8_v_i8m1(output, vacc, n);
+    input += n;
+    output += n;
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c b/src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c
new file mode 100644
index 00000000000..7b3d979c852
--- /dev/null
+++ b/src/s8-vclamp/gen/s8-vclamp-rvv-u2v.c
@@ -0,0 +1,44 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s8-vclamp/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "xnnpack/common.h"
+#include "xnnpack/intrinsics-polyfill.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s8_vclamp_ukernel__rvv_u2v(
+    size_t batch,
+    const int8_t* input,
+    int8_t* output,
+    const struct xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const int8_t vmin = params->scalar.min;
+  const int8_t vmax = params->scalar.max;
+
+  do {
+    const size_t n = __riscv_vsetvl_e8m2(batch);
+    vint8m2_t vacc = __riscv_vle8_v_i8m2(input, n);
+    vacc = __riscv_vmax_vx_i8m2(vacc, vmin, n);
+    vacc = __riscv_vmin_vx_i8m2(vacc, vmax, n);
+    __riscv_vse8_v_i8m2(output, vacc, n);
+    input += n;
+    output += n;
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c b/src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c
new file mode 100644
index 00000000000..7da079a2e0d
--- /dev/null
+++ b/src/s8-vclamp/gen/s8-vclamp-rvv-u4v.c
@@ -0,0 +1,44 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s8-vclamp/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "xnnpack/common.h"
+#include "xnnpack/intrinsics-polyfill.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s8_vclamp_ukernel__rvv_u4v(
+    size_t batch,
+    const int8_t* input,
+    int8_t* output,
+    const struct xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const int8_t vmin = params->scalar.min;
+  const int8_t vmax = params->scalar.max;
+
+  do {
+    const size_t n = __riscv_vsetvl_e8m4(batch);
+    vint8m4_t vacc = __riscv_vle8_v_i8m4(input, n);
+    vacc = __riscv_vmax_vx_i8m4(vacc, vmin, n);
+    vacc = __riscv_vmin_vx_i8m4(vacc, vmax, n);
+    __riscv_vse8_v_i8m4(output, vacc, n);
+    input += n;
+    output += n;
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c b/src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c
new file mode 100644
index 00000000000..4ba23c5333a
--- /dev/null
+++ b/src/s8-vclamp/gen/s8-vclamp-rvv-u8v.c
@@ -0,0 +1,44 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s8-vclamp/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "xnnpack/common.h"
+#include "xnnpack/intrinsics-polyfill.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s8_vclamp_ukernel__rvv_u8v(
+    size_t batch,
+    const int8_t* input,
+    int8_t* output,
+    const struct xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const int8_t vmin = params->scalar.min;
+  const int8_t vmax = params->scalar.max;
+
+  do {
+    const size_t n = __riscv_vsetvl_e8m8(batch);
+    vint8m8_t vacc = __riscv_vle8_v_i8m8(input, n);
+    vacc = __riscv_vmax_vx_i8m8(vacc, vmin, n);
+    vacc = __riscv_vmin_vx_i8m8(vacc, vmax, n);
+    __riscv_vse8_v_i8m8(output, vacc, n);
+    input += n;
+    output += n;
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/s8-vclamp/rvv.c.in b/src/s8-vclamp/rvv.c.in
new file mode 100644
index 00000000000..512b531de57
--- /dev/null
+++ b/src/s8-vclamp/rvv.c.in
@@ -0,0 +1,49 @@
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert LMUL in [1, 2, 4, 8]
+$assert DATATYPE in ["S8", "U8"]
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "xnnpack/common.h"
+#include "xnnpack/intrinsics-polyfill.h"
+#include "xnnpack/vunary.h"
+
+$XINT8_T = {"S8": "int8_t", "U8": "uint8_t"}[DATATYPE]
+
+void xnn_${DATATYPE.lower()}_vclamp_ukernel__rvv_u${LMUL}v(
+    size_t batch,
+    const ${XINT8_T}* input,
+    ${XINT8_T}* output,
+    const struct xnn_${DATATYPE.lower()}_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(${XINT8_T}) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const ${XINT8_T} vmin = params->scalar.min;
+  const ${XINT8_T} vmax = params->scalar.max;
+
+  do {
+    const size_t n = __riscv_vsetvl_e8m${LMUL}(batch);
+    $if DATATYPE == "S8":
+      vint8m${LMUL}_t vacc = __riscv_vle8_v_i8m${LMUL}(input, n);
+      vacc = __riscv_vmax_vx_i8m${LMUL}(vacc, vmin, n);
+      vacc = __riscv_vmin_vx_i8m${LMUL}(vacc, vmax, n);
+      __riscv_vse8_v_i8m${LMUL}(output, vacc, n);
+    $else:
+      vuint8m${LMUL}_t vacc = __riscv_vle8_v_u8m${LMUL}(input, n);
+      vacc = __riscv_vmaxu_vx_u8m${LMUL}(vacc, vmin, n);
+      vacc = __riscv_vminu_vx_u8m${LMUL}(vacc, vmax, n);
+      __riscv_vse8_v_u8m${LMUL}(output, vacc, n);
+    input += n;
+    output += n;
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/s8-vclamp/s8-vclamp-avx2-u128.c b/src/s8-vclamp/s8-vclamp-avx2-u128.c
new file mode 100644
index 00000000000..263a81d020c
--- /dev/null
+++ b/src/s8-vclamp/s8-vclamp-avx2-u128.c
@@ -0,0 +1,104 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <immintrin.h>
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/unaligned.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s8_vclamp_ukernel__avx2_u128(
+    size_t batch,
+    const int8_t* input,
+    int8_t* output,
+    const struct xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const __m256i voutput_min = _mm256_set1_epi8(params->scalar.min);
+  const __m256i voutput_max = _mm256_set1_epi8(params->scalar.max);
+  XNN_FORCE_REALIZATION(voutput_min);
+  XNN_FORCE_REALIZATION(voutput_max);
+
+  for (; batch >= 128; batch -= 128) {
+    __m256i vacc0 = _mm256_loadu_si256((const __m256i*) input);
+    __m256i vacc1 = _mm256_loadu_si256((const __m256i*) input + 1);
+    __m256i vacc2 = _mm256_loadu_si256((const __m256i*) input + 2);
+    __m256i vacc3 = _mm256_loadu_si256((const __m256i*) input + 3);
+    input += 128;
+
+    vacc0 = _mm256_max_epi8(vacc0, voutput_min);
+    vacc1 = _mm256_max_epi8(vacc1, voutput_min);
+    vacc2 = _mm256_max_epi8(vacc2, voutput_min);
+    vacc3 = _mm256_max_epi8(vacc3, voutput_min);
+
+    vacc0 = _mm256_min_epi8(vacc0, voutput_max);
+    vacc1 = _mm256_min_epi8(vacc1, voutput_max);
+    vacc2 = _mm256_min_epi8(vacc2, voutput_max);
+    vacc3 = _mm256_min_epi8(vacc3, voutput_max);
+
+    _mm256_storeu_si256((__m256i*) output, vacc0);
+    _mm256_storeu_si256((__m256i*) output + 1, vacc1);
+    _mm256_storeu_si256((__m256i*) output + 2, vacc2);
+    _mm256_storeu_si256((__m256i*) output + 3, vacc3);
+    output += 128;
+  }
+  for (; batch >= 32; batch -= 32) {
+    __m256i vacc = _mm256_loadu_si256((const __m256i*) input);
+    input += 32;
+
+    vacc = _mm256_min_epi8(vacc, voutput_max);
+    vacc = _mm256_max_epi8(vacc, voutput_min);
+
+    _mm256_storeu_si256((__m256i*) output, vacc);
+    output += 32;
+  }
+  if (batch >= 16) {
+    __m128i vacc = _mm_loadu_si128((const __m128i*) input);
+    input += 16;
+
+    vacc = _mm_min_epi8(vacc, _mm256_castsi256_si128(voutput_max));
+    vacc = _mm_max_epi8(vacc, _mm256_castsi256_si128(voutput_min));
+
+    _mm_storeu_si128((__m128i*) output, vacc);
+    output += 16;
+    batch -= 16;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    assert(batch >= 1 && batch <= 15);
+    __m128i vacc = _mm_loadu_si128((const __m128i*) input);
+    vacc = _mm_min_epi8(vacc, _mm256_castsi256_si128(voutput_max));
+    vacc = _mm_max_epi8(vacc, _mm256_castsi256_si128(voutput_min));
+
+    if (batch & 8) {
+      _mm_storel_epi64((__m128i*) output, vacc);
+      output += 8;
+      vacc = _mm_unpackhi_epi64(vacc, vacc);
+    }
+    if (batch & 4) {
+      unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vacc));
+      output += 4;
+      vacc = _mm_srli_epi64(vacc, 32);
+    }
+    if (batch & 2) {
+      unaligned_store_u16(output, (uint16_t) _mm_cvtsi128_si32(vacc));
+      output += 2;
+      vacc = _mm_srli_epi32(vacc, 16);
+    }
+    if (batch & 1) {
+      *output = (int8_t) _mm_cvtsi128_si32(vacc);
+    }
+  }
+}
diff --git a/src/s8-vclamp/s8-vclamp-avx512skx-u256.c b/src/s8-vclamp/s8-vclamp-avx512skx-u256.c
new file mode 100644
index 00000000000..abea2644840
--- /dev/null
+++ b/src/s8-vclamp/s8-vclamp-avx512skx-u256.c
@@ -0,0 +1,76 @@
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <immintrin.h>
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/unaligned.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_s8_vclamp_ukernel__avx512skx_u256(
+    size_t batch,
+    const int8_t* input,
+    int8_t* output,
+    const struct xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(int8_t) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const __m512i voutput_min = _mm512_set1_epi8(params->scalar.min);
+  const __m512i voutput_max = _mm512_set1_epi8(params->scalar.max);
+
+  for (; batch >= 256; batch -= 256) {
+    __m512i vacc0 = _mm512_loadu_si512((const __m512i*) input);
+    __m512i vacc1 = _mm512_loadu_si512((const __m512i*) input + 1);
+    __m512i vacc2 = _mm512_loadu_si512((const __m512i*) input + 2);
+    __m512i vacc3 = _mm512_loadu_si512((const __m512i*) input + 3);
+    input += 256;
+
+    vacc0 = _mm512_max_epi8(vacc0, voutput_min);
+    vacc1 = _mm512_max_epi8(vacc1, voutput_min);
+    vacc2 = _mm512_max_epi8(vacc2, voutput_min);
+    vacc3 = _mm512_max_epi8(vacc3, voutput_min);
+
+    vacc0 = _mm512_min_epi8(vacc0, voutput_max);
+    vacc1 = _mm512_min_epi8(vacc1, voutput_max);
+    vacc2 = _mm512_min_epi8(vacc2, voutput_max);
+    vacc3 = _mm512_min_epi8(vacc3, voutput_max);
+
+    _mm512_storeu_si512((__m512i*) output, vacc0);
+    _mm512_storeu_si512((__m512i*) output + 1, vacc1);
+    _mm512_storeu_si512((__m512i*) output + 2, vacc2);
+    _mm512_storeu_si512((__m512i*) output + 3, vacc3);
+    output += 256;
+  }
+  for (; batch >= 64; batch -= 64) {
+    __m512i vacc = _mm512_loadu_si512((const __m512i*) input);
+    input += 64;
+
+    vacc = _mm512_min_epi8(vacc, voutput_max);
+    vacc = _mm512_max_epi8(vacc, voutput_min);
+
+    _mm512_storeu_si512((__m512i*) output, vacc);
+    output += 64;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    assert(batch >= 1 && batch <= 63);
+    const __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT64_C(1) << batch) - UINT64_C(1)));
+    __m512i vacc = _mm512_maskz_loadu_epi8(vmask, input);
+
+    vacc = _mm512_min_epi8(vacc, voutput_max);
+    vacc = _mm512_max_epi8(vacc, voutput_min);
+
+    _mm512_mask_storeu_epi8(output, vmask, vacc);
+  }
+}
diff --git a/src/s8-vclamp/s8-vclamp.h b/src/s8-vclamp/s8-vclamp.h
index d4b601b627b..a7aa50b642f 100644
--- a/src/s8-vclamp/s8-vclamp.h
+++ b/src/s8-vclamp/s8-vclamp.h
@@ -23,8 +23,20 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_s8_vclamp_ukernel__neon_u64, 64,
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
 XNN_UKERNEL_WITH_PARAMS(0, xnn_s8_vclamp_ukernel__sse2_u64, 64, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
 XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_s8_vclamp_ukernel__sse41_u64, 64, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s8_vclamp_ukernel__avx2_u128, 128, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_s8_vclamp_ukernel__avx512skx_u256, 256, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+
+#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u1v, 1, true, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u2v, 2, true, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u4v, 4, true, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u8v, 8, true, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
+#endif  // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV
+
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_UKERNEL_WITH_PARAMS(0, xnn_s8_vclamp_ukernel__wasmsimd_u64, 64, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params)
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
diff --git a/src/subgraph.c b/src/subgraph.c
index 1c6769b3bbf..cc26fbf4d0c 100644
--- a/src/subgraph.c
+++ b/src/subgraph.c
@@ -515,15 +515,6 @@ uint32_t xnn_check_nchw_compatibility(xnn_subgraph_t subgraph, struct xnn_node*
     case xnn_node_type_floor:
     case xnn_node_type_hardswish:
     case xnn_node_type_leaky_relu:
-    case xnn_node_type_static_mean:
-    case xnn_node_type_static_sum:
-      if (subgraph->values[node->inputs[0]].shape.num_dims == 4) {
-        return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW | XNN_LAYOUT_FLAG_COMPATIBLE_NHWC2NCHW;
-      } else {
-        xnn_log_info("Node %s inputs shape is incompatible with sparse inference",
-                     xnn_node_type_to_string(node->type));
-        return 0;
-      }
     case xnn_node_type_negate:
     case xnn_node_type_sigmoid:
     case xnn_node_type_square:
@@ -536,6 +527,15 @@ uint32_t xnn_check_nchw_compatibility(xnn_subgraph_t subgraph, struct xnn_node*
                      xnn_node_type_to_string(node->type));
         return 0;
       }
+    case xnn_node_type_static_mean:
+    case xnn_node_type_static_sum:
+      if (subgraph->values[node->inputs[0]].shape.num_dims == 4) {
+        return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW | XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC;
+      } else {
+        xnn_log_info("Node %s inputs shape is incompatible with sparse inference",
+                     xnn_node_type_to_string(node->type));
+        return 0;
+      }
     default:
       return false;
   }
@@ -1433,6 +1433,8 @@ enum xnn_node_type xnn_binary_operator_to_node_type(enum xnn_binary_operator typ
       return xnn_node_type_copysign;
     case xnn_binary_squared_difference:
       return xnn_node_type_squared_difference;
+    case xnn_binary_prelu:
+      return xnn_node_type_prelu;
     case xnn_binary_minimum:
       return xnn_node_type_minimum2;
     case xnn_binary_maximum:
@@ -1457,6 +1459,8 @@ enum xnn_binary_operator xnn_node_type_to_binary_operator(enum xnn_node_type typ
       return xnn_binary_copysign;
     case xnn_node_type_squared_difference:
       return xnn_binary_squared_difference;
+    case xnn_node_type_prelu:
+      return xnn_binary_prelu;
     case xnn_node_type_minimum2:
       return xnn_binary_minimum;
     case xnn_node_type_maximum2:
diff --git a/src/subgraph/convert.c b/src/subgraph/convert.c
index 16ccb7bd015..b722dee5e4b 100644
--- a/src/subgraph/convert.c
+++ b/src/subgraph/convert.c
@@ -64,7 +64,6 @@ static enum xnn_status create_convert_operator(
           status = xnn_create_convert_nc_f32_qs8(
               output_value->quantization.scale,
               (int8_t) output_value->quantization.zero_point,
-              INT8_MIN, INT8_MAX,
               node->flags,
               &opdata->operator_objects[0]);
            break;
@@ -72,7 +71,6 @@ static enum xnn_status create_convert_operator(
           status = xnn_create_convert_nc_f32_qu8(
             output_value->quantization.scale,
             (uint8_t) output_value->quantization.zero_point,
-            0, UINT8_MAX,
             node->flags,
             &opdata->operator_objects[0]);
           break;
diff --git a/src/subgraph/deprecated.c b/src/subgraph/deprecated.c
index 9c086313ddc..b5165d691cd 100644
--- a/src/subgraph/deprecated.c
+++ b/src/subgraph/deprecated.c
@@ -77,6 +77,13 @@ enum xnn_status xnn_define_copysign(xnn_subgraph_t subgraph, uint32_t input1_id,
                            input2_id, output_id, flags);
 }
 
+enum xnn_status xnn_define_prelu(xnn_subgraph_t subgraph, uint32_t input1_id,
+                                 uint32_t input2_id, uint32_t output_id,
+                                 uint32_t flags) {
+  return xnn_define_binary(subgraph, xnn_binary_prelu, NULL,
+                           input1_id, input2_id, output_id, flags);
+}
+
 enum xnn_status xnn_define_static_mean(xnn_subgraph_t subgraph,
                                        size_t num_reduction_axes,
                                        const size_t* reduction_axes,
diff --git a/src/subgraph/prelu.c b/src/subgraph/prelu.c
deleted file mode 100644
index d4688735f5b..00000000000
--- a/src/subgraph/prelu.c
+++ /dev/null
@@ -1,285 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <inttypes.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "xnnpack.h"
-#include "xnnpack/common.h"
-#include "xnnpack/log.h"
-#include "xnnpack/node-type.h"
-#include "xnnpack/operator-type.h"
-#include "xnnpack/operator.h"
-#include "xnnpack/subgraph-validation.h"
-#include "xnnpack/subgraph.h"
-#include "pthreadpool.h"
-
-static enum xnn_status create_prelu_operator(
-  const struct xnn_node* node,
-  const struct xnn_value* values,
-  size_t num_values,
-  struct xnn_operator_data* opdata,
-  struct xnn_code_cache* code_cache,
-  xnn_weights_cache_t weights_cache)
-{
-  assert(node->num_inputs == 2);
-  const uint32_t input_id = node->inputs[0];
-  assert(input_id != XNN_INVALID_VALUE_ID);
-  assert(input_id < num_values);
-
-  const uint32_t slope_id = node->inputs[1];
-  assert(slope_id != XNN_INVALID_VALUE_ID);
-  assert(slope_id < num_values);
-
-  const void* slope_data = values[slope_id].fp32_data != NULL ? values[slope_id].fp32_data : values[slope_id].data;
-  assert(slope_data != NULL);
-
-  assert(node->num_outputs == 1);
-
-  const size_t num_slope_dims = values[slope_id].shape.num_dims;
-  const size_t slope_channels = num_slope_dims == 0 ? 1 : values[slope_id].shape.dim[num_slope_dims - 1];
-
-  const size_t num_input_dims = values[input_id].shape.num_dims;
-  const size_t input_channels = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1];
-
-  const uint32_t input1_id = node->inputs[0];
-  assert(input_id < num_values);
-  const struct xnn_value *input1_value = &values[input1_id];
-  enum xnn_status status;
-  switch (input1_value->datatype) {
-    case xnn_datatype_fp16:
-      status = xnn_create_prelu_nc_f16(
-        input_channels,
-        slope_channels,
-        /*input_stride=*/input_channels,
-        /*output_stride=*/input_channels,
-        /*negative_slope=*/slope_data,
-        node->flags | XNN_FLAG_FP32_STATIC_WEIGHTS,
-        code_cache,
-        weights_cache,
-        &opdata->operator_objects[0]);
-      break;
-    case xnn_datatype_fp32:
-      status = xnn_create_prelu_nc_f32(
-        input_channels,
-        slope_channels,
-        /*input_stride=*/input_channels,
-        /*output_stride=*/input_channels,
-        /*negative_slope=*/slope_data,
-        node->flags,
-        code_cache,
-        weights_cache,
-        &opdata->operator_objects[0]);
-      break;
-    default:
-      XNN_UNREACHABLE;
-  }
-  return status;
-}
-
-static enum xnn_status reshape_prelu_operator(
-  struct xnn_operator_data* opdata,
-  struct xnn_value* values,
-  size_t num_values,
-  pthreadpool_t threadpool)
-{
-  const uint32_t input_id = opdata->inputs[0];
-  assert(input_id < num_values);
-  const struct xnn_value* input_value = values + input_id;
-  const size_t batch_size = xnn_shape_multiply_non_channel_dims(&input_value->shape);
-
-  const size_t old_workspace_size = opdata->workspace_size;
-  enum xnn_status status = xnn_status_invalid_state;
-  switch (opdata->operator_objects[0]->type) {
-    case xnn_operator_type_prelu_nc_f16:
-      status = xnn_reshape_prelu_nc_f16(
-        opdata->operator_objects[0],
-        batch_size,
-        threadpool);
-        break;
-    case xnn_operator_type_prelu_nc_f32:
-      status = xnn_reshape_prelu_nc_f32(
-        opdata->operator_objects[0],
-        batch_size,
-        threadpool);
-        break;
-    default:
-      XNN_UNREACHABLE;
-  }
-  if (status != xnn_status_success) {
-    return status;
-  }
-
-  const uint32_t output_id = opdata->outputs[0];
-  assert(output_id < num_values);
-  struct xnn_value* output_value = values + output_id;
-
-  memcpy(output_value->shape.dim, input_value->shape.dim, input_value->shape.num_dims * sizeof(size_t));
-  const size_t new_size = xnn_tensor_get_size(output_value);
-  if (new_size > output_value->size || opdata->workspace_size > old_workspace_size) {
-    output_value->size = new_size;
-    return xnn_status_reallocation_required;
-  }
-
-  return xnn_status_success;
-
-}
-
-static enum xnn_status setup_prelu_operator(
-  const struct xnn_operator_data* opdata,
-  const struct xnn_value* values,
-  size_t num_values,
-  pthreadpool_t threadpool)
-{
-  const uint32_t input_id = opdata->inputs[0];
-  assert(input_id != XNN_INVALID_VALUE_ID);
-  assert(input_id < num_values);
-
-  const uint32_t output_id = opdata->outputs[0];
-  assert(output_id != XNN_INVALID_VALUE_ID);
-  assert(output_id < num_values);
-
-  const struct xnn_value* input_value = values + input_id;
-  const void* input_data = input_value->data;
-  assert(input_data != NULL);
-
-  const struct xnn_value* output_value = values + output_id;
-  void* output_data = output_value->data;
-  assert(output_data != NULL);
-
-  switch (opdata->operator_objects[0]->type) {
-    case xnn_operator_type_prelu_nc_f16:
-      return xnn_setup_prelu_nc_f16(
-        opdata->operator_objects[0],
-        input_data,
-        output_data);
-    case xnn_operator_type_prelu_nc_f32:
-      return xnn_setup_prelu_nc_f32(
-        opdata->operator_objects[0],
-        input_data,
-        output_data);
-    default:
-      XNN_UNREACHABLE;
-  }
-}
-
-enum xnn_status xnn_define_prelu(
-  xnn_subgraph_t subgraph,
-  uint32_t input_id,
-  uint32_t slope_id,
-  uint32_t output_id,
-  uint32_t flags)
-{
-  enum xnn_status status;
-  if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_prelu)) != xnn_status_success) {
-    return status;
-  }
-
-  if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_prelu, input_id, subgraph->num_values)) !=
-      xnn_status_success) {
-    return status;
-  }
-
-  const struct xnn_value* input_value = &subgraph->values[input_id];
-  status = xnn_subgraph_check_input_type_dense(xnn_node_type_prelu, input_id, input_value);
-  if (status != xnn_status_success) {
-    return status;
-  }
-
-  switch (input_value->datatype) {
-    case xnn_datatype_fp16:
-    case xnn_datatype_fp32:
-      break;
-    default:
-      xnn_log_error(
-        "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)",
-        xnn_node_type_to_string(xnn_node_type_prelu), input_id,
-        xnn_datatype_to_string(input_value->datatype), input_value->datatype);
-      return xnn_status_invalid_parameter;
-  }
-
-  if (slope_id >= subgraph->num_values) {
-    xnn_log_error(
-      "failed to define %s operator with slope ID #%" PRIu32 ": invalid Value ID",
-      xnn_node_type_to_string(xnn_node_type_prelu), slope_id);
-    return xnn_status_invalid_parameter;
-  }
-
-  const struct xnn_value* slope_value = &subgraph->values[slope_id];
-  if (slope_value->type != xnn_value_type_dense_tensor) {
-    xnn_log_error(
-      "failed to define %s operator with slope ID #%" PRIu32 ": unsupported Value type %d (expected dense tensor)",
-      xnn_node_type_to_string(xnn_node_type_prelu), slope_id, slope_value->type);
-    return xnn_status_invalid_parameter;
-  }
-
-  if (slope_value->data == NULL) {
-    xnn_log_error(
-      "failed to define %s operator with slope ID #%" PRIu32 ": non-static Value",
-      xnn_node_type_to_string(xnn_node_type_prelu), slope_id);
-    return xnn_status_invalid_parameter;
-  }
-
-  switch (slope_value->datatype) {
-    case xnn_datatype_fp32:
-      break;
-    default:
-      xnn_log_error(
-        "failed to define %s operator with slope ID #%" PRIu32 ": unsupported Value datatype %s (%d)",
-        xnn_node_type_to_string(xnn_node_type_prelu), slope_id,
-        xnn_datatype_to_string(slope_value->datatype), slope_value->datatype);
-      return xnn_status_invalid_parameter;
-  }
-
-  status = xnn_subgraph_check_output_node_id(xnn_node_type_prelu, output_id, subgraph->num_values);
-  if (status != xnn_status_success) {
-    return status;
-  }
-
-  const struct xnn_value* output_value = &subgraph->values[output_id];
-  status = xnn_subgraph_check_output_type_dense(xnn_node_type_prelu, output_id, output_value);
-  if (status != xnn_status_success) {
-    return status;
-  }
-
-  enum xnn_compute_type compute_type = xnn_compute_type_invalid;
-  switch (output_value->datatype) {
-    case xnn_datatype_fp16:
-      compute_type = xnn_compute_type_fp16;
-      break;
-    case xnn_datatype_fp32:
-      compute_type = xnn_compute_type_fp32;
-      break;
-    default:
-      xnn_log_error(
-        "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)",
-        xnn_node_type_to_string(xnn_node_type_prelu), output_id,
-        xnn_datatype_to_string(output_value->datatype), output_value->datatype);
-      return xnn_status_invalid_parameter;
-  }
-
-  struct xnn_node* node = xnn_subgraph_new_node(subgraph);
-  if (node == NULL) {
-    return xnn_status_out_of_memory;
-  }
-
-  node->type = xnn_node_type_prelu;
-  node->compute_type = compute_type;
-  node->num_inputs = 2;
-  node->inputs[0] = input_id;
-  node->inputs[1] = slope_id;
-  node->num_outputs = 1;
-  node->outputs[0] = output_id;
-  node->flags = flags;
-
-  node->create = create_prelu_operator;
-  node->reshape = reshape_prelu_operator;
-  node->setup = setup_prelu_operator;
-
-  return xnn_status_success;
-}
diff --git a/src/subgraph/rope.c b/src/subgraph/rope.c
index 8d424b79981..3f6e3b64121 100644
--- a/src/subgraph/rope.c
+++ b/src/subgraph/rope.c
@@ -36,13 +36,11 @@ static enum xnn_status create_rope_operator(
   switch (input_value->datatype) {
     case xnn_datatype_fp16:
       status = xnn_create_rope_nthc_f16(
-        node->params.rope.max_tokens,
         /*flags=*/0,
         &opdata->operator_objects[0]);
       break;
     case xnn_datatype_fp32:
       status = xnn_create_rope_nthc_f32(
-        node->params.rope.max_tokens,
         /*flags=*/0,
         &opdata->operator_objects[0]);
       break;
@@ -170,13 +168,6 @@ enum xnn_status xnn_define_rope(
     return status;
   }
 
-  if (max_tokens == 0) {
-    xnn_log_error(
-      "failed to define %s operator with %zu max tokens: maximum number of tokens must be non-zero",
-      xnn_node_type_to_string(xnn_node_type_rope), max_tokens);
-    return xnn_status_invalid_parameter;
-  }
-
   status = xnn_subgraph_check_input_node_id(xnn_node_type_rope, input_id, subgraph->num_values);
   if (status != xnn_status_success) {
     return status;
@@ -262,7 +253,6 @@ enum xnn_status xnn_define_rope(
 
   node->type = xnn_node_type_rope;
   node->compute_type = compute_type;
-  node->params.rope.max_tokens = max_tokens;
   node->num_inputs = 2;
   node->inputs[0] = input_id;
   node->inputs[1] = weights_id;
diff --git a/src/subgraph/static-reduce.c b/src/subgraph/static-reduce.c
index 0de434f5465..230b97297c4 100644
--- a/src/subgraph/static-reduce.c
+++ b/src/subgraph/static-reduce.c
@@ -326,15 +326,6 @@ enum xnn_status xnn_define_static_reduce(
     return xnn_status_invalid_parameter;
   }
 
-  for (size_t i = 0; i < num_reduction_axes; i++) {
-    if (reduction_axes[i] > input_value->shape.num_dims) {
-      xnn_log_error(
-        "failed to define %s operator with #%zu reduction axis of %zu: the index is out of bounds for a %zuD input shape",
-        xnn_node_type_to_string(node_type), i, reduction_axes[i], input_value->shape.num_dims);
-      return xnn_status_invalid_parameter;
-    }
-  }
-
   for (size_t i = 1; i < num_reduction_axes; i++) {
     if (reduction_axes[i] <= reduction_axes[i - 1]) {
       xnn_log_error(
diff --git a/src/u32-f32-vcvt/u32-f32-vcvt.h b/src/u32-f32-vcvt/u32-f32-vcvt.h
index 1d9267fa4a2..2bb599a4377 100644
--- a/src/u32-f32-vcvt/u32-f32-vcvt.h
+++ b/src/u32-f32-vcvt/u32-f32-vcvt.h
@@ -31,6 +31,9 @@ XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_u32_f32_vcvt_ukernel__avx2_u3
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_u32_f32_vcvt_ukernel__avx512f_u16, 16, false, uint32_t, float, struct xnn_u32_f32_cvt_params, xnn_init_u32_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_u32_f32_vcvt_ukernel__avx512f_u32, 32, false, uint32_t, float, struct xnn_u32_f32_cvt_params, xnn_init_u32_f32_cvt_scalar_params)
 XNN_CVT_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_u32_f32_vcvt_ukernel__avx512f_u48, 48, false, uint32_t, float, struct xnn_u32_f32_cvt_params, xnn_init_u32_f32_cvt_scalar_params)
diff --git a/src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c b/src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c
new file mode 100644
index 00000000000..2d291a78f40
--- /dev/null
+++ b/src/u8-vclamp/gen/u8-vclamp-rvv-u1v.c
@@ -0,0 +1,44 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s8-vclamp/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "xnnpack/common.h"
+#include "xnnpack/intrinsics-polyfill.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_u8_vclamp_ukernel__rvv_u1v(
+    size_t batch,
+    const uint8_t* input,
+    uint8_t* output,
+    const struct xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const uint8_t vmin = params->scalar.min;
+  const uint8_t vmax = params->scalar.max;
+
+  do {
+    const size_t n = __riscv_vsetvl_e8m1(batch);
+    vuint8m1_t vacc = __riscv_vle8_v_u8m1(input, n);
+    vacc = __riscv_vmaxu_vx_u8m1(vacc, vmin, n);
+    vacc = __riscv_vminu_vx_u8m1(vacc, vmax, n);
+    __riscv_vse8_v_u8m1(output, vacc, n);
+    input += n;
+    output += n;
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c b/src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c
new file mode 100644
index 00000000000..9a91840c067
--- /dev/null
+++ b/src/u8-vclamp/gen/u8-vclamp-rvv-u2v.c
@@ -0,0 +1,44 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s8-vclamp/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "xnnpack/common.h"
+#include "xnnpack/intrinsics-polyfill.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_u8_vclamp_ukernel__rvv_u2v(
+    size_t batch,
+    const uint8_t* input,
+    uint8_t* output,
+    const struct xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const uint8_t vmin = params->scalar.min;
+  const uint8_t vmax = params->scalar.max;
+
+  do {
+    const size_t n = __riscv_vsetvl_e8m2(batch);
+    vuint8m2_t vacc = __riscv_vle8_v_u8m2(input, n);
+    vacc = __riscv_vmaxu_vx_u8m2(vacc, vmin, n);
+    vacc = __riscv_vminu_vx_u8m2(vacc, vmax, n);
+    __riscv_vse8_v_u8m2(output, vacc, n);
+    input += n;
+    output += n;
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c b/src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c
new file mode 100644
index 00000000000..011146ec219
--- /dev/null
+++ b/src/u8-vclamp/gen/u8-vclamp-rvv-u4v.c
@@ -0,0 +1,44 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s8-vclamp/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "xnnpack/common.h"
+#include "xnnpack/intrinsics-polyfill.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_u8_vclamp_ukernel__rvv_u4v(
+    size_t batch,
+    const uint8_t* input,
+    uint8_t* output,
+    const struct xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const uint8_t vmin = params->scalar.min;
+  const uint8_t vmax = params->scalar.max;
+
+  do {
+    const size_t n = __riscv_vsetvl_e8m4(batch);
+    vuint8m4_t vacc = __riscv_vle8_v_u8m4(input, n);
+    vacc = __riscv_vmaxu_vx_u8m4(vacc, vmin, n);
+    vacc = __riscv_vminu_vx_u8m4(vacc, vmax, n);
+    __riscv_vse8_v_u8m4(output, vacc, n);
+    input += n;
+    output += n;
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c b/src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c
new file mode 100644
index 00000000000..5c782d823ff
--- /dev/null
+++ b/src/u8-vclamp/gen/u8-vclamp-rvv-u8v.c
@@ -0,0 +1,44 @@
+// Auto-generated file. Do not edit!
+//   Template: src/s8-vclamp/rvv.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "xnnpack/common.h"
+#include "xnnpack/intrinsics-polyfill.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_u8_vclamp_ukernel__rvv_u8v(
+    size_t batch,
+    const uint8_t* input,
+    uint8_t* output,
+    const struct xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const uint8_t vmin = params->scalar.min;
+  const uint8_t vmax = params->scalar.max;
+
+  do {
+    const size_t n = __riscv_vsetvl_e8m8(batch);
+    vuint8m8_t vacc = __riscv_vle8_v_u8m8(input, n);
+    vacc = __riscv_vmaxu_vx_u8m8(vacc, vmin, n);
+    vacc = __riscv_vminu_vx_u8m8(vacc, vmax, n);
+    __riscv_vse8_v_u8m8(output, vacc, n);
+    input += n;
+    output += n;
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/u8-vclamp/u8-vclamp-avx2-u128.c b/src/u8-vclamp/u8-vclamp-avx2-u128.c
new file mode 100644
index 00000000000..0448807359a
--- /dev/null
+++ b/src/u8-vclamp/u8-vclamp-avx2-u128.c
@@ -0,0 +1,104 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <immintrin.h>
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/unaligned.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_u8_vclamp_ukernel__avx2_u128(
+    size_t batch,
+    const uint8_t* input,
+    uint8_t* output,
+    const struct xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const __m256i voutput_min = _mm256_set1_epi8(params->scalar.min);
+  const __m256i voutput_max = _mm256_set1_epi8(params->scalar.max);
+  XNN_FORCE_REALIZATION(voutput_min);
+  XNN_FORCE_REALIZATION(voutput_max);
+
+  for (; batch >= 128; batch -= 128) {
+    __m256i vacc0 = _mm256_loadu_si256((const __m256i*) input);
+    __m256i vacc1 = _mm256_loadu_si256((const __m256i*) input + 1);
+    __m256i vacc2 = _mm256_loadu_si256((const __m256i*) input + 2);
+    __m256i vacc3 = _mm256_loadu_si256((const __m256i*) input + 3);
+    input += 128;
+
+    vacc0 = _mm256_max_epu8(vacc0, voutput_min);
+    vacc1 = _mm256_max_epu8(vacc1, voutput_min);
+    vacc2 = _mm256_max_epu8(vacc2, voutput_min);
+    vacc3 = _mm256_max_epu8(vacc3, voutput_min);
+
+    vacc0 = _mm256_min_epu8(vacc0, voutput_max);
+    vacc1 = _mm256_min_epu8(vacc1, voutput_max);
+    vacc2 = _mm256_min_epu8(vacc2, voutput_max);
+    vacc3 = _mm256_min_epu8(vacc3, voutput_max);
+
+    _mm256_storeu_si256((__m256i*) output, vacc0);
+    _mm256_storeu_si256((__m256i*) output + 1, vacc1);
+    _mm256_storeu_si256((__m256i*) output + 2, vacc2);
+    _mm256_storeu_si256((__m256i*) output + 3, vacc3);
+    output += 128;
+  }
+  for (; batch >= 32; batch -= 32) {
+    __m256i vacc = _mm256_loadu_si256((const __m256i*) input);
+    input += 32;
+
+    vacc = _mm256_max_epu8(vacc, voutput_min);
+    vacc = _mm256_min_epu8(vacc, voutput_max);
+
+    _mm256_storeu_si256((__m256i*) output, vacc);
+    output += 32;
+  }
+  if (batch >= 16) {
+    __m128i vacc = _mm_loadu_si128((const __m128i*) input);
+    input += 16;
+
+    vacc = _mm_max_epu8(vacc, _mm256_castsi256_si128(voutput_min));
+    vacc = _mm_min_epu8(vacc, _mm256_castsi256_si128(voutput_max));
+
+    _mm_storeu_si128((__m128i*) output, vacc);
+    output += 16;
+    batch -= 16;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    assert(batch >= 1 && batch <= 15);
+    __m128i vacc = _mm_loadu_si128((const __m128i*) input);
+    vacc = _mm_max_epu8(vacc, _mm256_castsi256_si128(voutput_min));
+    vacc = _mm_min_epu8(vacc, _mm256_castsi256_si128(voutput_max));
+
+    if (batch & 8) {
+      _mm_storel_epi64((__m128i*) output, vacc);
+      output += 8;
+      vacc = _mm_unpackhi_epi64(vacc, vacc);
+    }
+    if (batch & 4) {
+      unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vacc));
+      output += 4;
+      vacc = _mm_srli_epi64(vacc, 32);
+    }
+    if (batch & 2) {
+      unaligned_store_u16(output, (uint16_t) _mm_cvtsi128_si32(vacc));
+      output += 2;
+      vacc = _mm_srli_epi32(vacc, 16);
+    }
+    if (batch & 1) {
+      *output = (uint8_t) _mm_cvtsi128_si32(vacc);
+    }
+  }
+}
diff --git a/src/u8-vclamp/u8-vclamp-avx512skx-u256.c b/src/u8-vclamp/u8-vclamp-avx512skx-u256.c
new file mode 100644
index 00000000000..fcd9c0afeb6
--- /dev/null
+++ b/src/u8-vclamp/u8-vclamp-avx512skx-u256.c
@@ -0,0 +1,76 @@
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <immintrin.h>
+
+#include "xnnpack/common.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/unaligned.h"
+#include "xnnpack/vunary.h"
+
+
+void xnn_u8_vclamp_ukernel__avx512skx_u256(
+    size_t batch,
+    const uint8_t* input,
+    uint8_t* output,
+    const struct xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(batch % sizeof(uint8_t) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const __m512i voutput_min = _mm512_set1_epi8(params->scalar.min);
+  const __m512i voutput_max = _mm512_set1_epi8(params->scalar.max);
+
+  for (; batch >= 256; batch -= 256) {
+    __m512i vacc0 = _mm512_loadu_si512((const __m512i*) input);
+    __m512i vacc1 = _mm512_loadu_si512((const __m512i*) input + 1);
+    __m512i vacc2 = _mm512_loadu_si512((const __m512i*) input + 2);
+    __m512i vacc3 = _mm512_loadu_si512((const __m512i*) input + 3);
+    input += 256;
+
+    vacc0 = _mm512_max_epu8(vacc0, voutput_min);
+    vacc1 = _mm512_max_epu8(vacc1, voutput_min);
+    vacc2 = _mm512_max_epu8(vacc2, voutput_min);
+    vacc3 = _mm512_max_epu8(vacc3, voutput_min);
+
+    vacc0 = _mm512_min_epu8(vacc0, voutput_max);
+    vacc1 = _mm512_min_epu8(vacc1, voutput_max);
+    vacc2 = _mm512_min_epu8(vacc2, voutput_max);
+    vacc3 = _mm512_min_epu8(vacc3, voutput_max);
+
+    _mm512_storeu_si512((__m512i*) output, vacc0);
+    _mm512_storeu_si512((__m512i*) output + 1, vacc1);
+    _mm512_storeu_si512((__m512i*) output + 2, vacc2);
+    _mm512_storeu_si512((__m512i*) output + 3, vacc3);
+    output += 256;
+  }
+  for (; batch >= 64; batch -= 64) {
+    __m512i vacc = _mm512_loadu_si512((const __m512i*) input);
+    input += 64;
+
+    vacc = _mm512_min_epu8(vacc, voutput_max);
+    vacc = _mm512_max_epu8(vacc, voutput_min);
+
+    _mm512_storeu_si512((__m512i*) output, vacc);
+    output += 64;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    assert(batch >= 1 && batch <= 63);
+    const __mmask64 vmask = _cvtu64_mask64((uint64_t) ((UINT64_C(1) << batch) - UINT64_C(1)));
+    __m512i vacc = _mm512_maskz_loadu_epi8(vmask, input);
+
+    vacc = _mm512_min_epu8(vacc, voutput_max);
+    vacc = _mm512_max_epu8(vacc, voutput_min);
+
+    _mm512_mask_storeu_epi8(output, vmask, vacc);
+  }
+}
diff --git a/src/u8-vclamp/u8-vclamp.h b/src/u8-vclamp/u8-vclamp.h
index 0aceaebd69a..90b0cf43be3 100644
--- a/src/u8-vclamp/u8-vclamp.h
+++ b/src/u8-vclamp/u8-vclamp.h
@@ -22,8 +22,20 @@ XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_u8_vclamp_ukernel__neon_u64, 64,
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
 XNN_UKERNEL_WITH_PARAMS(0, xnn_u8_vclamp_ukernel__sse2_u64, 64, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_u8_vclamp_ukernel__avx2_u128, 128, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params)
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
+#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_u8_vclamp_ukernel__avx512skx_u256, 256, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params)
+#endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
+
+#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u1v, 1, true, int8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u2v, 2, true, int8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u4v, 4, true, int8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params)
+XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u8v, 8, true, int8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params)
+#endif  // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV
+
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 XNN_UKERNEL_WITH_PARAMS(0, xnn_u8_vclamp_ukernel__wasmsimd_u64, 64, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params)
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
diff --git a/src/x32-packw/x32-packw.h b/src/x32-packw/x32-packw.h
index 5517782d3d1..2a962d5f5d9 100644
--- a/src/x32-packw/x32-packw.h
+++ b/src/x32-packw/x32-packw.h
@@ -65,6 +65,9 @@ XNN_UKERNEL(xnn_arch_x86_avx, xnn_x32_packw_gemm_goi_ukernel_x16__avx_u4, 16, 1,
 XNN_UKERNEL(xnn_arch_x86_avx, xnn_x32_packw_gemm_goi_ukernel_x16__avx_u4_prfm, 16, 1, 1, 4, 1)
 XNN_UKERNEL(xnn_arch_x86_avx, xnn_x32_packw_gemm_goi_ukernel_x16s4__avx_u4, 16, 1, 4, 4, 1)
 XNN_UKERNEL(xnn_arch_x86_avx, xnn_x32_packw_gemm_goi_ukernel_x16s4__avx_u4_prfm, 16, 1, 4, 4, 1)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_x32_packw_gemm_goi_ukernel_x16__avx512f_u4, 16, 1, 1, 4, 1)
 XNN_UKERNEL(xnn_arch_x86_avx512f, xnn_x32_packw_gemm_goi_ukernel_x16__avx512f_u4_prfm, 16, 1, 1, 4, 1)
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/x8-packw/kr-avxvnni.c.in b/src/x8-packw/kr-avxvnni.c.in
index 7300efd09e8..685a9cd5c7e 100644
--- a/src/x8-packw/kr-avxvnni.c.in
+++ b/src/x8-packw/kr-avxvnni.c.in
@@ -36,7 +36,7 @@ void xnn_qs8${"_to_qu8" if IZP == 128 else ""}_packw_gemm_goi_ukernel_x${NR}c${K
   const void* scale,
   ${WTYPE}* packed_weights,
   size_t extra_bytes,
-  const void* params)
+  const void* params) XNN_OOB_READS
 {
   assert(g != 0);
   assert(nc != 0);
@@ -107,69 +107,18 @@ void xnn_qs8${"_to_qu8" if IZP == 128 else ""}_packw_gemm_goi_ukernel_x${NR}c${K
       // KC remainder of 1..${KR-1}
       if (k != 0) {
         assert(k >= 1 && k <= ${KR-1});
-        $for N in range(0, NR, 4):
-          __m256i v${N} = _mm256_setzero_si256();
-
-        if (k & 4) {
-          $for N in range(0, NR, 4):
-            v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N}, 0);
-            v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N+1}, 2);
-            v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N+2}, 4);
-            v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N+3}, 6);
-          $for N in range(NR):
-            w${N} += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            $for N in range(0, NR, 4):
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N}, 2);
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+1}, 6);
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+2}, 10);
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+3}, 14);
-          } else {
-            $for N in range(0, NR, 4):
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N}, 0);
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+1}, 4);
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+2}, 8);
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+3}, 12);
-          }
 
-          $for N in range(NR):
-            w${N} += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            $for N in range(0, NR, 4):
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 6);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 14);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 22);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 30);
-          }
-          else if (k & 4) {
-            $for N in range(0, NR, 4):
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 4);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 12);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 20);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 28);
-          }
-          else if (k & 2) {
-            $for N in range(0, NR, 4):
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 2);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 10);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 18);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 26);
-          }
-          else {
-            $for N in range(0, NR, 4):
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 0);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 8);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 16);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 24);
-          }
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
 
-          $for N in range(NR):
-            w${N} += 1;
-        }
+        $for N in range(0, NR, 4):
+          __m256i v${N} = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N}));
+          v${N} = _mm256_blend_epi32(v${N}, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N+1})), 0x0C);
+          v${N} = _mm256_blend_epi32(v${N}, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N+2})), 0x30);
+          v${N} = _mm256_blend_epi32(v${N}, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N+3})), 0xC0);
+          v${N} = _mm256_and_si256(v${N}, vmask);
+
+        $for N in range(NR):
+          w${N} += k;
 
         $for N in range(0, NR, 4):
           vacc${N} = ${_MM256_DPBUSD_EPI32}(vacc${N}, vone, v${N});
@@ -259,69 +208,18 @@ void xnn_qs8${"_to_qu8" if IZP == 128 else ""}_packw_gemm_goi_ukernel_x${NR}c${K
       // KC remainder of 1..${KR-1}
       if (k != 0) {
         assert(k >= 1 && k <= ${KR-1});
-        $for N in range(0, NR, 4):
-          __m256i v${N} = _mm256_setzero_si256();
-
-        if (k & 4) {
-          $for N in range(0, NR, 4):
-            v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N}, 0);
-            v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N+1}, 2);
-            v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N+2}, 4);
-            v${N} = _mm256_insert_epi32(v${N}, *(const int32_t *)w${N+3}, 6);
-          $for N in range(NR):
-            w${N} += 4;
-        }
-        if (k & 2) {
-          if (k & 4) {
-            $for N in range(0, NR, 4):
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N}, 2);
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+1}, 6);
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+2}, 10);
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+3}, 14);
-          } else {
-            $for N in range(0, NR, 4):
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N}, 0);
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+1}, 4);
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+2}, 8);
-              v${N} = _mm256_insert_epi16(v${N}, *(const int16_t *)w${N+3}, 12);
-          }
 
-          $for N in range(NR):
-            w${N} += 2;
-        }
-        if (k & 1) {
-          if ((k & 4) && (k & 2)) {
-            $for N in range(0, NR, 4):
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 6);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 14);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 22);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 30);
-          }
-          else if (k & 4) {
-            $for N in range(0, NR, 4):
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 4);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 12);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 20);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 28);
-          }
-          else if (k & 2) {
-            $for N in range(0, NR, 4):
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 2);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 10);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 18);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 26);
-          }
-          else {
-            $for N in range(0, NR, 4):
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N}, 0);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+1}, 8);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+2}, 16);
-              v${N} = _mm256_insert_epi8(v${N}, *(const int8_t *)w${N+3}, 24);
-          }
+        __m256i vmask = _mm256_srli_epi64(_mm256_set1_epi32(-1), (8 - k) * sizeof(int8_t) * 8);
 
-          $for N in range(NR):
-            w${N} += 1;
-        }
+        $for N in range(0, NR, 4):
+          __m256i v${N} = _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N}));
+          v${N} = _mm256_blend_epi32(v${N}, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N+1})), 0x0C);
+          v${N} = _mm256_blend_epi32(v${N}, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N+2})), 0x30);
+          v${N} = _mm256_blend_epi32(v${N}, _mm256_set1_epi64x((int64_t) unaligned_load_u64(w${N+3})), 0xC0);
+          v${N} = _mm256_and_si256(v${N}, vmask);
+
+        $for N in range(NR):
+          w${N} += k;
 
         $for N in range(0, NR, 4):
           vacc${N} = ${_MM256_DPBUSD_EPI32}(vacc${N}, vone, v${N});
diff --git a/src/x8-packw/kr-wasmdot.c.in b/src/x8-packw/kr-wasmdot.c.in
new file mode 100644
index 00000000000..4b5bd727730
--- /dev/null
+++ b/src/x8-packw/kr-wasmdot.c.in
@@ -0,0 +1,247 @@
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert NR == 8
+$assert KR == 8
+$assert TYPE in ["int8_t"]
+$assert IZP in [0, 128]
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include "xnnpack/packw.h"
+
+
+$ABC = "012345678"
+$BTYPE = {"int8_t": "uint32_t"}[TYPE]
+$WTYPE = {"int8_t": "int8_t"}[TYPE]
+void xnn_qs8${"_to_qu8" if IZP == 128 else ""}_packw_gemm_goi_ukernel_x${NR}c${KR}__wasmrelaxedsimd(
+  size_t g,
+  size_t nc,
+  size_t kc,
+  size_t nr,
+  size_t kr,
+  size_t sr,
+  const ${WTYPE}* weights,
+  const int32_t* bias,
+  const void* scale,
+  ${WTYPE}* packed_weights,
+  size_t extra_bytes,
+  const void* params) XNN_OOB_READS
+{
+  assert(g != 0);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(nr == ${NR});
+  assert(kr == ${KR});
+  assert(sr == 1);
+  assert(weights != NULL);
+  assert(packed_weights != NULL);
+
+  const v128_t vone = wasm_i8x16_splat(1);
+  const v128_t vzero = wasm_i32x4_splat(0);
+  XNN_FORCE_REALIZATION(vone);
+  XNN_FORCE_REALIZATION(vzero);
+  ${TYPE}* out = (${TYPE}*) packed_weights;
+  const ${BTYPE}* b = (const ${BTYPE}*) bias;
+  const uint32_t izp = (uint32_t) (params ? (((const struct xnn_qs8_packw_params*) params)->input_zero_point + ${IZP}): ${IZP});
+  v128_t vzeropoint = wasm_i32x4_splat((int32_t) izp);
+
+  do {
+    // NC main loop multiple of ${NR}
+    const ${TYPE}* w0 = (const ${TYPE}*) weights;
+    size_t n = nc;
+    for (;n >= ${NR}; n -= ${NR}) {
+      int32_t* packed_b = (int32_t*) out;
+      if XNN_LIKELY(b != NULL) {
+        $for N in range(0, NR, 4):
+          const v128_t vb${N>>2} = wasm_v128_load(b + ${N});
+          wasm_v128_store(out + ${N * 4}, vb${N>>2});
+        b += ${NR};
+      } else {
+        $for N in range(0, NR, 4):
+          wasm_v128_store(out + ${N * 4}, vzero);
+      }
+      out += ${NR} * sizeof(${BTYPE});
+
+      $for N in range(1, NR):
+        const ${TYPE}* w${N} = w${N-1} + kc;
+
+      $for N in range(0, NR, 2):
+        v128_t vacc${ABC[N:N+2]} = wasm_i32x4_splat(0);
+
+      // KC main loop multiple of ${NR}x${KR}
+      size_t k = kc;
+      for (; k >= ${2 * KR}; k -= ${2 * KR}) {
+        $for N in range(NR):
+          v128_t v${N}_01 = wasm_v128_load(w${N});
+
+        $for N in range(0, NR, 2):
+          v128_t v${ABC[N:N+2]}_0 = wasm_i64x2_shuffle(v${N}_01, v${N+1}_01, 0, 2);
+          v128_t v${ABC[N:N+2]}_1 = wasm_i64x2_shuffle(v${N}_01, v${N+1}_01, 1, 3);
+
+        $for N in range(0, NR, 2):
+          vacc${ABC[N:N+2]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v${ABC[N:N+2]}_0, vone, vacc${ABC[N:N+2]});
+          vacc${ABC[N:N+2]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v${ABC[N:N+2]}_1, vone, vacc${ABC[N:N+2]});
+
+        $for N in range(0, NR, 2):
+          wasm_v128_store(out + ${N * KR}, v${ABC[N:N+2]}_0);
+
+        $for N in range(0, NR, 2):
+          wasm_v128_store(out + ${(N + 8) * KR}, v${ABC[N:N+2]}_1);
+
+        $for N in range(NR):
+          w${N} += ${2 * KR};
+        out += ${2*NR*KR};
+      }
+
+      for (; k >= ${KR}; k -= ${KR}) {
+        $for N in range(0, NR, 2):
+          const v128_t v${N} = wasm_v128_load64_splat(w${N});
+          const v128_t v${N+1} = wasm_v128_load64_splat(w${N+1});
+          const v128_t v${ABC[N:N+2]} = wasm_i64x2_shuffle(v${N}, v${N+1}, 0, 3);
+
+        $for N in range(0, NR, 2):
+          vacc${ABC[N:N+2]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v${ABC[N:N+2]}, vone, vacc${ABC[N:N+2]});
+
+        $for N in range(0, NR, 2):
+          wasm_v128_store(out + ${N * KR}, v${ABC[N:N+2]});
+
+        $for N in range(NR):
+          w${N} += ${KR};
+        out += ${NR*KR};
+      }
+
+      // KC remainder 1..KR-1
+      if (k != 0) {
+        assert(k >= 1 && k <= ${KR-1});
+
+        const v128_t vmask = wasm_u64x2_shr(wasm_i32x4_splat(-1), (${KR} - k) * sizeof(${WTYPE}) * 8);
+
+        $for N in range(0, NR, 2):
+          const v128_t v${N} = wasm_v128_load64_splat(w${N});
+          const v128_t v${N+1} = wasm_v128_load64_splat(w${N+1});
+          v128_t v${ABC[N:N+2]} = wasm_i64x2_shuffle(v${N}, v${N+1}, 0, 3);
+          v${ABC[N:N+2]} = wasm_v128_and(v${ABC[N:N+2]}, vmask);
+
+        $for N in range(0, NR, 2):
+          vacc${ABC[N:N+2]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v${ABC[N:N+2]}, vone, vacc${ABC[N:N+2]});
+
+        $for N in range(0, NR, 2):
+          wasm_v128_store(out + ${N * KR}, v${ABC[N:N+2]});
+
+        $for N in range(NR):
+          w${N} += k;
+        out += ${NR*KR};
+      }
+
+      v128_t vksum0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc01, vacc23, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc01, vacc23, 1, 3, 5, 7));
+      v128_t vksum4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc45, vacc67, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc45, vacc67, 1, 3, 5, 7));
+
+      vksum0123 = wasm_i32x4_mul(vksum0123, vzeropoint);
+      vksum4567 = wasm_i32x4_mul(vksum4567, vzeropoint);
+
+      v128_t vpack0123 = wasm_v128_load(packed_b);
+      v128_t vpack4567 = wasm_v128_load(packed_b + 4);
+
+      wasm_v128_store(packed_b, wasm_i32x4_sub(vpack0123, vksum0123));
+      wasm_v128_store(packed_b + 4, wasm_i32x4_sub(vpack4567, vksum4567));
+
+      out = (${TYPE}*) ((uintptr_t) out + extra_bytes);
+      w0 = w${NR-1};
+    }
+
+    // NC remainder (1..${NR-1})
+    if XNN_UNLIKELY(n != 0) {
+      assert(n >= 1 && n <= ${NR-1});
+
+      int32_t* packed_b = (int32_t*) out;
+      if XNN_LIKELY(b != NULL) {
+        size_t nb = n;
+        do {
+          *((${BTYPE}*) out) = *b++;
+          out += sizeof(${BTYPE});
+        } while (--nb != 0);
+      } else {
+        size_t nb = n;
+        do {
+          *((${BTYPE}*) out) = 0;
+          out += sizeof(${BTYPE});
+        } while (--nb != 0);
+      }
+      out += (${NR} - n) * sizeof(${BTYPE});
+
+      $for N in range(1, NR):
+        const ${TYPE}* w${N} = w${N-1} + kc;
+        $if N % 2 == 0:
+          if XNN_UNPREDICTABLE(n <= ${N}) {
+            w${N} = w${N-1};
+          }
+        $else:
+          if XNN_UNPREDICTABLE(n < ${N+1}) {
+            w${N} = w${N-1};
+          }
+
+      $for N in range(0, NR, 2):
+        v128_t vacc${ABC[N:N+2]} = wasm_i32x4_splat(0);
+
+      // KC main loop multiple of ${NR}x${KR}
+      size_t k = kc;
+      for (; k >= ${KR}; k -= ${KR}) {
+        $for N in range(0, NR, 2):
+          const v128_t v${N} = wasm_v128_load64_splat(w${N});
+          const v128_t v${N+1} = wasm_v128_load64_splat(w${N+1});
+          const v128_t v${ABC[N:N+2]} = wasm_i64x2_shuffle(v${N}, v${N+1}, 0, 3);
+
+        $for N in range(0, NR, 2):
+          vacc${ABC[N:N+2]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v${ABC[N:N+2]}, vone, vacc${ABC[N:N+2]});
+
+        $for N in range(0, NR, 2):
+          wasm_v128_store(out + ${N * KR}, v${ABC[N:N+2]});
+
+        $for N in range(NR):
+          w${N} += ${KR};
+        out += ${NR*KR};
+      }
+
+      // KC remainder of 1..${KR-1}
+      if (k != 0) {
+        assert(k >= 1 && k <= ${KR-1});
+
+        const v128_t vmask = wasm_u64x2_shr(wasm_i32x4_splat(-1), (${KR} - k) * sizeof(${WTYPE}) * 8);
+
+        $for N in range(0, NR, 2):
+          const v128_t v${N} = wasm_v128_load64_splat(w${N});
+          const v128_t v${N+1} = wasm_v128_load64_splat(w${N+1});
+          v128_t v${ABC[N:N+2]} = wasm_i64x2_shuffle(v${N}, v${N+1}, 0, 3);
+          v${ABC[N:N+2]} = wasm_v128_and(v${ABC[N:N+2]}, vmask);
+
+        $for N in range(0, NR, 2):
+          vacc${ABC[N:N+2]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v${ABC[N:N+2]}, vone, vacc${ABC[N:N+2]});
+
+        $for N in range(0, NR, 2):
+          wasm_v128_store(out + ${N * KR}, v${ABC[N:N+2]});
+
+        out += ${NR*KR};
+      }
+
+      v128_t vksum0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc01, vacc23, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc01, vacc23, 1, 3, 5, 7));
+      v128_t vksum4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc45, vacc67, 0, 2, 4, 6), wasm_v32x4_shuffle(vacc45, vacc67, 1, 3, 5, 7));
+
+      vksum0123 = wasm_i32x4_mul(vksum0123, vzeropoint);
+      vksum4567 = wasm_i32x4_mul(vksum4567, vzeropoint);
+
+      v128_t vpack0123 = wasm_v128_load(packed_b);
+      v128_t vpack4567 = wasm_v128_load(packed_b + 4);
+
+      wasm_v128_store(packed_b, wasm_i32x4_sub(vpack0123, vksum0123));
+      wasm_v128_store(packed_b + 4, wasm_i32x4_sub(vpack4567, vksum4567));
+
+      out = (${TYPE}*) ((uintptr_t) out + extra_bytes);
+    }
+    weights += nc * kc;
+  } while (--g != 0);
+}
diff --git a/src/xnnpack/avgpool.h b/src/xnnpack/avgpool.h
index 55508bec2d9..68a8c005a4a 100644
--- a/src/xnnpack/avgpool.h
+++ b/src/xnnpack/avgpool.h
@@ -46,7 +46,7 @@ extern "C" {
       size_t output_increment,                                       \
       const struct xnn_f16_scaleminmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
 
-#include "src/f16-avgpool/f16-avgpool-minmax.h"
+#include "f16-avgpool/f16-avgpool-minmax.h"
 
 #undef XNN_UKERNEL_MULTIPASS
 #undef XNN_UKERNEL_UNIPASS
@@ -78,7 +78,7 @@ extern "C" {
       size_t output_increment,                                       \
       const struct xnn_f32_scaleminmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
 
-#include "src/f32-avgpool/f32-avgpool-minmax.h"
+#include "f32-avgpool/f32-avgpool-minmax.h"
 
 #undef XNN_UKERNEL_MULTIPASS
 #undef XNN_UKERNEL_UNIPASS
@@ -111,7 +111,7 @@ extern "C" {
       size_t output_increment,                                       \
       const union xnn_qu8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
 
-#include "src/qu8-avgpool/qu8-avgpool-minmax.h"
+#include "qu8-avgpool/qu8-avgpool-minmax.h"
 
 #undef XNN_UKERNEL_MULTIPASS
 #undef XNN_UKERNEL_UNIPASS
diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h
index 18eb152e72c..7a257df9034 100644
--- a/src/xnnpack/compute.h
+++ b/src/xnnpack/compute.h
@@ -1464,23 +1464,6 @@ struct reduce_context {
       size_t output2_block_size);
 #endif
 
-struct prelu_context {
-  size_t n;
-  const void* x;
-  size_t x_stride;
-  const void* w;
-  void* y;
-  size_t y_stride;
-  xnn_prelu_ukernel_fn ukernel;
-};
-
-#ifndef __cplusplus
-  XNN_PRIVATE void xnn_compute_prelu(
-      const struct prelu_context context[restrict XNN_MIN_ELEMENTS(1)],
-      size_t batch_start,
-      size_t batch_range);
-#endif
-
 struct vmulcaddc_context {
   size_t n;
   const void* x;
diff --git a/src/xnnpack/config-types.h b/src/xnnpack/config-types.h
index 7abec0bccb4..3d9e5f9b309 100644
--- a/src/xnnpack/config-types.h
+++ b/src/xnnpack/config-types.h
@@ -236,6 +236,8 @@ struct xnn_dwconv_config {
   uint8_t last_tile;
 };
 
+// Bilinear interpolation (2D).
+
 struct xnn_ibilinear_config {
   xnn_ibilinear_ukernel_fn ukernel;
   // Number of output pixels in a tile.
@@ -243,7 +245,7 @@ struct xnn_ibilinear_config {
   uint8_t pixel_tile;
 };
 
-// Bilinear interpolation (2D).
+// Bilinear interpolation (2D) in CHW layout.
 
 struct xnn_ibilinear_chw_config {
   xnn_ibilinear_chw_ukernel_fn ukernel;
@@ -252,18 +254,6 @@ struct xnn_ibilinear_chw_config {
   uint8_t channel_tile;
 };
 
-// Bilinear interpolation (2D) in CHW layout.
-
-struct xnn_prelu_config {
-  xnn_prelu_ukernel_fn ukernel;
-  // Number of rows in a tile.
-  // For best efficiency, micro-kernel must process a multiple of this number of rows in each call.
-  uint16_t row_tile;
-  // Number of channels in a tile.
-  // For best efficiency, micro-kernel must process a multiple of this number of channels in each call.
-  uint16_t channel_tile;
-};
-
 struct xnn_gemm_config {
   struct gemm_fused_ukernels minmax;
   struct gemm_fused_ukernels relu;
diff --git a/src/xnnpack/config.h b/src/xnnpack/config.h
index 59eaf94ced5..67c7ee16bc5 100644
--- a/src/xnnpack/config.h
+++ b/src/xnnpack/config.h
@@ -30,6 +30,7 @@ XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vdiv_confi
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vmax_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vmin_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vmul_config();
+XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vprelu_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vsub_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vsqrdiff_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vadd_config();
@@ -39,6 +40,7 @@ XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vdiv_confi
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vmax_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vmin_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vmul_config();
+XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vprelu_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vsub_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f32_vsqrdiff_config();
 XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_qs8_vadd_config();
@@ -155,9 +157,6 @@ XNN_INTERNAL const struct xnn_ibilinear_config* xnn_init_u8_ibilinear_config();
 XNN_INTERNAL const struct xnn_ibilinear_chw_config* xnn_init_f16_ibilinear_chw_config();
 XNN_INTERNAL const struct xnn_ibilinear_chw_config* xnn_init_f32_ibilinear_chw_config();
 
-XNN_INTERNAL const struct xnn_prelu_config* xnn_init_f16_prelu_config();
-XNN_INTERNAL const struct xnn_prelu_config* xnn_init_f32_prelu_config();
-
 static inline struct xnn_hmp_dqgemm_ukernel xnn_init_hmp_dqgemm_ukernel(
     xnn_dqgemm_ukernel_fn function) {
   struct xnn_hmp_dqgemm_ukernel ukernel = {{ function }};
diff --git a/src/xnnpack/dwconv.h b/src/xnnpack/dwconv.h
index a4c23e345cb..5ecaaec7d69 100644
--- a/src/xnnpack/dwconv.h
+++ b/src/xnnpack/dwconv.h
@@ -31,14 +31,14 @@ extern "C" {
     size_t input_offset,                                     \
     const datatype* zero,                                       \
     const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
-#include "src/f16-dwconv/f16-dwconv-minmax-unipass.h"
-#include "src/f32-dwconv/f32-dwconv-minmax-unipass.h"
-#include "src/f32-dwconv/f32-dwconv-unipass.h"
-#include "src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h"
-#include "src/qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h"
-#include "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h"
-#include "src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h"
-#include "src/qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h"
+#include "f16-dwconv/f16-dwconv-minmax-unipass.h"
+#include "f32-dwconv/f32-dwconv-minmax-unipass.h"
+#include "f32-dwconv/f32-dwconv-unipass.h"
+#include "qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h"
+#include "qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h"
+#include "qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h"
+#include "qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h"
+#include "qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h"
 #undef XNN_DWCONV_UNIPASS
 
 
@@ -56,14 +56,14 @@ extern "C" {
     size_t kernel_size,                                        \
     buffer_type* buffer,                                             \
     const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
-#include "src/f16-dwconv/f16-dwconv-minmax-multipass.h"
-#include "src/f32-dwconv/f32-dwconv-minmax-multipass.h"
-#include "src/f32-dwconv/f32-dwconv-multipass.h"
-#include "src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h"
-#include "src/qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h"
-#include "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h"
-#include "src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h"
-#include "src/qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h"
+#include "f16-dwconv/f16-dwconv-minmax-multipass.h"
+#include "f32-dwconv/f32-dwconv-minmax-multipass.h"
+#include "f32-dwconv/f32-dwconv-multipass.h"
+#include "qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h"
+#include "qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h"
+#include "qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h"
+#include "qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h"
+#include "qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h"
 #undef XNN_DWCONV_MULTIPASS
 
 #define DECLARE_F32_DWCONV2D_CHW_MINMAX_UKERNEL_FUNCTION(fn_name) \
diff --git a/src/xnnpack/fill.h b/src/xnnpack/fill.h
index 2c807407866..2698d0c671e 100644
--- a/src/xnnpack/fill.h
+++ b/src/xnnpack/fill.h
@@ -22,7 +22,7 @@ extern "C" {
     void* output,                              \
     size_t output_stride,                      \
     const uint32_t fill_pattern);
-#include "src/xx-fill/xx-fill.h"
+#include "xx-fill/xx-fill.h"
 #undef XNN_FILL_UKERNEL
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/indirection.h b/src/xnnpack/indirection.h
index 12ed000d3de..e5e71f852bb 100644
--- a/src/xnnpack/indirection.h
+++ b/src/xnnpack/indirection.h
@@ -13,6 +13,7 @@
 
 #include "xnnpack.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microparams.h"
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/maxpool.h b/src/xnnpack/maxpool.h
index d803fc50cbc..38100203cfa 100644
--- a/src/xnnpack/maxpool.h
+++ b/src/xnnpack/maxpool.h
@@ -31,7 +31,7 @@ extern "C" {
       size_t output_increment,                                                                                          \
       const union xnn_f16_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
 
-#include "src/f16-maxpool/f16-maxpool-minmax.h"
+#include "f16-maxpool/f16-maxpool-minmax.h"
 
 #undef XNN_UKERNEL
 
@@ -47,7 +47,7 @@ extern "C" {
       size_t output_increment,                                                                                          \
       const union xnn_f32_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
 
-#include "src/f32-maxpool/f32-maxpool-minmax.h"
+#include "f32-maxpool/f32-maxpool-minmax.h"
 
 #undef XNN_UKERNEL
 
@@ -64,7 +64,7 @@ extern "C" {
       const struct xnn_u8_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
 
 
-#include "src/u8-maxpool/u8-maxpool-minmax.h"
+#include "u8-maxpool/u8-maxpool-minmax.h"
 
 #undef XNN_UKERNEL
 
@@ -81,7 +81,7 @@ extern "C" {
       const struct xnn_s8_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
 
 
-#include "src/s8-maxpool/s8-maxpool-minmax.h"
+#include "s8-maxpool/s8-maxpool-minmax.h"
 
 #undef XNN_UKERNEL
 
diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h
index aacb40ff5ef..1705904ee9c 100644
--- a/src/xnnpack/microfnptr.h
+++ b/src/xnnpack/microfnptr.h
@@ -11,9 +11,9 @@
 
 #include "xnnpack.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microparams.h"
 
-
 /****************** Microkernel pointers for dense inference *****************/
 
 // CONV-HWC: direct CONVolution in HWC layout
@@ -790,35 +790,6 @@ typedef void (*xnn_f32_vmulcaddc_ukernel_fn)(
     size_t output_stride,
     const union xnn_f32_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
 
-// PRELU: Parametric RELU
-
-typedef void (*xnn_prelu_ukernel_fn)(
-    size_t batch,
-    size_t channels,
-    const void* input,
-    size_t input_stride,
-    const void* weights,
-    void* output,
-    size_t output_stride);
-
-typedef void (*xnn_f16_prelu_ukernel_fn)(
-    size_t batch,
-    size_t channels,
-    const xnn_float16* input,
-    size_t input_stride,
-    const xnn_float16* weights,
-    xnn_float16* output,
-    size_t output_stride);
-
-typedef void (*xnn_f32_prelu_ukernel_fn)(
-    size_t batch,
-    size_t channels,
-    const float* input,
-    size_t input_stride,
-    const float* weights,
-    float* output,
-    size_t output_stride);
-
 // IBILINEAR: Indirect BILINEAR interpolation
 
 typedef void (*xnn_ibilinear_ukernel_fn)(
@@ -2528,16 +2499,12 @@ typedef size_t (*xnn_init_binary_params_fn)(
 typedef size_t (*xnn_init_f16_qs8_cvt_params_fn)(
   struct xnn_f16_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
   xnn_float16 scale,
-  int8_t output_zero_point,
-  int8_t output_min,
-  int8_t output_max);
+  int8_t output_zero_point);
 
 typedef size_t (*xnn_init_f32_qs8_cvt_params_fn)(
   struct xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
   float scale,
-  int8_t output_zero_point,
-  int8_t output_min,
-  int8_t output_max);
+  int8_t output_zero_point);
 
 typedef size_t (*xnn_init_qs8_reduce_minmax_params_fn)(
   struct xnn_qs8_reduce_minmax_params params[XNN_MIN_ELEMENTS(1)],
@@ -2556,9 +2523,7 @@ typedef size_t (*xnn_init_qu8_reduce_minmax_params_fn)(
 typedef size_t (*xnn_init_f32_qu8_cvt_params_fn)(
   struct xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
   float scale,
-  uint8_t output_zero_point,
-  uint8_t output_min,
-  uint8_t output_max);
+  uint8_t output_zero_point);
 
 typedef size_t (*xnn_init_s32_f32_cvt_params_fn)(
   struct xnn_s32_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
diff --git a/src/xnnpack/microparams-init.h b/src/xnnpack/microparams-init.h
index ea04ccde182..7363f4064ce 100644
--- a/src/xnnpack/microparams-init.h
+++ b/src/xnnpack/microparams-init.h
@@ -10,9 +10,9 @@
 
 #include "xnnpack.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microparams.h"
 
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -478,23 +478,17 @@ DECLARE_INIT_QS8_MUL_MINMAX_PARAMS_FUNCTION(xnn_init_qs8_mul_minmax_scalar_param
 XNN_INTERNAL size_t xnn_init_f16_qs8_cvt_scalar_params(
   struct xnn_f16_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
   xnn_float16 scale,
-  int8_t zero_point,
-  int8_t output_min,
-  int8_t output_max);
+  int8_t zero_point);
 
 XNN_INTERNAL size_t xnn_init_f32_qs8_cvt_scalar_params(
   struct xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
   float scale,
-  int8_t zero_point,
-  int8_t output_min,
-  int8_t output_max);
+  int8_t zero_point);
 
 XNN_INTERNAL size_t xnn_init_f32_qu8_cvt_scalar_params(
   struct xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
   float scale,
-  uint8_t zero_point,
-  uint8_t output_min,
-  uint8_t output_max);
+  uint8_t zero_point);
 
 XNN_INTERNAL size_t xnn_init_s32_f32_cvt_scalar_params(
   struct xnn_s32_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
diff --git a/src/xnnpack/microparams.h b/src/xnnpack/microparams.h
index d939f36694c..7ff41529af6 100644
--- a/src/xnnpack/microparams.h
+++ b/src/xnnpack/microparams.h
@@ -590,8 +590,6 @@ struct xnn_f16_qs8_cvt_params {
   struct {
     xnn_float16 scale;
     int16_t output_zero_point;
-    int8_t output_min;
-    int8_t output_max;
   } scalar;
 };
 
@@ -599,8 +597,6 @@ struct xnn_f32_qs8_cvt_params {
   struct {
     float scale;
     int16_t output_zero_point;
-    int8_t output_min;
-    int8_t output_max;
   } scalar;
 };
 
@@ -608,8 +604,6 @@ struct xnn_f32_qu8_cvt_params {
   struct {
     float scale;
     int16_t output_zero_point;
-    uint8_t output_min;
-    uint8_t output_max;
   } scalar;
 };
 
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index 813cc03a8f7..0fbb6c36670 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -150,7 +150,6 @@ struct xnn_operator {
   size_t group_input_channels;
   size_t group_output_channels;
   size_t channels;
-  size_t max_tokens;
 
   uint32_t pad_value;
 
@@ -333,7 +332,6 @@ struct xnn_operator {
       };
     };  // For softmax operator.
     const struct xnn_maxpool_config* maxpool_config;
-    const struct xnn_prelu_config* prelu_config;
     const struct xnn_unpool_config* unpool_config;
     const struct xnn_zip_config* zip_config;
     struct {
@@ -392,7 +390,6 @@ struct xnn_operator {
     struct max_pooling_context max_pooling;
     struct pad_context pad;
     struct pixelwise_average_pooling_context pixelwise_average_pooling;
-    struct prelu_context prelu;
     struct reduce_context reduce;
     struct {
       struct resize_bilinear_context resize_bilinear;
diff --git a/src/xnnpack/pack.h b/src/xnnpack/pack.h
index 5d50d12cb79..6f8f4e9277d 100644
--- a/src/xnnpack/pack.h
+++ b/src/xnnpack/pack.h
@@ -13,10 +13,10 @@
 
 #include "xnnpack.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams.h"
 
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -1358,31 +1358,6 @@ XNN_INTERNAL void xnn_pack_f32_to_f16_vmulcaddc_w(
   const void* params);
 
 
-// Pack functions for prelu weights.
-typedef void (*xnn_pack_prelu_w_fn)(
-  size_t input_channels,
-  size_t slope_channels,
-  const void* slope_data,
-  void* packed_weights);
-
-XNN_INTERNAL void xnn_pack_f32_prelu_w(
-  size_t input_channels,
-  size_t slope_channels,
-  const float* slope_data,
-  float* packed_weights);
-
-XNN_INTERNAL void xnn_pack_f16_prelu_w(
-  size_t input_channels,
-  size_t slope_channels,
-  const uint16_t* slope_data,
-  uint16_t* packed_weights);
-
-XNN_INTERNAL void xnn_pack_f32_to_f16_prelu_w(
-  size_t input_channels,
-  size_t slope_channels,
-  const float* slope_data,
-  xnn_float16* packed_weights);
-
 // Sparse packing functions.
 
 struct xnn_spmm_packing_params {
diff --git a/src/xnnpack/packb.h b/src/xnnpack/packb.h
index 88a2d2896f7..b0b662f384a 100644
--- a/src/xnnpack/packb.h
+++ b/src/xnnpack/packb.h
@@ -25,7 +25,7 @@ extern "C" {
       size_t channel_subtile_stride,                                                   \
       const struct xnn_x32_packb_params params [XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);     \
 
-#include "src/x32-packb/x32-packb.h"
+#include "x32-packb/x32-packb.h"
 
 #undef XNN_UKERNEL
 
diff --git a/src/xnnpack/packq.h b/src/xnnpack/packq.h
index abc93399eb6..07af36c7b1c 100644
--- a/src/xnnpack/packq.h
+++ b/src/xnnpack/packq.h
@@ -148,7 +148,7 @@ XNN_INLINE static float xnn_x8_packq_f32qp8_get_dequantized(
                             const float* XNN_RESTRICT lhs, size_t lhs_stride, \
                             void* XNN_RESTRICT lhs_packed);
 
-#include "src/x8-packq/x8-packq.h"
+#include "x8-packq/x8-packq.h"
 
 #undef XNN_UKERNEL
 
diff --git a/src/xnnpack/packw.h b/src/xnnpack/packw.h
index 013295f593a..cf62f60c640 100644
--- a/src/xnnpack/packw.h
+++ b/src/xnnpack/packw.h
@@ -31,7 +31,7 @@ extern "C" {
       size_t extra_bytes,                                                 \
       const void* params);
 
-#include "src/x8-packw/x8-packw.h"
+#include "x8-packw/x8-packw.h"
 
 #undef XNN_UKERNEL
 
@@ -50,7 +50,7 @@ extern "C" {
       size_t extra_bytes,                                                          \
       const void* params);
 
-#include "src/qs8-packw/qs8-packw.h"
+#include "qs8-packw/qs8-packw.h"
 
 #undef XNN_QS8_UKERNEL
 
@@ -69,7 +69,7 @@ extern "C" {
       size_t extra_bytes,                                                 \
       const void* params);                                                \
 
-#include "src/x16-packw/x16-packw.h"
+#include "x16-packw/x16-packw.h"
 
 #undef XNN_UKERNEL
 
@@ -88,7 +88,7 @@ extern "C" {
       size_t extra_bytes,                                                 \
       const void* params);                                                \
 
-#include "src/x32-packw/x32-packw.h"
+#include "x32-packw/x32-packw.h"
 
 #undef XNN_UKERNEL
 
diff --git a/src/xnnpack/packx.h b/src/xnnpack/packx.h
index c08428f4e1d..c6a24e09106 100644
--- a/src/xnnpack/packx.h
+++ b/src/xnnpack/packx.h
@@ -23,7 +23,7 @@ extern "C" {
       size_t x_stride,                               \
       uint32_t* y);
 
-#include "src/x32-packx/x32-packx.h"
+#include "x32-packx/x32-packx.h"
 
 #undef XNN_UKERNEL
 
diff --git a/src/xnnpack/pad.h b/src/xnnpack/pad.h
index db0098ecedb..227c4ca75de 100644
--- a/src/xnnpack/pad.h
+++ b/src/xnnpack/pad.h
@@ -26,7 +26,7 @@ extern "C" {
     void* output,                             \
     size_t output_stride,                     \
     const uint32_t fill_pattern);
-#include "src/xx-pad/xx-pad.h"
+#include "xx-pad/xx-pad.h"
 #undef XNN_PAD_UKERNEL
 
 
diff --git a/src/xnnpack/pavgpool.h b/src/xnnpack/pavgpool.h
index e45595de9dd..c1433914ec5 100644
--- a/src/xnnpack/pavgpool.h
+++ b/src/xnnpack/pavgpool.h
@@ -45,7 +45,7 @@ extern "C" {
       size_t output_increment,                                        \
       const struct xnn_f16_scaleminmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
 
-#include "src/f16-pavgpool/f16-pavgpool-minmax.h"
+#include "f16-pavgpool/f16-pavgpool-minmax.h"
 
 #undef XNN_UKERNEL_MULTIPASS
 #undef XNN_UKERNEL_UNIPASS
@@ -80,7 +80,7 @@ extern "C" {
       size_t output_increment,                                        \
       const union xnn_f32_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
 
-#include "src/f32-pavgpool/f32-pavgpool-minmax.h"
+#include "f32-pavgpool/f32-pavgpool-minmax.h"
 
 #undef XNN_UKERNEL_MULTIPASS
 #undef XNN_UKERNEL_UNIPASS
diff --git a/src/xnnpack/prelu.h b/src/xnnpack/prelu.h
deleted file mode 100644
index 2057d4a2b1d..00000000000
--- a/src/xnnpack/prelu.h
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "xnnpack/common.h"
-#include "xnnpack/math.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-#define DECLARE_F16_PRELU_UKERNEL_FUNCTION(fn_name) \
-  XNN_INTERNAL void fn_name(                        \
-      size_t rows,                                  \
-      size_t channels,                              \
-      const xnn_float16* input,            \
-      size_t input_stride,                          \
-      const xnn_float16* weights,          \
-      xnn_float16* output,                 \
-      size_t output_stride);
-
-DECLARE_F16_PRELU_UKERNEL_FUNCTION(xnn_f16_prelu_ukernel__neonfp16arith_2x8)
-DECLARE_F16_PRELU_UKERNEL_FUNCTION(xnn_f16_prelu_ukernel__neonfp16arith_2x16)
-
-DECLARE_F16_PRELU_UKERNEL_FUNCTION(xnn_f16_prelu_ukernel__f16c_2x8)
-DECLARE_F16_PRELU_UKERNEL_FUNCTION(xnn_f16_prelu_ukernel__f16c_2x16)
-
-
-#define DECLARE_F32_PRELU_UKERNEL_FUNCTION(fn_name) \
-  XNN_INTERNAL void fn_name(                        \
-      size_t rows,                                  \
-      size_t channels,                              \
-      const float* input,                           \
-      size_t input_stride,                          \
-      const float* weights,                         \
-      float* output,                                \
-      size_t output_stride);
-
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_1x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_1x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_1x16)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_2x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_2x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_2x16)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_4x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_4x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_4x16)
-
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse_2x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse_2x8)
-
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse2_2x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse2_2x8)
-
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse41_2x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse41_2x8)
-
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__avx_2x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__avx_2x16)
-
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__avx512f_2x16)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__avx512f_2x32)
-
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16)
-
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16)
-
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16)
-
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16)
-
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasm_2x1)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasm_2x4)
-
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__scalar_2x1)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__scalar_2x4)
-
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
diff --git a/src/xnnpack/requantization-stubs.h b/src/xnnpack/requantization-stubs.h
index 02a47cc3ec6..3a23770f067 100644
--- a/src/xnnpack/requantization-stubs.h
+++ b/src/xnnpack/requantization-stubs.h
@@ -48,14 +48,6 @@ DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__sse41)
 DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__ssse3)
 DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__wasmsimd)
 
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_rndna__neon)
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_rndna__scalar_signed64)
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_rndna__scalar_unsigned32)
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_rndna__scalar_unsigned64)
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_rndna__sse2)
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_rndna__sse41)
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_rndna__ssse3)
-
 
 typedef void (*xnn_qs8_requantization_fn)(
     size_t n,
@@ -90,14 +82,6 @@ DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_gemmlowp__sse41)
 DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_gemmlowp__ssse3)
 DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_gemmlowp__wasmsimd)
 
-DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndna__neon)
-DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndna__scalar_signed64)
-DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndna__scalar_unsigned32)
-DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndna__scalar_unsigned64)
-DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndna__sse2)
-DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndna__sse41)
-DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndna__ssse3)
-
 DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndnu__neon_mull)
 DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndnu__neon_qdmulh)
 DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_rndnu__scalar)
diff --git a/src/xnnpack/requantization.h b/src/xnnpack/requantization.h
index 1cc63dbe652..cc2af0ec0bf 100644
--- a/src/xnnpack/requantization.h
+++ b/src/xnnpack/requantization.h
@@ -74,82 +74,6 @@ static inline uint8_t xnn_qu8_requantize_fp32(
   return (uint8_t) output;
 }
 
-static inline int8_t xnn_qs8_requantize_rndna(
-  int32_t input,
-  float scale,
-  int8_t zero_point,
-  int8_t min,
-  int8_t max)
-{
-  assert(scale >= 1.0f / 4294967296.0f /* 0x1.0p-32f */);
-  assert(scale < 256.0f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
-  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
-  assert(shift >= 16);
-  assert(shift < 56);
-
-  const uint64_t rounding = UINT64_C(1) << (shift - 1);
-  const int32_t min_less_zero_point = (int32_t) min - (int32_t) zero_point;
-  const int32_t max_less_zero_point = (int32_t) max - (int32_t) zero_point;
-
-  uint32_t abs_input = (uint32_t) input;
-  if (input < 0) {
-    abs_input = -abs_input;
-  }
-
-  const uint64_t abs_prescaled_input = (uint64_t) abs_input * (uint64_t) multiplier;
-  const uint32_t abs_scaled_input = (uint32_t) ((abs_prescaled_input + rounding) >> shift);
-
-  int32_t output = (int32_t) abs_scaled_input;
-  if (input < 0) {
-    output = -output;
-  }
-
-  output = math_max_s32(output, min_less_zero_point);
-  output = math_min_s32(output, max_less_zero_point);
-  return (int8_t) (output + (int32_t) zero_point);
-}
-
-static inline uint8_t xnn_qu8_requantize_rndna(
-  int32_t input,
-  float scale,
-  uint8_t zero_point,
-  uint8_t min,
-  uint8_t max)
-{
-  assert(scale >= 1.0f / 4294967296.0f /* 0x1.0p-32f */);
-  assert(scale < 256.0f);
-
-  const uint32_t scale_bits = float_as_uint32(scale);
-  const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
-  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
-  assert(shift >= 16);
-  assert(shift < 56);
-
-  const uint64_t rounding = UINT64_C(1) << (shift - 1);
-  const int32_t min_less_zero_point = (int32_t) min - (int32_t) zero_point;
-  const int32_t max_less_zero_point = (int32_t) max - (int32_t) zero_point;
-
-  uint32_t abs_input = (uint32_t) input;
-  if (input < 0) {
-    abs_input = -abs_input;
-  }
-
-  const uint64_t abs_prescaled_input = (uint64_t) abs_input * (uint64_t) multiplier;
-  const uint32_t abs_scaled_input = (uint32_t) ((abs_prescaled_input + rounding) >> shift);
-
-  int32_t output = (int32_t) abs_scaled_input;
-  if (input < 0) {
-    output = -output;
-  }
-
-  output = math_max_s32(output, min_less_zero_point);
-  output = math_min_s32(output, max_less_zero_point);
-  return (uint8_t) (output + (int32_t) zero_point);
-}
-
 static inline int8_t xnn_qs8_requantize_rndnu(
   int32_t input,
   float scale,
diff --git a/src/xnnpack/subgraph.h b/src/xnnpack/subgraph.h
index e8000c085b8..d60e34bad28 100644
--- a/src/xnnpack/subgraph.h
+++ b/src/xnnpack/subgraph.h
@@ -319,9 +319,6 @@ struct xnn_node {
       size_t new_height;
       size_t new_width;
     } static_resize;
-    struct {
-      size_t max_tokens;
-    } rope;
     struct {
       size_t num_dims;
       size_t offsets[XNN_MAX_TENSOR_DIMS];
diff --git a/src/xnnpack/transpose.h b/src/xnnpack/transpose.h
index eebabcc4b50..a10c4b94ebc 100644
--- a/src/xnnpack/transpose.h
+++ b/src/xnnpack/transpose.h
@@ -25,7 +25,7 @@ extern "C" {
       size_t element_size,                              \
       size_t block_width,                               \
       size_t block_height);
-#include "src/xx-transposev/xx-transposev.h"
+#include "xx-transposev/xx-transposev.h"
 #undef XNN_TRANSPOSE_UKERNEL
 
 #define XNN_TRANSPOSE_UKERNEL(arch_flags, fn_name, datasize, datatype, ...) \
@@ -36,11 +36,11 @@ extern "C" {
       size_t output_stride,                              \
       size_t block_width,                                \
       size_t block_height);
-#include "src/x8-transposec/x8-transposec.h"
-#include "src/x16-transposec/x16-transposec.h"
-#include "src/x24-transposec/x24-transposec.h"
-#include "src/x32-transposec/x32-transposec.h"
-#include "src/x64-transposec/x64-transposec.h"
+#include "x8-transposec/x8-transposec.h"
+#include "x16-transposec/x16-transposec.h"
+#include "x24-transposec/x24-transposec.h"
+#include "x32-transposec/x32-transposec.h"
+#include "x64-transposec/x64-transposec.h"
 #undef XNN_TRANSPOSE_UKERNEL
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/vbinary.h b/src/xnnpack/vbinary.h
index 2479e2e182c..8a9bdf065e8 100644
--- a/src/xnnpack/vbinary.h
+++ b/src/xnnpack/vbinary.h
@@ -24,26 +24,26 @@ extern "C" {
   XNN_INTERNAL void ukernel(                                                  \
       size_t n, const xnn_float16* a, const xnn_float16* b, xnn_float16* y,   \
       const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
-#include "src/f16-vbinary/f16-vadd.h"
-#include "src/f16-vbinary/f16-vaddc.h"
-#include "src/f16-vbinary/f16-vcmul.h"
-#include "src/f16-vbinary/f16-vdiv.h"
-#include "src/f16-vbinary/f16-vdivc.h"
-#include "src/f16-vbinary/f16-vmax.h"
-#include "src/f16-vbinary/f16-vmaxc.h"
-#include "src/f16-vbinary/f16-vmin.h"
-#include "src/f16-vbinary/f16-vminc.h"
-#include "src/f16-vbinary/f16-vmul.h"
-#include "src/f16-vbinary/f16-vmulc.h"
-#include "src/f16-vbinary/f16-vprelu.h"
-#include "src/f16-vbinary/f16-vpreluc.h"
-#include "src/f16-vbinary/f16-vrpreluc.h"
-#include "src/f16-vbinary/f16-vrdivc.h"
-#include "src/f16-vbinary/f16-vrsubc.h"
-#include "src/f16-vbinary/f16-vsqrdiff.h"
-#include "src/f16-vbinary/f16-vsqrdiffc.h"
-#include "src/f16-vbinary/f16-vsub.h"
-#include "src/f16-vbinary/f16-vsubc.h"
+#include "f16-vbinary/f16-vadd.h"
+#include "f16-vbinary/f16-vaddc.h"
+#include "f16-vbinary/f16-vcmul.h"
+#include "f16-vbinary/f16-vdiv.h"
+#include "f16-vbinary/f16-vdivc.h"
+#include "f16-vbinary/f16-vmax.h"
+#include "f16-vbinary/f16-vmaxc.h"
+#include "f16-vbinary/f16-vmin.h"
+#include "f16-vbinary/f16-vminc.h"
+#include "f16-vbinary/f16-vmul.h"
+#include "f16-vbinary/f16-vmulc.h"
+#include "f16-vbinary/f16-vprelu.h"
+#include "f16-vbinary/f16-vpreluc.h"
+#include "f16-vbinary/f16-vrpreluc.h"
+#include "f16-vbinary/f16-vrdivc.h"
+#include "f16-vbinary/f16-vrsubc.h"
+#include "f16-vbinary/f16-vsqrdiff.h"
+#include "f16-vbinary/f16-vsqrdiffc.h"
+#include "f16-vbinary/f16-vsub.h"
+#include "f16-vbinary/f16-vsubc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
 
 #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \
@@ -51,29 +51,29 @@ extern "C" {
   XNN_INTERNAL void ukernel(                                                  \
       size_t n, const float* a, const float* b, float* y,                     \
       const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
-#include "src/f32-vbinary/f32-vadd.h"
-#include "src/f32-vbinary/f32-vaddc.h"
-#include "src/f32-vbinary/f32-vcopysign.h"
-#include "src/f32-vbinary/f32-vcopysignc.h"
-#include "src/f32-vbinary/f32-vcmul.h"
-#include "src/f32-vbinary/f32-vdiv.h"
-#include "src/f32-vbinary/f32-vdivc.h"
-#include "src/f32-vbinary/f32-vmax.h"
-#include "src/f32-vbinary/f32-vmaxc.h"
-#include "src/f32-vbinary/f32-vmin.h"
-#include "src/f32-vbinary/f32-vminc.h"
-#include "src/f32-vbinary/f32-vmul.h"
-#include "src/f32-vbinary/f32-vmulc.h"
-#include "src/f32-vbinary/f32-vprelu.h"
-#include "src/f32-vbinary/f32-vpreluc.h"
-#include "src/f32-vbinary/f32-vrpreluc.h"
-#include "src/f32-vbinary/f32-vrcopysignc.h"
-#include "src/f32-vbinary/f32-vrdivc.h"
-#include "src/f32-vbinary/f32-vrsubc.h"
-#include "src/f32-vbinary/f32-vsqrdiff.h"
-#include "src/f32-vbinary/f32-vsqrdiffc.h"
-#include "src/f32-vbinary/f32-vsub.h"
-#include "src/f32-vbinary/f32-vsubc.h"
+#include "f32-vbinary/f32-vadd.h"
+#include "f32-vbinary/f32-vaddc.h"
+#include "f32-vbinary/f32-vcopysign.h"
+#include "f32-vbinary/f32-vcopysignc.h"
+#include "f32-vbinary/f32-vcmul.h"
+#include "f32-vbinary/f32-vdiv.h"
+#include "f32-vbinary/f32-vdivc.h"
+#include "f32-vbinary/f32-vmax.h"
+#include "f32-vbinary/f32-vmaxc.h"
+#include "f32-vbinary/f32-vmin.h"
+#include "f32-vbinary/f32-vminc.h"
+#include "f32-vbinary/f32-vmul.h"
+#include "f32-vbinary/f32-vmulc.h"
+#include "f32-vbinary/f32-vprelu.h"
+#include "f32-vbinary/f32-vpreluc.h"
+#include "f32-vbinary/f32-vrpreluc.h"
+#include "f32-vbinary/f32-vrcopysignc.h"
+#include "f32-vbinary/f32-vrdivc.h"
+#include "f32-vbinary/f32-vrsubc.h"
+#include "f32-vbinary/f32-vsqrdiff.h"
+#include "f32-vbinary/f32-vsqrdiffc.h"
+#include "f32-vbinary/f32-vsub.h"
+#include "f32-vbinary/f32-vsubc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
 
 #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \
@@ -82,12 +82,12 @@ extern "C" {
       size_t n, const uint8_t* input_a, const uint8_t* input_b,               \
       uint8_t* output,                                                        \
       const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
-#include "src/qu8-vadd/qu8-vadd-minmax.h"
-#include "src/qu8-vaddc/qu8-vaddc-minmax.h"
-#include "src/qu8-vmul/qu8-vmul-minmax-fp32.h"
-#include "src/qu8-vmul/qu8-vmul-minmax-rndnu.h"
-#include "src/qu8-vmulc/qu8-vmulc-minmax-fp32.h"
-#include "src/qu8-vmulc/qu8-vmulc-minmax-rndnu.h"
+#include "qu8-vadd/qu8-vadd-minmax.h"
+#include "qu8-vaddc/qu8-vaddc-minmax.h"
+#include "qu8-vmul/qu8-vmul-minmax-fp32.h"
+#include "qu8-vmul/qu8-vmul-minmax-rndnu.h"
+#include "qu8-vmulc/qu8-vmulc-minmax-fp32.h"
+#include "qu8-vmulc/qu8-vmulc-minmax-rndnu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
 
 #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \
@@ -95,12 +95,12 @@ extern "C" {
   XNN_INTERNAL void ukernel(                                                  \
       size_t n, const int8_t* input_a, const int8_t* input_b, int8_t* output, \
       const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
-#include "src/qs8-vadd/qs8-vadd-minmax.h"
-#include "src/qs8-vaddc/qs8-vaddc-minmax.h"
-#include "src/qs8-vmul/qs8-vmul-minmax-fp32.h"
-#include "src/qs8-vmul/qs8-vmul-minmax-rndnu.h"
-#include "src/qs8-vmulc/qs8-vmulc-minmax-fp32.h"
-#include "src/qs8-vmulc/qs8-vmulc-minmax-rndnu.h"
+#include "qs8-vadd/qs8-vadd-minmax.h"
+#include "qs8-vaddc/qs8-vaddc-minmax.h"
+#include "qs8-vmul/qs8-vmul-minmax-fp32.h"
+#include "qs8-vmul/qs8-vmul-minmax-rndnu.h"
+#include "qs8-vmulc/qs8-vmulc-minmax-fp32.h"
+#include "qs8-vmulc/qs8-vmulc-minmax-rndnu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
 
 #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \
@@ -109,8 +109,8 @@ extern "C" {
       size_t n, const int32_t* input_a, const int32_t* input_b,               \
       int32_t* output,                                                        \
       const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
-#include "src/s32-vmul/s32-vmul.h"
-#include "src/s32-vmul/s32-vmulc.h"
+#include "s32-vmul/s32-vmul.h"
+#include "s32-vmul/s32-vmulc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/vcvt.h b/src/xnnpack/vcvt.h
index fab15282e61..3bd78dc57e3 100644
--- a/src/xnnpack/vcvt.h
+++ b/src/xnnpack/vcvt.h
@@ -17,19 +17,19 @@ extern "C" {
 
 #define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, type_in, type_out, params_type, init_params) \
   XNN_INTERNAL void ukernel(size_t n, const type_in* input, type_out* output, const params_type* params);
-#include "src/f16-f32-vcvt/f16-f32-vcvt.h"
-#include "src/f16-qs8-vcvt/f16-qs8-vcvt.h"
-#include "src/f32-f16-vcvt/f32-f16-vcvt.h"
-#include "src/f32-qs8-vcvt/f32-qs8-vcvt.h"
-#include "src/f32-qu8-vcvt/f32-qu8-vcvt.h"
-#include "src/qs16-qs8-vcvt/qs16-qs8-vcvt.h"
-#include "src/qs8-f16-vcvt/qs8-f16-vcvt.h"
-#include "src/qs8-f32-vcvt/qs8-f32-vcvt.h"
-#include "src/qu8-f32-vcvt/qu8-f32-vcvt.h"
-#include "src/s32-f32-vcvt/s32-f32-vcvt.h"
-#include "src/u32-f32-vcvt/u32-f32-vcvt.h"
-#include "src/qs8-vcvt/qs8-vcvt.h"
-#include "src/qu8-vcvt/qu8-vcvt.h"
+#include "f16-f32-vcvt/f16-f32-vcvt.h"
+#include "f16-qs8-vcvt/f16-qs8-vcvt.h"
+#include "f32-f16-vcvt/f32-f16-vcvt.h"
+#include "f32-qs8-vcvt/f32-qs8-vcvt.h"
+#include "f32-qu8-vcvt/f32-qu8-vcvt.h"
+#include "qs16-qs8-vcvt/qs16-qs8-vcvt.h"
+#include "qs8-f16-vcvt/qs8-f16-vcvt.h"
+#include "qs8-f32-vcvt/qs8-f32-vcvt.h"
+#include "qu8-f32-vcvt/qu8-f32-vcvt.h"
+#include "s32-f32-vcvt/s32-f32-vcvt.h"
+#include "u32-f32-vcvt/u32-f32-vcvt.h"
+#include "qs8-vcvt/qs8-vcvt.h"
+#include "qu8-vcvt/qu8-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/vhswish.h b/src/xnnpack/vhswish.h
deleted file mode 100644
index cc38437f079..00000000000
--- a/src/xnnpack/vhswish.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2023 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "xnnpack/common.h"
-#include "xnnpack/microparams.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-#define XNN_UKERNEL_WITH_PARAMS(arch_flags, fn_name, batch_tile, vector_tile, \
-                                datatype, params_type, init_params)           \
-  XNN_INTERNAL void fn_name(                          \
-      size_t n,                                       \
-      const int8_t* input,                            \
-      int8_t* output,                                 \
-      const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
-#include "src/qs8-vhswish/qs8-vhswish.h"
-#undef XNN_UKERNEL
-#undef XNN_UKERNEL_WITH_PARAMS
-
-#define XNN_UKERNEL_WITH_PARAMS(arch_flags, fn_name, batch_tile, vector_tile, \
-                                datatype, params_type, init_params)           \
-  XNN_INTERNAL void fn_name(                          \
-      size_t n,                                       \
-      const uint8_t* input,                            \
-      uint8_t* output,                                 \
-      const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
-#include "src/qu8-vhswish/qu8-vhswish.h"
-#undef XNN_UKERNEL
-#undef XNN_UKERNEL_WITH_PARAMS
-
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
diff --git a/src/xnnpack/vlog.h b/src/xnnpack/vlog.h
deleted file mode 100644
index 3788af83709..00000000000
--- a/src/xnnpack/vlog.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2022 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "xnnpack/common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-#define DECLARE_U32_VLOG_UKERNEL_FUNCTION(fn_name) \
-  XNN_INTERNAL void fn_name(                       \
-    size_t batch_size,                             \
-    const uint32_t* input,                         \
-    uint32_t input_lshift,                         \
-    uint32_t output_scale,                         \
-    uint16_t* output);
-
-
-DECLARE_U32_VLOG_UKERNEL_FUNCTION(xnn_u32_vlog_ukernel__scalar_x1)
-DECLARE_U32_VLOG_UKERNEL_FUNCTION(xnn_u32_vlog_ukernel__scalar_x2)
-DECLARE_U32_VLOG_UKERNEL_FUNCTION(xnn_u32_vlog_ukernel__scalar_x3)
-DECLARE_U32_VLOG_UKERNEL_FUNCTION(xnn_u32_vlog_ukernel__scalar_x4)
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
diff --git a/src/xnnpack/vlrelu.h b/src/xnnpack/vlrelu.h
deleted file mode 100644
index 8ec23a938b9..00000000000
--- a/src/xnnpack/vlrelu.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2022 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "xnnpack/common.h"
-#include "xnnpack/microparams.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define XNN_UKERNEL(arch_flags, fn_name, batch_tile, vector_tile, datatype) \
-  XNN_INTERNAL void fn_name(size_t n, const int8_t* input, int8_t* output,  \
-                            const struct xnn_qs8_lrelu_params                \
-                                params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
-#include "src/qs8-vlrelu/qs8-vlrelu.h"
-#undef XNN_UKERNEL
-
-#define XNN_UKERNEL(arch_flags, fn_name, batch_tile, vector_tile, datatype)  \
-  XNN_INTERNAL void fn_name(size_t n, const uint8_t* input, uint8_t* output, \
-                            const struct xnn_qu8_lrelu_params                 \
-                                params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
-#include "src/qu8-vlrelu/qu8-vlrelu.h"
-#undef XNN_UKERNEL
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
diff --git a/src/xnnpack/vunary.h b/src/xnnpack/vunary.h
index 5aefeceb748..2f3536fc687 100644
--- a/src/xnnpack/vunary.h
+++ b/src/xnnpack/vunary.h
@@ -41,14 +41,14 @@ extern "C" {
   XNN_INTERNAL void fn_name(size_t n, const int8_t* x, int8_t* y,           \
                             const struct xnn_s8_minmax_params                \
                                 params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
-#include "src/s8-vclamp/s8-vclamp.h"
+#include "s8-vclamp/s8-vclamp.h"
 #undef XNN_UKERNEL
 
 #define XNN_UKERNEL(arch_flags, fn_name, batch_tile, vector_tile, datatype) \
   XNN_INTERNAL void fn_name(size_t n, const uint8_t* x, uint8_t* y,         \
                             const struct xnn_u8_minmax_params                \
                                 params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
-#include "src/u8-vclamp/u8-vclamp.h"
+#include "u8-vclamp/u8-vclamp.h"
 #undef XNN_UKERNEL
 
 #define XNN_UKERNEL(arch_flags, fn_name, batch_tile, vector_tile, datatype) \
@@ -56,7 +56,7 @@ extern "C" {
 #define XNN_UKERNEL_WITH_PARAMS(arch_flags, fn_name, batch_tile, vector_tile, \
                                 datatype, params_type, init_params)           \
   DECLARE_BF16_UKERNEL_FUNCTION(fn_name, params_type);
-#include "src/bf16-vabs/bf16-vabs.h"
+#include "bf16-vabs/bf16-vabs.h"
 #undef XNN_UKERNEL
 #undef XNN_UKERNEL_WITH_PARAMS
 
@@ -65,21 +65,21 @@ extern "C" {
 #define XNN_UKERNEL_WITH_PARAMS(arch_flags, fn_name, batch_tile, vector_tile, \
                                 datatype, params_type, init_params)           \
   DECLARE_F16_UKERNEL_FUNCTION(fn_name, params_type);
-#include "src/f16-vabs/f16-vabs.h"
-#include "src/f16-vclamp/f16-vclamp.h"
-#include "src/f16-velu/f16-velu.h"
-#include "src/f16-vhswish/f16-vhswish.h"
-#include "src/f16-vlrelu/f16-vlrelu.h"
-#include "src/f16-vneg/f16-vneg.h"
-#include "src/f16-vrnd/f16-vrndd.h"
-#include "src/f16-vrnd/f16-vrndne.h"
-#include "src/f16-vrnd/f16-vrndu.h"
-#include "src/f16-vrnd/f16-vrndz.h"
-#include "src/f16-vrsqrt/f16-vrsqrt.h"
-#include "src/f16-vsigmoid/f16-vsigmoid.h"
-#include "src/f16-vsqr/f16-vsqr.h"
-#include "src/f16-vsqrt/f16-vsqrt.h"
-#include "src/f16-vtanh/f16-vtanh.h"
+#include "f16-vabs/f16-vabs.h"
+#include "f16-vclamp/f16-vclamp.h"
+#include "f16-velu/f16-velu.h"
+#include "f16-vhswish/f16-vhswish.h"
+#include "f16-vlrelu/f16-vlrelu.h"
+#include "f16-vneg/f16-vneg.h"
+#include "f16-vrnd/f16-vrndd.h"
+#include "f16-vrnd/f16-vrndne.h"
+#include "f16-vrnd/f16-vrndu.h"
+#include "f16-vrnd/f16-vrndz.h"
+#include "f16-vrsqrt/f16-vrsqrt.h"
+#include "f16-vsigmoid/f16-vsigmoid.h"
+#include "f16-vsqr/f16-vsqr.h"
+#include "f16-vsqrt/f16-vsqrt.h"
+#include "f16-vtanh/f16-vtanh.h"
 #undef XNN_UKERNEL
 #undef XNN_UKERNEL_WITH_PARAMS
 
@@ -88,28 +88,45 @@ extern "C" {
 #define XNN_UKERNEL_WITH_PARAMS(arch_flags, fn_name, batch_tile, vector_tile, \
                                 datatype, params_type, init_params)           \
   DECLARE_F32_UKERNEL_FUNCTION(fn_name, params_type);
-#include "src/f32-vabs/f32-vabs.h"
-#include "src/f32-vclamp/f32-vclamp.h"
-#include "src/f32-velu/f32-velu.h"
-#include "src/f32-vexp/f32-vexp.h"
-#include "src/f32-vgelu/f32-vgelu.h"
-#include "src/f32-vhswish/f32-vhswish.h"
-#include "src/f32-vlog/f32-vlog.h"
-#include "src/f32-vlrelu/f32-vlrelu.h"
-#include "src/f32-vneg/f32-vneg.h"
-#include "src/f32-vrelu/f32-vrelu.h"
-#include "src/f32-vrnd/f32-vrndd.h"
-#include "src/f32-vrnd/f32-vrndne.h"
-#include "src/f32-vrnd/f32-vrndu.h"
-#include "src/f32-vrnd/f32-vrndz.h"
-#include "src/f32-vrsqrt/f32-vrsqrt.h"
-#include "src/f32-vsigmoid/f32-vsigmoid.h"
-#include "src/f32-vsqr/f32-vsqr.h"
-#include "src/f32-vsqrt/f32-vsqrt.h"
-#include "src/f32-vtanh/f32-vtanh.h"
+#include "f32-vabs/f32-vabs.h"
+#include "f32-vclamp/f32-vclamp.h"
+#include "f32-velu/f32-velu.h"
+#include "f32-vexp/f32-vexp.h"
+#include "f32-vgelu/f32-vgelu.h"
+#include "f32-vhswish/f32-vhswish.h"
+#include "f32-vlog/f32-vlog.h"
+#include "f32-vlrelu/f32-vlrelu.h"
+#include "f32-vneg/f32-vneg.h"
+#include "f32-vrelu/f32-vrelu.h"
+#include "f32-vrnd/f32-vrndd.h"
+#include "f32-vrnd/f32-vrndne.h"
+#include "f32-vrnd/f32-vrndu.h"
+#include "f32-vrnd/f32-vrndz.h"
+#include "f32-vrsqrt/f32-vrsqrt.h"
+#include "f32-vsigmoid/f32-vsigmoid.h"
+#include "f32-vsqr/f32-vsqr.h"
+#include "f32-vsqrt/f32-vsqrt.h"
+#include "f32-vtanh/f32-vtanh.h"
 #undef XNN_UKERNEL
 #undef XNN_UKERNEL_WITH_PARAMS
 
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, fn_name, batch_tile, vector_tile, \
+                                datatype, params_type, init_params)           \
+  XNN_INTERNAL void fn_name(                                                  \
+      size_t n, const int8_t* input, int8_t* output,                          \
+      const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
+#include "qs8-vhswish/qs8-vhswish.h"
+#include "qs8-vlrelu/qs8-vlrelu.h"
+#undef XNN_UKERNEL_WITH_PARAMS
+
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, fn_name, batch_tile, vector_tile, \
+                                datatype, params_type, init_params)           \
+  XNN_INTERNAL void fn_name(                                                  \
+      size_t n, const uint8_t* input, uint8_t* output,                        \
+      const params_type params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
+#include "qu8-vhswish/qu8-vhswish.h"
+#include "qu8-vlrelu/qu8-vlrelu.h"
+#undef XNN_UKERNEL_WITH_PARAMS
 
 #define DECLARE_XX_VUNARY_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                        \
diff --git a/src/xnnpack/zerob.h b/src/xnnpack/zerob.h
index 0ffef479264..f37395ff750 100644
--- a/src/xnnpack/zerob.h
+++ b/src/xnnpack/zerob.h
@@ -24,7 +24,7 @@ extern "C" {
       size_t channel_subtile_stride,                                                   \
       const struct xnn_x32_packb_params* params);                                       \
 
-#include "src/x32-zerob/x32-zerob.h"
+#include "x32-zerob/x32-zerob.h"
 
 #undef XNN_UKERNEL
 
diff --git a/test/BUILD.bazel b/test/BUILD.bazel
index bcbedbc084d..a9e9f4f6965 100644
--- a/test/BUILD.bazel
+++ b/test/BUILD.bazel
@@ -91,6 +91,13 @@ xnnpack_cxx_library(
     ],
 )
 
+xnnpack_cxx_library(
+    name = "tanh_operator_tester",
+    testonly = True,
+    hdrs = ["tanh-operator-tester.h"],
+    deps = OPERATOR_TEST_DEPS + xnnpack_test_deps_for_library(),
+)
+
 xnnpack_cxx_library(
     name = "unary_operator_tester",
     testonly = True,
@@ -554,15 +561,6 @@ xnnpack_unit_test(
     ],
 )
 
-xnnpack_unit_test(
-    name = "f16_prelu_test",
-    srcs = [
-        "f16-prelu.cc",
-        "prelu-microkernel-tester.h",
-    ],
-    deps = MICROKERNEL_TEST_DEPS,
-)
-
 xnnpack_unit_test(
     name = "f16_spmm_minmax_test",
     srcs = [
@@ -799,15 +797,6 @@ xnnpack_unit_test(
     ],
 )
 
-xnnpack_unit_test(
-    name = "f32_prelu_test",
-    srcs = [
-        "f32-prelu.cc",
-        "prelu-microkernel-tester.h",
-    ],
-    deps = MICROKERNEL_TEST_DEPS,
-)
-
 xnnpack_unit_test(
     name = "f32_raddexpminusmax_test",
     srcs = [
@@ -1318,12 +1307,6 @@ xnnpack_unit_test(
 
 ########################## Size tests for the library #########################
 
-xnnpack_binary(
-    name = "operator_size_test",
-    srcs = ["operator-size.c"],
-    deps = ["//:XNNPACK"],
-)
-
 xnnpack_binary(
     name = "subgraph_size_test",
     srcs = ["subgraph-size.c"],
@@ -1621,15 +1604,6 @@ xnnpack_unit_test(
     deps = OPERATOR_TEST_DEPS,
 )
 
-xnnpack_unit_test(
-    name = "prelu_nc_test",
-    srcs = [
-        "prelu-nc.cc",
-        "prelu-operator-tester.h",
-    ],
-    deps = OPERATOR_TEST_DEPS,
-)
-
 xnnpack_unit_test(
     name = "resize_bilinear_nhwc_test",
     srcs = [
@@ -1778,6 +1752,7 @@ xnnpack_cxx_library(
     deps = [
         ":replicable_random_device",
         ":subgraph_unary_tester",
+        ":tanh_operator_tester",
         "//:XNNPACK",
         "//:math",
         "//:node_type",
@@ -1897,6 +1872,7 @@ xnnpack_unit_test(
         "//:XNNPACK",
         "//:buffer",
         "//:common",
+        "//:math",
         "//:node_type",
         "//:operator_utils",
         "//:operators",
@@ -1926,6 +1902,7 @@ xnnpack_unit_test(
         "//:XNNPACK",
         "//:buffer",
         "//:common",
+        "//:math",
         "//:node_type",
         "//:operators",
         "//:subgraph",
@@ -1941,6 +1918,7 @@ xnnpack_unit_test(
         ":replicable_random_device",
         "//:XNNPACK",
         "//:buffer",
+        "//:math",
         "//:node_type",
         "//:operators",
         "//:subgraph",
@@ -1963,6 +1941,7 @@ xnnpack_unit_test(
         "//:XNNPACK",
         "//:buffer",
         "//:common",
+        "//:math",
         "//:node_type",
         "//:operator_utils",
         "//:operators",
@@ -1982,6 +1961,7 @@ xnnpack_unit_test(
         ":replicable_random_device",
         "//:XNNPACK",
         "//:buffer",
+        "//:math",
         "//:node_type",
         "//:operator_utils",
         "//:operators",
@@ -1999,6 +1979,7 @@ xnnpack_unit_test(
         ":replicable_random_device",
         "//:XNNPACK",
         "//:buffer",
+        "//:math",
         "//:node_type",
         "//:operators",
         "//:subgraph",
@@ -2016,6 +1997,7 @@ xnnpack_unit_test(
         "//:XNNPACK",
         "//:buffer",
         "//:common",
+        "//:math",
         "//:node_type",
         "//:operator_utils",
         "//:operators",
@@ -2033,6 +2015,7 @@ xnnpack_unit_test(
         ":replicable_random_device",
         "//:XNNPACK",
         "//:buffer",
+        "//:math",
         "//:node_type",
         "//:operators",
         "//:subgraph",
@@ -2075,6 +2058,7 @@ xnnpack_unit_test(
         "//:XNNPACK",
         "//:aligned_allocator",
         "//:common",
+        "//:math",
         "//:node_type",
         "//:operators",
         "//:requantization",
@@ -2092,6 +2076,7 @@ xnnpack_unit_test(
         "//:XNNPACK",
         "//:aligned_allocator",
         "//:common",
+        "//:math",
         "//:node_type",
         "//:operators",
         "//:requantization",
@@ -2109,6 +2094,7 @@ xnnpack_unit_test(
         "//:XNNPACK",
         "//:aligned_allocator",
         "//:common",
+        "//:math",
         "//:node_type",
         "//:operators",
         "//:subgraph",
@@ -2125,6 +2111,7 @@ xnnpack_unit_test(
         "//:XNNPACK",
         "//:aligned_allocator",
         "//:common",
+        "//:math",
         "//:node_type",
         "//:operators",
         "//:subgraph",
@@ -2140,6 +2127,7 @@ xnnpack_unit_test(
         ":replicable_random_device",
         "//:XNNPACK",
         "//:buffer",
+        "//:math",
         "//:node_type",
         "//:operator_utils",
         "//:operators",
@@ -2148,21 +2136,6 @@ xnnpack_unit_test(
     ],
 )
 
-xnnpack_unit_test(
-    name = "prelu_test",
-    srcs = [
-        "prelu.cc",
-    ],
-    deps = [
-        ":replicable_random_device",
-        "//:XNNPACK",
-        "//:buffer",
-        "//:node_type",
-        "//:operators",
-        "//:subgraph",
-    ],
-)
-
 xnnpack_unit_test(
     name = "rope_test",
     srcs = [
@@ -2189,6 +2162,7 @@ xnnpack_unit_test(
         "//:XNNPACK",
         "//:aligned_allocator",
         "//:common",
+        "//:math",
         "//:node_type",
         "//:subgraph",
     ],
@@ -2234,6 +2208,7 @@ xnnpack_unit_test(
         ":replicable_random_device",
         "//:XNNPACK",
         "//:buffer",
+        "//:math",
         "//:node_type",
         "//:operators",
         "//:subgraph",
@@ -2360,6 +2335,7 @@ xnnpack_unit_test(
         "//:allocation_type",
         "//:allocator",
         "//:buffer",
+        "//:math",
         "//:node_type",
         "//:params",
         "//:subgraph",
diff --git a/test/abs.cc b/test/abs.cc
index 8a037d79d61..1ce5c5ab860 100644
--- a/test/abs.cc
+++ b/test/abs.cc
@@ -12,6 +12,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/average-pooling-2d.cc b/test/average-pooling-2d.cc
index 6ad7c9ba4e7..9a7ef13ca89 100644
--- a/test/average-pooling-2d.cc
+++ b/test/average-pooling-2d.cc
@@ -14,12 +14,13 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator-utils.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 template <
diff --git a/test/avgpool-microkernel-tester.h b/test/avgpool-microkernel-tester.h
index 101cd837852..3ac8b570286 100644
--- a/test/avgpool-microkernel-tester.h
+++ b/test/avgpool-microkernel-tester.h
@@ -21,10 +21,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams.h"
 #include "xnnpack/requantization.h"
-#include "xnnpack/buffer.h"
 #include "next_prime.h"
 #include "replicable_random_device.h"
 
diff --git a/test/avgpool-minmax.cc b/test/avgpool-minmax.cc
index cf2f484d250..fe78497d95b 100644
--- a/test/avgpool-minmax.cc
+++ b/test/avgpool-minmax.cc
@@ -43,10 +43,10 @@ const XnnTestParam xnn_test_params[] = {
 #define XNN_UKERNEL_UNIPASS(arch_flags, ukernel, channel_tile, channel_scaled_tile, primary_tile, incremental_tile, init_params) \
   { #ukernel, AvgPoolMicrokernelTester::Kernel{ukernel, init_params}, arch_flags, channel_tile, channel_scaled_tile, primary_tile, incremental_tile },
 
-#include "src/f16-avgpool/f16-avgpool-minmax.h"
-#include "src/f16-pavgpool/f16-pavgpool-minmax.h"
-#include "src/f32-avgpool/f32-avgpool-minmax.h"
-#include "src/f32-pavgpool/f32-pavgpool-minmax.h"
+#include "f16-avgpool/f16-avgpool-minmax.h"
+#include "f16-pavgpool/f16-pavgpool-minmax.h"
+#include "f32-avgpool/f32-avgpool-minmax.h"
+#include "f32-pavgpool/f32-pavgpool-minmax.h"
 
 #undef XNN_UKERNEL_MULTIPASS
 #undef XNN_UKERNEL_UNIPASS
@@ -57,7 +57,7 @@ const XnnTestParam xnn_test_params[] = {
 #define XNN_UKERNEL_UNIPASS(arch_flags, ukernel, requantize, channel_tile, channel_scaled_tile, primary_tile, incremental_tile, init_params) \
   { #ukernel, AvgPoolMicrokernelTester::Kernel{ukernel, init_params, requantize}, arch_flags, channel_tile, channel_scaled_tile, primary_tile, incremental_tile },
 
-#include "src/qu8-avgpool/qu8-avgpool-minmax.h"
+#include "qu8-avgpool/qu8-avgpool-minmax.h"
 
 #undef XNN_UKERNEL_MULTIPASS
 #undef XNN_UKERNEL_UNIPASS
diff --git a/test/bankers-rounding.cc b/test/bankers-rounding.cc
index 00cde38e0a3..77f508a8f44 100644
--- a/test/bankers-rounding.cc
+++ b/test/bankers-rounding.cc
@@ -12,6 +12,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/batch-matrix-multiply.cc b/test/batch-matrix-multiply.cc
index 39a2d49065b..c3f1dec9ad3 100644
--- a/test/batch-matrix-multiply.cc
+++ b/test/batch-matrix-multiply.cc
@@ -21,11 +21,12 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 template <class InputType, class OutputType>
diff --git a/test/bf16-vabs.cc b/test/bf16-vabs.cc
index 09e65eb412c..9d925bb1332 100644
--- a/test/bf16-vabs.cc
+++ b/test/bf16-vabs.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs());                 \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs());
-#include "src/bf16-vabs/bf16-vabs.h"
+#include "bf16-vabs/bf16-vabs.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/binary-elementwise-nd.cc b/test/binary-elementwise-nd.cc
index 874c80e9b7e..e4f26e82a26 100644
--- a/test/binary-elementwise-nd.cc
+++ b/test/binary-elementwise-nd.cc
@@ -77,6 +77,8 @@ class BinaryElementwiseOperatorTester {
         return "Minimum";
       case xnn_binary_multiply:
         return "Multiply";
+      case xnn_binary_prelu:
+        return "Prelu";
       case xnn_binary_subtract:
         return "Subtract";
       case xnn_binary_squared_difference:
diff --git a/test/binary.cc b/test/binary.cc
index 0a5da6b753d..7280dfd90f4 100644
--- a/test/binary.cc
+++ b/test/binary.cc
@@ -165,6 +165,8 @@ static const char* binary_operator_to_string(
       return "Minimum";
     case xnn_binary_multiply:
       return "Multiply";
+    case xnn_binary_prelu:
+      return "Prelu";
     case xnn_binary_subtract:
       return "Subtract";
     case xnn_binary_squared_difference:
@@ -913,7 +915,8 @@ INSTANTIATE_TEST_SUITE_P(test, BinaryTestF16,
                          testing::Values(xnn_binary_add, xnn_binary_subtract,
                                          xnn_binary_multiply, xnn_binary_divide,
                                          xnn_binary_maximum, xnn_binary_minimum,
-                                         xnn_binary_squared_difference),
+                                         xnn_binary_squared_difference,
+                                         xnn_binary_prelu),
                          [](const auto& info) { return ToString(info.param); });
 #endif
 INSTANTIATE_TEST_SUITE_P(test, BinaryTestF32,
@@ -921,7 +924,8 @@ INSTANTIATE_TEST_SUITE_P(test, BinaryTestF32,
                                          xnn_binary_multiply, xnn_binary_divide,
                                          xnn_binary_maximum, xnn_binary_minimum,
                                          xnn_binary_copysign,
-                                         xnn_binary_squared_difference),
+                                         xnn_binary_squared_difference,
+                                         xnn_binary_prelu),
                          [](const auto& info) { return ToString(info.param); });
 INSTANTIATE_TEST_SUITE_P(test, BinaryTestS32,
                          testing::Values(xnn_binary_multiply),
diff --git a/test/ceiling.cc b/test/ceiling.cc
index ee493fc69a5..13682a681b2 100644
--- a/test/ceiling.cc
+++ b/test/ceiling.cc
@@ -12,6 +12,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/clamp.cc b/test/clamp.cc
index df4d225920b..aa03eda65a7 100644
--- a/test/clamp.cc
+++ b/test/clamp.cc
@@ -12,6 +12,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/concatenate2.cc b/test/concatenate2.cc
index ff3ad0066d0..2a078aa1d69 100644
--- a/test/concatenate2.cc
+++ b/test/concatenate2.cc
@@ -17,10 +17,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 template <typename T> class Concatenate2Test : public ::testing::Test {
diff --git a/test/concatenate3.cc b/test/concatenate3.cc
index 3743e663bd9..5706b5df330 100644
--- a/test/concatenate3.cc
+++ b/test/concatenate3.cc
@@ -17,10 +17,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 template <typename T> class Concatenate3Test : public ::testing::Test {
diff --git a/test/concatenate4.cc b/test/concatenate4.cc
index d999a20d887..d37320daf9d 100644
--- a/test/concatenate4.cc
+++ b/test/concatenate4.cc
@@ -17,10 +17,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 template <typename T> class Concatenate4Test : public ::testing::Test {
diff --git a/test/concatenate5.cc b/test/concatenate5.cc
index 8546da9fb7d..e06af5b7887 100644
--- a/test/concatenate5.cc
+++ b/test/concatenate5.cc
@@ -17,10 +17,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 template <typename T> class Concatenate5Test : public ::testing::Test {
diff --git a/test/conv-hwc2chw-microkernel-tester.h b/test/conv-hwc2chw-microkernel-tester.h
index 3f32e698821..f7f852f658f 100644
--- a/test/conv-hwc2chw-microkernel-tester.h
+++ b/test/conv-hwc2chw-microkernel-tester.h
@@ -17,10 +17,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams.h"
 #include "xnnpack/pack.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 class ConvHWC2CHWMicrokernelTester {
diff --git a/test/convert-nc-eager.cc b/test/convert-nc-eager.cc
index a09464790ea..af668afec9c 100644
--- a/test/convert-nc-eager.cc
+++ b/test/convert-nc-eager.cc
@@ -124,8 +124,6 @@ TEST(CONVERT_NC_F32_QS8, unit_batch) {
     ConvertOperatorTester()
         .batch_size(1)
         .channels(channels)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestRunF32toQS8();
   }
@@ -136,8 +134,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch) {
     ConvertOperatorTester()
         .batch_size(3)
         .channels(channels)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestRunF32toQS8();
   }
@@ -149,8 +145,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch_with_input_stride) {
         .batch_size(3)
         .channels(channels)
         .input_stride(129)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestRunF32toQS8();
   }
@@ -162,8 +156,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch_with_output_stride) {
         .batch_size(3)
         .channels(channels)
         .output_stride(117)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestRunF32toQS8();
   }
@@ -176,8 +168,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch_with_input_and_output_stride) {
         .channels(channels)
         .input_stride(129)
         .output_stride(117)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestRunF32toQS8();
   }
@@ -190,9 +180,7 @@ TEST(CONVERT_NC_F32_QS8, output_scale) {
           .batch_size(3)
           .channels(channels)
           .output_scale(output_scale)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .iterations(3)
+              .iterations(3)
           .TestRunF32toQS8();
     }
   }
@@ -208,9 +196,7 @@ TEST(CONVERT_NC_F32_QS8, output_zero_point) {
           .batch_size(3)
           .channels(channels)
           .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .iterations(3)
+              .iterations(3)
           .TestRunF32toQS8();
     }
   }
@@ -304,8 +290,6 @@ TEST(CONVERT_NC_QS16_QS8, unit_batch) {
     ConvertOperatorTester()
         .batch_size(1)
         .channels(channels)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestRunQS16toQS8();
   }
@@ -316,8 +300,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch) {
     ConvertOperatorTester()
         .batch_size(3)
         .channels(channels)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestRunQS16toQS8();
   }
@@ -329,8 +311,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch_with_input_stride) {
         .batch_size(3)
         .channels(channels)
         .input_stride(129)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestRunQS16toQS8();
   }
@@ -342,8 +322,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch_with_output_stride) {
         .batch_size(3)
         .channels(channels)
         .output_stride(117)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestRunQS16toQS8();
   }
@@ -356,8 +334,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch_with_input_and_output_stride) {
         .channels(channels)
         .input_stride(129)
         .output_stride(117)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestRunQS16toQS8();
   }
@@ -370,9 +346,7 @@ TEST(CONVERT_NC_QS16_QS8, input_scale) {
           .batch_size(3)
           .channels(channels)
           .input_scale(input_scale)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .iterations(3)
+              .iterations(3)
           .TestRunQS16toQS8();
     }
   }
@@ -388,8 +362,6 @@ TEST(CONVERT_NC_QS16_QS8, output_zero_point) {
           .batch_size(3)
           .channels(channels)
           .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
           .iterations(3)
           .TestRunQS16toQS8();
     }
@@ -401,8 +373,6 @@ TEST(CONVERT_NC_F32_QU8, unit_batch) {
     ConvertOperatorTester()
         .batch_size(1)
         .channels(channels)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
         .iterations(3)
         .TestRunF32toQU8();
   }
@@ -413,8 +383,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch) {
     ConvertOperatorTester()
         .batch_size(3)
         .channels(channels)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
         .iterations(3)
         .TestRunF32toQU8();
   }
@@ -426,8 +394,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch_with_input_stride) {
         .batch_size(3)
         .channels(channels)
         .input_stride(129)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
         .iterations(3)
         .TestRunF32toQU8();
   }
@@ -439,8 +405,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch_with_output_stride) {
         .batch_size(3)
         .channels(channels)
         .output_stride(117)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
         .iterations(3)
         .TestRunF32toQU8();
   }
@@ -453,8 +417,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch_with_input_and_output_stride) {
         .channels(channels)
         .input_stride(129)
         .output_stride(117)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
         .iterations(3)
         .TestRunF32toQU8();
   }
@@ -467,8 +429,6 @@ TEST(CONVERT_NC_F32_QU8, output_scale) {
           .batch_size(3)
           .channels(channels)
           .output_scale(output_scale)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
           .iterations(3)
           .TestRunF32toQU8();
     }
@@ -485,8 +445,6 @@ TEST(CONVERT_NC_F32_QU8, output_zero_point) {
           .batch_size(3)
           .channels(channels)
           .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
           .iterations(3)
           .TestRunF32toQU8();
     }
diff --git a/test/convert-nc.cc b/test/convert-nc.cc
index 9d1102a42b0..33eeee1a0ca 100644
--- a/test/convert-nc.cc
+++ b/test/convert-nc.cc
@@ -124,8 +124,6 @@ TEST(CONVERT_NC_F16_QD8, unit_batch) {
     ConvertOperatorTester()
         .batch_size(1)
         .channels(channels)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF16toQD8();
   }
@@ -136,8 +134,6 @@ TEST(CONVERT_NC_F16_QD8, small_batch) {
     ConvertOperatorTester()
         .batch_size(3)
         .channels(channels)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF16toQD8();
   }
@@ -149,8 +145,6 @@ TEST(CONVERT_NC_F16_QD8, small_batch_with_input_stride) {
         .batch_size(3)
         .channels(channels)
         .input_stride(129)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF16toQD8();
   }
@@ -162,8 +156,6 @@ TEST(CONVERT_NC_F16_QD8, small_batch_with_output_stride) {
         .batch_size(3)
         .channels(channels)
         .output_stride(117)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF16toQD8();
   }
@@ -176,53 +168,16 @@ TEST(CONVERT_NC_F16_QD8, small_batch_with_input_and_output_stride) {
         .channels(channels)
         .input_stride(129)
         .output_stride(117)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF16toQD8();
   }
 }
 
-TEST(CONVERT_NC_F16_QD8, output_min) {
-  for (int16_t qmin = std::numeric_limits<int8_t>::min();
-       qmin < std::numeric_limits<int8_t>::max();
-       qmin += 51)
-  {
-    for (size_t channels = 1; channels < 100; channels++) {
-      ConvertOperatorTester()
-          .batch_size(3)
-          .channels(channels)
-          .qmin(qmin)
-          .qmax(std::numeric_limits<int8_t>::max())
-          .iterations(3)
-          .TestF16toQD8();
-    }
-  }
-}
-
-TEST(CONVERT_NC_F16_QD8, output_max) {
-  for (int16_t qmax = std::numeric_limits<int8_t>::min() + 1;
-       qmax <= std::numeric_limits<int8_t>::max();
-       qmax += 51)
-  {
-    for (size_t channels = 1; channels < 100; channels++) {
-      ConvertOperatorTester()
-          .batch_size(3)
-          .channels(channels)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(qmax)
-          .iterations(3)
-          .TestF16toQD8();
-    }
-  }
-}
 TEST(CONVERT_NC_F32_QD8, unit_batch) {
   for (size_t channels = 1; channels < 100; channels++) {
     ConvertOperatorTester()
         .batch_size(1)
         .channels(channels)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF32toQD8();
   }
@@ -233,8 +188,6 @@ TEST(CONVERT_NC_F32_QD8, small_batch) {
     ConvertOperatorTester()
         .batch_size(3)
         .channels(channels)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF32toQD8();
   }
@@ -246,8 +199,6 @@ TEST(CONVERT_NC_F32_QD8, small_batch_with_input_stride) {
         .batch_size(3)
         .channels(channels)
         .input_stride(129)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF32toQD8();
   }
@@ -259,8 +210,6 @@ TEST(CONVERT_NC_F32_QD8, small_batch_with_output_stride) {
         .batch_size(3)
         .channels(channels)
         .output_stride(117)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF32toQD8();
   }
@@ -273,8 +222,6 @@ TEST(CONVERT_NC_F32_QD8, small_batch_with_input_and_output_stride) {
         .channels(channels)
         .input_stride(129)
         .output_stride(117)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF32toQD8();
   }
@@ -285,8 +232,6 @@ TEST(CONVERT_NC_F32_QS8, unit_batch) {
     ConvertOperatorTester()
         .batch_size(1)
         .channels(channels)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF32toQS8();
   }
@@ -297,8 +242,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch) {
     ConvertOperatorTester()
         .batch_size(3)
         .channels(channels)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF32toQS8();
   }
@@ -310,8 +253,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch_with_input_stride) {
         .batch_size(3)
         .channels(channels)
         .input_stride(129)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF32toQS8();
   }
@@ -323,8 +264,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch_with_output_stride) {
         .batch_size(3)
         .channels(channels)
         .output_stride(117)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF32toQS8();
   }
@@ -337,8 +276,6 @@ TEST(CONVERT_NC_F32_QS8, small_batch_with_input_and_output_stride) {
         .channels(channels)
         .input_stride(129)
         .output_stride(117)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF32toQS8();
   }
@@ -351,9 +288,7 @@ TEST(CONVERT_NC_F32_QS8, output_scale) {
           .batch_size(3)
           .channels(channels)
           .output_scale(output_scale)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .iterations(3)
+              .iterations(3)
           .TestF32toQS8();
     }
   }
@@ -369,43 +304,7 @@ TEST(CONVERT_NC_F32_QS8, output_zero_point) {
           .batch_size(3)
           .channels(channels)
           .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .iterations(3)
-          .TestF32toQS8();
-    }
-  }
-}
-
-TEST(CONVERT_NC_F32_QS8, output_min) {
-  for (int16_t qmin = std::numeric_limits<int8_t>::min();
-       qmin < std::numeric_limits<int8_t>::max();
-       qmin += 51)
-  {
-    for (size_t channels = 1; channels < 100; channels++) {
-      ConvertOperatorTester()
-          .batch_size(3)
-          .channels(channels)
-          .qmin(qmin)
-          .qmax(std::numeric_limits<int8_t>::max())
-          .iterations(3)
-          .TestF32toQS8();
-    }
-  }
-}
-
-TEST(CONVERT_NC_F32_QS8, output_max) {
-  for (int16_t qmax = std::numeric_limits<int8_t>::min() + 1;
-       qmax <= std::numeric_limits<int8_t>::max();
-       qmax += 51)
-  {
-    for (size_t channels = 1; channels < 100; channels++) {
-      ConvertOperatorTester()
-          .batch_size(3)
-          .channels(channels)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(qmax)
-          .iterations(3)
+              .iterations(3)
           .TestF32toQS8();
     }
   }
@@ -416,8 +315,6 @@ TEST(CONVERT_NC_F32_QU8, unit_batch) {
     ConvertOperatorTester()
         .batch_size(1)
         .channels(channels)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
         .iterations(3)
         .TestF32toQU8();
   }
@@ -428,8 +325,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch) {
     ConvertOperatorTester()
         .batch_size(3)
         .channels(channels)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
         .iterations(3)
         .TestF32toQU8();
   }
@@ -441,8 +336,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch_with_input_stride) {
         .batch_size(3)
         .channels(channels)
         .input_stride(129)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
         .iterations(3)
         .TestF32toQU8();
   }
@@ -454,8 +347,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch_with_output_stride) {
         .batch_size(3)
         .channels(channels)
         .output_stride(117)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
         .iterations(3)
         .TestF32toQU8();
   }
@@ -468,8 +359,6 @@ TEST(CONVERT_NC_F32_QU8, small_batch_with_input_and_output_stride) {
         .channels(channels)
         .input_stride(129)
         .output_stride(117)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
         .iterations(3)
         .TestF32toQU8();
   }
@@ -482,8 +371,6 @@ TEST(CONVERT_NC_F32_QU8, output_scale) {
           .batch_size(3)
           .channels(channels)
           .output_scale(output_scale)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
           .iterations(3)
           .TestF32toQU8();
     }
@@ -500,42 +387,6 @@ TEST(CONVERT_NC_F32_QU8, output_zero_point) {
           .batch_size(3)
           .channels(channels)
           .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .iterations(3)
-          .TestF32toQU8();
-    }
-  }
-}
-
-TEST(CONVERT_NC_F32_QU8, output_min) {
-  for (int16_t qmin = std::numeric_limits<uint8_t>::min();
-       qmin < std::numeric_limits<uint8_t>::max();
-       qmin += 51)
-  {
-    for (size_t channels = 1; channels < 100; channels++) {
-      ConvertOperatorTester()
-          .batch_size(3)
-          .channels(channels)
-          .qmin(qmin)
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .iterations(3)
-          .TestF32toQU8();
-    }
-  }
-}
-
-TEST(CONVERT_NC_F32_QU8, output_max) {
-  for (int16_t qmax = std::numeric_limits<uint8_t>::min() + 1;
-       qmax <= std::numeric_limits<uint8_t>::max();
-       qmax += 51)
-  {
-    for (size_t channels = 1; channels < 100; channels++) {
-      ConvertOperatorTester()
-          .batch_size(3)
-          .channels(channels)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(qmax)
           .iterations(3)
           .TestF32toQU8();
     }
@@ -713,8 +564,6 @@ TEST(CONVERT_NC_QS16_QS8, unit_batch) {
     ConvertOperatorTester()
         .batch_size(1)
         .channels(channels)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestQS16toQS8();
   }
@@ -725,8 +574,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch) {
     ConvertOperatorTester()
         .batch_size(3)
         .channels(channels)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestQS16toQS8();
   }
@@ -738,8 +585,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch_with_input_stride) {
         .batch_size(3)
         .channels(channels)
         .input_stride(129)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestQS16toQS8();
   }
@@ -751,8 +596,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch_with_output_stride) {
         .batch_size(3)
         .channels(channels)
         .output_stride(117)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestQS16toQS8();
   }
@@ -765,8 +608,6 @@ TEST(CONVERT_NC_QS16_QS8, small_batch_with_input_and_output_stride) {
         .channels(channels)
         .input_stride(129)
         .output_stride(117)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestQS16toQS8();
   }
@@ -779,9 +620,7 @@ TEST(CONVERT_NC_QS16_QS8, input_scale) {
           .batch_size(3)
           .channels(channels)
           .input_scale(input_scale)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .iterations(3)
+              .iterations(3)
           .TestQS16toQS8();
     }
   }
@@ -797,9 +636,7 @@ TEST(CONVERT_NC_QS16_QS8, output_zero_point) {
           .batch_size(3)
           .channels(channels)
           .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .iterations(3)
+              .iterations(3)
           .TestQS16toQS8();
     }
   }
@@ -893,8 +730,6 @@ TEST(CONVERT_NC_F32_QP8, unit_batch) {
     ConvertOperatorTester()
         .batch_size(1)
         .channels(channels)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF32toQD8();
   }
@@ -905,8 +740,6 @@ TEST(CONVERT_NC_F32_QP8, small_batch) {
     ConvertOperatorTester()
         .batch_size(3)
         .channels(channels)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF32toQD8();
   }
@@ -918,8 +751,6 @@ TEST(CONVERT_NC_F32_QP8, small_batch_with_input_stride) {
         .batch_size(3)
         .channels(channels)
         .input_stride(129)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
         .iterations(3)
         .TestF32toQD8();
   }
diff --git a/test/convert-operator-tester.h b/test/convert-operator-tester.h
index 2858ef3dc3c..e88eea376bf 100644
--- a/test/convert-operator-tester.h
+++ b/test/convert-operator-tester.h
@@ -109,24 +109,6 @@ class ConvertOperatorTester {
     return this->zero_point_;
   }
 
-  ConvertOperatorTester& qmin(int16_t qmin) {
-    this->qmin_ = qmin;
-    return *this;
-  }
-
-  int16_t qmin() const {
-    return this->qmin_;
-  }
-
-  ConvertOperatorTester& qmax(int16_t qmax) {
-    this->qmax_ = qmax;
-    return *this;
-  }
-
-  int16_t qmax() const {
-    return this->qmax_;
-  }
-
   ConvertOperatorTester& iterations(size_t iterations) {
     this->iterations_ = iterations;
     return *this;
@@ -412,10 +394,6 @@ class ConvertOperatorTester {
   }
 
   void TestF32toQS8() const {
-    ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
-    ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
-    ASSERT_LT(qmin(), qmax());
-
     ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
     ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
 
@@ -434,8 +412,8 @@ class ConvertOperatorTester {
       for (size_t i = 0; i < batch_size(); i++) {
         for (size_t c = 0; c < channels(); c++) {
           float scaled_input = input[i * input_stride() + c] * inv_scale;
-          scaled_input = std::min<float>(scaled_input, float(qmax() - zero_point()));
-          scaled_input = std::max<float>(scaled_input, float(qmin() - zero_point()));
+          scaled_input = std::min<float>(scaled_input, float(std::numeric_limits<int8_t>::max() - zero_point()));
+          scaled_input = std::max<float>(scaled_input, float(std::numeric_limits<int8_t>::min() - zero_point()));
           output_ref[i * channels() + c] = int8_t(std::lrintf(scaled_input) + long(zero_point()));
         }
       }
@@ -446,7 +424,7 @@ class ConvertOperatorTester {
 
       ASSERT_EQ(xnn_status_success,
         xnn_create_convert_nc_f32_qs8(
-          output_scale(), int8_t(zero_point()), int8_t(qmin()), int8_t(qmax()),
+          output_scale(), int8_t(zero_point()),
           0, &convert_op));
       ASSERT_NE(nullptr, convert_op);
 
@@ -469,10 +447,6 @@ class ConvertOperatorTester {
   }
 
   void TestF32toQU8() const {
-    ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
-    ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
-    ASSERT_LT(qmin(), qmax());
-
     ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
     ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
 
@@ -491,8 +465,8 @@ class ConvertOperatorTester {
       for (size_t i = 0; i < batch_size(); i++) {
         for (size_t c = 0; c < channels(); c++) {
           float scaled_input = input[i * input_stride() + c] * inv_scale;
-          scaled_input = std::min<float>(scaled_input, float(qmax() - zero_point()));
-          scaled_input = std::max<float>(scaled_input, float(qmin() - zero_point()));
+          scaled_input = std::min<float>(scaled_input, float(std::numeric_limits<uint8_t>::max() - zero_point()));
+          scaled_input = std::max<float>(scaled_input, float(std::numeric_limits<uint8_t>::min() - zero_point()));
           output_ref[i * channels() + c] = uint8_t(std::lrintf(scaled_input) + long(zero_point()));
         }
       }
@@ -503,7 +477,7 @@ class ConvertOperatorTester {
 
       ASSERT_EQ(xnn_status_success,
         xnn_create_convert_nc_f32_qu8(
-          output_scale(), uint8_t(zero_point()), uint8_t(qmin()), uint8_t(qmax()),
+          output_scale(), uint8_t(zero_point()),
           0, &convert_op));
       ASSERT_NE(nullptr, convert_op);
 
@@ -631,10 +605,6 @@ class ConvertOperatorTester {
   }
 
   void TestQS16toQS8() const {
-    ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
-    ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
-    ASSERT_LT(qmin(), qmax());
-
     ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
     ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
 
@@ -819,10 +789,6 @@ class ConvertOperatorTester {
   }
 
   void TestRunF32toQS8() const {
-    ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
-    ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
-    ASSERT_LT(qmin(), qmax());
-
     ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
     ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
 
@@ -841,8 +807,8 @@ class ConvertOperatorTester {
       for (size_t i = 0; i < batch_size(); i++) {
         for (size_t c = 0; c < channels(); c++) {
           float scaled_input = input[i * input_stride() + c] * inv_scale;
-          scaled_input = std::min<float>(scaled_input, float(qmax() - zero_point()));
-          scaled_input = std::max<float>(scaled_input, float(qmin() - zero_point()));
+          scaled_input = std::min<float>(scaled_input, float(std::numeric_limits<int8_t>::max() - zero_point()));
+          scaled_input = std::max<float>(scaled_input, float(std::numeric_limits<int8_t>::min() - zero_point()));
           output_ref[i * channels() + c] = int8_t(std::lrintf(scaled_input) + long(zero_point()));
         }
       }
@@ -910,10 +876,6 @@ class ConvertOperatorTester {
   }
 
   void TestRunQS16toQS8() const {
-    ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
-    ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
-    ASSERT_LT(qmin(), qmax());
-
     ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
     ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
 
@@ -960,10 +922,6 @@ class ConvertOperatorTester {
   }
 
   void TestRunF32toQU8() const {
-    ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
-    ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
-    ASSERT_LT(qmin(), qmax());
-
     ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
     ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
 
@@ -982,8 +940,8 @@ class ConvertOperatorTester {
       for (size_t i = 0; i < batch_size(); i++) {
         for (size_t c = 0; c < channels(); c++) {
           float scaled_input = input[i * input_stride() + c] * inv_scale;
-          scaled_input = std::min<float>(scaled_input, float(qmax() - zero_point()));
-          scaled_input = std::max<float>(scaled_input, float(qmin() - zero_point()));
+          scaled_input = std::min<float>(scaled_input, float(std::numeric_limits<uint8_t>::max() - zero_point()));
+          scaled_input = std::max<float>(scaled_input, float(std::numeric_limits<uint8_t>::min() - zero_point()));
           output_ref[i * channels() + c] = uint8_t(std::lrintf(scaled_input) + long(zero_point()));
         }
       }
@@ -1057,7 +1015,5 @@ class ConvertOperatorTester {
   float input_scale_{150.0f};
   float output_scale_{3.0f};
   int16_t zero_point_{1};
-  int16_t qmin_{std::numeric_limits<int16_t>::min()};
-  int16_t qmax_{std::numeric_limits<int16_t>::max()};
   size_t iterations_{15};
 };
diff --git a/test/convert.cc b/test/convert.cc
index ca2fe7e0df3..931f18d5f51 100644
--- a/test/convert.cc
+++ b/test/convert.cc
@@ -13,6 +13,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
@@ -484,7 +485,7 @@ TEST_F(ConvertTestF32ToQS8, matches_operator_api)
   // Call operator API.
   xnn_operator_t op = nullptr;
   const xnn_status status = xnn_create_convert_nc_f32_qs8(
-    scale, signed_zero_point, INT8_MIN, INT8_MAX, /*flags=*/0, &op);
+    scale, signed_zero_point, /*flags=*/0, &op);
   if (status == xnn_status_unsupported_hardware) {
     GTEST_SKIP();
   }
@@ -593,7 +594,7 @@ TEST_F(ConvertTestF32ToQU8, matches_operator_api)
   // Call operator API.
   xnn_operator_t op = nullptr;
   const xnn_status status = xnn_create_convert_nc_f32_qu8(
-    scale, unsigned_zero_point, 0, UINT8_MAX, /*flags=*/0, &op);
+    scale, unsigned_zero_point, /*flags=*/0, &op);
   if (status == xnn_status_unsupported_hardware) {
     GTEST_SKIP();
   }
diff --git a/test/convolution-2d.cc b/test/convolution-2d.cc
index 43b98339f13..c0d52531580 100644
--- a/test/convolution-2d.cc
+++ b/test/convolution-2d.cc
@@ -15,13 +15,14 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator-utils.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/requantization.h"
 #include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
 #include "convolution-test-helpers.h"
 #include "replicable_random_device.h"
 
diff --git a/test/convolution-operator-tester.h b/test/convolution-operator-tester.h
index 298f687e187..3e2b1c3b9e0 100644
--- a/test/convolution-operator-tester.h
+++ b/test/convolution-operator-tester.h
@@ -22,10 +22,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
 #include "xnnpack/cache.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microparams.h"
-#include "xnnpack/buffer.h"
 #include "convolution-test-helpers.h"
 #include "replicable_random_device.h"
 #include "pthreadpool.h"
diff --git a/test/copy.cc b/test/copy.cc
index 693981d4924..ceafa6564a9 100644
--- a/test/copy.cc
+++ b/test/copy.cc
@@ -12,6 +12,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/deconvolution-2d.cc b/test/deconvolution-2d.cc
index de1975b5272..01c80f733bd 100644
--- a/test/deconvolution-2d.cc
+++ b/test/deconvolution-2d.cc
@@ -15,12 +15,13 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator-utils.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/requantization.h"
 #include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 template <class T, class KernelType = T, class BiasType = T> class DeconvolutionTestBase : public ::testing::Test {
diff --git a/test/deconvolution-operator-tester.h b/test/deconvolution-operator-tester.h
index e83d4c0b968..831d57b7494 100644
--- a/test/deconvolution-operator-tester.h
+++ b/test/deconvolution-operator-tester.h
@@ -24,10 +24,11 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
 #include "xnnpack/cache.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microparams.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 class DeconvolutionOperatorTester {
diff --git a/test/depth-to-space-2d.cc b/test/depth-to-space-2d.cc
index 23c9808e6cc..6a341fbdeb1 100644
--- a/test/depth-to-space-2d.cc
+++ b/test/depth-to-space-2d.cc
@@ -18,10 +18,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 template <typename T> class DepthToSpaceTest : public ::testing::Test {
diff --git a/test/depthwise-convolution-2d.cc b/test/depthwise-convolution-2d.cc
index 91fbab1d2b5..44d51f6f71a 100644
--- a/test/depthwise-convolution-2d.cc
+++ b/test/depthwise-convolution-2d.cc
@@ -16,13 +16,14 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator-utils.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/requantization.h"
 #include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
 #include "convolution-test-helpers.h"
 #include "replicable_random_device.h"
 
diff --git a/test/dwconv2d-microkernel-tester.h b/test/dwconv2d-microkernel-tester.h
index eb99ab1d3f7..1a25dade488 100644
--- a/test/dwconv2d-microkernel-tester.h
+++ b/test/dwconv2d-microkernel-tester.h
@@ -19,9 +19,10 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 class DWConv2DMicrokernelTester {
diff --git a/test/elu.cc b/test/elu.cc
index 1694a195ae4..39866bfa7da 100644
--- a/test/elu.cc
+++ b/test/elu.cc
@@ -12,6 +12,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/even-split2.cc b/test/even-split2.cc
index bcd6b004a30..45bd14d48b5 100644
--- a/test/even-split2.cc
+++ b/test/even-split2.cc
@@ -17,10 +17,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 template <typename T> class EvenSplit2Test : public ::testing::Test {
diff --git a/test/even-split3.cc b/test/even-split3.cc
index c72bacc8d9d..28418947e69 100644
--- a/test/even-split3.cc
+++ b/test/even-split3.cc
@@ -17,10 +17,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 template <typename T> class EvenSplit3Test : public ::testing::Test {
diff --git a/test/even-split4.cc b/test/even-split4.cc
index b327c868f48..16f968feb72 100644
--- a/test/even-split4.cc
+++ b/test/even-split4.cc
@@ -17,10 +17,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 template <typename T> class EvenSplit4Test : public ::testing::Test {
diff --git a/test/f16-dwconv-minmax-multipass.cc b/test/f16-dwconv-minmax-multipass.cc
index 994598c292f..d87f21b8244 100644
--- a/test/f16-dwconv-minmax-multipass.cc
+++ b/test/f16-dwconv-minmax-multipass.cc
@@ -300,5 +300,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                                                                                                              \
       return info.param.test_name;                                                                                                                                                                               \
     });
-#include "src/f16-dwconv/f16-dwconv-minmax-multipass.h"
+#include "f16-dwconv/f16-dwconv-minmax-multipass.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-dwconv-minmax-unipass.cc b/test/f16-dwconv-minmax-unipass.cc
index fbbce5862f8..624e20f0ba2 100644
--- a/test/f16-dwconv-minmax-unipass.cc
+++ b/test/f16-dwconv-minmax-unipass.cc
@@ -204,5 +204,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                             \
       return info.param.test_name;                                                                                              \
     });
-#include "src/f16-dwconv/f16-dwconv-minmax-unipass.h"
+#include "f16-dwconv/f16-dwconv-minmax-unipass.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-f32-vcvt.cc b/test/f16-f32-vcvt.cc
index bd583c3e2df..d723077bf52 100644
--- a/test/f16-f32-vcvt.cc
+++ b/test/f16-f32-vcvt.cc
@@ -18,5 +18,5 @@ XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out
 XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);\
 XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \
 XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);
-#include "src/f16-f32-vcvt/f16-f32-vcvt.h"
+#include "f16-f32-vcvt/f16-f32-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
diff --git a/test/f16-prelu.cc b/test/f16-prelu.cc
deleted file mode 100644
index 36c4e7ef28e..00000000000
--- a/test/f16-prelu.cc
+++ /dev/null
@@ -1,491 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-//
-// Auto-generated file. Do not edit!
-//   Specification: test/f16-prelu.yaml
-//   Generator: tools/generate-prelu-test.py
-
-
-#include <gtest/gtest.h>
-#include "xnnpack/common.h"
-#include "xnnpack/isa-checks.h"
-#include "xnnpack/prelu.h"
-#include "prelu-microkernel-tester.h"
-
-
-#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
-  TEST(F16_PRELU__NEONFP16ARITH_2X8, channels_eq_8) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(8)
-      .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8);
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X8, channels_div_8) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8);
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X8, channels_lt_8) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8);
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X8, channels_gt_8) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8);
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X8, rows_lt_2) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X8, rows_div_2) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X8, rows_gt_2) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X8, input_stride) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X8, output_stride) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X8, inplace) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x8);
-      }
-    }
-  }
-#endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
-
-
-#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
-  TEST(F16_PRELU__NEONFP16ARITH_2X16, channels_eq_16) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(16)
-      .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16);
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X16, channels_div_16) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16);
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X16, channels_lt_16) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16);
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X16, channels_gt_16) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16);
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X16, rows_lt_2) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X16, rows_div_2) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X16, rows_gt_2) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X16, input_stride) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X16, output_stride) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__NEONFP16ARITH_2X16, inplace) {
-    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f16_prelu_ukernel__neonfp16arith_2x16);
-      }
-    }
-  }
-#endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(F16_PRELU__F16C_2X8, channels_eq_8) {
-    TEST_REQUIRES_X86_F16C;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(8)
-      .Test(xnn_f16_prelu_ukernel__f16c_2x8);
-  }
-
-  TEST(F16_PRELU__F16C_2X8, channels_div_8) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f16_prelu_ukernel__f16c_2x8);
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X8, channels_lt_8) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f16_prelu_ukernel__f16c_2x8);
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X8, channels_gt_8) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f16_prelu_ukernel__f16c_2x8);
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X8, rows_lt_2) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f16_prelu_ukernel__f16c_2x8);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X8, rows_div_2) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f16_prelu_ukernel__f16c_2x8);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X8, rows_gt_2) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f16_prelu_ukernel__f16c_2x8);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X8, input_stride) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f16_prelu_ukernel__f16c_2x8);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X8, output_stride) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f16_prelu_ukernel__f16c_2x8);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X8, inplace) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f16_prelu_ukernel__f16c_2x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(F16_PRELU__F16C_2X16, channels_eq_16) {
-    TEST_REQUIRES_X86_F16C;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(16)
-      .Test(xnn_f16_prelu_ukernel__f16c_2x16);
-  }
-
-  TEST(F16_PRELU__F16C_2X16, channels_div_16) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f16_prelu_ukernel__f16c_2x16);
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X16, channels_lt_16) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f16_prelu_ukernel__f16c_2x16);
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X16, channels_gt_16) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f16_prelu_ukernel__f16c_2x16);
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X16, rows_lt_2) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f16_prelu_ukernel__f16c_2x16);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X16, rows_div_2) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f16_prelu_ukernel__f16c_2x16);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X16, rows_gt_2) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f16_prelu_ukernel__f16c_2x16);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X16, input_stride) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f16_prelu_ukernel__f16c_2x16);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X16, output_stride) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f16_prelu_ukernel__f16c_2x16);
-      }
-    }
-  }
-
-  TEST(F16_PRELU__F16C_2X16, inplace) {
-    TEST_REQUIRES_X86_F16C;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f16_prelu_ukernel__f16c_2x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/test/f16-prelu.yaml b/test/f16-prelu.yaml
deleted file mode 100644
index 22a92cd0b51..00000000000
--- a/test/f16-prelu.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright 2020 Google LLC
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# ARM NEON+FP16ARITH
-- name: xnn_f16_prelu_ukernel__neonfp16arith_2x8
-- name: xnn_f16_prelu_ukernel__neonfp16arith_2x16
-
-# x86 F16C
-- name: xnn_f16_prelu_ukernel__f16c_2x8
-- name: xnn_f16_prelu_ukernel__f16c_2x16
diff --git a/test/f16-qs8-vcvt.cc b/test/f16-qs8-vcvt.cc
index 6d42943dfb4..89e963b8d47 100644
--- a/test/f16-qs8-vcvt.cc
+++ b/test/f16-qs8-vcvt.cc
@@ -23,5 +23,5 @@ XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, u
                                                                                                                  \
                                                                                                                  \
 XNN_TEST_CVT_OUTPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);
-#include "src/f16-qs8-vcvt/f16-qs8-vcvt.h"
+#include "f16-qs8-vcvt/f16-qs8-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vabs.cc b/test/f16-vabs.cc
index 61f82ac0ed3..bd9953a00dd 100644
--- a/test/f16-vabs.cc
+++ b/test/f16-vabs.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs());                 \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs());
-#include "src/f16-vabs/f16-vabs.h"
+#include "f16-vabs/f16-vabs.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vadd.cc b/test/f16-vadd.cc
index 6bf82168024..d9568a6e34d 100644
--- a/test/f16-vadd.cc
+++ b/test/f16-vadd.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);
-#include "src/f16-vbinary/f16-vadd.h"
+#include "f16-vbinary/f16-vadd.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vaddc.cc b/test/f16-vaddc.cc
index f1717a8d5c7..68a7361cbcc 100644
--- a/test/f16-vaddc.cc
+++ b/test/f16-vaddc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);
-#include "src/f16-vbinary/f16-vaddc.h"
+#include "f16-vbinary/f16-vaddc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vclamp.cc b/test/f16-vclamp.cc
index 3a3b7511687..db5fd14048a 100644
--- a/test/f16-vclamp.cc
+++ b/test/f16-vclamp.cc
@@ -34,5 +34,5 @@ XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                         \
 XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                            \
 XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);
-#include "src/f16-vclamp/f16-vclamp.h"
+#include "f16-vclamp/f16-vclamp.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vcmul.cc b/test/f16-vcmul.cc
index f0492ca5190..9f551f262fa 100644
--- a/test/f16-vcmul.cc
+++ b/test/f16-vcmul.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);               \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);               \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);
-#include "src/f16-vbinary/f16-vcmul.h"
+#include "f16-vbinary/f16-vcmul.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vdiv.cc b/test/f16-vdiv.cc
index 6f6b91f6665..f1a1e72c1cd 100644
--- a/test/f16-vdiv.cc
+++ b/test/f16-vdiv.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);
-#include "src/f16-vbinary/f16-vdiv.h"
+#include "f16-vbinary/f16-vdiv.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vdivc.cc b/test/f16-vdivc.cc
index b30b4789328..522d50d613b 100644
--- a/test/f16-vdivc.cc
+++ b/test/f16-vdivc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);
-#include "src/f16-vbinary/f16-vdivc.h"
+#include "f16-vbinary/f16-vdivc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-velu.cc b/test/f16-velu.cc
index b5ab981aff4..d1ddb4f7ce1 100644
--- a/test/f16-velu.cc
+++ b/test/f16-velu.cc
@@ -76,5 +76,5 @@ TEST(ukernel, beta) {
     }                                                                                                            \
   }                                                                                                              \
 }
-#include "src/f16-velu/f16-velu.h"
+#include "f16-velu/f16-velu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vhswish.cc b/test/f16-vhswish.cc
index 39bf2bdb63a..f8b4a3b5433 100644
--- a/test/f16-vhswish.cc
+++ b/test/f16-vhswish.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                        \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);
-#include "src/f16-vhswish/f16-vhswish.h"
+#include "f16-vhswish/f16-vhswish.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vlrelu.cc b/test/f16-vlrelu.cc
index 2f931a115b0..eea1518a1e6 100644
--- a/test/f16-vlrelu.cc
+++ b/test/f16-vlrelu.cc
@@ -46,5 +46,5 @@ TEST(ukernel, slope) {
     }                                                                                                            \
   }                                                                                                              \
 }
-#include "src/f16-vlrelu/f16-vlrelu.h"
+#include "f16-vlrelu/f16-vlrelu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vmax.cc b/test/f16-vmax.cc
index 598de281d27..24509ce02bd 100644
--- a/test/f16-vmax.cc
+++ b/test/f16-vmax.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params);
-#include "src/f16-vbinary/f16-vmax.h"
+#include "f16-vbinary/f16-vmax.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vmaxc.cc b/test/f16-vmaxc.cc
index a27e02cd6b6..daaafe58ba0 100644
--- a/test/f16-vmaxc.cc
+++ b/test/f16-vmaxc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params);
-#include "src/f16-vbinary/f16-vmaxc.h"
+#include "f16-vbinary/f16-vmaxc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vmin.cc b/test/f16-vmin.cc
index aca129bf78b..4547f969cb3 100644
--- a/test/f16-vmin.cc
+++ b/test/f16-vmin.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params);
-#include "src/f16-vbinary/f16-vmin.h"
+#include "f16-vbinary/f16-vmin.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vminc.cc b/test/f16-vminc.cc
index da6c5930139..c186bb4f87b 100644
--- a/test/f16-vminc.cc
+++ b/test/f16-vminc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params);
-#include "src/f16-vbinary/f16-vminc.h"
+#include "f16-vbinary/f16-vminc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vmul.cc b/test/f16-vmul.cc
index cc169da9dbb..23ea43be68a 100644
--- a/test/f16-vmul.cc
+++ b/test/f16-vmul.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);
-#include "src/f16-vbinary/f16-vmul.h"
+#include "f16-vbinary/f16-vmul.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vmulc.cc b/test/f16-vmulc.cc
index 9ff93b385b0..b41f8e486ce 100644
--- a/test/f16-vmulc.cc
+++ b/test/f16-vmulc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);
-#include "src/f16-vbinary/f16-vmulc.h"
+#include "f16-vbinary/f16-vmulc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vneg.cc b/test/f16-vneg.cc
index add84b6d095..5cf6e86eda8 100644
--- a/test/f16-vneg.cc
+++ b/test/f16-vneg.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg());                 \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg());
-#include "src/f16-vneg/f16-vneg.h"
+#include "f16-vneg/f16-vneg.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vprelu.cc b/test/f16-vprelu.cc
index 65ab0cce57b..4aca438164e 100644
--- a/test/f16-vprelu.cc
+++ b/test/f16-vprelu.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);
-#include "src/f16-vbinary/f16-vprelu.h"
+#include "f16-vbinary/f16-vprelu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vpreluc.cc b/test/f16-vpreluc.cc
index 55ee99e7415..9ac5d00a873 100644
--- a/test/f16-vpreluc.cc
+++ b/test/f16-vpreluc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);
-#include "src/f16-vbinary/f16-vpreluc.h"
+#include "f16-vbinary/f16-vpreluc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vrdivc.cc b/test/f16-vrdivc.cc
index fc73faa51bf..55ff00068ae 100644
--- a/test/f16-vrdivc.cc
+++ b/test/f16-vrdivc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params);
-#include "src/f16-vbinary/f16-vrdivc.h"
+#include "f16-vbinary/f16-vrdivc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vrndd.cc b/test/f16-vrndd.cc
index 18e84573163..8690fffb912 100644
--- a/test/f16-vrndd.cc
+++ b/test/f16-vrndd.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params); \
                                                                                                                                       \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params);
-#include "src/f16-vrnd/f16-vrndd.h"
+#include "f16-vrnd/f16-vrndd.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vrndne.cc b/test/f16-vrndne.cc
index 103045e68c5..d11342514cb 100644
--- a/test/f16-vrndne.cc
+++ b/test/f16-vrndne.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params); \
                                                                                                                                                \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params);
-#include "src/f16-vrnd/f16-vrndne.h"
+#include "f16-vrnd/f16-vrndne.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vrndu.cc b/test/f16-vrndu.cc
index f265fcd4173..44f229fff03 100644
--- a/test/f16-vrndu.cc
+++ b/test/f16-vrndu.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params); \
                                                                                                                                     \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params);
-#include "src/f16-vrnd/f16-vrndu.h"
+#include "f16-vrnd/f16-vrndu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vrndz.cc b/test/f16-vrndz.cc
index db57af8077c..5cc01810577 100644
--- a/test/f16-vrndz.cc
+++ b/test/f16-vrndz.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params); \
                                                                                                                                              \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params);
-#include "src/f16-vrnd/f16-vrndz.h"
+#include "f16-vrnd/f16-vrndz.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vrpreluc.cc b/test/f16-vrpreluc.cc
index b769c7763b0..65ca49c6c0b 100644
--- a/test/f16-vrpreluc.cc
+++ b/test/f16-vrpreluc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);
-#include "src/f16-vbinary/f16-vrpreluc.h"
+#include "f16-vbinary/f16-vrpreluc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vrsqrt.cc b/test/f16-vrsqrt.cc
index 777148fbc27..ffe76b20cff 100644
--- a/test/f16-vrsqrt.cc
+++ b/test/f16-vrsqrt.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                        \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);
-#include "src/f16-vrsqrt/f16-vrsqrt.h"
+#include "f16-vrsqrt/f16-vrsqrt.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vrsubc.cc b/test/f16-vrsubc.cc
index 74cb4632e4e..873aba2531f 100644
--- a/test/f16-vrsubc.cc
+++ b/test/f16-vrsubc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params);
-#include "src/f16-vbinary/f16-vrsubc.h"
+#include "f16-vbinary/f16-vrsubc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vsigmoid.cc b/test/f16-vsigmoid.cc
index c9fa13f1f1b..661f486e033 100644
--- a/test/f16-vsigmoid.cc
+++ b/test/f16-vsigmoid.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                        \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);
-#include "src/f16-vsigmoid/f16-vsigmoid.h"
+#include "f16-vsigmoid/f16-vsigmoid.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vsqr.cc b/test/f16-vsqr.cc
index af906dca2b9..97dedacf945 100644
--- a/test/f16-vsqr.cc
+++ b/test/f16-vsqr.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr());                 \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr());
-#include "src/f16-vsqr/f16-vsqr.h"
+#include "f16-vsqr/f16-vsqr.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vsqrdiff.cc b/test/f16-vsqrdiff.cc
index dd60bda0a1c..a621c3608f6 100644
--- a/test/f16-vsqrdiff.cc
+++ b/test/f16-vsqrdiff.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params);
-#include "src/f16-vbinary/f16-vsqrdiff.h"
+#include "f16-vbinary/f16-vsqrdiff.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vsqrdiffc.cc b/test/f16-vsqrdiffc.cc
index 3682b324ac5..9a631468af4 100644
--- a/test/f16-vsqrdiffc.cc
+++ b/test/f16-vsqrdiffc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params);
-#include "src/f16-vbinary/f16-vsqrdiffc.h"
+#include "f16-vbinary/f16-vsqrdiffc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vsqrt.cc b/test/f16-vsqrt.cc
index 47a110bf40f..66d396ad7fe 100644
--- a/test/f16-vsqrt.cc
+++ b/test/f16-vsqrt.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                        \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);
-#include "src/f16-vsqrt/f16-vsqrt.h"
+#include "f16-vsqrt/f16-vsqrt.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vsub.cc b/test/f16-vsub.cc
index aba129113a1..96e41b52dc3 100644
--- a/test/f16-vsub.cc
+++ b/test/f16-vsub.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);
-#include "src/f16-vbinary/f16-vsub.h"
+#include "f16-vbinary/f16-vsub.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vsubc.cc b/test/f16-vsubc.cc
index ea52fcd5567..6b66e4a5938 100644
--- a/test/f16-vsubc.cc
+++ b/test/f16-vsubc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);
-#include "src/f16-vbinary/f16-vsubc.h"
+#include "f16-vbinary/f16-vsubc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f16-vtanh.cc b/test/f16-vtanh.cc
index 157e956a26e..924acb8d415 100644
--- a/test/f16-vtanh.cc
+++ b/test/f16-vtanh.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                        \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);
-#include "src/f16-vtanh/f16-vtanh.h"
+#include "f16-vtanh/f16-vtanh.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-dwconv-minmax-multipass.cc b/test/f32-dwconv-minmax-multipass.cc
index 4b1ae109c14..a032524a298 100644
--- a/test/f32-dwconv-minmax-multipass.cc
+++ b/test/f32-dwconv-minmax-multipass.cc
@@ -300,5 +300,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                                                                                                              \
       return info.param.test_name;                                                                                                                                                                               \
     });
-#include "src/f32-dwconv/f32-dwconv-minmax-multipass.h"
+#include "f32-dwconv/f32-dwconv-minmax-multipass.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-dwconv-minmax-unipass.cc b/test/f32-dwconv-minmax-unipass.cc
index 00cf20ccf08..3e13c8ad80b 100644
--- a/test/f32-dwconv-minmax-unipass.cc
+++ b/test/f32-dwconv-minmax-unipass.cc
@@ -204,5 +204,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                             \
       return info.param.test_name;                                                                                              \
     });
-#include "src/f32-dwconv/f32-dwconv-minmax-unipass.h"
+#include "f32-dwconv/f32-dwconv-minmax-unipass.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-dwconv-multipass.cc b/test/f32-dwconv-multipass.cc
index 279dfc94b4c..e1528fec836 100644
--- a/test/f32-dwconv-multipass.cc
+++ b/test/f32-dwconv-multipass.cc
@@ -273,5 +273,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                                                                                                              \
       return info.param.test_name;                                                                                                                                                                               \
     });
-#include "src/f32-dwconv/f32-dwconv-multipass.h"
+#include "f32-dwconv/f32-dwconv-multipass.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-dwconv-unipass.cc b/test/f32-dwconv-unipass.cc
index db24377ba3b..f847307c615 100644
--- a/test/f32-dwconv-unipass.cc
+++ b/test/f32-dwconv-unipass.cc
@@ -151,5 +151,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                             \
       return info.param.test_name;                                                                                              \
     });
-#include "src/f32-dwconv/f32-dwconv-unipass.h"
+#include "f32-dwconv/f32-dwconv-unipass.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-f16-vcvt.cc b/test/f32-f16-vcvt.cc
index 4688b03bfa0..6ca121fd452 100644
--- a/test/f32-f16-vcvt.cc
+++ b/test/f32-f16-vcvt.cc
@@ -18,5 +18,5 @@ XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out
 XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);\
 XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \
 XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);
-#include "src/f32-f16-vcvt/f32-f16-vcvt.h"
+#include "f32-f16-vcvt/f32-f16-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
diff --git a/test/f32-prelu.cc b/test/f32-prelu.cc
deleted file mode 100644
index 58fdf245375..00000000000
--- a/test/f32-prelu.cc
+++ /dev/null
@@ -1,6259 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-//
-// Auto-generated file. Do not edit!
-//   Specification: test/f32-prelu.yaml
-//   Generator: tools/generate-prelu-test.py
-
-
-#include <gtest/gtest.h>
-#include "xnnpack/common.h"
-#include "xnnpack/isa-checks.h"
-#include "xnnpack/prelu.h"
-#include "prelu-microkernel-tester.h"
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_PRELU__NEON_1X4, channels_eq_4) {
-    TEST_REQUIRES_ARM_NEON;
-    PReLUMicrokernelTester()
-      .rows(1)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__neon_1x4);
-  }
-
-  TEST(F32_PRELU__NEON_1X4, channels_div_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_1x4);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X4, channels_lt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_1x4);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X4, channels_gt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_1x4);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X4, rows_gt_1) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 2; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_1x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X4, input_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_1x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X4, output_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_1x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X4, inplace) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_1x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_PRELU__NEON_1X8, channels_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    PReLUMicrokernelTester()
-      .rows(1)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__neon_1x8);
-  }
-
-  TEST(F32_PRELU__NEON_1X8, channels_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_1x8);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X8, channels_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_1x8);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X8, channels_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_1x8);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X8, rows_gt_1) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 2; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_1x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X8, input_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_1x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X8, output_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_1x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X8, inplace) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_1x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_PRELU__NEON_1X16, channels_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    PReLUMicrokernelTester()
-      .rows(1)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__neon_1x16);
-  }
-
-  TEST(F32_PRELU__NEON_1X16, channels_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_1x16);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X16, channels_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_1x16);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X16, channels_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_1x16);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X16, rows_gt_1) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 2; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_1x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X16, input_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_1x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X16, output_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_1x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_1X16, inplace) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_1x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_PRELU__NEON_2X4, channels_eq_4) {
-    TEST_REQUIRES_ARM_NEON;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__neon_2x4);
-  }
-
-  TEST(F32_PRELU__NEON_2X4, channels_div_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X4, channels_lt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X4, channels_gt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X4, rows_lt_2) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X4, rows_div_2) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X4, rows_gt_2) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X4, input_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X4, output_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X4, inplace) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_2x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_PRELU__NEON_2X8, channels_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__neon_2x8);
-  }
-
-  TEST(F32_PRELU__NEON_2X8, channels_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X8, channels_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X8, channels_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X8, rows_lt_2) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X8, rows_div_2) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X8, rows_gt_2) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X8, input_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X8, output_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X8, inplace) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_2x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_PRELU__NEON_2X16, channels_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__neon_2x16);
-  }
-
-  TEST(F32_PRELU__NEON_2X16, channels_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X16, channels_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X16, channels_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X16, rows_lt_2) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X16, rows_div_2) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X16, rows_gt_2) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X16, input_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X16, output_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_2X16, inplace) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_2x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_PRELU__NEON_4X4, channels_eq_4) {
-    TEST_REQUIRES_ARM_NEON;
-    PReLUMicrokernelTester()
-      .rows(4)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__neon_4x4);
-  }
-
-  TEST(F32_PRELU__NEON_4X4, channels_div_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_4x4);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X4, channels_lt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_4x4);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X4, channels_gt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_4x4);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X4, rows_lt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X4, rows_div_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 8; rows <= 16; rows += 4) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X4, rows_gt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 5; rows < 8; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X4, input_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X4, output_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X4, inplace) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_4x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_PRELU__NEON_4X8, channels_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    PReLUMicrokernelTester()
-      .rows(4)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__neon_4x8);
-  }
-
-  TEST(F32_PRELU__NEON_4X8, channels_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_4x8);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X8, channels_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_4x8);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X8, channels_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_4x8);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X8, rows_lt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X8, rows_div_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 8; rows <= 16; rows += 4) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X8, rows_gt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 5; rows < 8; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X8, input_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X8, output_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X8, inplace) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_4x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_PRELU__NEON_4X16, channels_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    PReLUMicrokernelTester()
-      .rows(4)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__neon_4x16);
-  }
-
-  TEST(F32_PRELU__NEON_4X16, channels_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_4x16);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X16, channels_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_4x16);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X16, channels_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__neon_4x16);
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X16, rows_lt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X16, rows_div_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 8; rows <= 16; rows += 4) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X16, rows_gt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 5; rows < 8; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__neon_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X16, input_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X16, output_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__NEON_4X16, inplace) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__neon_4x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(F32_PRELU__SSE_2X4, channels_eq_4) {
-    TEST_REQUIRES_X86_SSE;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__sse_2x4);
-  }
-
-  TEST(F32_PRELU__SSE_2X4, channels_div_4) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X4, channels_lt_4) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X4, channels_gt_4) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X4, rows_lt_2) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X4, rows_div_2) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X4, rows_gt_2) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X4, input_stride) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X4, output_stride) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X4, inplace) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse_2x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(F32_PRELU__SSE_2X8, channels_eq_8) {
-    TEST_REQUIRES_X86_SSE;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__sse_2x8);
-  }
-
-  TEST(F32_PRELU__SSE_2X8, channels_div_8) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X8, channels_lt_8) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X8, channels_gt_8) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X8, rows_lt_2) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X8, rows_div_2) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X8, rows_gt_2) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X8, input_stride) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X8, output_stride) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE_2X8, inplace) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse_2x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(F32_PRELU__SSE2_2X4, channels_eq_4) {
-    TEST_REQUIRES_X86_SSE2;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__sse2_2x4);
-  }
-
-  TEST(F32_PRELU__SSE2_2X4, channels_div_4) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse2_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X4, channels_lt_4) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse2_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X4, channels_gt_4) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse2_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X4, rows_lt_2) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse2_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X4, rows_div_2) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse2_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X4, rows_gt_2) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse2_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X4, input_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse2_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X4, output_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse2_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X4, inplace) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse2_2x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(F32_PRELU__SSE2_2X8, channels_eq_8) {
-    TEST_REQUIRES_X86_SSE2;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__sse2_2x8);
-  }
-
-  TEST(F32_PRELU__SSE2_2X8, channels_div_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse2_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X8, channels_lt_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse2_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X8, channels_gt_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse2_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X8, rows_lt_2) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse2_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X8, rows_div_2) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse2_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X8, rows_gt_2) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse2_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X8, input_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse2_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X8, output_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse2_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE2_2X8, inplace) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse2_2x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(F32_PRELU__SSE41_2X4, channels_eq_4) {
-    TEST_REQUIRES_X86_SSE41;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__sse41_2x4);
-  }
-
-  TEST(F32_PRELU__SSE41_2X4, channels_div_4) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse41_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X4, channels_lt_4) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse41_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X4, channels_gt_4) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse41_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X4, rows_lt_2) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse41_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X4, rows_div_2) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse41_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X4, rows_gt_2) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse41_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X4, input_stride) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse41_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X4, output_stride) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse41_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X4, inplace) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse41_2x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(F32_PRELU__SSE41_2X8, channels_eq_8) {
-    TEST_REQUIRES_X86_SSE41;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__sse41_2x8);
-  }
-
-  TEST(F32_PRELU__SSE41_2X8, channels_div_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse41_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X8, channels_lt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse41_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X8, channels_gt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__sse41_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X8, rows_lt_2) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse41_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X8, rows_div_2) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse41_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X8, rows_gt_2) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__sse41_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X8, input_stride) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse41_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X8, output_stride) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse41_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__SSE41_2X8, inplace) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__sse41_2x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(F32_PRELU__AVX_2X8, channels_eq_8) {
-    TEST_REQUIRES_X86_AVX;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__avx_2x8);
-  }
-
-  TEST(F32_PRELU__AVX_2X8, channels_div_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__avx_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X8, channels_lt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__avx_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X8, channels_gt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__avx_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X8, rows_lt_2) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__avx_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X8, rows_div_2) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__avx_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X8, rows_gt_2) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__avx_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X8, input_stride) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__avx_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X8, output_stride) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__avx_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X8, inplace) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__avx_2x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(F32_PRELU__AVX_2X16, channels_eq_16) {
-    TEST_REQUIRES_X86_AVX;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__avx_2x16);
-  }
-
-  TEST(F32_PRELU__AVX_2X16, channels_div_16) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__avx_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X16, channels_lt_16) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__avx_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X16, channels_gt_16) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__avx_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X16, rows_lt_2) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__avx_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X16, rows_div_2) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__avx_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X16, rows_gt_2) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__avx_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X16, input_stride) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__avx_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X16, output_stride) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__avx_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX_2X16, inplace) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__avx_2x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
-  TEST(F32_PRELU__AVX512F_2X16, channels_eq_16) {
-    TEST_REQUIRES_X86_AVX512F;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__avx512f_2x16);
-  }
-
-  TEST(F32_PRELU__AVX512F_2X16, channels_div_16) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__avx512f_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X16, channels_lt_16) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__avx512f_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X16, channels_gt_16) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__avx512f_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X16, rows_lt_2) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__avx512f_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X16, rows_div_2) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__avx512f_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X16, rows_gt_2) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__avx512f_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X16, input_stride) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__avx512f_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X16, output_stride) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__avx512f_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X16, inplace) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__avx512f_2x16);
-      }
-    }
-  }
-#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
-
-
-#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
-  TEST(F32_PRELU__AVX512F_2X32, channels_eq_32) {
-    TEST_REQUIRES_X86_AVX512F;
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(32)
-      .Test(xnn_f32_prelu_ukernel__avx512f_2x32);
-  }
-
-  TEST(F32_PRELU__AVX512F_2X32, channels_div_32) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t channels = 64; channels < 320; channels += 32) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__avx512f_2x32);
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X32, channels_lt_32) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t channels = 1; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__avx512f_2x32);
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X32, channels_gt_32) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t channels = 33; channels < 64; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__avx512f_2x32);
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X32, rows_lt_2) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 160; channels += 31) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__avx512f_2x32);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X32, rows_div_2) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 160; channels += 31) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__avx512f_2x32);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X32, rows_gt_2) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 160; channels += 31) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__avx512f_2x32);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X32, input_stride) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 160; channels += 31) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(163)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__avx512f_2x32);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X32, output_stride) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 160; channels += 31) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(163)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__avx512f_2x32);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__AVX512F_2X32, inplace) {
-    TEST_REQUIRES_X86_AVX512F;
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 160; channels += 31) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__avx512f_2x32);
-      }
-    }
-  }
-#endif  // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, channels_eq_4) {
-    PReLUMicrokernelTester()
-      .rows(1)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, channels_div_4) {
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, channels_lt_4) {
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, channels_gt_4) {
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, rows_gt_1) {
-    for (size_t rows = 2; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, input_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, output_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X4, inplace) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, channels_eq_8) {
-    PReLUMicrokernelTester()
-      .rows(1)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, channels_div_8) {
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, channels_lt_8) {
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, channels_gt_8) {
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, rows_gt_1) {
-    for (size_t rows = 2; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, input_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, output_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X8, inplace) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, channels_eq_16) {
-    PReLUMicrokernelTester()
-      .rows(1)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, channels_div_16) {
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, channels_lt_16) {
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, channels_gt_16) {
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, rows_gt_1) {
-    for (size_t rows = 2; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, input_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, output_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_1X16, inplace) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, channels_eq_4) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, channels_div_4) {
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, channels_lt_4) {
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, channels_gt_4) {
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, rows_lt_2) {
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, rows_div_2) {
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, rows_gt_2) {
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, input_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, output_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X4, inplace) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, channels_eq_8) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, channels_div_8) {
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, channels_lt_8) {
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, channels_gt_8) {
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, rows_lt_2) {
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, rows_div_2) {
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, rows_gt_2) {
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, input_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, output_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X8, inplace) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, channels_eq_16) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, channels_div_16) {
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, channels_lt_16) {
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, channels_gt_16) {
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, rows_lt_2) {
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, rows_div_2) {
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, rows_gt_2) {
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, input_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, output_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_2X16, inplace) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, channels_eq_4) {
-    PReLUMicrokernelTester()
-      .rows(4)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, channels_div_4) {
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, channels_lt_4) {
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, channels_gt_4) {
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, rows_lt_4) {
-    for (size_t rows = 1; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, rows_div_4) {
-    for (size_t rows = 8; rows <= 16; rows += 4) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, rows_gt_4) {
-    for (size_t rows = 5; rows < 8; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, input_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, output_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X4, inplace) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, channels_eq_8) {
-    PReLUMicrokernelTester()
-      .rows(4)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, channels_div_8) {
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, channels_lt_8) {
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, channels_gt_8) {
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, rows_lt_4) {
-    for (size_t rows = 1; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, rows_div_4) {
-    for (size_t rows = 8; rows <= 16; rows += 4) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, rows_gt_4) {
-    for (size_t rows = 5; rows < 8; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, input_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, output_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X8, inplace) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, channels_eq_16) {
-    PReLUMicrokernelTester()
-      .rows(4)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, channels_div_16) {
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, channels_lt_16) {
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, channels_gt_16) {
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, rows_lt_4) {
-    for (size_t rows = 1; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, rows_div_4) {
-    for (size_t rows = 8; rows <= 16; rows += 4) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, rows_gt_4) {
-    for (size_t rows = 5; rows < 8; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, input_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, output_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_IMINMAX_4X16, inplace) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, channels_eq_4) {
-    PReLUMicrokernelTester()
-      .rows(1)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, channels_div_4) {
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, channels_lt_4) {
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, channels_gt_4) {
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, rows_gt_1) {
-    for (size_t rows = 2; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, input_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, output_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X4, inplace) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, channels_eq_8) {
-    PReLUMicrokernelTester()
-      .rows(1)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, channels_div_8) {
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, channels_lt_8) {
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, channels_gt_8) {
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, rows_gt_1) {
-    for (size_t rows = 2; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, input_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, output_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X8, inplace) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, channels_eq_16) {
-    PReLUMicrokernelTester()
-      .rows(1)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, channels_div_16) {
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, channels_lt_16) {
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, channels_gt_16) {
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, rows_gt_1) {
-    for (size_t rows = 2; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, input_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, output_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_1X16, inplace) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, channels_eq_4) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, channels_div_4) {
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, channels_lt_4) {
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, channels_gt_4) {
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, rows_lt_2) {
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, rows_div_2) {
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, rows_gt_2) {
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, input_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, output_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X4, inplace) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, channels_eq_8) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, channels_div_8) {
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, channels_lt_8) {
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, channels_gt_8) {
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, rows_lt_2) {
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, rows_div_2) {
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, rows_gt_2) {
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, input_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, output_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X8, inplace) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, channels_eq_16) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, channels_div_16) {
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, channels_lt_16) {
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, channels_gt_16) {
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, rows_lt_2) {
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, rows_div_2) {
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, rows_gt_2) {
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, input_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, output_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_2X16, inplace) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, channels_eq_4) {
-    PReLUMicrokernelTester()
-      .rows(4)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, channels_div_4) {
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, channels_lt_4) {
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, channels_gt_4) {
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, rows_lt_4) {
-    for (size_t rows = 1; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, rows_div_4) {
-    for (size_t rows = 8; rows <= 16; rows += 4) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, rows_gt_4) {
-    for (size_t rows = 5; rows < 8; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, input_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, output_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X4, inplace) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, channels_eq_8) {
-    PReLUMicrokernelTester()
-      .rows(4)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, channels_div_8) {
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, channels_lt_8) {
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, channels_gt_8) {
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, rows_lt_4) {
-    for (size_t rows = 1; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, rows_div_4) {
-    for (size_t rows = 8; rows <= 16; rows += 4) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, rows_gt_4) {
-    for (size_t rows = 5; rows < 8; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, input_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, output_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X8, inplace) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, channels_eq_16) {
-    PReLUMicrokernelTester()
-      .rows(4)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16);
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, channels_div_16) {
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, channels_lt_16) {
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, channels_gt_16) {
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, rows_lt_4) {
-    for (size_t rows = 1; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, rows_div_4) {
-    for (size_t rows = 8; rows <= 16; rows += 4) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, rows_gt_4) {
-    for (size_t rows = 5; rows < 8; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, input_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, output_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMSIMD_LANESELECT_4X16, inplace) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, channels_eq_4) {
-    PReLUMicrokernelTester()
-      .rows(1)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, channels_div_4) {
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, channels_lt_4) {
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, channels_gt_4) {
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, rows_gt_1) {
-    for (size_t rows = 2; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, input_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, output_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X4, inplace) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, channels_eq_8) {
-    PReLUMicrokernelTester()
-      .rows(1)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, channels_div_8) {
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, channels_lt_8) {
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, channels_gt_8) {
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, rows_gt_1) {
-    for (size_t rows = 2; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, input_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, output_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X8, inplace) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, channels_eq_16) {
-    PReLUMicrokernelTester()
-      .rows(1)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, channels_div_16) {
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, channels_lt_16) {
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, channels_gt_16) {
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, rows_gt_1) {
-    for (size_t rows = 2; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, input_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, output_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_1X16, inplace) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, channels_eq_4) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, channels_div_4) {
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, channels_lt_4) {
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, channels_gt_4) {
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, rows_lt_2) {
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, rows_div_2) {
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, rows_gt_2) {
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, input_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, output_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X4, inplace) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, channels_eq_8) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, channels_div_8) {
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, channels_lt_8) {
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, channels_gt_8) {
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, rows_lt_2) {
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, rows_div_2) {
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, rows_gt_2) {
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, input_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, output_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X8, inplace) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, channels_eq_16) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, channels_div_16) {
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, channels_lt_16) {
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, channels_gt_16) {
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, rows_lt_2) {
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, rows_div_2) {
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, rows_gt_2) {
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, input_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, output_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_2X16, inplace) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, channels_eq_4) {
-    PReLUMicrokernelTester()
-      .rows(4)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, channels_div_4) {
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, channels_lt_4) {
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, channels_gt_4) {
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, rows_lt_4) {
-    for (size_t rows = 1; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, rows_div_4) {
-    for (size_t rows = 8; rows <= 16; rows += 4) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, rows_gt_4) {
-    for (size_t rows = 5; rows < 8; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, input_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, output_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X4, inplace) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, channels_eq_8) {
-    PReLUMicrokernelTester()
-      .rows(4)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, channels_div_8) {
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, channels_lt_8) {
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, channels_gt_8) {
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, rows_lt_4) {
-    for (size_t rows = 1; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, rows_div_4) {
-    for (size_t rows = 8; rows <= 16; rows += 4) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, rows_gt_4) {
-    for (size_t rows = 5; rows < 8; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, input_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, output_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X8, inplace) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, channels_eq_16) {
-    PReLUMicrokernelTester()
-      .rows(4)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, channels_div_16) {
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, channels_lt_16) {
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, channels_gt_16) {
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, rows_lt_4) {
-    for (size_t rows = 1; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, rows_div_4) {
-    for (size_t rows = 8; rows <= 16; rows += 4) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, rows_gt_4) {
-    for (size_t rows = 5; rows < 8; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, input_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, output_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_IMINMAX_4X16, inplace) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, channels_eq_4) {
-    PReLUMicrokernelTester()
-      .rows(1)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, channels_div_4) {
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, channels_lt_4) {
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, channels_gt_4) {
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, rows_gt_1) {
-    for (size_t rows = 2; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, input_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, output_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X4, inplace) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, channels_eq_8) {
-    PReLUMicrokernelTester()
-      .rows(1)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, channels_div_8) {
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, channels_lt_8) {
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, channels_gt_8) {
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, rows_gt_1) {
-    for (size_t rows = 2; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, input_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, output_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X8, inplace) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, channels_eq_16) {
-    PReLUMicrokernelTester()
-      .rows(1)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, channels_div_16) {
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, channels_lt_16) {
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, channels_gt_16) {
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(1)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, rows_gt_1) {
-    for (size_t rows = 2; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, input_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, output_stride) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_1X16, inplace) {
-    for (size_t rows = 1; rows <= 3; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, channels_eq_4) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, channels_div_4) {
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, channels_lt_4) {
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, channels_gt_4) {
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, rows_lt_2) {
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, rows_div_2) {
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, rows_gt_2) {
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, input_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, output_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X4, inplace) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, channels_eq_8) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, channels_div_8) {
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, channels_lt_8) {
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, channels_gt_8) {
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, rows_lt_2) {
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, rows_div_2) {
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, rows_gt_2) {
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, input_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, output_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X8, inplace) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, channels_eq_16) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, channels_div_16) {
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, channels_lt_16) {
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, channels_gt_16) {
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, rows_lt_2) {
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, rows_div_2) {
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, rows_gt_2) {
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, input_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, output_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_2X16, inplace) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, channels_eq_4) {
-    PReLUMicrokernelTester()
-      .rows(4)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, channels_div_4) {
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, channels_lt_4) {
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, channels_gt_4) {
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, rows_lt_4) {
-    for (size_t rows = 1; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, rows_div_4) {
-    for (size_t rows = 8; rows <= 16; rows += 4) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, rows_gt_4) {
-    for (size_t rows = 5; rows < 8; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, input_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, output_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X4, inplace) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, channels_eq_8) {
-    PReLUMicrokernelTester()
-      .rows(4)
-      .channels(8)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, channels_div_8) {
-    for (size_t channels = 16; channels < 80; channels += 8) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, channels_lt_8) {
-    for (size_t channels = 1; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, channels_gt_8) {
-    for (size_t channels = 9; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, rows_lt_4) {
-    for (size_t rows = 1; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, rows_div_4) {
-    for (size_t rows = 8; rows <= 16; rows += 4) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, rows_gt_4) {
-    for (size_t rows = 5; rows < 8; rows++) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, input_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, output_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(43)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X8, inplace) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 40; channels += 7) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, channels_eq_16) {
-    PReLUMicrokernelTester()
-      .rows(4)
-      .channels(16)
-      .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16);
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, channels_div_16) {
-    for (size_t channels = 32; channels < 160; channels += 16) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, channels_lt_16) {
-    for (size_t channels = 1; channels < 16; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, channels_gt_16) {
-    for (size_t channels = 17; channels < 32; channels++) {
-      PReLUMicrokernelTester()
-        .rows(4)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16);
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, rows_lt_4) {
-    for (size_t rows = 1; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, rows_div_4) {
-    for (size_t rows = 8; rows <= 16; rows += 4) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, rows_gt_4) {
-    for (size_t rows = 5; rows < 8; rows++) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, input_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, output_stride) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(83)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASMRELAXEDSIMD_LANESELECT_4X16, inplace) {
-    for (size_t rows = 1; rows <= 12; rows += 3) {
-      for (size_t channels = 1; channels <= 80; channels += 15) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASM_2X1, channels_eq_1) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(1)
-      .Test(xnn_f32_prelu_ukernel__wasm_2x1);
-  }
-
-  TEST(F32_PRELU__WASM_2X1, channels_gt_1) {
-    for (size_t channels = 2; channels < 10; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasm_2x1);
-    }
-  }
-
-  TEST(F32_PRELU__WASM_2X1, rows_lt_2) {
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 5; channels += 1) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasm_2x1);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASM_2X1, rows_div_2) {
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 5; channels += 1) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasm_2x1);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASM_2X1, rows_gt_2) {
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 5; channels += 1) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasm_2x1);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASM_2X1, input_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 5; channels += 1) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(7)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasm_2x1);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASM_2X1, output_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 5; channels += 1) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(7)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasm_2x1);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASM_2X1, inplace) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 5; channels += 1) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasm_2x1);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(F32_PRELU__WASM_2X4, channels_eq_4) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(4)
-      .Test(xnn_f32_prelu_ukernel__wasm_2x4);
-  }
-
-  TEST(F32_PRELU__WASM_2X4, channels_div_4) {
-    for (size_t channels = 8; channels < 40; channels += 4) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasm_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASM_2X4, channels_lt_4) {
-    for (size_t channels = 1; channels < 4; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasm_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASM_2X4, channels_gt_4) {
-    for (size_t channels = 5; channels < 8; channels++) {
-      PReLUMicrokernelTester()
-        .rows(2)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__wasm_2x4);
-    }
-  }
-
-  TEST(F32_PRELU__WASM_2X4, rows_lt_2) {
-    for (size_t rows = 1; rows < 2; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasm_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASM_2X4, rows_div_2) {
-    for (size_t rows = 4; rows <= 8; rows += 2) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasm_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASM_2X4, rows_gt_2) {
-    for (size_t rows = 3; rows < 4; rows++) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(xnn_f32_prelu_ukernel__wasm_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASM_2X4, input_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .input_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasm_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASM_2X4, output_stride) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .output_stride(23)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasm_2x4);
-      }
-    }
-  }
-
-  TEST(F32_PRELU__WASM_2X4, inplace) {
-    for (size_t rows = 1; rows <= 6; rows += 1) {
-      for (size_t channels = 1; channels <= 20; channels += 3) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .inplace(true)
-          .iterations(1)
-          .Test(xnn_f32_prelu_ukernel__wasm_2x4);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-TEST(F32_PRELU__SCALAR_2X1, channels_eq_1) {
-  PReLUMicrokernelTester()
-    .rows(2)
-    .channels(1)
-    .Test(xnn_f32_prelu_ukernel__scalar_2x1);
-}
-
-TEST(F32_PRELU__SCALAR_2X1, channels_gt_1) {
-  for (size_t channels = 2; channels < 10; channels++) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(channels)
-      .Test(xnn_f32_prelu_ukernel__scalar_2x1);
-  }
-}
-
-TEST(F32_PRELU__SCALAR_2X1, rows_lt_2) {
-  for (size_t rows = 1; rows < 2; rows++) {
-    for (size_t channels = 1; channels <= 5; channels += 1) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__scalar_2x1);
-    }
-  }
-}
-
-TEST(F32_PRELU__SCALAR_2X1, rows_div_2) {
-  for (size_t rows = 4; rows <= 8; rows += 2) {
-    for (size_t channels = 1; channels <= 5; channels += 1) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__scalar_2x1);
-    }
-  }
-}
-
-TEST(F32_PRELU__SCALAR_2X1, rows_gt_2) {
-  for (size_t rows = 3; rows < 4; rows++) {
-    for (size_t channels = 1; channels <= 5; channels += 1) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__scalar_2x1);
-    }
-  }
-}
-
-TEST(F32_PRELU__SCALAR_2X1, input_stride) {
-  for (size_t rows = 1; rows <= 6; rows += 1) {
-    for (size_t channels = 1; channels <= 5; channels += 1) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .input_stride(7)
-        .iterations(1)
-        .Test(xnn_f32_prelu_ukernel__scalar_2x1);
-    }
-  }
-}
-
-TEST(F32_PRELU__SCALAR_2X1, output_stride) {
-  for (size_t rows = 1; rows <= 6; rows += 1) {
-    for (size_t channels = 1; channels <= 5; channels += 1) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .output_stride(7)
-        .iterations(1)
-        .Test(xnn_f32_prelu_ukernel__scalar_2x1);
-    }
-  }
-}
-
-TEST(F32_PRELU__SCALAR_2X1, inplace) {
-  for (size_t rows = 1; rows <= 6; rows += 1) {
-    for (size_t channels = 1; channels <= 5; channels += 1) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .inplace(true)
-        .iterations(1)
-        .Test(xnn_f32_prelu_ukernel__scalar_2x1);
-    }
-  }
-}
-
-TEST(F32_PRELU__SCALAR_2X4, channels_eq_4) {
-  PReLUMicrokernelTester()
-    .rows(2)
-    .channels(4)
-    .Test(xnn_f32_prelu_ukernel__scalar_2x4);
-}
-
-TEST(F32_PRELU__SCALAR_2X4, channels_div_4) {
-  for (size_t channels = 8; channels < 40; channels += 4) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(channels)
-      .Test(xnn_f32_prelu_ukernel__scalar_2x4);
-  }
-}
-
-TEST(F32_PRELU__SCALAR_2X4, channels_lt_4) {
-  for (size_t channels = 1; channels < 4; channels++) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(channels)
-      .Test(xnn_f32_prelu_ukernel__scalar_2x4);
-  }
-}
-
-TEST(F32_PRELU__SCALAR_2X4, channels_gt_4) {
-  for (size_t channels = 5; channels < 8; channels++) {
-    PReLUMicrokernelTester()
-      .rows(2)
-      .channels(channels)
-      .Test(xnn_f32_prelu_ukernel__scalar_2x4);
-  }
-}
-
-TEST(F32_PRELU__SCALAR_2X4, rows_lt_2) {
-  for (size_t rows = 1; rows < 2; rows++) {
-    for (size_t channels = 1; channels <= 20; channels += 3) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__scalar_2x4);
-    }
-  }
-}
-
-TEST(F32_PRELU__SCALAR_2X4, rows_div_2) {
-  for (size_t rows = 4; rows <= 8; rows += 2) {
-    for (size_t channels = 1; channels <= 20; channels += 3) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__scalar_2x4);
-    }
-  }
-}
-
-TEST(F32_PRELU__SCALAR_2X4, rows_gt_2) {
-  for (size_t rows = 3; rows < 4; rows++) {
-    for (size_t channels = 1; channels <= 20; channels += 3) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .Test(xnn_f32_prelu_ukernel__scalar_2x4);
-    }
-  }
-}
-
-TEST(F32_PRELU__SCALAR_2X4, input_stride) {
-  for (size_t rows = 1; rows <= 6; rows += 1) {
-    for (size_t channels = 1; channels <= 20; channels += 3) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .input_stride(23)
-        .iterations(1)
-        .Test(xnn_f32_prelu_ukernel__scalar_2x4);
-    }
-  }
-}
-
-TEST(F32_PRELU__SCALAR_2X4, output_stride) {
-  for (size_t rows = 1; rows <= 6; rows += 1) {
-    for (size_t channels = 1; channels <= 20; channels += 3) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .output_stride(23)
-        .iterations(1)
-        .Test(xnn_f32_prelu_ukernel__scalar_2x4);
-    }
-  }
-}
-
-TEST(F32_PRELU__SCALAR_2X4, inplace) {
-  for (size_t rows = 1; rows <= 6; rows += 1) {
-    for (size_t channels = 1; channels <= 20; channels += 3) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .inplace(true)
-        .iterations(1)
-        .Test(xnn_f32_prelu_ukernel__scalar_2x4);
-    }
-  }
-}
\ No newline at end of file
diff --git a/test/f32-prelu.yaml b/test/f32-prelu.yaml
deleted file mode 100644
index fcb18835002..00000000000
--- a/test/f32-prelu.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright 2019 Google LLC
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# ARM NEON
-- name: xnn_f32_prelu_ukernel__neon_1x4
-- name: xnn_f32_prelu_ukernel__neon_1x8
-- name: xnn_f32_prelu_ukernel__neon_1x16
-- name: xnn_f32_prelu_ukernel__neon_2x4
-- name: xnn_f32_prelu_ukernel__neon_2x8
-- name: xnn_f32_prelu_ukernel__neon_2x16
-- name: xnn_f32_prelu_ukernel__neon_4x4
-- name: xnn_f32_prelu_ukernel__neon_4x8
-- name: xnn_f32_prelu_ukernel__neon_4x16
-# x86 SSE
-- name: xnn_f32_prelu_ukernel__sse_2x4
-- name: xnn_f32_prelu_ukernel__sse_2x8
-- name: xnn_f32_prelu_ukernel__sse2_2x4
-- name: xnn_f32_prelu_ukernel__sse2_2x8
-- name: xnn_f32_prelu_ukernel__sse41_2x4
-- name: xnn_f32_prelu_ukernel__sse41_2x8
-# x86 AVX
-- name: xnn_f32_prelu_ukernel__avx_2x8
-- name: xnn_f32_prelu_ukernel__avx_2x16
-# x86 AVX512
-- name: xnn_f32_prelu_ukernel__avx512f_2x16
-- name: xnn_f32_prelu_ukernel__avx512f_2x32
-# WAsm SIMD
-- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x4
-- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x8
-- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_1x16
-- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x4
-- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8
-- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x16
-- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x4
-- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x8
-- name: xnn_f32_prelu_ukernel__wasmsimd_iminmax_4x16
-- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x4
-- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x8
-- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_1x16
-- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x4
-- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8
-- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x16
-- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x4
-- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x8
-- name: xnn_f32_prelu_ukernel__wasmsimd_laneselect_4x16
-# WAsm Relaxed SIMD
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x4
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x8
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_1x16
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x4
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x8
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_2x16
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x4
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x8
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_iminmax_4x16
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x4
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x8
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_1x16
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x4
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x8
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_2x16
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x4
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x8
-- name: xnn_f32_prelu_ukernel__wasmrelaxedsimd_laneselect_4x16
-# WAsm
-- name: xnn_f32_prelu_ukernel__wasm_2x1
-- name: xnn_f32_prelu_ukernel__wasm_2x4
-# Scalar
-- name: xnn_f32_prelu_ukernel__scalar_2x1
-- name: xnn_f32_prelu_ukernel__scalar_2x4
diff --git a/test/f32-qs8-vcvt.cc b/test/f32-qs8-vcvt.cc
index c2e15f43a2d..ca31e0fa0f4 100644
--- a/test/f32-qs8-vcvt.cc
+++ b/test/f32-qs8-vcvt.cc
@@ -2,10 +2,6 @@
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
-//
-// Auto-generated file. Do not edit!
-//   Microkernel: f32-qs8-vcvt
-//   Generator: tools/generate-vcvt-test.py
 
 
 #include "xnnpack/microparams-init.h"
@@ -25,9 +21,6 @@ XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, u
 XNN_TEST_CVT_OUTPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);\
                                                                                                                  \
 XNN_TEST_CVT_SATURATION(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);       \
-XNN_TEST_CVT_OVERFLOW(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);         \
-                                                                                                                 \
-XNN_TEST_CVT_QMIN(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);             \
-XNN_TEST_CVT_QMAX(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);
-#include "src/f32-qs8-vcvt/f32-qs8-vcvt.h"
+XNN_TEST_CVT_OVERFLOW(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);
+#include "f32-qs8-vcvt/f32-qs8-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
diff --git a/test/f32-qu8-vcvt.cc b/test/f32-qu8-vcvt.cc
index 7c1cc17c11f..0189a95982b 100644
--- a/test/f32-qu8-vcvt.cc
+++ b/test/f32-qu8-vcvt.cc
@@ -2,10 +2,6 @@
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
-//
-// Auto-generated file. Do not edit!
-//   Microkernel: f32-qu8-vcvt
-//   Generator: tools/generate-vcvt-test.py
 
 
 #include "xnnpack/microparams-init.h"
@@ -25,10 +21,6 @@ XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, u
 XNN_TEST_CVT_OUTPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);\
                                                                                                                  \
 XNN_TEST_CVT_SATURATION(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);       \
-XNN_TEST_CVT_OVERFLOW(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);         \
-                                                                                                                 \
-                                                                                                                 \
-XNN_TEST_CVT_QMIN(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);             \
-XNN_TEST_CVT_QMAX(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);
-#include "src/f32-qu8-vcvt/f32-qu8-vcvt.h"
+XNN_TEST_CVT_OVERFLOW(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);
+#include "f32-qu8-vcvt/f32-qu8-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vabs.cc b/test/f32-vabs.cc
index b5988fd3a29..11b95f45cc7 100644
--- a/test/f32-vabs.cc
+++ b/test/f32-vabs.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs());                 \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs());
-#include "src/f32-vabs/f32-vabs.h"
+#include "f32-vabs/f32-vabs.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vadd.cc b/test/f32-vadd.cc
index caeabdbe654..84cbb1bab29 100644
--- a/test/f32-vadd.cc
+++ b/test/f32-vadd.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);
-#include "src/f32-vbinary/f32-vadd.h"
+#include "f32-vbinary/f32-vadd.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vaddc.cc b/test/f32-vaddc.cc
index b8c8721c6ba..071d06de40e 100644
--- a/test/f32-vaddc.cc
+++ b/test/f32-vaddc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Add, init_params);
-#include "src/f32-vbinary/f32-vaddc.h"
+#include "f32-vbinary/f32-vaddc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vclamp.cc b/test/f32-vclamp.cc
index 0ad6af1863f..ed0f97a218d 100644
--- a/test/f32-vclamp.cc
+++ b/test/f32-vclamp.cc
@@ -34,5 +34,5 @@ XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                         \
 XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                            \
 XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);
-#include "src/f32-vclamp/f32-vclamp.h"
+#include "f32-vclamp/f32-vclamp.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vcmul.cc b/test/f32-vcmul.cc
index f2c41e71f50..dedd5e10d7e 100644
--- a/test/f32-vcmul.cc
+++ b/test/f32-vcmul.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);               \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);               \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);
-#include "src/f32-vbinary/f32-vcmul.h"
+#include "f32-vbinary/f32-vcmul.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vcopysign.cc b/test/f32-vcopysign.cc
index 13163219c10..99d5b2d9d63 100644
--- a/test/f32-vcopysign.cc
+++ b/test/f32-vcopysign.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::CopySign, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::CopySign, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::CopySign, init_params);
-#include "src/f32-vbinary/f32-vcopysign.h"
+#include "f32-vbinary/f32-vcopysign.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vcopysignc.cc b/test/f32-vcopysignc.cc
index 0b37182adb8..63ac43a13a8 100644
--- a/test/f32-vcopysignc.cc
+++ b/test/f32-vcopysignc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::CopySign, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::CopySign, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::CopySign, init_params);
-#include "src/f32-vbinary/f32-vcopysignc.h"
+#include "f32-vbinary/f32-vcopysignc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vdiv.cc b/test/f32-vdiv.cc
index 79cc695e474..1a3e349df1e 100644
--- a/test/f32-vdiv.cc
+++ b/test/f32-vdiv.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);
-#include "src/f32-vbinary/f32-vdiv.h"
+#include "f32-vbinary/f32-vdiv.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vdivc.cc b/test/f32-vdivc.cc
index 88d55ff6930..42b20dd5227 100644
--- a/test/f32-vdivc.cc
+++ b/test/f32-vdivc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Div, init_params);
-#include "src/f32-vbinary/f32-vdivc.h"
+#include "f32-vbinary/f32-vdivc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-velu.cc b/test/f32-velu.cc
index 6ecab89b8ea..4db508d03de 100644
--- a/test/f32-velu.cc
+++ b/test/f32-velu.cc
@@ -76,5 +76,5 @@ TEST(ukernel, beta) {
     }                                                                                                            \
   }                                                                                                              \
 }
-#include "src/f32-velu/f32-velu.h"
+#include "f32-velu/f32-velu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vexp.cc b/test/f32-vexp.cc
index ed50b5efa6e..577a50951a1 100644
--- a/test/f32-vexp.cc
+++ b/test/f32-vexp.cc
@@ -63,5 +63,5 @@ TEST(ukernel, special_values) {
     }                                                                                                            \
   }                                                                                                              \
 }
-#include "src/f32-vexp/f32-vexp.h"
+#include "f32-vexp/f32-vexp.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vgelu.cc b/test/f32-vgelu.cc
index 30beaa307de..720a986c13f 100644
--- a/test/f32-vgelu.cc
+++ b/test/f32-vgelu.cc
@@ -63,5 +63,5 @@ TEST(ukernel, special_values) {
     }                                                                                                            \
   }                                                                                                              \
 }
-#include "src/f32-vgelu/f32-vgelu.h"
+#include "f32-vgelu/f32-vgelu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vhswish.cc b/test/f32-vhswish.cc
index fb2892df96b..813441dc197 100644
--- a/test/f32-vhswish.cc
+++ b/test/f32-vhswish.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                        \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);
-#include "src/f32-vhswish/f32-vhswish.h"
+#include "f32-vhswish/f32-vhswish.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vlog.cc b/test/f32-vlog.cc
index 005b6142ff6..917c8d36b2c 100644
--- a/test/f32-vlog.cc
+++ b/test/f32-vlog.cc
@@ -63,5 +63,5 @@ TEST(ukernel, special_values) {
     }                                                                                                            \
   }                                                                                                              \
 }
-#include "src/f32-vlog/f32-vlog.h"
+#include "f32-vlog/f32-vlog.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vlrelu.cc b/test/f32-vlrelu.cc
index 16d55758aa2..d7478e85dab 100644
--- a/test/f32-vlrelu.cc
+++ b/test/f32-vlrelu.cc
@@ -46,5 +46,5 @@ TEST(ukernel, slope) {
     }                                                                                                            \
   }                                                                                                              \
 }
-#include "src/f32-vlrelu/f32-vlrelu.h"
+#include "f32-vlrelu/f32-vlrelu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vmax.cc b/test/f32-vmax.cc
index c6ab216e1e4..af07b27fcae 100644
--- a/test/f32-vmax.cc
+++ b/test/f32-vmax.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params);
-#include "src/f32-vbinary/f32-vmax.h"
+#include "f32-vbinary/f32-vmax.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vmaxc.cc b/test/f32-vmaxc.cc
index 96a870ff865..6746142ae80 100644
--- a/test/f32-vmaxc.cc
+++ b/test/f32-vmaxc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Max, init_params);
-#include "src/f32-vbinary/f32-vmaxc.h"
+#include "f32-vbinary/f32-vmaxc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vmin.cc b/test/f32-vmin.cc
index 01c5b1c2f94..10e8b311c36 100644
--- a/test/f32-vmin.cc
+++ b/test/f32-vmin.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params);
-#include "src/f32-vbinary/f32-vmin.h"
+#include "f32-vbinary/f32-vmin.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vminc.cc b/test/f32-vminc.cc
index 6e5e1ad3d2a..913c8092976 100644
--- a/test/f32-vminc.cc
+++ b/test/f32-vminc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Min, init_params);
-#include "src/f32-vbinary/f32-vminc.h"
+#include "f32-vbinary/f32-vminc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vmul.cc b/test/f32-vmul.cc
index 93bf736e41b..097ab533d14 100644
--- a/test/f32-vmul.cc
+++ b/test/f32-vmul.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);
-#include "src/f32-vbinary/f32-vmul.h"
+#include "f32-vbinary/f32-vmul.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vmulc.cc b/test/f32-vmulc.cc
index eb58031cb52..ecd2f6c285c 100644
--- a/test/f32-vmulc.cc
+++ b/test/f32-vmulc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);
-#include "src/f32-vbinary/f32-vmulc.h"
+#include "f32-vbinary/f32-vmulc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vneg.cc b/test/f32-vneg.cc
index c6dbc078fb1..bbbe8e0660a 100644
--- a/test/f32-vneg.cc
+++ b/test/f32-vneg.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg());                 \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg());
-#include "src/f32-vneg/f32-vneg.h"
+#include "f32-vneg/f32-vneg.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vprelu.cc b/test/f32-vprelu.cc
index bbd587e18c0..291bc181977 100644
--- a/test/f32-vprelu.cc
+++ b/test/f32-vprelu.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);
-#include "src/f32-vbinary/f32-vprelu.h"
+#include "f32-vbinary/f32-vprelu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vpreluc.cc b/test/f32-vpreluc.cc
index 31725565dd5..b0290e8deae 100644
--- a/test/f32-vpreluc.cc
+++ b/test/f32-vpreluc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Prelu, init_params);
-#include "src/f32-vbinary/f32-vpreluc.h"
+#include "f32-vbinary/f32-vpreluc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vrcopysignc.cc b/test/f32-vrcopysignc.cc
index bde00d406af..fd8ed7f769f 100644
--- a/test/f32-vrcopysignc.cc
+++ b/test/f32-vrcopysignc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RCopySign, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RCopySign, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RCopySign, init_params);
-#include "src/f32-vbinary/f32-vrcopysignc.h"
+#include "f32-vbinary/f32-vrcopysignc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vrdivc.cc b/test/f32-vrdivc.cc
index 510b7f52901..31185edf74c 100644
--- a/test/f32-vrdivc.cc
+++ b/test/f32-vrdivc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RDiv, init_params);
-#include "src/f32-vbinary/f32-vrdivc.h"
+#include "f32-vbinary/f32-vrdivc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vrelu.cc b/test/f32-vrelu.cc
index 8732db4b431..0d6d7c4e338 100644
--- a/test/f32-vrelu.cc
+++ b/test/f32-vrelu.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                        \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);
-#include "src/f32-vrelu/f32-vrelu.h"
+#include "f32-vrelu/f32-vrelu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vrndd.cc b/test/f32-vrndd.cc
index 64f9c1a910a..cc824ea91d9 100644
--- a/test/f32-vrndd.cc
+++ b/test/f32-vrndd.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params); \
                                                                                                                                       \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params);
-#include "src/f32-vrnd/f32-vrndd.h"
+#include "f32-vrnd/f32-vrndd.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vrndne.cc b/test/f32-vrndne.cc
index e4739467ad2..c1106131b6b 100644
--- a/test/f32-vrndne.cc
+++ b/test/f32-vrndne.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params); \
                                                                                                                                                \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params);
-#include "src/f32-vrnd/f32-vrndne.h"
+#include "f32-vrnd/f32-vrndne.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vrndu.cc b/test/f32-vrndu.cc
index b0d313b4dcf..247b2cb1ced 100644
--- a/test/f32-vrndu.cc
+++ b/test/f32-vrndu.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params); \
                                                                                                                                     \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params);
-#include "src/f32-vrnd/f32-vrndu.h"
+#include "f32-vrnd/f32-vrndu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vrndz.cc b/test/f32-vrndz.cc
index 7efd335417e..0808e048173 100644
--- a/test/f32-vrndz.cc
+++ b/test/f32-vrndz.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUna
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params); \
                                                                                                                                              \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params);
-#include "src/f32-vrnd/f32-vrndz.h"
+#include "f32-vrnd/f32-vrndz.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vrpreluc.cc b/test/f32-vrpreluc.cc
index 8ea52b7789c..df02958d935 100644
--- a/test/f32-vrpreluc.cc
+++ b/test/f32-vrpreluc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RPrelu, init_params);
-#include "src/f32-vbinary/f32-vrpreluc.h"
+#include "f32-vbinary/f32-vrpreluc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vrsqrt.cc b/test/f32-vrsqrt.cc
index 8050d56e362..06b1eaccd64 100644
--- a/test/f32-vrsqrt.cc
+++ b/test/f32-vrsqrt.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                        \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);
-#include "src/f32-vrsqrt/f32-vrsqrt.h"
+#include "f32-vrsqrt/f32-vrsqrt.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vrsubc.cc b/test/f32-vrsubc.cc
index 14c71fac4fb..de2f4321b9c 100644
--- a/test/f32-vrsubc.cc
+++ b/test/f32-vrsubc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::RSub, init_params);
-#include "src/f32-vbinary/f32-vrsubc.h"
+#include "f32-vbinary/f32-vrsubc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vsigmoid.cc b/test/f32-vsigmoid.cc
index 0988fc19978..e60273dc896 100644
--- a/test/f32-vsigmoid.cc
+++ b/test/f32-vsigmoid.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                        \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);
-#include "src/f32-vsigmoid/f32-vsigmoid.h"
+#include "f32-vsigmoid/f32-vsigmoid.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vsqr.cc b/test/f32-vsqr.cc
index e5965be3420..a2f4987c3bc 100644
--- a/test/f32-vsqr.cc
+++ b/test/f32-vsqr.cc
@@ -32,5 +32,5 @@ XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr());                 \
                                                                                                                  \
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr());
-#include "src/f32-vsqr/f32-vsqr.h"
+#include "f32-vsqr/f32-vsqr.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vsqrdiff.cc b/test/f32-vsqrdiff.cc
index 8dfdac139ff..365838c4438 100644
--- a/test/f32-vsqrdiff.cc
+++ b/test/f32-vsqrdiff.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params);
-#include "src/f32-vbinary/f32-vsqrdiff.h"
+#include "f32-vbinary/f32-vsqrdiff.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vsqrdiffc.cc b/test/f32-vsqrdiffc.cc
index 8434a403dc8..3d0276a901a 100644
--- a/test/f32-vsqrdiffc.cc
+++ b/test/f32-vsqrdiffc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::SqrDiff, init_params);
-#include "src/f32-vbinary/f32-vsqrdiffc.h"
+#include "f32-vbinary/f32-vsqrdiffc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vsqrt.cc b/test/f32-vsqrt.cc
index 829dc4b466c..ef629a2987f 100644
--- a/test/f32-vsqrt.cc
+++ b/test/f32-vsqrt.cc
@@ -63,5 +63,5 @@ TEST(ukernel, special_values) {
     }                                                                                                            \
   }                                                                                                              \
 }
-#include "src/f32-vsqrt/f32-vsqrt.h"
+#include "f32-vsqrt/f32-vsqrt.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vsub.cc b/test/f32-vsub.cc
index 03c9f5cc02c..15c86a300c7 100644
--- a/test/f32-vsub.cc
+++ b/test/f32-vsub.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);
-#include "src/f32-vbinary/f32-vsub.h"
+#include "f32-vbinary/f32-vsub.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vsubc.cc b/test/f32-vsubc.cc
index 69cfe91caa0..eac7baa565f 100644
--- a/test/f32-vsubc.cc
+++ b/test/f32-vsubc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Sub, init_params);
-#include "src/f32-vbinary/f32-vsubc.h"
+#include "f32-vbinary/f32-vsubc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/f32-vtanh.cc b/test/f32-vtanh.cc
index 591612e13d8..5d2c6723393 100644
--- a/test/f32-vtanh.cc
+++ b/test/f32-vtanh.cc
@@ -63,5 +63,5 @@ TEST(ukernel, special_values) {
     }                                                                                                            \
   }                                                                                                              \
 }
-#include "src/f32-vtanh/f32-vtanh.h"
+#include "f32-vtanh/f32-vtanh.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/floor.cc b/test/floor.cc
index d3412cc3148..bea533cabaf 100644
--- a/test/floor.cc
+++ b/test/floor.cc
@@ -12,6 +12,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/fully-connected.cc b/test/fully-connected.cc
index 3bc434e827e..f5948f2c89f 100644
--- a/test/fully-connected.cc
+++ b/test/fully-connected.cc
@@ -519,6 +519,7 @@ TEST_F(FullyConnectedTestQP8F32QC4W, matches_operator_api_with_reshape) {
   // unwritten portions of these buffers are matching.
   std::fill(convert_input.begin(), convert_input.end(), 0.0f);
   std::fill(subgraph_output.begin(), subgraph_output.end(), 0.0f);
+  std::fill(operator_output.begin(), operator_output.end(), 0.0f);
 
   // Adjust number of kernel elements for QC4W. input_channels should be padded
   // to byte boundary, hence even.
diff --git a/test/gavgpool-cw-microkernel-tester.h b/test/gavgpool-cw-microkernel-tester.h
index fae2cb8e9d2..d9450a0f671 100644
--- a/test/gavgpool-cw-microkernel-tester.h
+++ b/test/gavgpool-cw-microkernel-tester.h
@@ -17,6 +17,7 @@
 #include <gtest/gtest.h>
 #include "xnnpack.h"
 #include "xnnpack/fp16.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams.h"
 #include "replicable_random_device.h"
diff --git a/test/gavgpool-microkernel-tester.h b/test/gavgpool-microkernel-tester.h
index b0b1d5f0d94..92a3ba72aef 100644
--- a/test/gavgpool-microkernel-tester.h
+++ b/test/gavgpool-microkernel-tester.h
@@ -21,6 +21,7 @@
 #include <gtest/gtest.h>
 #include "xnnpack.h"
 #include "xnnpack/aligned-allocator.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams.h"
 #include "xnnpack/requantization.h"
diff --git a/test/gemm-microkernel-tester.cc b/test/gemm-microkernel-tester.cc
index cb92bf8bc3b..2f2aebe6409 100644
--- a/test/gemm-microkernel-tester.cc
+++ b/test/gemm-microkernel-tester.cc
@@ -1734,7 +1734,7 @@ void GemmMicrokernelTester::Test(
                                            input_qp8.data());
 
     std::generate(b.begin(), b.end(), std::ref(w8rng));
-    // std::generate(bias.begin(), bias.end(), std::ref(f32rng));
+    std::generate(bias.begin(), bias.end(), std::ref(f32rng));
     std::generate(kernel_scale.begin(), kernel_scale.end(), std::ref(scalerng));
     std::fill(packed_w.begin(), packed_w.end(), 0);
 
@@ -1747,8 +1747,8 @@ void GemmMicrokernelTester::Test(
          /*accumulator_init=*/nullptr,
          /*weights=*/b.data(),
          /*int_extra_data0_fn=*/nullptr,
-         /*extra_data0=*/nullptr,
-         /*extra_data0_size=*/0,
+         /*extra_data0=*/bias.data(),
+         /*extra_data0_size=*/sizeof(float),
          /*init_extra_data1_fn=*/
          nullptr,
          /*extra_data1=*/kernel_scale.data(),
diff --git a/test/global-average-pooling-1d.cc b/test/global-average-pooling-1d.cc
index 4a45f86851d..ec5ab308cb7 100644
--- a/test/global-average-pooling-1d.cc
+++ b/test/global-average-pooling-1d.cc
@@ -19,6 +19,7 @@
 #include "xnnpack.h"
 #include "xnnpack/aligned-allocator.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/requantization.h"
diff --git a/test/global-average-pooling-2d.cc b/test/global-average-pooling-2d.cc
index 786fcf47a33..c489b6f9a4f 100644
--- a/test/global-average-pooling-2d.cc
+++ b/test/global-average-pooling-2d.cc
@@ -19,6 +19,7 @@
 #include "xnnpack.h"
 #include "xnnpack/aligned-allocator.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/requantization.h"
diff --git a/test/global-sum-pooling-1d.cc b/test/global-sum-pooling-1d.cc
index d3bd490ac53..bd7c813c762 100644
--- a/test/global-sum-pooling-1d.cc
+++ b/test/global-sum-pooling-1d.cc
@@ -19,6 +19,7 @@
 #include "xnnpack.h"
 #include "xnnpack/aligned-allocator.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/global-sum-pooling-2d.cc b/test/global-sum-pooling-2d.cc
index e3c6fec6943..d23a0eff24d 100644
--- a/test/global-sum-pooling-2d.cc
+++ b/test/global-sum-pooling-2d.cc
@@ -19,6 +19,7 @@
 #include "xnnpack.h"
 #include "xnnpack/aligned-allocator.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/hardswish.cc b/test/hardswish.cc
index df394a3d0d1..d026b2d7fa8 100644
--- a/test/hardswish.cc
+++ b/test/hardswish.cc
@@ -14,6 +14,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/leaky-relu.cc b/test/leaky-relu.cc
index 5907f5f2a91..9ebdc0de3e9 100644
--- a/test/leaky-relu.cc
+++ b/test/leaky-relu.cc
@@ -12,6 +12,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/max-pooling-2d.cc b/test/max-pooling-2d.cc
index b6e4b8ecaa8..c7ed6cd91c3 100644
--- a/test/max-pooling-2d.cc
+++ b/test/max-pooling-2d.cc
@@ -15,12 +15,13 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator-utils.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/requantization.h"
 #include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 template <class T> class MaxPooling2DTestBase : public ::testing::Test {
diff --git a/test/maxpool-microkernel-tester.h b/test/maxpool-microkernel-tester.h
index 755ace470a0..f8447292bd0 100644
--- a/test/maxpool-microkernel-tester.h
+++ b/test/maxpool-microkernel-tester.h
@@ -21,10 +21,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams.h"
 #include "next_prime.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 class MaxPoolMicrokernelTester {
diff --git a/test/maxpool-minmax.cc b/test/maxpool-minmax.cc
index e7c92e96e0c..d5cc5669871 100644
--- a/test/maxpool-minmax.cc
+++ b/test/maxpool-minmax.cc
@@ -39,10 +39,10 @@ std::string GetTestName(const testing::TestParamInfo<XnnTest::ParamType>& info)
   { #ukernel, MaxPoolMicrokernelTester::Kernel{ukernel, init_params}, arch_flags, channel_tile, channel_scaled_tile, primary_tile, incremental_tile, qmin, qmax },
 
 const XnnTestParam xnn_test_params[] = {
-#include "src/f16-maxpool/f16-maxpool-minmax.h"
-#include "src/f32-maxpool/f32-maxpool-minmax.h"
-#include "src/s8-maxpool/s8-maxpool-minmax.h"
-#include "src/u8-maxpool/u8-maxpool-minmax.h"
+#include "f16-maxpool/f16-maxpool-minmax.h"
+#include "f32-maxpool/f32-maxpool-minmax.h"
+#include "s8-maxpool/s8-maxpool-minmax.h"
+#include "u8-maxpool/u8-maxpool-minmax.h"
 };
 
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/negate.cc b/test/negate.cc
index f0aa5af3c1a..f31ff9ac25e 100644
--- a/test/negate.cc
+++ b/test/negate.cc
@@ -12,6 +12,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/operator-size.c b/test/operator-size.c
deleted file mode 100644
index 7c4c490489b..00000000000
--- a/test/operator-size.c
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <stdlib.h>
-
-#include "xnnpack.h"
-
-
-// A dummy program that calls every Operators API function in XNNPACK, for size estimation.
-int main(int argc, char** argv) {
-  int function_idx = 0;
-  if (argc >= 2) {
-    function_idx = atoi(argv[1]);
-  }
-
-  xnn_initialize(NULL /* allocator */);
-
-  xnn_operator_t op = NULL;
-  switch (function_idx) {
-    case -1:
-      xnn_delete_operator(op);
-      break;
-    case 0:
-      xnn_run_operator(op, NULL);
-      break;
-    case 1:
-      xnn_create_binary_elementwise_nd(
-        xnn_binary_add, xnn_datatype_fp32, NULL, NULL, NULL, 0, &op);
-      break;
-    case 2:
-      xnn_setup_binary_elementwise_nd(
-        op, NULL, NULL, NULL);
-      break;
-    case 3:
-      xnn_create_argmax_pooling2d_nhwc_f32(
-        0, 0, 0, 0,
-        0, 0,
-        0, &op);
-      break;
-    case 4:
-      xnn_setup_argmax_pooling2d_nhwc_f32(
-        op, NULL, NULL, NULL, NULL);
-      break;
-    case 5:
-      xnn_create_average_pooling2d_nhwc_f32(
-        0, 0, 0, 0,
-        0, 0,
-        0, 0,
-        0.0f, 0.0f,
-        0, &op);
-      break;
-    case 6:
-      xnn_setup_average_pooling2d_nhwc_f32(
-        op, NULL, NULL, NULL);
-      break;
-    case 7:
-      xnn_create_clamp_nc_f32(
-        0.0f, 0.0f,
-        0, &op);
-      xnn_reshape_clamp_nc_f32(
-        op, 0, 0, 0, 0, NULL);
-      break;
-    case 8:
-      xnn_setup_clamp_nc_f32(
-        op, NULL, NULL);
-      break;
-    case 9:
-      xnn_create_convolution2d_nhwc_f32(
-        0, 0, 0, 0,
-        0, 0,
-        0, 0,
-        0, 0,
-        0, 0, 0, 0, 0,
-        NULL, NULL,
-        0.0f, 0.0f,
-        0, NULL, NULL, &op);
-      break;
-    case 10:
-      xnn_setup_convolution2d_nhwc_f32(
-        op, NULL, NULL, NULL);
-      break;
-    case 11:
-      xnn_create_deconvolution2d_nhwc_f32(
-        0, 0, 0, 0,
-        0, 0,
-        0, 0,
-        0, 0,
-        0, 0, 0, 0, 0,
-        NULL, NULL,
-        0.0f, 0.0f,
-        0, NULL, NULL, &op);
-      break;
-    case 12:
-      xnn_setup_deconvolution2d_nhwc_f32(
-        op, NULL, NULL);
-      break;
-    case 15:
-      xnn_create_fully_connected_nc_f32(
-        0, 0, 0, 0,
-        NULL, NULL,
-        0.0f, 0.0f,
-        0, NULL, NULL, &op);
-      break;
-    case 16:
-      xnn_setup_fully_connected_nc_f32(
-        op, NULL, NULL);
-      break;
-    case 17:
-      xnn_create_global_average_pooling_nwc_f32(
-        0.0f, 0.0f,
-        0, &op);
-      break;
-    case 18:
-      xnn_setup_global_average_pooling_nwc_f32(
-        op, NULL, NULL, NULL);
-      break;
-    case 19:
-      xnn_create_hardswish_nc_f32(
-        0, &op);
-      xnn_reshape_hardswish_nc_f32(
-        op, 0, 0, 0, 0, NULL);
-      break;
-    case 20:
-      xnn_setup_hardswish_nc_f32(
-        op, NULL, NULL);
-      break;
-    case 21:
-      xnn_create_max_pooling2d_nhwc_f32(
-        0, 0, 0, 0,
-        0, 0, 0,
-        0, 0, 0,
-        0.0f, 0.0f,
-        0, &op);
-      break;
-    case 22:
-      xnn_setup_max_pooling2d_nhwc_f32(
-        op, NULL, NULL);
-      break;
-    case 29:
-      xnn_create_prelu_nc_f32(
-        0, 0, 0, 0,
-        NULL, 0, NULL, NULL, &op);
-      break;
-    case 30:
-      xnn_setup_prelu_nc_f32(
-        op,
-        NULL, NULL);
-      break;
-    case 31:
-      xnn_create_resize_bilinear2d_nhwc_f32(
-        0, 0, 0, &op);
-      break;
-    case 32:
-      xnn_setup_resize_bilinear2d_nhwc_f32(
-        op, NULL, NULL, NULL);
-      break;
-    case 33:
-      xnn_create_sigmoid_nc_f32(
-        0, &op);
-      xnn_reshape_sigmoid_nc_f32(
-        op, 0, 0, 0, 0, NULL);
-      break;
-    case 34:
-      xnn_setup_sigmoid_nc_f32(
-        op, NULL, NULL);
-      break;
-    case 35:
-      xnn_create_softmax_nc_f32(
-        0, &op);
-      break;
-    case 36:
-      xnn_setup_softmax_nc_f32(
-        op, NULL, NULL);
-      break;
-    case 39:
-      xnn_create_channel_shuffle_nc_x32(
-        0, 0, 0, 0,
-        0, &op);
-      break;
-    case 40:
-      xnn_setup_channel_shuffle_nc_x32(
-        op, NULL, NULL);
-      break;
-    case 41:
-      xnn_create_unpooling2d_nhwc_x32(
-        0, 0, 0, 0,
-        0, 0,
-        0, 0, 0,
-        0, &op);
-      break;
-    case 42:
-      xnn_setup_unpooling2d_nhwc_x32(
-        op, NULL, NULL, NULL);
-      break;
-  }
-
-  xnn_deinitialize();
-}
diff --git a/test/prelu-microkernel-tester.h b/test/prelu-microkernel-tester.h
deleted file mode 100644
index 30edcda22ac..00000000000
--- a/test/prelu-microkernel-tester.h
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-#include <random>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "xnnpack.h"
-#include "xnnpack/microfnptr.h"
-#include "xnnpack/buffer.h"
-#include "replicable_random_device.h"
-
-class PReLUMicrokernelTester {
- public:
-  PReLUMicrokernelTester& rows(size_t rows) {
-    assert(rows != 0);
-    this->rows_ = rows;
-    return *this;
-  }
-
-  size_t rows() const {
-    return this->rows_;
-  }
-
-  PReLUMicrokernelTester& channels(size_t channels) {
-    assert(channels != 0);
-    this->channels_ = channels;
-    return *this;
-  }
-
-  size_t channels() const {
-    return this->channels_;
-  }
-
-  PReLUMicrokernelTester& input_stride(size_t input_stride) {
-    assert(input_stride != 0);
-    this->input_stride_ = input_stride;
-    return *this;
-  }
-
-  size_t input_stride() const {
-    if (this->input_stride_ == 0) {
-      return channels();
-    } else {
-      assert(this->input_stride_ >= channels());
-      return this->input_stride_;
-    }
-  }
-
-  PReLUMicrokernelTester& output_stride(size_t output_stride) {
-    assert(output_stride != 0);
-    this->output_stride_ = output_stride;
-    return *this;
-  }
-
-  size_t output_stride() const {
-    if (this->output_stride_ == 0) {
-      return channels();
-    } else {
-      assert(this->output_stride_ >= channels());
-      return this->output_stride_;
-    }
-  }
-
-  PReLUMicrokernelTester& inplace(bool inplace) {
-    this->inplace_ = inplace;
-    return *this;
-  }
-
-  bool inplace() const {
-    return this->inplace_;
-  }
-
-  PReLUMicrokernelTester& iterations(size_t iterations) {
-    this->iterations_ = iterations;
-    return *this;
-  }
-
-  size_t iterations() const {
-    return this->iterations_;
-  }
-
-  void Test(xnn_f16_prelu_ukernel_fn prelu) const {
-    xnnpack::ReplicableRandomDevice rng;
-    std::uniform_real_distribution<float> f32dist(-1.0f, 1.0f);
-    std::uniform_real_distribution<float> w32dist(0.25f, 0.75f);
-
-    xnnpack::Buffer<xnn_float16> x(channels() + (rows() - 1) * input_stride() + XNN_EXTRA_BYTES / sizeof(xnn_float16));
-    xnnpack::Buffer<xnn_float16, XNN_ALLOCATION_ALIGNMENT> w(
-        channels() + XNN_EXTRA_BYTES / sizeof(xnn_float16));
-    xnnpack::Buffer<xnn_float16> y(channels() + (rows() - 1) * output_stride() + XNN_EXTRA_BYTES / sizeof(xnn_float16));
-    xnnpack::Buffer<float> y_ref(channels() * rows());
-    for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); });
-      std::generate(w.begin(), w.end(), [&]() { return w32dist(rng); });
-      if (inplace()) {
-        std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); });
-      }
-      const xnn_float16* x_data = inplace() ? y.data() : x.data();
-
-      // Compute reference results, without clamping.
-      for (size_t n = 0; n < rows(); n++) {
-        for (size_t c = 0; c < channels(); c++) {
-          const float x_value = x_data[n * input_stride() + c];
-          y_ref[n * channels() + c] = std::signbit(x_value) ?
-              float(xnn_float16(x_value * w[c])) : x_value;  // What is going on here?
-        }
-      }
-
-      // Call optimized micro-kernel.
-      prelu(rows(), channels() * sizeof(xnn_float16),
-        x_data, input_stride() * sizeof(xnn_float16),
-        w.data(),
-        y.data(), output_stride() * sizeof(xnn_float16));
-
-      // Verify results.
-      for (size_t n = 0; n < rows(); n++) {
-        for (size_t c = 0; c < channels(); c++) {
-          EXPECT_EQ(y[n * output_stride() + c], y_ref[n * channels() + c])
-            << "at row " << n << " / " << rows()
-            << ", channel " << c << " / " << channels();
-        }
-      }
-    }
-  }
-
-  void Test(xnn_f32_prelu_ukernel_fn prelu) const {
-    xnnpack::ReplicableRandomDevice rng;
-    std::uniform_real_distribution<float> f32dist(-1.0f, 1.0f);
-    std::uniform_real_distribution<float> w32dist(0.25f, 0.75f);
-
-    xnnpack::Buffer<float> x(channels() + (rows() - 1) * input_stride() + XNN_EXTRA_BYTES / sizeof(float));
-    xnnpack::Buffer<float, XNN_ALLOCATION_ALIGNMENT> w(channels() +
-                                               XNN_EXTRA_BYTES / sizeof(float));
-    xnnpack::Buffer<float> y(channels() + (rows() - 1) * output_stride() + XNN_EXTRA_BYTES / sizeof(float));
-    xnnpack::Buffer<float> y_ref(channels() * rows());
-    for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); });
-      std::generate(w.begin(), w.end(), [&]() { return w32dist(rng); });
-      if (inplace()) {
-        std::generate(y.begin(), y.end(), [&]() { return f32dist(rng); });
-      }
-      const float* x_data = inplace() ? y.data() : x.data();
-
-      // Compute reference results, without clamping.
-      for (size_t n = 0; n < rows(); n++) {
-        for (size_t c = 0; c < channels(); c++) {
-          const float x_value = x_data[n * input_stride() + c];
-          y_ref[n * channels() + c] = std::signbit(x_value) ? x_value * w[c] : x_value;
-        }
-      }
-
-      // Call optimized micro-kernel.
-      prelu(rows(), channels() * sizeof(float),
-        x_data, input_stride() * sizeof(float),
-        w.data(),
-        y.data(), output_stride() * sizeof(float));
-
-      // Verify results.
-      for (size_t n = 0; n < rows(); n++) {
-        for (size_t c = 0; c < channels(); c++) {
-          EXPECT_EQ(y[n * output_stride() + c], y_ref[n * channels() + c])
-            << "at row " << n << " / " << rows()
-            << ", channel " << c << " / " << channels();
-        }
-      }
-    }
-  }
-
- private:
-  size_t rows_{1};
-  size_t channels_{1};
-  size_t input_stride_{0};
-  size_t output_stride_{0};
-  bool inplace_{false};
-  size_t iterations_{15};
-};
diff --git a/test/prelu-nc.cc b/test/prelu-nc.cc
deleted file mode 100644
index 32068f52d27..00000000000
--- a/test/prelu-nc.cc
+++ /dev/null
@@ -1,356 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-
-#include <gtest/gtest.h>
-#include "xnnpack/config.h"
-#include "prelu-operator-tester.h"
-
-#ifndef XNN_EXCLUDE_F16_TESTS
-TEST(PRELU_NC_F16, unit_batch) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config();
-  if (prelu_config == nullptr) {
-    GTEST_SKIP();  // F16 unsupported.
-  }
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(1)
-      .input_channels(input_channels)
-      .iterations(3)
-      .TestF16();
-  }
-}
-
-TEST(PRELU_NC_F16, small_batch_with_broadcasted_slope) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config();
-  if (prelu_config == nullptr) {
-    GTEST_SKIP();  // F16 unsupported.
-  }
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(prelu_config->row_tile)
-      .input_channels(input_channels)
-      .slope_channels(1)
-      .iterations(3)
-      .TestF16();
-  }
-}
-
-TEST(PRELU_NC_F16, small_batch) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config();
-  if (prelu_config == nullptr) {
-    GTEST_SKIP();  // F16 unsupported.
-  }
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(prelu_config->row_tile)
-      .input_channels(input_channels)
-      .iterations(3)
-      .TestF16();
-  }
-}
-
-TEST(PRELU_NC_F16, small_batch_with_x_stride) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config();
-  if (prelu_config == nullptr) {
-    GTEST_SKIP();  // F16 unsupported.
-  }
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(prelu_config->row_tile)
-      .input_channels(input_channels)
-      .x_stride(337)
-      .iterations(3)
-      .TestF16();
-  }
-}
-
-TEST(PRELU_NC_F16, small_batch_with_y_stride) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config();
-  if (prelu_config == nullptr) {
-    GTEST_SKIP();  // F16 unsupported.
-  }
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(prelu_config->row_tile)
-      .input_channels(input_channels)
-      .y_stride(347)
-      .iterations(3)
-      .TestF16();
-  }
-}
-
-TEST(PRELU_NC_F16, small_batch_with_x_stride_and_y_stride) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config();
-  if (prelu_config == nullptr) {
-    GTEST_SKIP();  // F16 unsupported.
-  }
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(prelu_config->row_tile)
-      .input_channels(input_channels)
-      .x_stride(337)
-      .y_stride(347)
-      .iterations(3)
-      .TestF16();
-  }
-}
-
-TEST(PRELU_NC_F16, large_batch) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config();
-  if (prelu_config == nullptr) {
-    GTEST_SKIP();  // F16 unsupported.
-  }
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(3 * prelu_config->row_tile + 1)
-      .input_channels(input_channels)
-      .iterations(1)
-      .TestF16();
-  }
-}
-
-TEST(PRELU_NC_F16, large_batch_with_x_stride) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config();
-  if (prelu_config == nullptr) {
-    GTEST_SKIP();  // F16 unsupported.
-  }
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(3 * prelu_config->row_tile + 1)
-      .input_channels(input_channels)
-      .x_stride(337)
-      .iterations(1)
-      .TestF16();
-  }
-}
-
-TEST(PRELU_NC_F16, large_batch_with_y_stride) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config();
-  if (prelu_config == nullptr) {
-    GTEST_SKIP();  // F16 unsupported.
-  }
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(3 * prelu_config->row_tile + 1)
-      .input_channels(input_channels)
-      .y_stride(347)
-      .iterations(1)
-      .TestF16();
-  }
-}
-
-TEST(PRELU_NC_F16, large_batch_with_x_stride_and_y_stride) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config();
-  if (prelu_config == nullptr) {
-    GTEST_SKIP();  // F16 unsupported.
-  }
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(3 * prelu_config->row_tile + 1)
-      .input_channels(input_channels)
-      .x_stride(337)
-      .y_stride(347)
-      .iterations(1)
-      .TestF16();
-  }
-}
-
-TEST(PRELU_NC_F16, fp32_weights) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config();
-  if (prelu_config == nullptr) {
-    GTEST_SKIP();  // F16 unsupported.
-  }
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(3 * prelu_config->row_tile + 1)
-      .input_channels(input_channels)
-      .x_stride(337)
-      .y_stride(347)
-      .weights_type(PReLUOperatorTester::WeightsType::FP32)
-      .iterations(1)
-      .TestF16();
-  }
-}
-
-TEST(PRELU_NC_F16, weights_cache_unit_batch) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config();
-  if (prelu_config == nullptr) {
-    GTEST_SKIP();  // F16 unsupported.
-  }
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(1)
-      .input_channels(input_channels)
-      .use_weights_cache(true)
-      .iterations(3)
-      .TestF16();
-  }
-}
-
-TEST(PRELU_NC_F16, weights_cache_fp32_weights) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f16_prelu_config();
-  if (prelu_config == nullptr) {
-    GTEST_SKIP();  // F16 unsupported.
-  }
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(3 * prelu_config->row_tile + 1)
-      .input_channels(input_channels)
-      .x_stride(345)
-      .y_stride(347)
-      .weights_type(PReLUOperatorTester::WeightsType::FP32)
-      .use_weights_cache(true)
-      .iterations(1)
-      .TestF16();
-  }
-}
-#endif  // XNN_EXCLUDE_F16_TESTS
-
-
-TEST(PRELU_NC_F32, unit_batch) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config();
-  assert(prelu_config != nullptr);
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(1)
-      .input_channels(input_channels)
-      .iterations(3)
-      .TestF32();
-  }
-}
-
-TEST(PRELU_NC_F32, small_batch_with_broadcasted_slope) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config();
-  assert(prelu_config != nullptr);
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(prelu_config->row_tile)
-      .input_channels(input_channels)
-      .slope_channels(1)
-      .iterations(3)
-      .TestF32();
-  }
-}
-
-TEST(PRELU_NC_F32, small_batch) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config();
-  assert(prelu_config != nullptr);
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(prelu_config->row_tile)
-      .input_channels(input_channels)
-      .iterations(3)
-      .TestF32();
-  }
-}
-
-TEST(PRELU_NC_F32, small_batch_with_x_stride) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config();
-  assert(prelu_config != nullptr);
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(prelu_config->row_tile)
-      .input_channels(input_channels)
-      .x_stride(337)
-      .iterations(3)
-      .TestF32();
-  }
-}
-
-TEST(PRELU_NC_F32, small_batch_with_y_stride) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config();
-  assert(prelu_config != nullptr);
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(prelu_config->row_tile)
-      .input_channels(input_channels)
-      .y_stride(347)
-      .iterations(3)
-      .TestF32();
-  }
-}
-
-TEST(PRELU_NC_F32, small_batch_with_x_stride_and_y_stride) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config();
-  assert(prelu_config != nullptr);
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(prelu_config->row_tile)
-      .input_channels(input_channels)
-      .x_stride(337)
-      .y_stride(347)
-      .iterations(3)
-      .TestF32();
-  }
-}
-
-TEST(PRELU_NC_F32, large_batch) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config();
-  assert(prelu_config != nullptr);
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(3 * prelu_config->row_tile + 1)
-      .input_channels(input_channels)
-      .iterations(1)
-      .TestF32();
-  }
-}
-
-TEST(PRELU_NC_F32, large_batch_with_x_stride) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config();
-  assert(prelu_config != nullptr);
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(3 * prelu_config->row_tile + 1)
-      .input_channels(input_channels)
-      .x_stride(337)
-      .iterations(1)
-      .TestF32();
-  }
-}
-
-TEST(PRELU_NC_F32, large_batch_with_y_stride) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config();
-  assert(prelu_config != nullptr);
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(3 * prelu_config->row_tile + 1)
-      .input_channels(input_channels)
-      .y_stride(347)
-      .iterations(1)
-      .TestF32();
-  }
-}
-
-TEST(PRELU_NC_F32, large_batch_with_x_stride_and_y_stride) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config();
-  assert(prelu_config != nullptr);
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(3 * prelu_config->row_tile + 1)
-      .input_channels(input_channels)
-      .x_stride(337)
-      .y_stride(347)
-      .iterations(1)
-      .TestF32();
-  }
-}
-
-TEST(PRELU_NC_F32, weights_cache_unit_batch) {
-  const struct xnn_prelu_config* prelu_config = xnn_init_f32_prelu_config();
-  assert(prelu_config != nullptr);
-  for (size_t input_channels = 1; input_channels < prelu_config->channel_tile * 10; input_channels += std::max<size_t>(1, prelu_config->channel_tile - 1)) {
-    PReLUOperatorTester()
-      .batch_size(1)
-      .input_channels(input_channels)
-      .use_weights_cache(true)
-      .iterations(3)
-      .TestF32();
-  }
-}
diff --git a/test/prelu-operator-tester.h b/test/prelu-operator-tester.h
deleted file mode 100644
index cc0dd7311d0..00000000000
--- a/test/prelu-operator-tester.h
+++ /dev/null
@@ -1,394 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-#include <memory>
-#include <random>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "xnnpack.h"
-#include "xnnpack/cache.h"
-#include "xnnpack/math.h"
-#include "xnnpack/buffer.h"
-#include "replicable_random_device.h"
-
-class PReLUOperatorTester {
- public:
-  enum class WeightsType {
-    Default,
-    FP32,
-  };
-
-  PReLUOperatorTester& batch_size(size_t batch_size) {
-    assert(batch_size != 0);
-    this->batch_size_ = batch_size;
-    return *this;
-  }
-
-  size_t batch_size() const {
-    return this->batch_size_;
-  }
-
-  PReLUOperatorTester& input_channels(size_t input_channels) {
-    assert(input_channels != 0);
-    this->input_channels_ = input_channels;
-    return *this;
-  }
-
-  size_t input_channels() const {
-    return this->input_channels_;
-  }
-
-  PReLUOperatorTester& slope_channels(size_t slope_channels) {
-    assert(slope_channels != 0);
-    this->slope_channels_ = slope_channels;
-    return *this;
-  }
-
-  size_t slope_channels() const {
-    if (this->slope_channels_ == 0) {
-      return this->input_channels_;
-    } else {
-      return this->slope_channels_;
-    }
-  }
-
-  PReLUOperatorTester& x_stride(size_t x_stride) {
-    assert(x_stride != 0);
-    this->x_stride_ = x_stride;
-    return *this;
-  }
-
-  size_t x_stride() const {
-    if (this->x_stride_ == 0) {
-      return this->input_channels_;
-    } else {
-      assert(this->x_stride_ >= this->input_channels_);
-      return this->x_stride_;
-    }
-  }
-
-  PReLUOperatorTester& y_stride(size_t y_stride) {
-    assert(y_stride != 0);
-    this->y_stride_ = y_stride;
-    return *this;
-  }
-
-  size_t y_stride() const {
-    if (this->y_stride_ == 0) {
-      return this->input_channels_;
-    } else {
-      assert(this->y_stride_ >= this->input_channels_);
-      return this->y_stride_;
-    }
-  }
-
-  PReLUOperatorTester& weights_type(WeightsType weights_type) {
-    this->weights_type_ = weights_type;
-    return *this;
-  }
-
-  WeightsType weights_type() const {
-    return this->weights_type_;
-  }
-
-  PReLUOperatorTester& iterations(size_t iterations) {
-    this->iterations_ = iterations;
-    return *this;
-  }
-
-  size_t iterations() const {
-    return this->iterations_;
-  }
-
-  PReLUOperatorTester& use_weights_cache(bool use_weights_cache) {
-    this->use_weights_cache_ = use_weights_cache;
-    return *this;
-  }
-
-  bool use_weights_cache() const {
-    return this->use_weights_cache_;
-  }
-
-  void TestF16() const {
-    switch (weights_type()) {
-      case WeightsType::Default:
-        break;
-      case WeightsType::FP32:
-        break;
-      default:
-        GTEST_FAIL() << "unexpected weights type";
-    }
-
-    xnnpack::ReplicableRandomDevice rng;
-    auto f32irng = std::uniform_real_distribution<float>(-1.0f, 1.0f);
-    auto f32wrng = std::uniform_real_distribution<float>(0.25f, 0.75f);
-
-    xnnpack::Buffer<xnn_float16> x((batch_size() - 1) * x_stride() + input_channels() + XNN_EXTRA_BYTES / sizeof(xnn_float16));
-    xnnpack::Buffer<xnn_float16> w(input_channels());
-    xnnpack::Buffer<float> w_as_float(input_channels());
-    xnnpack::Buffer<xnn_float16> y((batch_size() - 1) * y_stride() + input_channels() + XNN_EXTRA_BYTES / sizeof(xnn_float16));
-    xnnpack::Buffer<float> y_ref(batch_size() * input_channels());
-    for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      std::generate(x.begin(), x.end(), [&] { return f32irng(rng); });
-      if (slope_channels() == 1) {
-        std::fill(w.begin(), w.end(), f32wrng(rng));
-      } else {
-        std::generate(w.begin(), w.end(), [&] { return f32wrng(rng); });
-      }
-      std::copy(w.cbegin(), w.cend(), w_as_float.begin());
-
-      // Compute reference results, without clamping.
-      for (size_t i = 0; i < batch_size(); i++) {
-        for (size_t c = 0; c < input_channels(); c++) {
-          const float x_value = x[i * x_stride() + c];
-          const float w_value = w_as_float[c];
-          y_ref[i * input_channels() + c] = std::signbit(x_value) ? x_value * w_value : x_value;
-        }
-      }
-
-      // Create, setup, run, and destroy PReLU operator.
-      ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
-      xnn_operator_t prelu_op = nullptr;
-
-      struct xnn_internal_weights_cache* internal_weights_cache = nullptr;
-      std::unique_ptr<xnn_weights_cache_provider, decltype(&xnn_delete_weights_cache)> auto_weights_cache(
-        nullptr, xnn_delete_weights_cache);
-      if (use_weights_cache()) {
-        xnn_weights_cache_t weights_cache = nullptr;
-        xnn_create_weights_cache(&weights_cache);
-        auto_weights_cache.reset(weights_cache);
-        if (weights_cache) {
-          internal_weights_cache = (struct xnn_internal_weights_cache*) weights_cache->context;
-        }
-      }
-
-      const void* negative_slope_data = w.data();
-      if (weights_type() == WeightsType::FP32) {
-        negative_slope_data = w_as_float.data();
-      }
-      uint32_t flags = 0;
-      if (weights_type() == WeightsType::FP32) {
-        flags |= XNN_FLAG_FP32_STATIC_WEIGHTS;
-      }
-      ASSERT_EQ(xnn_status_success,
-        xnn_create_prelu_nc_f16(
-          input_channels(), slope_channels(), x_stride(), y_stride(),
-          negative_slope_data,
-          flags, /*code_cache=*/nullptr, auto_weights_cache.get(), &prelu_op));
-      ASSERT_NE(nullptr, prelu_op);
-      if (use_weights_cache()) {
-        ASSERT_EQ(xnn_status_success,
-                  xnn_finalize_weights_cache(auto_weights_cache.get(), xnn_weights_cache_finalization_kind_soft));
-      }
-
-      // Smart pointer to automatically delete prelu_op.
-      std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_prelu_op(prelu_op, xnn_delete_operator);
-
-      ASSERT_EQ(xnn_status_success,
-        xnn_reshape_prelu_nc_f16(
-          prelu_op,
-          batch_size(),
-          /*threadpool=*/nullptr));
-
-      ASSERT_EQ(xnn_status_success,
-        xnn_setup_prelu_nc_f16(
-          prelu_op,
-          x.data(), y.data()));
-
-      ASSERT_EQ(xnn_status_success,
-        xnn_run_operator(prelu_op, /*threadpool=*/nullptr));
-
-      VerifyF16(y, y_ref);
-
-      if (use_weights_cache()) {
-        xnn_operator_t prelu_op2 = nullptr;
-        const size_t old_weights_cache_size = internal_weights_cache->cache.weights.size;
-
-        ASSERT_EQ(xnn_status_success,
-                  xnn_create_prelu_nc_f16(
-                      input_channels(), slope_channels(), x_stride(), y_stride(),
-                      negative_slope_data,
-                      flags, /*code_cache=*/nullptr, auto_weights_cache.get(), &prelu_op2));
-        ASSERT_NE(nullptr, prelu_op2);
-
-        // Smart pointer to automatically delete prelu_op2.
-        std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_prelu_op(prelu_op2, xnn_delete_operator);
-
-        xnnpack::Buffer<xnn_float16> y2(y.size(), std::nanf(""));
-        ASSERT_EQ(xnn_status_success,
-                  xnn_reshape_prelu_nc_f16(
-                      prelu_op2,
-                      batch_size(),
-                      /*threadpool=*/nullptr));
-        ASSERT_EQ(xnn_status_success,
-                  xnn_setup_prelu_nc_f16(
-                      prelu_op2,
-                      x.data(), y2.data()));
-
-        ASSERT_EQ(xnn_status_success,
-                  xnn_run_operator(prelu_op2, /*threadpool=*/nullptr));
-
-        VerifyF16(y2, y_ref);
-        VerifyWeightsCache(*internal_weights_cache, old_weights_cache_size);
-      }
-    }
-  }
-
-  void VerifyF16(const xnnpack::Buffer<xnn_float16>& y, const xnnpack::Buffer<float>& y_ref) const {
-    for (size_t i = 0; i < batch_size(); i++) {
-      for (size_t c = 0; c < input_channels(); c++) {
-        ASSERT_NEAR(
-            y[i * y_stride() + c],
-            y_ref[i * input_channels() + c],
-            std::max(1.0e-4f, std::abs(y_ref[i * input_channels() + c]) * 1.0e-3f))
-            << "at position " << i << " / " << batch_size() << ", channel " << c << " / " << input_channels();
-      }
-    }
-  }
-
-  void TestF32() const {
-    ASSERT_EQ(weights_type(), WeightsType::Default);
-
-    xnnpack::ReplicableRandomDevice rng;
-    auto f32irng = std::uniform_real_distribution<float>(-1.0f, 1.0f);
-    auto f32wrng = std::uniform_real_distribution<float>(0.25f, 0.75f);
-
-    xnnpack::Buffer<float> x((batch_size() - 1) * x_stride() + input_channels() + XNN_EXTRA_BYTES / sizeof(float));
-    xnnpack::Buffer<float> w(input_channels());
-    xnnpack::Buffer<float> y((batch_size() - 1) * y_stride() + input_channels() + XNN_EXTRA_BYTES / sizeof(float));
-    xnnpack::Buffer<float> y_ref(batch_size() * input_channels());
-    for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      std::generate(x.begin(), x.end(), [&] { return f32irng(rng);} );
-      if (slope_channels() == 1) {
-        std::fill(w.begin(), w.end(), f32wrng(rng));
-      } else {
-        std::generate(w.begin(), w.end(), [&] { return f32wrng(rng);} );
-      }
-
-      // Compute reference results, without clamping.
-      for (size_t i = 0; i < batch_size(); i++) {
-        for (size_t c = 0; c < input_channels(); c++) {
-          y_ref[i * input_channels() + c] = std::signbit(x[i * x_stride() + c]) ? x[i * x_stride() + c] * w[c] : x[i * x_stride() + c];
-        }
-      }
-
-      // Create, setup, run, and destroy PReLU operator.
-      ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
-      xnn_operator_t prelu_op = nullptr;
-
-     struct xnn_internal_weights_cache* internal_weights_cache = nullptr;
-      std::unique_ptr<xnn_weights_cache_provider, decltype(&xnn_delete_weights_cache)> auto_weights_cache(
-        nullptr, xnn_delete_weights_cache);
-      if (use_weights_cache()) {
-        xnn_weights_cache_t weights_cache = nullptr;
-        xnn_create_weights_cache(&weights_cache);
-        auto_weights_cache.reset(weights_cache);
-        if (weights_cache) {
-          internal_weights_cache = (struct xnn_internal_weights_cache*) weights_cache->context;
-        }
-      }
-
-      ASSERT_EQ(xnn_status_success,
-        xnn_create_prelu_nc_f32(
-          input_channels(), slope_channels(), x_stride(), y_stride(),
-          w.data(),
-          0, /*code_cache=*/nullptr, auto_weights_cache.get(), &prelu_op));
-      ASSERT_NE(nullptr, prelu_op);
-      if (use_weights_cache()) {
-        ASSERT_EQ(xnn_status_success,
-                  xnn_finalize_weights_cache(auto_weights_cache.get(), xnn_weights_cache_finalization_kind_soft));
-      }
-
-      // Smart pointer to automatically delete prelu_op.
-      std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_prelu_op(prelu_op, xnn_delete_operator);
-
-      ASSERT_EQ(xnn_status_success,
-        xnn_reshape_prelu_nc_f32(
-          prelu_op,
-          batch_size(),
-          /*threadpool=*/nullptr));
-
-      ASSERT_EQ(xnn_status_success,
-        xnn_setup_prelu_nc_f32(
-          prelu_op,
-          x.data(), y.data()));
-
-      ASSERT_EQ(xnn_status_success,
-        xnn_run_operator(prelu_op, /*threadpool=*/nullptr));
-
-      VerifyF32(y, y_ref);
-
-      if (use_weights_cache()) {
-        xnn_operator_t prelu_op2 = nullptr;
-        const size_t old_weights_cache_size = internal_weights_cache->cache.weights.size;
-
-        ASSERT_EQ(xnn_status_success,
-                  xnn_create_prelu_nc_f32(
-                      input_channels(), slope_channels(), x_stride(), y_stride(),
-                      w.data(),
-                      0, /*code_cache=*/nullptr, auto_weights_cache.get(), &prelu_op2));
-        ASSERT_NE(nullptr, prelu_op2);
-
-        // Smart pointer to automatically delete prelu_op2.
-        std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_prelu_op(prelu_op2, xnn_delete_operator);
-        xnnpack::Buffer<float> y2(y.size(), nanf(""));
-
-        ASSERT_EQ(xnn_status_success,
-                  xnn_reshape_prelu_nc_f32(
-                      prelu_op2,
-                      batch_size(),
-                      /*threadpool=*/nullptr));
-
-        ASSERT_EQ(xnn_status_success,
-                  xnn_setup_prelu_nc_f32(
-                      prelu_op2,
-                      x.data(), y2.data()));
-
-        ASSERT_EQ(xnn_status_success,
-                  xnn_run_operator(prelu_op2, /*threadpool=*/nullptr));
-
-        VerifyF32(y, y_ref);
-        VerifyWeightsCache(*internal_weights_cache, old_weights_cache_size);
-      }
-    }
-  }
-
-  void VerifyF32(const xnnpack::Buffer<float>& y, const xnnpack::Buffer<float>& y_ref) const {
-    for (size_t i = 0; i < batch_size(); i++) {
-      for (size_t c = 0; c < input_channels(); c++) {
-        ASSERT_NEAR(
-            y[i * y_stride() + c],
-            y_ref[i * input_channels() + c],
-            std::max(1.0e-6f, std::abs(y_ref[i * input_channels() + c]) * 1.0e-6f))
-          << "at position " << i << " / " << batch_size() << ", channel " << c << " / " << input_channels();
-      }
-    }
-  }
-
-  void VerifyWeightsCache(const xnn_internal_weights_cache& weights_cache, size_t old_size) const {
-    ASSERT_EQ(weights_cache.cache.hits, 1);
-    // Ensure that we did not write more weights to the cache because it was a cache hit.
-    ASSERT_EQ(old_size, weights_cache.cache.weights.size);
-  };
-
- private:
-  size_t batch_size_{1};
-  size_t input_channels_{1};
-  size_t slope_channels_{0};
-  size_t x_stride_{0};
-  size_t y_stride_{0};
-  WeightsType weights_type_{WeightsType::Default};
-  bool use_weights_cache_{false};
-  size_t iterations_{15};
-};
diff --git a/test/prelu.cc b/test/prelu.cc
deleted file mode 100644
index 8571c89f7f6..00000000000
--- a/test/prelu.cc
+++ /dev/null
@@ -1,366 +0,0 @@
-// Copyright 2022 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <algorithm>
-#include <array>
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <numeric>
-#include <random>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "xnnpack.h"
-#include "xnnpack/node-type.h"
-#include "xnnpack/operator.h"
-#include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
-#include "replicable_random_device.h"
-
-template <
-  typename InputType,
-  typename WeightType = InputType,
-  typename OutputType = InputType>
-class PreluTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    dim_dist = std::uniform_int_distribution<size_t>(1, 9);
-    input_dims = RandomShape(4);
-    output_dims = input_dims;
-    batch_size = input_dims[0] * input_dims[1] * input_dims[2];
-    input_channels = input_dims[3];
-    slope_channels = input_dims[3];
-    // Randomly broadcast slope.
-    if (dim_dist(rng) < 3) {
-      slope_channels = 1;
-    }
-    slope_dims = {slope_channels};
-    input = xnnpack::Buffer<InputType>(XNN_EXTRA_BYTES / sizeof(InputType) + NumElements(input_dims));
-    slope = xnnpack::Buffer<WeightType>(slope_channels);
-    operator_output = xnnpack::Buffer<OutputType>(NumElements(output_dims));
-    subgraph_output = xnnpack::Buffer<OutputType>(operator_output.size());
-  }
-
-  std::vector<size_t> RandomShape(size_t num_dims)
-  {
-    std::vector<size_t> dims(num_dims);
-    std::generate(dims.begin(), dims.end(), [&] { return dim_dist(rng); });
-    return dims;
-  }
-
-  size_t NumElements(std::vector<size_t>& dims)
-  {
-    return std::accumulate(dims.begin(), dims.end(), size_t(1), std::multiplies<size_t>());
-  }
-
-  xnnpack::ReplicableRandomDevice rng;
-  std::uniform_int_distribution<size_t> dim_dist;
-
-  std::vector<size_t> output_dims;
-  std::vector<size_t> input_dims;
-  std::vector<size_t> slope_dims;
-  xnnpack::Buffer<InputType> input;
-  xnnpack::Buffer<WeightType> slope;
-  xnnpack::Buffer<OutputType> operator_output;
-  xnnpack::Buffer<OutputType> subgraph_output;
-  size_t input_channels;
-  size_t slope_channels;
-  size_t batch_size;
-};
-
-using PreluTestF16 = PreluTest<xnn_float16, float>;
-using PreluTestF32 = PreluTest<float>;
-
-TEST_F(PreluTestF16, define)
-{
-  ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr));
-
-  xnn_subgraph_t subgraph = nullptr;
-  ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, /*flags=*/0, &subgraph));
-  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> auto_subgraph(subgraph, xnn_delete_subgraph);
-
-  uint32_t input_id = XNN_INVALID_NODE_ID;
-  ASSERT_EQ(
-    xnn_status_success, xnn_define_tensor_value(
-                          subgraph, xnn_datatype_fp16, input_dims.size(), input_dims.data(), nullptr, 0,
-                          /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
-  ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
-
-  uint32_t slope_id = XNN_INVALID_NODE_ID;
-  ASSERT_EQ(
-    xnn_status_success, xnn_define_tensor_value(
-                          subgraph, xnn_datatype_fp32, slope_dims.size(), slope_dims.data(), slope.data(), 1,
-                          /*flags=*/0, &slope_id));
-  ASSERT_NE(slope_id, XNN_INVALID_NODE_ID);
-
-  uint32_t output_id = XNN_INVALID_NODE_ID;
-  ASSERT_EQ(
-    xnn_status_success, xnn_define_tensor_value(
-                          subgraph, xnn_datatype_fp16, input_dims.size(), input_dims.data(), nullptr, 2,
-                          /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
-  ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
-
-  ASSERT_EQ(xnn_status_success, xnn_define_prelu(subgraph, input_id, slope_id, output_id, /*flags=*/0));
-
-  ASSERT_EQ(subgraph->num_nodes, 1);
-  const struct xnn_node* node = &subgraph->nodes[0];
-  ASSERT_EQ(node->type, xnn_node_type_prelu);
-  ASSERT_EQ(node->compute_type, xnn_compute_type_fp16);
-  ASSERT_EQ(node->num_inputs, 2);
-  ASSERT_EQ(node->inputs[0], input_id);
-  ASSERT_EQ(node->inputs[1], slope_id);
-  ASSERT_EQ(node->num_outputs, 1);
-  ASSERT_EQ(node->outputs[0], output_id);
-  ASSERT_EQ(node->flags, 0);
-}
-
-TEST_F(PreluTestF32, define)
-{
-  ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr));
-
-  xnn_subgraph_t subgraph = nullptr;
-  ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, /*flags=*/0, &subgraph));
-  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> auto_subgraph(subgraph, xnn_delete_subgraph);
-
-  uint32_t input_id = XNN_INVALID_NODE_ID;
-  ASSERT_EQ(
-    xnn_status_success, xnn_define_tensor_value(
-                          subgraph, xnn_datatype_fp32, input_dims.size(), input_dims.data(), nullptr, 0,
-                          /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
-  ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
-
-  uint32_t slope_id = XNN_INVALID_NODE_ID;
-  ASSERT_EQ(
-    xnn_status_success, xnn_define_tensor_value(
-                          subgraph, xnn_datatype_fp32, slope_dims.size(), slope_dims.data(), slope.data(), 1,
-                          /*flags=*/0, &slope_id));
-  ASSERT_NE(slope_id, XNN_INVALID_NODE_ID);
-
-  uint32_t output_id = XNN_INVALID_NODE_ID;
-  ASSERT_EQ(
-    xnn_status_success, xnn_define_tensor_value(
-                          subgraph, xnn_datatype_fp32, input_dims.size(), input_dims.data(), nullptr, 2,
-                          /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
-  ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
-
-  ASSERT_EQ(xnn_status_success, xnn_define_prelu(subgraph, input_id, slope_id, output_id, /*flags=*/0));
-
-  ASSERT_EQ(subgraph->num_nodes, 1);
-  const struct xnn_node* node = &subgraph->nodes[0];
-  ASSERT_EQ(node->type, xnn_node_type_prelu);
-  ASSERT_EQ(node->compute_type, xnn_compute_type_fp32);
-  ASSERT_EQ(node->num_inputs, 2);
-  ASSERT_EQ(node->inputs[0], input_id);
-  ASSERT_EQ(node->inputs[1], slope_id);
-  ASSERT_EQ(node->num_outputs, 1);
-  ASSERT_EQ(node->outputs[0], output_id);
-  ASSERT_EQ(node->flags, 0);
-}
-
-TEST_F(PreluTestF16, matches_operator_api)
-{
-  std::uniform_real_distribution<float> f32idist(-1.0f, 1.0f);
-  std::uniform_real_distribution<float> f32wdist(0.25f, 0.75f);
-  std::generate(input.begin(), input.end(), [&]() { return f32idist(rng); });
-  std::generate(slope.begin(), slope.end(), [&]() { return f32wdist(rng); });
-
-  ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr));
-
-  // Call operator API.
-  xnn_operator_t op = nullptr;
-  const xnn_status status =
-    xnn_create_prelu_nc_f16(input_channels, slope_channels, input_channels, input_channels, slope.data(), XNN_FLAG_FP32_STATIC_WEIGHTS, nullptr, nullptr, &op);
-  if (status == xnn_status_unsupported_hardware) {
-    GTEST_SKIP();
-  }
-
-  ASSERT_EQ(xnn_status_success, status);
-  ASSERT_NE(nullptr, op);
-  std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_op(op, xnn_delete_operator);
-
-  ASSERT_EQ(
-    xnn_status_success,
-    xnn_reshape_prelu_nc_f16(op, batch_size, /*threadpool=*/nullptr));
-
-  ASSERT_EQ(
-    xnn_status_success,
-    xnn_setup_prelu_nc_f16(op, input.data(), operator_output.data()));
-
-  ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr));
-
-  // Call subgraph API.
-  xnn_subgraph_t subgraph = nullptr;
-  ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, /*flags=*/0, &subgraph));
-  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> auto_subgraph(subgraph, xnn_delete_subgraph);
-  uint32_t input_id = XNN_INVALID_NODE_ID;
-  ASSERT_EQ(
-    xnn_status_success, xnn_define_tensor_value(
-                          subgraph, xnn_datatype_fp16, input_dims.size(), input_dims.data(), nullptr, /*external_id=*/0,
-                          /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
-  ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
-
-  uint32_t slope_id = XNN_INVALID_NODE_ID;
-  ASSERT_EQ(
-    xnn_status_success,
-    xnn_define_tensor_value(
-      subgraph, xnn_datatype_fp32, slope_dims.size(), slope_dims.data(), slope.data(), /*external_id=*/1,
-      /*flags=*/0, &slope_id));
-  ASSERT_NE(slope_id, XNN_INVALID_NODE_ID);
-
-  uint32_t output_id = XNN_INVALID_NODE_ID;
-  ASSERT_EQ(
-    xnn_status_success,
-    xnn_define_tensor_value(
-      subgraph, xnn_datatype_fp16, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2,
-      /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
-  ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
-
-  xnn_runtime_t runtime = nullptr;
-  ASSERT_EQ(xnn_status_success, xnn_define_prelu(subgraph, input_id, slope_id, output_id, /*flags=*/0));
-  ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime));
-  ASSERT_NE(nullptr, runtime);
-  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(runtime, xnn_delete_runtime);
-  std::array<xnn_external_value, 2> external = {
-    xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}};
-  ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data()));
-  ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime));
-
-  ASSERT_EQ(subgraph_output, operator_output);
-}
-
-TEST_F(PreluTestF32, matches_operator_api)
-{
-  std::uniform_real_distribution<float> f32idist(-1.0f, 1.0f);
-  std::uniform_real_distribution<float> f32wdist(0.25f, 0.75f);
-  std::generate(input.begin(), input.end(), [&]() { return f32idist(rng); });
-  std::generate(slope.begin(), slope.end(), [&]() { return f32wdist(rng); });
-
-  ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr));
-
-  // Call operator API.
-  xnn_operator_t op = nullptr;
-  const xnn_status status =
-    xnn_create_prelu_nc_f32(input_channels, slope_channels, input_channels, input_channels, slope.data(), /*flags=*/0, nullptr, nullptr, &op);
-  if (status == xnn_status_unsupported_hardware) {
-    GTEST_SKIP();
-  }
-
-  ASSERT_EQ(xnn_status_success, status);
-  ASSERT_NE(nullptr, op);
-  std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_op(op, xnn_delete_operator);
-
-  ASSERT_EQ(
-    xnn_status_success,
-    xnn_reshape_prelu_nc_f32(op, batch_size, /*threadpool=*/nullptr));
-
-  ASSERT_EQ(
-    xnn_status_success,
-    xnn_setup_prelu_nc_f32(op, input.data(), operator_output.data()));
-
-  ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr));
-
-  // Call subgraph API.
-  xnn_subgraph_t subgraph = nullptr;
-  ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, /*flags=*/0, &subgraph));
-  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> auto_subgraph(subgraph, xnn_delete_subgraph);
-  uint32_t input_id = XNN_INVALID_NODE_ID;
-  ASSERT_EQ(
-    xnn_status_success, xnn_define_tensor_value(
-                          subgraph, xnn_datatype_fp32, input_dims.size(), input_dims.data(), nullptr, /*external_id=*/0,
-                          /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
-  ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
-
-  uint32_t slope_id = XNN_INVALID_NODE_ID;
-  ASSERT_EQ(
-    xnn_status_success,
-    xnn_define_tensor_value(
-      subgraph, xnn_datatype_fp32, slope_dims.size(), slope_dims.data(), slope.data(), /*external_id=*/1,
-      /*flags=*/0, &slope_id));
-  ASSERT_NE(slope_id, XNN_INVALID_NODE_ID);
-
-  uint32_t output_id = XNN_INVALID_NODE_ID;
-  ASSERT_EQ(
-    xnn_status_success,
-    xnn_define_tensor_value(
-      subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2,
-      /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
-  ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
-
-  xnn_runtime_t runtime = nullptr;
-  ASSERT_EQ(xnn_status_success, xnn_define_prelu(subgraph, input_id, slope_id, output_id, /*flags=*/0));
-  ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime));
-  ASSERT_NE(nullptr, runtime);
-  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(runtime, xnn_delete_runtime);
-  std::array<xnn_external_value, 2> external = {
-    xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}};
-  ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data()));
-  ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime));
-
-  ASSERT_EQ(subgraph_output, operator_output);
-}
-
-TEST_F(PreluTestF32, reshape_output)
-{
-  ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr));
-
-  // Call subgraph API.
-  xnn_subgraph_t subgraph = nullptr;
-  ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/3, /*flags=*/0, &subgraph));
-  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> auto_subgraph(subgraph, xnn_delete_subgraph);
-  uint32_t input_id = XNN_INVALID_NODE_ID;
-  ASSERT_EQ(
-    xnn_status_success, xnn_define_tensor_value(
-                          subgraph, xnn_datatype_fp32, input_dims.size(), input_dims.data(), nullptr, /*external_id=*/0,
-                          /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
-  ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
-
-  uint32_t slope_id = XNN_INVALID_NODE_ID;
-  ASSERT_EQ(
-    xnn_status_success,
-    xnn_define_tensor_value(
-      subgraph, xnn_datatype_fp32, slope_dims.size(), slope_dims.data(), slope.data(), /*external_id=*/1,
-      /*flags=*/0, &slope_id));
-  ASSERT_NE(slope_id, XNN_INVALID_NODE_ID);
-
-  uint32_t output_id = XNN_INVALID_NODE_ID;
-  ASSERT_EQ(
-    xnn_status_success,
-    xnn_define_tensor_value(
-      subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/2,
-      /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
-  ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
-
-  xnn_runtime_t runtime = nullptr;
-  ASSERT_EQ(xnn_status_success, xnn_define_prelu(subgraph, input_id, slope_id, output_id, /*flags=*/0));
-  ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime));
-  ASSERT_NE(nullptr, runtime);
-  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(runtime, xnn_delete_runtime);
-  std::array<xnn_external_value, 2> external = {
-    xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}};
-  ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data()));
-  ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime));
-
-  input_dims[0] += 2;
-  input_dims[1] += 2;
-  input_dims[2] += 2;
-
-  ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_dims.size(), input_dims.data()));
-  const struct xnn_node* node = &subgraph->nodes[0];
-  ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_reallocation_required);
-  const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape;
-  for (size_t i = 0; i < input_dims.size(); ++i) {
-    ASSERT_EQ(input_dims[i], output_shape->dim[i]);
-  }
-
-  input_dims[1] -= 1;
-  ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, input_id, input_dims.size(), input_dims.data()));
-  ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_success);
-  for (size_t i = 0; i < input_dims.size(); ++i) {
-    ASSERT_EQ(input_dims[i], output_shape->dim[i]);
-  }
-}
diff --git a/test/qs16-qs8-vcvt.cc b/test/qs16-qs8-vcvt.cc
index 0d9531a3de9..f947e0c9628 100644
--- a/test/qs16-qs8-vcvt.cc
+++ b/test/qs16-qs8-vcvt.cc
@@ -23,5 +23,5 @@ XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, u
                                                                                                                  \
                                                                                                                  \
 XNN_TEST_CVT_OUTPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);
-#include "src/qs16-qs8-vcvt/qs16-qs8-vcvt.h"
+#include "qs16-qs8-vcvt/qs16-qs8-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-dwconv-minmax-multipass-fp32.cc b/test/qs8-dwconv-minmax-multipass-fp32.cc
index 51e7ee9801a..c2c1e70d2ab 100644
--- a/test/qs8-dwconv-minmax-multipass-fp32.cc
+++ b/test/qs8-dwconv-minmax-multipass-fp32.cc
@@ -300,5 +300,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                                                                                                              \
       return info.param.test_name;                                                                                                                                                                               \
     });
-#include "src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h"
+#include "qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-dwconv-minmax-multipass-rndnu.cc b/test/qs8-dwconv-minmax-multipass-rndnu.cc
index 08efff8519b..8afaff02c64 100644
--- a/test/qs8-dwconv-minmax-multipass-rndnu.cc
+++ b/test/qs8-dwconv-minmax-multipass-rndnu.cc
@@ -300,5 +300,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                                                                                                              \
       return info.param.test_name;                                                                                                                                                                               \
     });
-#include "src/qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h"
+#include "qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-dwconv-minmax-unipass-fp32.cc b/test/qs8-dwconv-minmax-unipass-fp32.cc
index 4da93830b13..8e688cfee62 100644
--- a/test/qs8-dwconv-minmax-unipass-fp32.cc
+++ b/test/qs8-dwconv-minmax-unipass-fp32.cc
@@ -204,5 +204,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                             \
       return info.param.test_name;                                                                                              \
     });
-#include "src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h"
+#include "qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-dwconv-minmax-unipass-rndnu.cc b/test/qs8-dwconv-minmax-unipass-rndnu.cc
index 13ee096f8b5..d0ec65e4121 100644
--- a/test/qs8-dwconv-minmax-unipass-rndnu.cc
+++ b/test/qs8-dwconv-minmax-unipass-rndnu.cc
@@ -204,5 +204,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                             \
       return info.param.test_name;                                                                                              \
     });
-#include "src/qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h"
+#include "qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-f16-vcvt.cc b/test/qs8-f16-vcvt.cc
index 132e9d9ef08..f410d3027bb 100644
--- a/test/qs8-f16-vcvt.cc
+++ b/test/qs8-f16-vcvt.cc
@@ -22,5 +22,5 @@ XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out
 XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);           \
                                                                                                                 \
 XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);
-#include "src/qs8-f16-vcvt/qs8-f16-vcvt.h"
+#include "qs8-f16-vcvt/qs8-f16-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-f32-vcvt.cc b/test/qs8-f32-vcvt.cc
index 5e2ee068a88..9415e7df8a1 100644
--- a/test/qs8-f32-vcvt.cc
+++ b/test/qs8-f32-vcvt.cc
@@ -22,5 +22,5 @@ XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out
 XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);           \
                                                                                                                 \
 XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);
-#include "src/qs8-f32-vcvt/qs8-f32-vcvt.h"
+#include "qs8-f32-vcvt/qs8-f32-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-packw.cc b/test/qs8-packw.cc
index 0f69c429a15..23a635403c7 100644
--- a/test/qs8-packw.cc
+++ b/test/qs8-packw.cc
@@ -33,7 +33,7 @@ std::string GetTestQS8Name(const testing::TestParamInfo<XnnTestQS8::ParamType>&
   { #ukernel, ukernel, arch_flags, nr, kr, sr, kblock, nr_scale, izp },
 
 const XnnTestQS8Param xnn_test_qs8_params[] = {
-#include "src/qs8-packw/qs8-packw.h"
+#include "qs8-packw/qs8-packw.h"
 };
 
 #undef XNN_QS8_UKERNEL
diff --git a/test/qs8-qc8w-dwconv-minmax-multipass-fp32.cc b/test/qs8-qc8w-dwconv-minmax-multipass-fp32.cc
index 5ae7ca8f286..4dfea67e351 100644
--- a/test/qs8-qc8w-dwconv-minmax-multipass-fp32.cc
+++ b/test/qs8-qc8w-dwconv-minmax-multipass-fp32.cc
@@ -300,5 +300,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                                                                                                              \
       return info.param.test_name;                                                                                                                                                                               \
     });
-#include "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h"
+#include "qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-qc8w-dwconv-minmax-unipass-fp32.cc b/test/qs8-qc8w-dwconv-minmax-unipass-fp32.cc
index 95f334d0e22..7b761dc8e85 100644
--- a/test/qs8-qc8w-dwconv-minmax-unipass-fp32.cc
+++ b/test/qs8-qc8w-dwconv-minmax-unipass-fp32.cc
@@ -204,5 +204,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                             \
       return info.param.test_name;                                                                                              \
     });
-#include "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h"
+#include "qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-requantization.cc b/test/qs8-requantization.cc
index 53bf1cc92cb..e9d541ed3a0 100644
--- a/test/qs8-requantization.cc
+++ b/test/qs8-requantization.cc
@@ -15,288 +15,6 @@
 #include "xnnpack/requantization-stubs.h"
 #include "requantization-tester.h"
 
-/*
- * Round-to-nearest, ties away from zero, scalar implementation using unsigned 32-bit arithmetics.
- */
-
-TEST(QS8_RNDNA__SCALAR_UNSIGNED32, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
-    RequantizationTester()
-      .qmin(std::numeric_limits<int8_t>::min())
-      .qmax(std::numeric_limits<int8_t>::max())
-      .s(s)
-      .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_unsigned32);
-  }
-}
-
-TEST(QS8_RNDNA__SCALAR_UNSIGNED32, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-       zero_point <= std::numeric_limits<int8_t>::max();
-       zero_point++)
-  {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_unsigned32);
-    }
-  }
-}
-
-TEST(QS8_RNDNA__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_up) {
-  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-       zero_point <= std::numeric_limits<int8_t>::max();
-       zero_point++)
-  {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__scalar_unsigned32);
-    }
-  }
-}
-
-TEST(QS8_RNDNA__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_down) {
-  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-       zero_point <= std::numeric_limits<int8_t>::max();
-       zero_point++)
-  {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__scalar_unsigned32);
-    }
-  }
-}
-
-TEST(QS8_RNDNA__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_away) {
-  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-       zero_point <= std::numeric_limits<int8_t>::max();
-       zero_point++)
-  {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__scalar_unsigned32);
-    }
-  }
-}
-
-TEST(QS8_RNDNA__SCALAR_UNSIGNED32, special_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<int8_t>::min())
-    .qmax(std::numeric_limits<int8_t>::max())
-    .TestSpecialCases(xnn_qs8_requantize_rndna__scalar_unsigned32);
-}
-
-TEST(QS8_RNDNA__SCALAR_UNSIGNED32, random_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<int8_t>::min())
-    .qmax(std::numeric_limits<int8_t>::max())
-    .iterations(100)
-    .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__scalar_unsigned32);
-}
-
-
-/*
- * Round-to-nearest, ties away from zero, scalar implementation using unsigned 64-bit arithmetics.
- */
-
-TEST(QS8_RNDNA__SCALAR_UNSIGNED64, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
-    RequantizationTester()
-      .qmin(std::numeric_limits<int8_t>::min())
-      .qmax(std::numeric_limits<int8_t>::max())
-      .s(s)
-      .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_unsigned64);
-  }
-}
-
-TEST(QS8_RNDNA__SCALAR_UNSIGNED64, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-       zero_point <= std::numeric_limits<int8_t>::max();
-       zero_point++)
-  {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_unsigned64);
-    }
-  }
-}
-
-TEST(QS8_RNDNA__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_up) {
-  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-       zero_point <= std::numeric_limits<int8_t>::max();
-       zero_point++)
-  {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__scalar_unsigned64);
-    }
-  }
-}
-
-TEST(QS8_RNDNA__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_down) {
-  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-       zero_point <= std::numeric_limits<int8_t>::max();
-       zero_point++)
-  {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__scalar_unsigned64);
-    }
-  }
-}
-
-TEST(QS8_RNDNA__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_away) {
-  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-       zero_point <= std::numeric_limits<int8_t>::max();
-       zero_point++)
-  {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__scalar_unsigned64);
-    }
-  }
-}
-
-TEST(QS8_RNDNA__SCALAR_UNSIGNED64, special_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<int8_t>::min())
-    .qmax(std::numeric_limits<int8_t>::max())
-    .TestSpecialCases(xnn_qs8_requantize_rndna__scalar_unsigned64);
-}
-
-TEST(QS8_RNDNA__SCALAR_UNSIGNED64, random_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<int8_t>::min())
-    .qmax(std::numeric_limits<int8_t>::max())
-    .iterations(100)
-    .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__scalar_unsigned64);
-}
-
-
-/*
- * Round-to-nearest, ties away from zero, scalar implementation using signed 64-bit arithmetics.
- */
-
-TEST(QS8_RNDNA__SCALAR_SIGNED64, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
-    RequantizationTester()
-      .qmin(std::numeric_limits<int8_t>::min())
-      .qmax(std::numeric_limits<int8_t>::max())
-      .s(s)
-      .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_signed64);
-  }
-}
-
-TEST(QS8_RNDNA__SCALAR_SIGNED64, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-       zero_point <= std::numeric_limits<int8_t>::max();
-       zero_point++)
-  {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_signed64);
-    }
-  }
-}
-
-TEST(QS8_RNDNA__SCALAR_SIGNED64, divide_by_po2_with_rounding_up) {
-  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-       zero_point <= std::numeric_limits<int8_t>::max();
-       zero_point++)
-  {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__scalar_signed64);
-    }
-  }
-}
-
-TEST(QS8_RNDNA__SCALAR_SIGNED64, divide_by_po2_with_rounding_down) {
-  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-       zero_point <= std::numeric_limits<int8_t>::max();
-       zero_point++)
-  {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__scalar_signed64);
-    }
-  }
-}
-
-TEST(QS8_RNDNA__SCALAR_SIGNED64, divide_by_po2_with_rounding_away) {
-  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-       zero_point <= std::numeric_limits<int8_t>::max();
-       zero_point++)
-  {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__scalar_signed64);
-    }
-  }
-}
-
-TEST(QS8_RNDNA__SCALAR_SIGNED64, special_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<int8_t>::min())
-    .qmax(std::numeric_limits<int8_t>::max())
-    .TestSpecialCases(xnn_qs8_requantize_rndna__scalar_signed64);
-}
-
-TEST(QS8_RNDNA__SCALAR_SIGNED64, random_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<int8_t>::min())
-    .qmax(std::numeric_limits<int8_t>::max())
-    .iterations(100)
-    .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__scalar_signed64);
-}
-
-
 /*
  * Round-to-nearest, ties up, scalar implementation using signed 64-bit arithmetics.
  */
@@ -361,417 +79,121 @@ TEST(QS8_RNDNU__SCALAR, divide_by_po2_with_rounding_down) {
 
 TEST(QS8_RNDNU__SCALAR, divide_by_po2_with_rounding_away) {
   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-       zero_point <= std::numeric_limits<int8_t>::max();
-       zero_point++)
-  {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingTiesUp(xnn_qs8_requantize_rndnu__scalar);
-    }
-  }
-}
-
-TEST(QS8_RNDNU__SCALAR, random_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<int8_t>::min())
-    .qmax(std::numeric_limits<int8_t>::max())
-    .iterations(100)
-    .TestRandomCasesRoundToNearestTiesUp(xnn_qs8_requantize_rndnu__scalar);
-}
-
-
-/*
- * FP32-based scalar implementation using lrintf function.
- */
-
-TEST(QS8_FP32__SCALAR_LRINTF, random_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<int8_t>::min())
-    .qmax(std::numeric_limits<int8_t>::max())
-    .iterations(1000)
-    .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__scalar_lrintf);
-}
-
-
-/*
- * FP32-based scalar implementation using magic trick for FP32->INT32 conversion.
- */
-
-TEST(QS8_FP32__SCALAR_FMAGIC, random_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<int8_t>::min())
-    .qmax(std::numeric_limits<int8_t>::max())
-    .iterations(1000)
-    .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__scalar_fmagic);
-}
-
-
-/*
- * GEMMLOWP-equivalent scalar implementation.
- */
-
-TEST(QS8_GEMMLOWP__SCALAR, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
-    RequantizationTester()
-      .qmin(std::numeric_limits<int8_t>::min())
-      .qmax(std::numeric_limits<int8_t>::max())
-      .s(s)
-      .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__scalar);
-  }
-}
-
-TEST(QS8_GEMMLOWP__SCALAR, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-       zero_point <= std::numeric_limits<int8_t>::max();
-       zero_point++)
-  {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__scalar);
-    }
-  }
-}
-
-TEST(QS8_GEMMLOWP__SCALAR, divide_by_po2_with_rounding_up) {
-  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-       zero_point <= std::numeric_limits<int8_t>::max();
-       zero_point++)
-  {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_gemmlowp__scalar);
-    }
-  }
-}
-
-/* No rounding down test - it fails because of upward bias in multiplication */
-/* No rounding away test - it fails because of upward bias in multiplication */
-
-TEST(QS8_GEMMLOWP__SCALAR, special_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<int8_t>::min())
-    .qmax(std::numeric_limits<int8_t>::max())
-    .TestSpecialCases(xnn_qs8_requantize_gemmlowp__scalar);
-}
-
-TEST(QS8_GEMMLOWP__SCALAR, random_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<int8_t>::min())
-    .qmax(std::numeric_limits<int8_t>::max())
-    .iterations(100)
-    .TestRandomCasesApproximate(xnn_qs8_requantize_gemmlowp__scalar);
-}
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  /*
-   * Round-to-nearest, ties away from zero, SSE2 implementation using floating-point shuffle.
-   */
-
-  TEST(QS8_RNDNA__SSE2, exact_divide_by_po2) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestExactDivideByPO2(xnn_qs8_requantize_rndna__sse2);
-    }
-  }
-
-  TEST(QS8_RNDNA__SSE2, exact_divide_by_po2_with_zero_point) {
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestExactDivideByPO2(xnn_qs8_requantize_rndna__sse2);
-      }
-    }
-  }
-
-  TEST(QS8_RNDNA__SSE2, divide_by_po2_with_rounding_up) {
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__sse2);
-      }
-    }
-  }
-
-  TEST(QS8_RNDNA__SSE2, divide_by_po2_with_rounding_down) {
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__sse2);
-      }
-    }
-  }
-
-  TEST(QS8_RNDNA__SSE2, divide_by_po2_with_rounding_away) {
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__sse2);
-      }
-    }
-  }
-
-  TEST(QS8_RNDNA__SSE2, special_cases) {
-    RequantizationTester()
-      .qmin(std::numeric_limits<int8_t>::min())
-      .qmax(std::numeric_limits<int8_t>::max())
-      .TestSpecialCases(xnn_qs8_requantize_rndna__sse2);
-  }
-
-  TEST(QS8_RNDNA__SSE2, random_cases) {
-    RequantizationTester()
-      .qmin(std::numeric_limits<int8_t>::min())
-      .qmax(std::numeric_limits<int8_t>::max())
-      .iterations(100)
-      .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__sse2);
-  }
-
-
-  /*
-   * Round-to-nearest, ties away from zero, SSSE3 implementation using floating-point shuffle.
-   */
-
-  TEST(QS8_RNDNA__SSSE3, exact_divide_by_po2) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .s(s)
-        .TestExactDivideByPO2(xnn_qs8_requantize_rndna__ssse3);
-    }
-  }
-
-  TEST(QS8_RNDNA__SSSE3, exact_divide_by_po2_with_zero_point) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestExactDivideByPO2(xnn_qs8_requantize_rndna__ssse3);
-      }
-    }
-  }
-
-  TEST(QS8_RNDNA__SSSE3, divide_by_po2_with_rounding_up) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__ssse3);
-      }
-    }
-  }
-
-  TEST(QS8_RNDNA__SSSE3, divide_by_po2_with_rounding_down) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__ssse3);
-      }
-    }
-  }
-
-  TEST(QS8_RNDNA__SSSE3, divide_by_po2_with_rounding_away) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__ssse3);
-      }
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestDivideByPO2WithRoundingTiesUp(xnn_qs8_requantize_rndnu__scalar);
     }
   }
+}
 
-  TEST(QS8_RNDNA__SSSE3, special_cases) {
-    TEST_REQUIRES_X86_SSSE3;
-    RequantizationTester()
-      .qmin(std::numeric_limits<int8_t>::min())
-      .qmax(std::numeric_limits<int8_t>::max())
-      .TestSpecialCases(xnn_qs8_requantize_rndna__ssse3);
-  }
+TEST(QS8_RNDNU__SCALAR, random_cases) {
+  RequantizationTester()
+    .qmin(std::numeric_limits<int8_t>::min())
+    .qmax(std::numeric_limits<int8_t>::max())
+    .iterations(100)
+    .TestRandomCasesRoundToNearestTiesUp(xnn_qs8_requantize_rndnu__scalar);
+}
 
-  TEST(QS8_RNDNA__SSSE3, random_cases) {
-    TEST_REQUIRES_X86_SSSE3;
+
+/*
+ * FP32-based scalar implementation using lrintf function.
+ */
+
+TEST(QS8_FP32__SCALAR_LRINTF, random_cases) {
+  RequantizationTester()
+    .qmin(std::numeric_limits<int8_t>::min())
+    .qmax(std::numeric_limits<int8_t>::max())
+    .iterations(1000)
+    .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__scalar_lrintf);
+}
+
+
+/*
+ * FP32-based scalar implementation using magic trick for FP32->INT32 conversion.
+ */
+
+TEST(QS8_FP32__SCALAR_FMAGIC, random_cases) {
+  RequantizationTester()
+    .qmin(std::numeric_limits<int8_t>::min())
+    .qmax(std::numeric_limits<int8_t>::max())
+    .iterations(1000)
+    .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__scalar_fmagic);
+}
+
+
+/*
+ * GEMMLOWP-equivalent scalar implementation.
+ */
+
+TEST(QS8_GEMMLOWP__SCALAR, exact_divide_by_po2) {
+  for (uint32_t s = 1; s < 32; s++) {
     RequantizationTester()
       .qmin(std::numeric_limits<int8_t>::min())
       .qmax(std::numeric_limits<int8_t>::max())
-      .iterations(100)
-      .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__ssse3);
+      .s(s)
+      .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__scalar);
   }
+}
 
-
-  /*
-   * Round-to-nearest, ties away from zero, SSE4.1 implementation using static blend instruction.
-   */
-
-  TEST(QS8_RNDNA__SSE41, exact_divide_by_po2) {
-    TEST_REQUIRES_X86_SSE41;
+TEST(QS8_GEMMLOWP__SCALAR, exact_divide_by_po2_with_zero_point) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
+        .zero_point(zero_point)
         .qmin(std::numeric_limits<int8_t>::min())
         .qmax(std::numeric_limits<int8_t>::max())
         .s(s)
-        .TestExactDivideByPO2(xnn_qs8_requantize_rndna__sse41);
-    }
-  }
-
-  TEST(QS8_RNDNA__SSE41, exact_divide_by_po2_with_zero_point) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestExactDivideByPO2(xnn_qs8_requantize_rndna__sse41);
-      }
-    }
-  }
-
-  TEST(QS8_RNDNA__SSE41, divide_by_po2_with_rounding_up) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__sse41);
-      }
+        .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__scalar);
     }
   }
+}
 
-  TEST(QS8_RNDNA__SSE41, divide_by_po2_with_rounding_down) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__sse41);
-      }
+TEST(QS8_GEMMLOWP__SCALAR, divide_by_po2_with_rounding_up) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_gemmlowp__scalar);
     }
   }
+}
 
-  TEST(QS8_RNDNA__SSE41, divide_by_po2_with_rounding_away) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__sse41);
-      }
-    }
-  }
+/* No rounding down test - it fails because of upward bias in multiplication */
+/* No rounding away test - it fails because of upward bias in multiplication */
 
-  TEST(QS8_RNDNA__SSE41, special_cases) {
-    TEST_REQUIRES_X86_SSE41;
-    RequantizationTester()
-      .qmin(std::numeric_limits<int8_t>::min())
-      .qmax(std::numeric_limits<int8_t>::max())
-      .TestSpecialCases(xnn_qs8_requantize_rndna__sse41);
-  }
+TEST(QS8_GEMMLOWP__SCALAR, special_cases) {
+  RequantizationTester()
+    .qmin(std::numeric_limits<int8_t>::min())
+    .qmax(std::numeric_limits<int8_t>::max())
+    .TestSpecialCases(xnn_qs8_requantize_gemmlowp__scalar);
+}
 
-  TEST(QS8_RNDNA__SSE41, random_cases) {
-    TEST_REQUIRES_X86_SSE41;
-    RequantizationTester()
-      .qmin(std::numeric_limits<int8_t>::min())
-      .qmax(std::numeric_limits<int8_t>::max())
-      .iterations(100)
-      .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__sse41);
-  }
+TEST(QS8_GEMMLOWP__SCALAR, random_cases) {
+  RequantizationTester()
+    .qmin(std::numeric_limits<int8_t>::min())
+    .qmax(std::numeric_limits<int8_t>::max())
+    .iterations(100)
+    .TestRandomCasesApproximate(xnn_qs8_requantize_gemmlowp__scalar);
+}
 
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   /*
    * Round-to-nearest, ties up, SSE4.1 implementation using arithmetic shift right.
    */
@@ -1191,107 +613,6 @@ TEST(QS8_GEMMLOWP__SCALAR, random_cases) {
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  /*
-   * Round-to-nearest, ties away from zero, ARM NEON implementation.
-   */
-
-  TEST(QS8_RNDNA__NEON, exact_divide_by_po2) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .s(s)
-        .qmin(std::numeric_limits<int8_t>::min())
-        .qmax(std::numeric_limits<int8_t>::max())
-        .TestExactDivideByPO2(xnn_qs8_requantize_rndna__neon);
-    }
-  }
-
-  TEST(QS8_RNDNA__NEON, exact_divide_by_po2_with_zero_point) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestExactDivideByPO2(xnn_qs8_requantize_rndna__neon);
-      }
-    }
-  }
-
-  TEST(QS8_RNDNA__NEON, divide_by_po2_with_rounding_up) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__neon);
-      }
-    }
-  }
-
-  TEST(QS8_RNDNA__NEON, divide_by_po2_with_rounding_down) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__neon);
-      }
-    }
-  }
-
-  TEST(QS8_RNDNA__NEON, divide_by_po2_with_rounding_away) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
-         zero_point <= std::numeric_limits<int8_t>::max();
-         zero_point++)
-    {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<int8_t>::min())
-          .qmax(std::numeric_limits<int8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__neon);
-      }
-    }
-  }
-
-  TEST(QS8_RNDNA__NEON, special_cases) {
-    TEST_REQUIRES_ARM_NEON;
-    RequantizationTester()
-      .qmin(std::numeric_limits<int8_t>::min())
-      .qmax(std::numeric_limits<int8_t>::max())
-      .TestSpecialCases(xnn_qs8_requantize_rndna__neon);
-  }
-
-  TEST(QS8_RNDNA__NEON, random_cases) {
-    TEST_REQUIRES_ARM_NEON;
-    RequantizationTester()
-      .qmin(std::numeric_limits<int8_t>::min())
-      .qmax(std::numeric_limits<int8_t>::max())
-      .iterations(100)
-      .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__neon);
-  }
-
-
   /*
    * Round-to-nearest, ties up, ARM NEON implementation using extended multiplication.
    */
diff --git a/test/qs8-vadd-minmax.cc b/test/qs8-vadd-minmax.cc
index aa3b0d66c63..45860f4013d 100644
--- a/test/qs8-vadd-minmax.cc
+++ b/test/qs8-vadd-minmax.cc
@@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukerne
                                                                                                                  \
 XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);                    \
 XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);
-#include "src/qs8-vadd/qs8-vadd-minmax.h"
+#include "qs8-vadd/qs8-vadd-minmax.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-vaddc-minmax.cc b/test/qs8-vaddc-minmax.cc
index 41fdd0148d9..cdef11d3293 100644
--- a/test/qs8-vaddc-minmax.cc
+++ b/test/qs8-vaddc-minmax.cc
@@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel
                                                                                                                  \
 XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params);                     \
 XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params);
-#include "src/qs8-vaddc/qs8-vaddc-minmax.h"
+#include "qs8-vaddc/qs8-vaddc-minmax.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-vcvt.cc b/test/qs8-vcvt.cc
index 4b3237b3962..4716a82d19c 100644
--- a/test/qs8-vcvt.cc
+++ b/test/qs8-vcvt.cc
@@ -22,5 +22,5 @@ XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out
 XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);           \
                                                                                                                 \
 XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);
-#include "src/qs8-vcvt/qs8-vcvt.h"
+#include "qs8-vcvt/qs8-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-vhswish.cc b/test/qs8-vhswish.cc
index c958a985bde..093c95ab5d6 100644
--- a/test/qs8-vhswish.cc
+++ b/test/qs8-vhswish.cc
@@ -1,1561 +1,83 @@
-// Copyright 2023 Google LLC
+// Copyright 2019 Google LLC
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 //
 // Auto-generated file. Do not edit!
-//   Specification: test/qs8-vhswish.yaml
-//   Generator: tools/generate-vhswish-test.py
+//   Microkernel: qs8-vhswish
+//   Generator: tools/generate-vunary-test.py
 
 
-#include <vector>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <cstddef>
+#include <limits>
 
 #include <gtest/gtest.h>
+#include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/isa-checks.h"
 #include "xnnpack/microparams-init.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+#include "next_prime.h"
 #include "vhswish-microkernel-tester.h"
 
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_VHSWISH__NEON_U8, batch_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    VHSwishMicrokernelTester()
-      .batch_size(8)
-      .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params);
-  }
-
-  TEST(QS8_VHSWISH__NEON_U8, batch_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U8, batch_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U8, batch_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U8, input_scale) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U8, output_scale) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U8, input_zero_point) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U8, output_zero_point) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__neon_u8, xnn_init_qs8_hswish_scalar_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_VHSWISH__NEON_U16, batch_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    VHSwishMicrokernelTester()
-      .batch_size(16)
-      .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params);
-  }
-
-  TEST(QS8_VHSWISH__NEON_U16, batch_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U16, batch_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U16, batch_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U16, input_scale) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U16, output_scale) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U16, input_zero_point) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U16, output_zero_point) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__neon_u16, xnn_init_qs8_hswish_scalar_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_VHSWISH__NEON_U32, batch_eq_32) {
-    TEST_REQUIRES_ARM_NEON;
-    VHSwishMicrokernelTester()
-      .batch_size(32)
-      .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params);
-  }
-
-  TEST(QS8_VHSWISH__NEON_U32, batch_div_32) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U32, batch_lt_32) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U32, batch_gt_32) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U32, input_scale) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U32, output_scale) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U32, input_zero_point) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__NEON_U32, output_zero_point) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__neon_u32, xnn_init_qs8_hswish_scalar_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_VHSWISH__SSE2_U16, batch_eq_16) {
-    TEST_REQUIRES_X86_SSE2;
-    VHSwishMicrokernelTester()
-      .batch_size(16)
-      .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params);
-  }
-
-  TEST(QS8_VHSWISH__SSE2_U16, batch_div_16) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE2_U16, batch_lt_16) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE2_U16, batch_gt_16) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE2_U16, input_scale) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE2_U16, output_scale) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE2_U16, input_zero_point) {
-    TEST_REQUIRES_X86_SSE2;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE2_U16, output_zero_point) {
-    TEST_REQUIRES_X86_SSE2;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__sse2_u16, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_VHSWISH__SSE2_U32, batch_eq_32) {
-    TEST_REQUIRES_X86_SSE2;
-    VHSwishMicrokernelTester()
-      .batch_size(32)
-      .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params);
-  }
-
-  TEST(QS8_VHSWISH__SSE2_U32, batch_div_32) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE2_U32, batch_lt_32) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE2_U32, batch_gt_32) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE2_U32, input_scale) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE2_U32, output_scale) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE2_U32, input_zero_point) {
-    TEST_REQUIRES_X86_SSE2;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE2_U32, output_zero_point) {
-    TEST_REQUIRES_X86_SSE2;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__sse2_u32, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_VHSWISH__SSSE3_U16, batch_eq_16) {
-    TEST_REQUIRES_X86_SSSE3;
-    VHSwishMicrokernelTester()
-      .batch_size(16)
-      .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params);
-  }
-
-  TEST(QS8_VHSWISH__SSSE3_U16, batch_div_16) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSSE3_U16, batch_lt_16) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSSE3_U16, batch_gt_16) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSSE3_U16, input_scale) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSSE3_U16, output_scale) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSSE3_U16, input_zero_point) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSSE3_U16, output_zero_point) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__ssse3_u16, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_VHSWISH__SSSE3_U32, batch_eq_32) {
-    TEST_REQUIRES_X86_SSSE3;
-    VHSwishMicrokernelTester()
-      .batch_size(32)
-      .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params);
-  }
-
-  TEST(QS8_VHSWISH__SSSE3_U32, batch_div_32) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSSE3_U32, batch_lt_32) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSSE3_U32, batch_gt_32) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSSE3_U32, input_scale) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSSE3_U32, output_scale) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSSE3_U32, input_zero_point) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSSE3_U32, output_zero_point) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__ssse3_u32, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_VHSWISH__SSE41_U8, batch_eq_8) {
-    TEST_REQUIRES_X86_SSE41;
-    VHSwishMicrokernelTester()
-      .batch_size(8)
-      .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params);
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U8, batch_div_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U8, batch_lt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U8, batch_gt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U8, input_scale) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U8, output_scale) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U8, input_zero_point) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U8, output_zero_point) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__sse41_u8, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_VHSWISH__SSE41_U16, batch_eq_16) {
-    TEST_REQUIRES_X86_SSE41;
-    VHSwishMicrokernelTester()
-      .batch_size(16)
-      .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params);
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U16, batch_div_16) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U16, batch_lt_16) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U16, batch_gt_16) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U16, input_scale) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U16, output_scale) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U16, input_zero_point) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U16, output_zero_point) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__sse41_u16, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_VHSWISH__SSE41_U32, batch_eq_32) {
-    TEST_REQUIRES_X86_SSE41;
-    VHSwishMicrokernelTester()
-      .batch_size(32)
-      .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params);
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U32, batch_div_32) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U32, batch_lt_32) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U32, batch_gt_32) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U32, input_scale) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U32, output_scale) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U32, input_zero_point) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__SSE41_U32, output_zero_point) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__sse41_u32, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_VHSWISH__AVX_U8, batch_eq_8) {
-    TEST_REQUIRES_X86_AVX;
-    VHSwishMicrokernelTester()
-      .batch_size(8)
-      .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params);
-  }
-
-  TEST(QS8_VHSWISH__AVX_U8, batch_div_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U8, batch_lt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U8, batch_gt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U8, input_scale) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U8, output_scale) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U8, input_zero_point) {
-    TEST_REQUIRES_X86_AVX;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U8, output_zero_point) {
-    TEST_REQUIRES_X86_AVX;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__avx_u8, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_VHSWISH__AVX_U16, batch_eq_16) {
-    TEST_REQUIRES_X86_AVX;
-    VHSwishMicrokernelTester()
-      .batch_size(16)
-      .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params);
-  }
-
-  TEST(QS8_VHSWISH__AVX_U16, batch_div_16) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U16, batch_lt_16) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U16, batch_gt_16) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U16, input_scale) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U16, output_scale) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U16, input_zero_point) {
-    TEST_REQUIRES_X86_AVX;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U16, output_zero_point) {
-    TEST_REQUIRES_X86_AVX;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__avx_u16, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_VHSWISH__AVX_U32, batch_eq_32) {
-    TEST_REQUIRES_X86_AVX;
-    VHSwishMicrokernelTester()
-      .batch_size(32)
-      .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params);
-  }
-
-  TEST(QS8_VHSWISH__AVX_U32, batch_div_32) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U32, batch_lt_32) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U32, batch_gt_32) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U32, input_scale) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U32, output_scale) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U32, input_zero_point) {
-    TEST_REQUIRES_X86_AVX;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__AVX_U32, output_zero_point) {
-    TEST_REQUIRES_X86_AVX;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__avx_u32, xnn_init_qs8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(QS8_VHSWISH__WASMSIMD_U8, batch_eq_8) {
-    VHSwishMicrokernelTester()
-      .batch_size(8)
-      .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params);
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U8, batch_div_8) {
-    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U8, batch_lt_8) {
-    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U8, batch_gt_8) {
-    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U8, input_scale) {
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U8, output_scale) {
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U8, input_zero_point) {
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U8, output_zero_point) {
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u8, xnn_init_qs8_hswish_scalar_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(QS8_VHSWISH__WASMSIMD_U16, batch_eq_16) {
-    VHSwishMicrokernelTester()
-      .batch_size(16)
-      .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params);
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U16, batch_div_16) {
-    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U16, batch_lt_16) {
-    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U16, batch_gt_16) {
-    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U16, input_scale) {
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U16, output_scale) {
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U16, input_zero_point) {
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U16, output_zero_point) {
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u16, xnn_init_qs8_hswish_scalar_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(QS8_VHSWISH__WASMSIMD_U32, batch_eq_32) {
-    VHSwishMicrokernelTester()
-      .batch_size(32)
-      .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params);
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U32, batch_div_32) {
-    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U32, batch_lt_32) {
-    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U32, batch_gt_32) {
-    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U32, input_scale) {
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U32, output_scale) {
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U32, input_zero_point) {
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params);
-      }
-    }
-  }
-
-  TEST(QS8_VHSWISH__WASMSIMD_U32, output_zero_point) {
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qs8_vhswish_ukernel__wasmsimd_u32, xnn_init_qs8_hswish_scalar_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-TEST(QS8_VHSWISH__SCALAR_U1, batch_eq_1) {
-  VHSwishMicrokernelTester()
-    .batch_size(1)
-    .Test(xnn_qs8_vhswish_ukernel__scalar_u1, xnn_init_qs8_hswish_scalar_params);
-}
-
-TEST(QS8_VHSWISH__SCALAR_U1, batch_gt_1) {
-  for (size_t batch_size = 2; batch_size < 10; batch_size++) {
-    VHSwishMicrokernelTester()
-      .batch_size(batch_size)
-      .Test(xnn_qs8_vhswish_ukernel__scalar_u1, xnn_init_qs8_hswish_scalar_params);
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U1, input_scale) {
-  for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) {
-    for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_scale(input_scale)
-        .Test(xnn_qs8_vhswish_ukernel__scalar_u1, xnn_init_qs8_hswish_scalar_params);
-      }
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U1, output_scale) {
-  for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) {
-    for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .output_scale(output_scale)
-        .Test(xnn_qs8_vhswish_ukernel__scalar_u1, xnn_init_qs8_hswish_scalar_params);
-      }
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U1, input_zero_point) {
-  for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-    for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(input_zero_point)
-        .Test(xnn_qs8_vhswish_ukernel__scalar_u1, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U1, output_zero_point) {
-  for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-    for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .output_zero_point(output_zero_point)
-        .Test(xnn_qs8_vhswish_ukernel__scalar_u1, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U2, batch_eq_2) {
-  VHSwishMicrokernelTester()
-    .batch_size(2)
-    .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params);
-}
-
-TEST(QS8_VHSWISH__SCALAR_U2, batch_div_2) {
-  for (size_t batch_size = 4; batch_size < 20; batch_size += 2) {
-    VHSwishMicrokernelTester()
-      .batch_size(batch_size)
-      .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params);
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U2, batch_lt_2) {
-  for (size_t batch_size = 1; batch_size < 2; batch_size++) {
-    VHSwishMicrokernelTester()
-      .batch_size(batch_size)
-      .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params);
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U2, batch_gt_2) {
-  for (size_t batch_size = 3; batch_size < 4; batch_size++) {
-    VHSwishMicrokernelTester()
-      .batch_size(batch_size)
-      .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params);
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U2, input_scale) {
-  for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) {
-    for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_scale(input_scale)
-        .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params);
-      }
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U2, output_scale) {
-  for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) {
-    for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .output_scale(output_scale)
-        .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params);
-      }
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U2, input_zero_point) {
-  for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-    for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(input_zero_point)
-        .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U2, output_zero_point) {
-  for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-    for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .output_zero_point(output_zero_point)
-        .Test(xnn_qs8_vhswish_ukernel__scalar_u2, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U4, batch_eq_4) {
-  VHSwishMicrokernelTester()
-    .batch_size(4)
-    .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params);
-}
-
-TEST(QS8_VHSWISH__SCALAR_U4, batch_div_4) {
-  for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
-    VHSwishMicrokernelTester()
-      .batch_size(batch_size)
-      .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params);
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U4, batch_lt_4) {
-  for (size_t batch_size = 1; batch_size < 4; batch_size++) {
-    VHSwishMicrokernelTester()
-      .batch_size(batch_size)
-      .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params);
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U4, batch_gt_4) {
-  for (size_t batch_size = 5; batch_size < 8; batch_size++) {
-    VHSwishMicrokernelTester()
-      .batch_size(batch_size)
-      .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params);
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U4, input_scale) {
-  for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
-    for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_scale(input_scale)
-        .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params);
-      }
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U4, output_scale) {
-  for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
-    for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .output_scale(output_scale)
-        .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params);
-      }
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U4, input_zero_point) {
-  for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(input_zero_point)
-        .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-}
-
-TEST(QS8_VHSWISH__SCALAR_U4, output_zero_point) {
-  for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .output_zero_point(output_zero_point)
-        .Test(xnn_qs8_vhswish_ukernel__scalar_u4, xnn_init_qs8_hswish_scalar_params);
-    }
-  }
-}
\ No newline at end of file
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\
+                                                                                                                 \
+XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                        \
+XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                       \
+XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                        \
+XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                        \
+                                                                                                                 \
+XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                         \
+TEST(ukernel, input_scale) {                                                                                     \
+  TEST_REQUIRES_ARCH_FLAGS(arch_flags);                                                                          \
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {                                               \
+    for (float input_scale : {4.0f, 16.0f, 64.0f}) {                                                             \
+      VHSwishMicrokernelTester()                                                                                 \
+        .batch_size(batch_size)                                                                                  \
+        .input_scale(input_scale)                                                                                \
+        .Test(ukernel, init_params);                                                                             \
+      }                                                                                                          \
+  }                                                                                                              \
+}                                                                                                                \
+                                                                                                                 \
+TEST(ukernel, output_scale) {                                                                                    \
+  TEST_REQUIRES_ARCH_FLAGS(arch_flags);                                                                          \
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {                                               \
+    for (float output_scale : {4.0f, 16.0f, 64.0f}) {                                                            \
+      VHSwishMicrokernelTester()                                                                                 \
+        .batch_size(batch_size)                                                                                  \
+        .output_scale(output_scale)                                                                              \
+        .Test(ukernel, init_params);                                                                             \
+      }                                                                                                          \
+  }                                                                                                              \
+}                                                                                                                \
+                                                                                                                 \
+TEST(ukernel, input_zero_point) {                                                                                \
+  TEST_REQUIRES_ARCH_FLAGS(arch_flags);                                                                          \
+  for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {                             \
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {                                             \
+      VHSwishMicrokernelTester()                                                                                 \
+        .batch_size(batch_size)                                                                                  \
+        .input_zero_point(input_zero_point)                                                                      \
+        .Test(ukernel, init_params);                                                                             \
+    }                                                                                                            \
+  }                                                                                                              \
+}                                                                                                                \
+                                                                                                                 \
+TEST(ukernel, output_zero_point) {                                                                               \
+  TEST_REQUIRES_ARCH_FLAGS(arch_flags);                                                                          \
+  for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {                          \
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {                                             \
+      VHSwishMicrokernelTester()                                                                                 \
+        .batch_size(batch_size)                                                                                  \
+        .output_zero_point(output_zero_point)                                                                    \
+        .Test(ukernel, init_params);                                                                             \
+    }                                                                                                            \
+  }                                                                                                              \
+}
+#include "qs8-vhswish/qs8-vhswish.h"
+#undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-vhswish.yaml b/test/qs8-vhswish.yaml
deleted file mode 100644
index 0c0154c381a..00000000000
--- a/test/qs8-vhswish.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# ARM NEON
-- name: xnn_qs8_vhswish_ukernel__neon_u8
-  init: xnn_init_qs8_hswish_scalar_params
-- name: xnn_qs8_vhswish_ukernel__neon_u16
-  init: xnn_init_qs8_hswish_scalar_params
-- name: xnn_qs8_vhswish_ukernel__neon_u32
-  init: xnn_init_qs8_hswish_scalar_params
-
-# x86 SSE2
-- name: xnn_qs8_vhswish_ukernel__sse2_u16
-  init: xnn_init_qs8_hswish_sse2_params
-- name: xnn_qs8_vhswish_ukernel__sse2_u32
-  init: xnn_init_qs8_hswish_sse2_params
-
-# x86 SSSE3
-- name: xnn_qs8_vhswish_ukernel__ssse3_u16
-  init: xnn_init_qs8_hswish_sse2_params
-- name: xnn_qs8_vhswish_ukernel__ssse3_u32
-  init: xnn_init_qs8_hswish_sse2_params
-
-# x86 SSE4.1
-- name: xnn_qs8_vhswish_ukernel__sse41_u8
-  init: xnn_init_qs8_hswish_sse2_params
-- name: xnn_qs8_vhswish_ukernel__sse41_u16
-  init: xnn_init_qs8_hswish_sse2_params
-- name: xnn_qs8_vhswish_ukernel__sse41_u32
-  init: xnn_init_qs8_hswish_sse2_params
-
-# x86 AVX
-- name: xnn_qs8_vhswish_ukernel__avx_u8
-  init: xnn_init_qs8_hswish_sse2_params
-- name: xnn_qs8_vhswish_ukernel__avx_u16
-  init: xnn_init_qs8_hswish_sse2_params
-- name: xnn_qs8_vhswish_ukernel__avx_u32
-  init: xnn_init_qs8_hswish_sse2_params
-
-# WAsm Relaxed SIMD
-- name: xnn_qs8_vhswish_ukernel__wasmsimd_u8
-  init: xnn_init_qs8_hswish_scalar_params
-- name: xnn_qs8_vhswish_ukernel__wasmsimd_u16
-  init: xnn_init_qs8_hswish_scalar_params
-- name: xnn_qs8_vhswish_ukernel__wasmsimd_u32
-  init: xnn_init_qs8_hswish_scalar_params
-
-# Scalar
-- name: xnn_qs8_vhswish_ukernel__scalar_u1
-  init: xnn_init_qs8_hswish_scalar_params
-- name: xnn_qs8_vhswish_ukernel__scalar_u2
-  init: xnn_init_qs8_hswish_scalar_params
-- name: xnn_qs8_vhswish_ukernel__scalar_u4
-  init: xnn_init_qs8_hswish_scalar_params
diff --git a/test/qs8-vlrelu.cc b/test/qs8-vlrelu.cc
index adec5d8a2fb..650b91404c0 100644
--- a/test/qs8-vlrelu.cc
+++ b/test/qs8-vlrelu.cc
@@ -20,7 +20,7 @@
 #include "xnnpack/isa-checks.h"
 #include "xnnpack/microparams-init.h"
 #include "xnnpack/microparams.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 #include "next_prime.h"
 #include "vlrelu-microkernel-tester.h"
 
@@ -55,5 +55,5 @@ TEST(ukernel, negative_scale) {
       }                                                                                                          \
   }                                                                                                              \
 }
-#include "src/qs8-vlrelu/qs8-vlrelu.h"
+#include "qs8-vlrelu/qs8-vlrelu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-vmul-minmax-fp32.cc b/test/qs8-vmul-minmax-fp32.cc
index 84192475916..71b20362efb 100644
--- a/test/qs8-vmul-minmax-fp32.cc
+++ b/test/qs8-vmul-minmax-fp32.cc
@@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukerne
                                                                                                                  \
 XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);                    \
 XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);
-#include "src/qs8-vmul/qs8-vmul-minmax-fp32.h"
+#include "qs8-vmul/qs8-vmul-minmax-fp32.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-vmul-minmax-rndnu.cc b/test/qs8-vmul-minmax-rndnu.cc
index 415282a69ec..a8e646f9be8 100644
--- a/test/qs8-vmul-minmax-rndnu.cc
+++ b/test/qs8-vmul-minmax-rndnu.cc
@@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukerne
                                                                                                                  \
 XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);                    \
 XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);
-#include "src/qs8-vmul/qs8-vmul-minmax-rndnu.h"
+#include "qs8-vmul/qs8-vmul-minmax-rndnu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-vmulc-minmax-fp32.cc b/test/qs8-vmulc-minmax-fp32.cc
index 17c4c06dce7..88b6b2668b9 100644
--- a/test/qs8-vmulc-minmax-fp32.cc
+++ b/test/qs8-vmulc-minmax-fp32.cc
@@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel
                                                                                                                  \
 XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params);                     \
 XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params);
-#include "src/qs8-vmulc/qs8-vmulc-minmax-fp32.h"
+#include "qs8-vmulc/qs8-vmulc-minmax-fp32.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qs8-vmulc-minmax-rndnu.cc b/test/qs8-vmulc-minmax-rndnu.cc
index a2deac30969..c51e1dfdffb 100644
--- a/test/qs8-vmulc-minmax-rndnu.cc
+++ b/test/qs8-vmulc-minmax-rndnu.cc
@@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel
                                                                                                                  \
 XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params);                     \
 XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params);
-#include "src/qs8-vmulc/qs8-vmulc-minmax-rndnu.h"
+#include "qs8-vmulc/qs8-vmulc-minmax-rndnu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-dwconv-minmax-multipass-fp32.cc b/test/qu8-dwconv-minmax-multipass-fp32.cc
index 29dd4cdf532..d73327d2e9a 100644
--- a/test/qu8-dwconv-minmax-multipass-fp32.cc
+++ b/test/qu8-dwconv-minmax-multipass-fp32.cc
@@ -300,5 +300,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                                                                                                              \
       return info.param.test_name;                                                                                                                                                                               \
     });
-#include "src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h"
+#include "qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-dwconv-minmax-multipass-rndnu.cc b/test/qu8-dwconv-minmax-multipass-rndnu.cc
index b300ab8218a..bd8cc9d10d3 100644
--- a/test/qu8-dwconv-minmax-multipass-rndnu.cc
+++ b/test/qu8-dwconv-minmax-multipass-rndnu.cc
@@ -300,5 +300,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                                                                                                              \
       return info.param.test_name;                                                                                                                                                                               \
     });
-#include "src/qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h"
+#include "qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-dwconv-minmax-unipass-fp32.cc b/test/qu8-dwconv-minmax-unipass-fp32.cc
index 586851cea87..4dd4997d469 100644
--- a/test/qu8-dwconv-minmax-unipass-fp32.cc
+++ b/test/qu8-dwconv-minmax-unipass-fp32.cc
@@ -225,5 +225,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                             \
       return info.param.test_name;                                                                                              \
     });
-#include "src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h"
+#include "qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-dwconv-minmax-unipass-rndnu.cc b/test/qu8-dwconv-minmax-unipass-rndnu.cc
index 361838e6368..3891a756d71 100644
--- a/test/qu8-dwconv-minmax-unipass-rndnu.cc
+++ b/test/qu8-dwconv-minmax-unipass-rndnu.cc
@@ -225,5 +225,5 @@ INSTANTIATE_TEST_SUITE_P(
     [](const testing::TestParamInfo<DWConvTest::ParamType>& info) {                                                             \
       return info.param.test_name;                                                                                              \
     });
-#include "src/qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h"
+#include "qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-f32-vcvt.cc b/test/qu8-f32-vcvt.cc
index e37575b1b1c..909563c8e75 100644
--- a/test/qu8-f32-vcvt.cc
+++ b/test/qu8-f32-vcvt.cc
@@ -22,5 +22,5 @@ XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out
 XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);           \
                                                                                                                 \
 XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);
-#include "src/qu8-f32-vcvt/qu8-f32-vcvt.h"
+#include "qu8-f32-vcvt/qu8-f32-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-requantization.cc b/test/qu8-requantization.cc
index c5cd2a23929..5501de70e86 100644
--- a/test/qu8-requantization.cc
+++ b/test/qu8-requantization.cc
@@ -15,254 +15,6 @@
 #include "xnnpack/requantization-stubs.h"
 #include "requantization-tester.h"
 
-/*
- * Round-to-nearest, ties away from zero, scalar implementation using unsigned 32-bit arithmetics.
- */
-
-TEST(QU8_RNDNA__SCALAR_UNSIGNED32, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
-    RequantizationTester()
-      .qmin(std::numeric_limits<uint8_t>::min())
-      .qmax(std::numeric_limits<uint8_t>::max())
-      .s(s)
-      .TestExactDivideByPO2(xnn_qu8_requantize_rndna__scalar_unsigned32);
-  }
-}
-
-TEST(QU8_RNDNA__SCALAR_UNSIGNED32, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zero_point = 1; zero_point < 256; zero_point++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestExactDivideByPO2(xnn_qu8_requantize_rndna__scalar_unsigned32);
-    }
-  }
-}
-
-TEST(QU8_RNDNA__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_up) {
-  for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_rndna__scalar_unsigned32);
-    }
-  }
-}
-
-TEST(QU8_RNDNA__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_down) {
-  for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_rndna__scalar_unsigned32);
-    }
-  }
-}
-
-TEST(QU8_RNDNA__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_away) {
-  for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingTiesAway(xnn_qu8_requantize_rndna__scalar_unsigned32);
-    }
-  }
-}
-
-TEST(QU8_RNDNA__SCALAR_UNSIGNED32, special_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<uint8_t>::min())
-    .qmax(std::numeric_limits<uint8_t>::max())
-    .TestSpecialCases(xnn_qu8_requantize_rndna__scalar_unsigned32);
-}
-
-TEST(QU8_RNDNA__SCALAR_UNSIGNED32, random_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<uint8_t>::min())
-    .qmax(std::numeric_limits<uint8_t>::max())
-    .zero_point(128)
-    .iterations(100)
-    .TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantize_rndna__scalar_unsigned32);
-}
-
-
-/*
- * Round-to-nearest, ties away from zero, scalar implementation using unsigned 64-bit arithmetics.
- */
-
-TEST(QU8_RNDNA__SCALAR_UNSIGNED64, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
-    RequantizationTester()
-      .qmin(std::numeric_limits<uint8_t>::min())
-      .qmax(std::numeric_limits<uint8_t>::max())
-      .s(s)
-      .TestExactDivideByPO2(xnn_qu8_requantize_rndna__scalar_unsigned64);
-  }
-}
-
-TEST(QU8_RNDNA__SCALAR_UNSIGNED64, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zero_point = 1; zero_point < 256; zero_point++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestExactDivideByPO2(xnn_qu8_requantize_rndna__scalar_unsigned64);
-    }
-  }
-}
-
-TEST(QU8_RNDNA__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_up) {
-  for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_rndna__scalar_unsigned64);
-    }
-  }
-}
-
-TEST(QU8_RNDNA__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_down) {
-  for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_rndna__scalar_unsigned64);
-    }
-  }
-}
-
-TEST(QU8_RNDNA__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_away) {
-  for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingTiesAway(xnn_qu8_requantize_rndna__scalar_unsigned64);
-    }
-  }
-}
-
-TEST(QU8_RNDNA__SCALAR_UNSIGNED64, special_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<uint8_t>::min())
-    .qmax(std::numeric_limits<uint8_t>::max())
-    .TestSpecialCases(xnn_qu8_requantize_rndna__scalar_unsigned64);
-}
-
-TEST(QU8_RNDNA__SCALAR_UNSIGNED64, random_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<uint8_t>::min())
-    .qmax(std::numeric_limits<uint8_t>::max())
-    .zero_point(128)
-    .iterations(100)
-    .TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantize_rndna__scalar_unsigned64);
-}
-
-
-/*
- * Round-to-nearest, ties away from zero, scalar implementation using signed 64-bit arithmetics.
- */
-
-TEST(QU8_RNDNA__SCALAR_SIGNED64, exact_divide_by_po2) {
-  for (uint32_t s = 1; s < 32; s++) {
-    RequantizationTester()
-      .qmin(std::numeric_limits<uint8_t>::min())
-      .qmax(std::numeric_limits<uint8_t>::max())
-      .s(s)
-      .TestExactDivideByPO2(xnn_qu8_requantize_rndna__scalar_signed64);
-  }
-}
-
-TEST(QU8_RNDNA__SCALAR_SIGNED64, exact_divide_by_po2_with_zero_point) {
-  for (int32_t zero_point = 1; zero_point < 256; zero_point++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestExactDivideByPO2(xnn_qu8_requantize_rndna__scalar_signed64);
-    }
-  }
-}
-
-TEST(QU8_RNDNA__SCALAR_SIGNED64, divide_by_po2_with_rounding_up) {
-  for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_rndna__scalar_signed64);
-    }
-  }
-}
-
-TEST(QU8_RNDNA__SCALAR_SIGNED64, divide_by_po2_with_rounding_down) {
-  for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_rndna__scalar_signed64);
-    }
-  }
-}
-
-TEST(QU8_RNDNA__SCALAR_SIGNED64, divide_by_po2_with_rounding_away) {
-  for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestDivideByPO2WithRoundingTiesAway(xnn_qu8_requantize_rndna__scalar_signed64);
-    }
-  }
-}
-
-TEST(QU8_RNDNA__SCALAR_SIGNED64, special_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<uint8_t>::min())
-    .qmax(std::numeric_limits<uint8_t>::max())
-    .TestSpecialCases(xnn_qu8_requantize_rndna__scalar_signed64);
-}
-
-TEST(QU8_RNDNA__SCALAR_SIGNED64, random_cases) {
-  RequantizationTester()
-    .qmin(std::numeric_limits<uint8_t>::min())
-    .qmax(std::numeric_limits<uint8_t>::max())
-    .zero_point(128)
-    .iterations(100)
-    .TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantize_rndna__scalar_signed64);
-}
-
 
 /*
  * FP32-based scalar implementation using lrintf function.
@@ -350,269 +102,6 @@ TEST(QU8_GEMMLOWP__SCALAR, random_cases) {
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  /*
-   * Round-to-nearest, ties away from zero, SSE2 implementation using floating-point shuffle.
-   */
-
-  TEST(QU8_RNDNA__SSE2, exact_divide_by_po2) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestExactDivideByPO2(xnn_qu8_requantize_rndna__sse2);
-    }
-  }
-
-  TEST(QU8_RNDNA__SSE2, exact_divide_by_po2_with_zero_point) {
-    for (int32_t zero_point = 1; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestExactDivideByPO2(xnn_qu8_requantize_rndna__sse2);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__SSE2, divide_by_po2_with_rounding_up) {
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_rndna__sse2);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__SSE2, divide_by_po2_with_rounding_down) {
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_rndna__sse2);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__SSE2, divide_by_po2_with_rounding_away) {
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingTiesAway(xnn_qu8_requantize_rndna__sse2);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__SSE2, special_cases) {
-    RequantizationTester()
-      .qmin(std::numeric_limits<uint8_t>::min())
-      .qmax(std::numeric_limits<uint8_t>::max())
-      .TestSpecialCases(xnn_qu8_requantize_rndna__sse2);
-  }
-
-  TEST(QU8_RNDNA__SSE2, random_cases) {
-    RequantizationTester()
-      .qmin(std::numeric_limits<uint8_t>::min())
-      .qmax(std::numeric_limits<uint8_t>::max())
-      .zero_point(128)
-      .iterations(100)
-      .TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantize_rndna__sse2);
-  }
-
-
-  /*
-   * Round-to-nearest, ties away from zero, SSSE3 implementation using floating-point shuffle.
-   */
-
-  TEST(QU8_RNDNA__SSSE3, exact_divide_by_po2) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestExactDivideByPO2(xnn_qu8_requantize_rndna__ssse3);
-    }
-  }
-
-  TEST(QU8_RNDNA__SSSE3, exact_divide_by_po2_with_zero_point) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int32_t zero_point = 1; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestExactDivideByPO2(xnn_qu8_requantize_rndna__ssse3);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__SSSE3, divide_by_po2_with_rounding_up) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_rndna__ssse3);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__SSSE3, divide_by_po2_with_rounding_down) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_rndna__ssse3);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__SSSE3, divide_by_po2_with_rounding_away) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingTiesAway(xnn_qu8_requantize_rndna__ssse3);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__SSSE3, special_cases) {
-    TEST_REQUIRES_X86_SSSE3;
-    RequantizationTester()
-      .qmin(std::numeric_limits<uint8_t>::min())
-      .qmax(std::numeric_limits<uint8_t>::max())
-      .TestSpecialCases(xnn_qu8_requantize_rndna__ssse3);
-  }
-
-  TEST(QU8_RNDNA__SSSE3, random_cases) {
-    TEST_REQUIRES_X86_SSSE3;
-    RequantizationTester()
-      .qmin(std::numeric_limits<uint8_t>::min())
-      .qmax(std::numeric_limits<uint8_t>::max())
-      .zero_point(128)
-      .iterations(100)
-      .TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantize_rndna__ssse3);
-  }
-
-
-  /*
-   * Round-to-nearest, ties away from zero, SSE4.1 implementation using static blend instruction.
-   */
-
-  TEST(QU8_RNDNA__SSE41, exact_divide_by_po2) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestExactDivideByPO2(xnn_qu8_requantize_rndna__sse41);
-    }
-  }
-
-  TEST(QU8_RNDNA__SSE41, exact_divide_by_po2_with_zero_point) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int32_t zero_point = 1; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestExactDivideByPO2(xnn_qu8_requantize_rndna__sse41);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__SSE41, divide_by_po2_with_rounding_up) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_rndna__sse41);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__SSE41, divide_by_po2_with_rounding_down) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_rndna__sse41);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__SSE41, divide_by_po2_with_rounding_away) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingTiesAway(xnn_qu8_requantize_rndna__sse41);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__SSE41, special_cases) {
-    TEST_REQUIRES_X86_SSE41;
-    RequantizationTester()
-      .qmin(std::numeric_limits<uint8_t>::min())
-      .qmax(std::numeric_limits<uint8_t>::max())
-      .TestSpecialCases(xnn_qu8_requantize_rndna__sse41);
-  }
-
-  TEST(QU8_RNDNA__SSE41, random_cases) {
-    TEST_REQUIRES_X86_SSE41;
-    RequantizationTester()
-      .qmin(std::numeric_limits<uint8_t>::min())
-      .qmax(std::numeric_limits<uint8_t>::max())
-      .zero_point(128)
-      .iterations(100)
-      .TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantize_rndna__sse41);
-  }
-
-
   /*
    * FP32-based x86 SSE2 implementation.
    */
@@ -814,96 +303,6 @@ TEST(QU8_GEMMLOWP__SCALAR, random_cases) {
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  /*
-   * Round-to-nearest, ties away from zero, ARM NEON implementation.
-   */
-
-  TEST(QU8_RNDNA__NEON, exact_divide_by_po2) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .qmin(std::numeric_limits<uint8_t>::min())
-        .qmax(std::numeric_limits<uint8_t>::max())
-        .s(s)
-        .TestExactDivideByPO2(xnn_qu8_requantize_rndna__neon);
-    }
-  }
-
-  TEST(QU8_RNDNA__NEON, exact_divide_by_po2_with_zero_point) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int32_t zero_point = 1; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestExactDivideByPO2(xnn_qu8_requantize_rndna__neon);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__NEON, divide_by_po2_with_rounding_up) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_rndna__neon);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__NEON, divide_by_po2_with_rounding_down) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_rndna__neon);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__NEON, divide_by_po2_with_rounding_away) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .qmin(std::numeric_limits<uint8_t>::min())
-          .qmax(std::numeric_limits<uint8_t>::max())
-          .s(s)
-          .TestDivideByPO2WithRoundingTiesAway(xnn_qu8_requantize_rndna__neon);
-      }
-    }
-  }
-
-  TEST(QU8_RNDNA__NEON, special_cases) {
-    TEST_REQUIRES_ARM_NEON;
-    RequantizationTester()
-      .qmin(std::numeric_limits<uint8_t>::min())
-      .qmax(std::numeric_limits<uint8_t>::max())
-      .TestSpecialCases(xnn_qu8_requantize_rndna__neon);
-  }
-
-  TEST(QU8_RNDNA__NEON, random_cases) {
-    TEST_REQUIRES_ARM_NEON;
-    RequantizationTester()
-      .qmin(std::numeric_limits<uint8_t>::min())
-      .qmax(std::numeric_limits<uint8_t>::max())
-      .zero_point(128)
-      .iterations(100)
-      .TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantize_rndna__neon);
-  }
-
-
   /*
    * FP32-based ARM NEON implementation.
    */
diff --git a/test/qu8-vadd-minmax.cc b/test/qu8-vadd-minmax.cc
index a720b913857..038adf26f39 100644
--- a/test/qu8-vadd-minmax.cc
+++ b/test/qu8-vadd-minmax.cc
@@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukerne
                                                                                                                  \
 XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);                    \
 XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);
-#include "src/qu8-vadd/qu8-vadd-minmax.h"
+#include "qu8-vadd/qu8-vadd-minmax.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-vaddc-minmax.cc b/test/qu8-vaddc-minmax.cc
index 64da7bb810b..9c856481986 100644
--- a/test/qu8-vaddc-minmax.cc
+++ b/test/qu8-vaddc-minmax.cc
@@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel
                                                                                                                  \
 XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params);                     \
 XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params);
-#include "src/qu8-vaddc/qu8-vaddc-minmax.h"
+#include "qu8-vaddc/qu8-vaddc-minmax.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-vcvt.cc b/test/qu8-vcvt.cc
index d8e60d7252c..b0e061a70b4 100644
--- a/test/qu8-vcvt.cc
+++ b/test/qu8-vcvt.cc
@@ -22,5 +22,5 @@ XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out
 XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);           \
                                                                                                                 \
 XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);
-#include "src/qu8-vcvt/qu8-vcvt.h"
+#include "qu8-vcvt/qu8-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-vhswish.cc b/test/qu8-vhswish.cc
index 43607f2b51f..e6e76ed7dcc 100644
--- a/test/qu8-vhswish.cc
+++ b/test/qu8-vhswish.cc
@@ -1,1823 +1,89 @@
-// Copyright 2023 Google LLC
+// Copyright 2019 Google LLC
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 //
 // Auto-generated file. Do not edit!
-//   Specification: test/qu8-vhswish.yaml
-//   Generator: tools/generate-vhswish-test.py
+//   Microkernel: qu8-vhswish
+//   Generator: tools/generate-vunary-test.py
 
 
-#include <vector>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <cstddef>
+#include <limits>
 
 #include <gtest/gtest.h>
+#include "xnnpack.h"
 #include "xnnpack/common.h"
 #include "xnnpack/isa-checks.h"
 #include "xnnpack/microparams-init.h"
-#include "xnnpack/vhswish.h"
+#include "xnnpack/microparams.h"
+#include "xnnpack/vunary.h"
+#include "next_prime.h"
 #include "vhswish-microkernel-tester.h"
 
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QU8_VHSWISH__NEON_U8, batch_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    VHSwishMicrokernelTester()
-      .batch_size(8)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params);
-  }
-
-  TEST(QU8_VHSWISH__NEON_U8, batch_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U8, batch_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U8, batch_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U8, input_scale) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U8, output_scale) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U8, input_zero_point) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U8, output_zero_point) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__neon_u8, xnn_init_qu8_hswish_scalar_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QU8_VHSWISH__NEON_U16, batch_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    VHSwishMicrokernelTester()
-      .batch_size(16)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params);
-  }
-
-  TEST(QU8_VHSWISH__NEON_U16, batch_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U16, batch_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U16, batch_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U16, input_scale) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U16, output_scale) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U16, input_zero_point) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U16, output_zero_point) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__neon_u16, xnn_init_qu8_hswish_scalar_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QU8_VHSWISH__NEON_U32, batch_eq_32) {
-    TEST_REQUIRES_ARM_NEON;
-    VHSwishMicrokernelTester()
-      .batch_size(32)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params);
-  }
-
-  TEST(QU8_VHSWISH__NEON_U32, batch_div_32) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U32, batch_lt_32) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U32, batch_gt_32) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U32, input_scale) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U32, output_scale) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U32, input_zero_point) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__NEON_U32, output_zero_point) {
-    TEST_REQUIRES_ARM_NEON;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__neon_u32, xnn_init_qu8_hswish_scalar_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QU8_VHSWISH__SSE2_U16, batch_eq_16) {
-    TEST_REQUIRES_X86_SSE2;
-    VHSwishMicrokernelTester()
-      .batch_size(16)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params);
-  }
-
-  TEST(QU8_VHSWISH__SSE2_U16, batch_div_16) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE2_U16, batch_lt_16) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE2_U16, batch_gt_16) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE2_U16, input_scale) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE2_U16, output_scale) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE2_U16, input_zero_point) {
-    TEST_REQUIRES_X86_SSE2;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE2_U16, output_zero_point) {
-    TEST_REQUIRES_X86_SSE2;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__sse2_u16, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QU8_VHSWISH__SSE2_U32, batch_eq_32) {
-    TEST_REQUIRES_X86_SSE2;
-    VHSwishMicrokernelTester()
-      .batch_size(32)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params);
-  }
-
-  TEST(QU8_VHSWISH__SSE2_U32, batch_div_32) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE2_U32, batch_lt_32) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE2_U32, batch_gt_32) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE2_U32, input_scale) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE2_U32, output_scale) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE2_U32, input_zero_point) {
-    TEST_REQUIRES_X86_SSE2;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE2_U32, output_zero_point) {
-    TEST_REQUIRES_X86_SSE2;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__sse2_u32, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QU8_VHSWISH__SSSE3_U16, batch_eq_16) {
-    TEST_REQUIRES_X86_SSSE3;
-    VHSwishMicrokernelTester()
-      .batch_size(16)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params);
-  }
-
-  TEST(QU8_VHSWISH__SSSE3_U16, batch_div_16) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSSE3_U16, batch_lt_16) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSSE3_U16, batch_gt_16) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSSE3_U16, input_scale) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSSE3_U16, output_scale) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSSE3_U16, input_zero_point) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSSE3_U16, output_zero_point) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__ssse3_u16, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QU8_VHSWISH__SSSE3_U32, batch_eq_32) {
-    TEST_REQUIRES_X86_SSSE3;
-    VHSwishMicrokernelTester()
-      .batch_size(32)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params);
-  }
-
-  TEST(QU8_VHSWISH__SSSE3_U32, batch_div_32) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSSE3_U32, batch_lt_32) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSSE3_U32, batch_gt_32) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSSE3_U32, input_scale) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSSE3_U32, output_scale) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSSE3_U32, input_zero_point) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSSE3_U32, output_zero_point) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__ssse3_u32, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QU8_VHSWISH__SSE41_U8, batch_eq_8) {
-    TEST_REQUIRES_X86_SSE41;
-    VHSwishMicrokernelTester()
-      .batch_size(8)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params);
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U8, batch_div_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U8, batch_lt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U8, batch_gt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U8, input_scale) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U8, output_scale) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U8, input_zero_point) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U8, output_zero_point) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__sse41_u8, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QU8_VHSWISH__SSE41_U16, batch_eq_16) {
-    TEST_REQUIRES_X86_SSE41;
-    VHSwishMicrokernelTester()
-      .batch_size(16)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params);
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U16, batch_div_16) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U16, batch_lt_16) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U16, batch_gt_16) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U16, input_scale) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U16, output_scale) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U16, input_zero_point) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U16, output_zero_point) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__sse41_u16, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QU8_VHSWISH__SSE41_U32, batch_eq_32) {
-    TEST_REQUIRES_X86_SSE41;
-    VHSwishMicrokernelTester()
-      .batch_size(32)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params);
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U32, batch_div_32) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U32, batch_lt_32) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U32, batch_gt_32) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U32, input_scale) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U32, output_scale) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U32, input_zero_point) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__SSE41_U32, output_zero_point) {
-    TEST_REQUIRES_X86_SSE41;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__sse41_u32, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QU8_VHSWISH__AVX_U8, batch_eq_8) {
-    TEST_REQUIRES_X86_AVX;
-    VHSwishMicrokernelTester()
-      .batch_size(8)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params);
-  }
-
-  TEST(QU8_VHSWISH__AVX_U8, batch_div_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U8, batch_lt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U8, batch_gt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U8, input_scale) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U8, output_scale) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U8, input_zero_point) {
-    TEST_REQUIRES_X86_AVX;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U8, output_zero_point) {
-    TEST_REQUIRES_X86_AVX;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__avx_u8, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QU8_VHSWISH__AVX_U16, batch_eq_16) {
-    TEST_REQUIRES_X86_AVX;
-    VHSwishMicrokernelTester()
-      .batch_size(16)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params);
-  }
-
-  TEST(QU8_VHSWISH__AVX_U16, batch_div_16) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U16, batch_lt_16) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U16, batch_gt_16) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U16, input_scale) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U16, output_scale) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U16, input_zero_point) {
-    TEST_REQUIRES_X86_AVX;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U16, output_zero_point) {
-    TEST_REQUIRES_X86_AVX;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__avx_u16, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QU8_VHSWISH__AVX_U32, batch_eq_32) {
-    TEST_REQUIRES_X86_AVX;
-    VHSwishMicrokernelTester()
-      .batch_size(32)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params);
-  }
-
-  TEST(QU8_VHSWISH__AVX_U32, batch_div_32) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U32, batch_lt_32) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U32, batch_gt_32) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U32, input_scale) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U32, output_scale) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U32, input_zero_point) {
-    TEST_REQUIRES_X86_AVX;
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__AVX_U32, output_zero_point) {
-    TEST_REQUIRES_X86_AVX;
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__avx_u32, xnn_init_qu8_hswish_sse2_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(QU8_VHSWISH__WASMSIMD_U8, batch_eq_8) {
-    VHSwishMicrokernelTester()
-      .batch_size(8)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params);
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U8, batch_div_8) {
-    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U8, batch_lt_8) {
-    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U8, batch_gt_8) {
-    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U8, input_scale) {
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U8, output_scale) {
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U8, input_zero_point) {
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U8, output_zero_point) {
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u8, xnn_init_qu8_hswish_scalar_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(QU8_VHSWISH__WASMSIMD_U16, batch_eq_16) {
-    VHSwishMicrokernelTester()
-      .batch_size(16)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params);
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U16, batch_div_16) {
-    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U16, batch_lt_16) {
-    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U16, batch_gt_16) {
-    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U16, input_scale) {
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U16, output_scale) {
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U16, input_zero_point) {
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U16, output_zero_point) {
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u16, xnn_init_qu8_hswish_scalar_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  TEST(QU8_VHSWISH__WASMSIMD_U32, batch_eq_32) {
-    VHSwishMicrokernelTester()
-      .batch_size(32)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params);
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U32, batch_div_32) {
-    for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U32, batch_lt_32) {
-    for (size_t batch_size = 1; batch_size < 32; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U32, batch_gt_32) {
-    for (size_t batch_size = 33; batch_size < 64; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U32, input_scale) {
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_scale(input_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U32, output_scale) {
-    for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-      for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .output_scale(output_scale)
-          .input_zero_point(150)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params);
-        }
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U32, input_zero_point) {
-    for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(input_zero_point)
-          .output_zero_point(100)
-          .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params);
-      }
-    }
-  }
-
-  TEST(QU8_VHSWISH__WASMSIMD_U32, output_zero_point) {
-    for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-      for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
-        VHSwishMicrokernelTester()
-          .batch_size(batch_size)
-          .input_zero_point(150)
-          .output_zero_point(output_zero_point)
-          .Test(xnn_qu8_vhswish_ukernel__wasmsimd_u32, xnn_init_qu8_hswish_scalar_params);
-      }
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-
-TEST(QU8_VHSWISH__SCALAR_U1, batch_eq_1) {
-  VHSwishMicrokernelTester()
-    .batch_size(1)
-    .input_zero_point(150)
-    .output_zero_point(100)
-    .Test(xnn_qu8_vhswish_ukernel__scalar_u1, xnn_init_qu8_hswish_scalar_params);
-}
-
-TEST(QU8_VHSWISH__SCALAR_U1, batch_gt_1) {
-  for (size_t batch_size = 2; batch_size < 10; batch_size++) {
-    VHSwishMicrokernelTester()
-      .batch_size(batch_size)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__scalar_u1, xnn_init_qu8_hswish_scalar_params);
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U1, input_scale) {
-  for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) {
-    for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_scale(input_scale)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__scalar_u1, xnn_init_qu8_hswish_scalar_params);
-      }
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U1, output_scale) {
-  for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) {
-    for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .output_scale(output_scale)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__scalar_u1, xnn_init_qu8_hswish_scalar_params);
-      }
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U1, input_zero_point) {
-  for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-    for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(input_zero_point)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__scalar_u1, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U1, output_zero_point) {
-  for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-    for (size_t batch_size = 1; batch_size <= 5; batch_size += 1) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(output_zero_point)
-        .Test(xnn_qu8_vhswish_ukernel__scalar_u1, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U2, batch_eq_2) {
-  VHSwishMicrokernelTester()
-    .batch_size(2)
-    .input_zero_point(150)
-    .output_zero_point(100)
-    .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params);
-}
-
-TEST(QU8_VHSWISH__SCALAR_U2, batch_div_2) {
-  for (size_t batch_size = 4; batch_size < 20; batch_size += 2) {
-    VHSwishMicrokernelTester()
-      .batch_size(batch_size)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params);
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U2, batch_lt_2) {
-  for (size_t batch_size = 1; batch_size < 2; batch_size++) {
-    VHSwishMicrokernelTester()
-      .batch_size(batch_size)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params);
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U2, batch_gt_2) {
-  for (size_t batch_size = 3; batch_size < 4; batch_size++) {
-    VHSwishMicrokernelTester()
-      .batch_size(batch_size)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params);
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U2, input_scale) {
-  for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) {
-    for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_scale(input_scale)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params);
-      }
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U2, output_scale) {
-  for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) {
-    for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .output_scale(output_scale)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params);
-      }
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U2, input_zero_point) {
-  for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-    for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(input_zero_point)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U2, output_zero_point) {
-  for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-    for (size_t batch_size = 1; batch_size <= 10; batch_size += 1) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(output_zero_point)
-        .Test(xnn_qu8_vhswish_ukernel__scalar_u2, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U4, batch_eq_4) {
-  VHSwishMicrokernelTester()
-    .batch_size(4)
-    .input_zero_point(150)
-    .output_zero_point(100)
-    .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params);
-}
-
-TEST(QU8_VHSWISH__SCALAR_U4, batch_div_4) {
-  for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
-    VHSwishMicrokernelTester()
-      .batch_size(batch_size)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params);
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U4, batch_lt_4) {
-  for (size_t batch_size = 1; batch_size < 4; batch_size++) {
-    VHSwishMicrokernelTester()
-      .batch_size(batch_size)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params);
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U4, batch_gt_4) {
-  for (size_t batch_size = 5; batch_size < 8; batch_size++) {
-    VHSwishMicrokernelTester()
-      .batch_size(batch_size)
-      .input_zero_point(150)
-      .output_zero_point(100)
-      .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params);
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U4, input_scale) {
-  for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
-    for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_scale(input_scale)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params);
-      }
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U4, output_scale) {
-  for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
-    for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .output_scale(output_scale)
-        .input_zero_point(150)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params);
-      }
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U4, input_zero_point) {
-  for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(input_zero_point)
-        .output_zero_point(100)
-        .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-}
-
-TEST(QU8_VHSWISH__SCALAR_U4, output_zero_point) {
-  for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(150)
-        .output_zero_point(output_zero_point)
-        .Test(xnn_qu8_vhswish_ukernel__scalar_u4, xnn_init_qu8_hswish_scalar_params);
-    }
-  }
-}
\ No newline at end of file
+#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\
+                                                                                                                 \
+XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                        \
+XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                       \
+XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                        \
+XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                        \
+                                                                                                                 \
+XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                         \
+TEST(ukernel, input_scale) {                                                                                     \
+  TEST_REQUIRES_ARCH_FLAGS(arch_flags);                                                                          \
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {                                               \
+    for (float input_scale : {4.0f, 16.0f, 64.0f}) {                                                             \
+      VHSwishMicrokernelTester()                                                                                 \
+        .batch_size(batch_size)                                                                                  \
+        .input_scale(input_scale)                                                                                \
+        .input_zero_point(150)                                                                                   \
+        .output_zero_point(100)                                                                                  \
+        .Test(ukernel, init_params);                                                                             \
+      }                                                                                                          \
+  }                                                                                                              \
+}                                                                                                                \
+                                                                                                                 \
+TEST(ukernel, output_scale) {                                                                                    \
+  TEST_REQUIRES_ARCH_FLAGS(arch_flags);                                                                          \
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {                                               \
+    for (float output_scale : {4.0f, 16.0f, 64.0f}) {                                                            \
+      VHSwishMicrokernelTester()                                                                                 \
+        .batch_size(batch_size)                                                                                  \
+        .output_scale(output_scale)                                                                              \
+        .input_zero_point(150)                                                                                   \
+        .output_zero_point(100)                                                                                  \
+        .Test(ukernel, init_params);                                                                             \
+      }                                                                                                          \
+  }                                                                                                              \
+}                                                                                                                \
+                                                                                                                 \
+TEST(ukernel, input_zero_point) {                                                                                \
+  TEST_REQUIRES_ARCH_FLAGS(arch_flags);                                                                          \
+  for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {                             \
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {                                             \
+      VHSwishMicrokernelTester()                                                                                 \
+        .batch_size(batch_size)                                                                                  \
+        .input_zero_point(input_zero_point)                                                                      \
+        .output_zero_point(100)                                                                                  \
+        .Test(ukernel, init_params);                                                                             \
+    }                                                                                                            \
+  }                                                                                                              \
+}                                                                                                                \
+                                                                                                                 \
+TEST(ukernel, output_zero_point) {                                                                               \
+  TEST_REQUIRES_ARCH_FLAGS(arch_flags);                                                                          \
+  for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {                          \
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {                                             \
+      VHSwishMicrokernelTester()                                                                                 \
+        .batch_size(batch_size)                                                                                  \
+        .input_zero_point(150)                                                                                   \
+        .output_zero_point(output_zero_point)                                                                    \
+        .Test(ukernel, init_params);                                                                             \
+    }                                                                                                            \
+  }                                                                                                              \
+}
+#include "qu8-vhswish/qu8-vhswish.h"
+#undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-vhswish.yaml b/test/qu8-vhswish.yaml
deleted file mode 100644
index e348b3e5488..00000000000
--- a/test/qu8-vhswish.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# ARM NEON
-- name: xnn_qu8_vhswish_ukernel__neon_u8
-  init: xnn_init_qu8_hswish_scalar_params
-- name: xnn_qu8_vhswish_ukernel__neon_u16
-  init: xnn_init_qu8_hswish_scalar_params
-- name: xnn_qu8_vhswish_ukernel__neon_u32
-  init: xnn_init_qu8_hswish_scalar_params
-
-# x86 SSE2
-- name: xnn_qu8_vhswish_ukernel__sse2_u16
-  init: xnn_init_qu8_hswish_sse2_params
-- name: xnn_qu8_vhswish_ukernel__sse2_u32
-  init: xnn_init_qu8_hswish_sse2_params
-
-# x86 SSSE3
-- name: xnn_qu8_vhswish_ukernel__ssse3_u16
-  init: xnn_init_qu8_hswish_sse2_params
-- name: xnn_qu8_vhswish_ukernel__ssse3_u32
-  init: xnn_init_qu8_hswish_sse2_params
-
-# x86 SSE4.1
-- name: xnn_qu8_vhswish_ukernel__sse41_u8
-  init: xnn_init_qu8_hswish_sse2_params
-- name: xnn_qu8_vhswish_ukernel__sse41_u16
-  init: xnn_init_qu8_hswish_sse2_params
-- name: xnn_qu8_vhswish_ukernel__sse41_u32
-  init: xnn_init_qu8_hswish_sse2_params
-
-# x86 AVX
-- name: xnn_qu8_vhswish_ukernel__avx_u8
-  init: xnn_init_qu8_hswish_sse2_params
-- name: xnn_qu8_vhswish_ukernel__avx_u16
-  init: xnn_init_qu8_hswish_sse2_params
-- name: xnn_qu8_vhswish_ukernel__avx_u32
-  init: xnn_init_qu8_hswish_sse2_params
-
-# WAsm Relaxed SIMD
-- name: xnn_qu8_vhswish_ukernel__wasmsimd_u8
-  init: xnn_init_qu8_hswish_scalar_params
-- name: xnn_qu8_vhswish_ukernel__wasmsimd_u16
-  init: xnn_init_qu8_hswish_scalar_params
-- name: xnn_qu8_vhswish_ukernel__wasmsimd_u32
-  init: xnn_init_qu8_hswish_scalar_params
-
-# Scalar
-- name: xnn_qu8_vhswish_ukernel__scalar_u1
-  init: xnn_init_qu8_hswish_scalar_params
-- name: xnn_qu8_vhswish_ukernel__scalar_u2
-  init: xnn_init_qu8_hswish_scalar_params
-- name: xnn_qu8_vhswish_ukernel__scalar_u4
-  init: xnn_init_qu8_hswish_scalar_params
diff --git a/test/qu8-vlrelu.cc b/test/qu8-vlrelu.cc
index 7effffac006..39b9947ff4c 100644
--- a/test/qu8-vlrelu.cc
+++ b/test/qu8-vlrelu.cc
@@ -20,7 +20,7 @@
 #include "xnnpack/isa-checks.h"
 #include "xnnpack/microparams-init.h"
 #include "xnnpack/microparams.h"
-#include "xnnpack/vlrelu.h"
+#include "xnnpack/vunary.h"
 #include "next_prime.h"
 #include "vlrelu-microkernel-tester.h"
 
@@ -55,5 +55,5 @@ TEST(ukernel, negative_scale) {
       }                                                                                                          \
   }                                                                                                              \
 }
-#include "src/qu8-vlrelu/qu8-vlrelu.h"
+#include "qu8-vlrelu/qu8-vlrelu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-vmul-minmax-fp32.cc b/test/qu8-vmul-minmax-fp32.cc
index f6bdba82f4a..64f6b498c0f 100644
--- a/test/qu8-vmul-minmax-fp32.cc
+++ b/test/qu8-vmul-minmax-fp32.cc
@@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukerne
                                                                                                                  \
 XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);                    \
 XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);
-#include "src/qu8-vmul/qu8-vmul-minmax-fp32.h"
+#include "qu8-vmul/qu8-vmul-minmax-fp32.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-vmul-minmax-rndnu.cc b/test/qu8-vmul-minmax-rndnu.cc
index 823741779f1..2447f07af5c 100644
--- a/test/qu8-vmul-minmax-rndnu.cc
+++ b/test/qu8-vmul-minmax-rndnu.cc
@@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, false, datatype, ukerne
                                                                                                                  \
 XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);                    \
 XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, false, datatype, ukernel, init_params);
-#include "src/qu8-vmul/qu8-vmul-minmax-rndnu.h"
+#include "qu8-vmul/qu8-vmul-minmax-rndnu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-vmulc-minmax-fp32.cc b/test/qu8-vmulc-minmax-fp32.cc
index daa8984b69d..efa1d85ef47 100644
--- a/test/qu8-vmulc-minmax-fp32.cc
+++ b/test/qu8-vmulc-minmax-fp32.cc
@@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel
                                                                                                                  \
 XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params);                     \
 XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params);
-#include "src/qu8-vmulc/qu8-vmulc-minmax-fp32.h"
+#include "qu8-vmulc/qu8-vmulc-minmax-fp32.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/qu8-vmulc-minmax-rndnu.cc b/test/qu8-vmulc-minmax-rndnu.cc
index b9fab2e8e4f..1e203ea74b7 100644
--- a/test/qu8-vmulc-minmax-rndnu.cc
+++ b/test/qu8-vmulc-minmax-rndnu.cc
@@ -31,5 +31,5 @@ XNN_TEST_BINARY_Y_SCALE(ukernel, arch_flags, batch_tile, true, datatype, ukernel
                                                                                                                  \
 XNN_TEST_BINARY_QMIN(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params);                     \
 XNN_TEST_BINARY_QMAX(ukernel, arch_flags, batch_tile, true, datatype, ukernel, init_params);
-#include "src/qu8-vmulc/qu8-vmulc-minmax-rndnu.h"
+#include "qu8-vmulc/qu8-vmulc-minmax-rndnu.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/raddstoreexpminusmax-microkernel-tester.h b/test/raddstoreexpminusmax-microkernel-tester.h
index 1d020a2cf17..0dd99e56a5c 100644
--- a/test/raddstoreexpminusmax-microkernel-tester.h
+++ b/test/raddstoreexpminusmax-microkernel-tester.h
@@ -17,9 +17,10 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 class RAddStoreExpMinusMaxMicrokernelTester {
diff --git a/test/rdsum-microkernel-tester.h b/test/rdsum-microkernel-tester.h
index f8315b7fde5..e3742ead92f 100644
--- a/test/rdsum-microkernel-tester.h
+++ b/test/rdsum-microkernel-tester.h
@@ -17,10 +17,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams.h"
 #include "xnnpack/requantization.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 class RDSumMicrokernelTester {
diff --git a/test/reciprocal-square-root.cc b/test/reciprocal-square-root.cc
index 8d8ef3cbf15..bebfdd1c85a 100644
--- a/test/reciprocal-square-root.cc
+++ b/test/reciprocal-square-root.cc
@@ -11,6 +11,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/requantization-tester.h b/test/requantization-tester.h
index 80f24ddcbc3..a24857ab21b 100644
--- a/test/requantization-tester.h
+++ b/test/requantization-tester.h
@@ -520,49 +520,7 @@ class RequantizationTester {
     }
   }
 
-  void TestRandomCasesRoundToNearestTiesAway(xnn_qu8_requantization_fn requantize) {
-    ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
-    ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
-    ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
-    ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
-    ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
-    ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
-    ASSERT_LT(qmin(), qmax());
-
-    xnnpack::ReplicableRandomDevice rng;
-    for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      auto u8rng =
-        std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
-
-      xnnpack::Buffer<int32_t> inputs(4096);
-      xnnpack::Buffer<uint8_t> outputs(inputs.size());
-
-      std::uniform_real_distribution<float> scale_distribution(0x1.000000p-23f, 0x1.FFFFFEp-1f);
-      const float scale = scale_distribution(rng);
-      for (size_t i = 0; i < inputs.size(); i++) {
-        const uint8_t approximate_output = std::min(std::max(uint8_t(u8rng()), uint8_t(qmin())), uint8_t(qmax()));
-        const int32_t input = int32_t(double(approximate_output) / double(scale));
-        inputs[i] = input;
-      }
-
-      requantize(
-        inputs.size(), inputs.data(), scale, zero_point(), qmin(), qmax(),
-        outputs.data());
-
-      /* Ensure that outputs are not all identical, as in this case the test doesn't validate much */
-      ASSERT_NE(
-        *std::max_element(outputs.cbegin(), outputs.cend()),
-        *std::min_element(outputs.cbegin(), outputs.cend()));
-
-      for (size_t i = 0; i < inputs.size(); i++) {
-        const uint8_t reference_output = xnn_qu8_requantize_rndna(
-          inputs[i], scale, zero_point(), qmin(), qmax());
-        ASSERT_EQ(uint32_t(reference_output), uint32_t(outputs[i]));
-      }
-    }
-  }
-
-  void TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantization_fn requantize) {
+  void TestRandomCasesRoundToNearestTiesUp(xnn_qs8_requantization_fn requantize) {
     ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
     ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
     ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
@@ -597,34 +555,34 @@ class RequantizationTester {
         *std::min_element(outputs.cbegin(), outputs.cend()));
 
       for (size_t i = 0; i < inputs.size(); i++) {
-        const int8_t reference_output = xnn_qs8_requantize_rndna(
+        const int8_t reference_output = xnn_qs8_requantize_rndnu(
           inputs[i], scale, zero_point(), qmin(), qmax());
         ASSERT_EQ(int32_t(reference_output), int32_t(outputs[i]));
       }
     }
   }
 
-  void TestRandomCasesRoundToNearestTiesUp(xnn_qs8_requantization_fn requantize) {
-    ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
-    ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
-    ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
-    ASSERT_LE(qmin(), std::numeric_limits<int8_t>::max());
-    ASSERT_GE(qmax(), std::numeric_limits<int8_t>::min());
-    ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
+  void TestRandomCasesRoundToNearestTiesUp(xnn_qu8_requantization_fn requantize) {
+    ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
+    ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
+    ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
     ASSERT_LT(qmin(), qmax());
 
     xnnpack::ReplicableRandomDevice rng;
     for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      auto i8rng = std::bind(
-        std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), std::ref(rng));
+      auto u8rng = std::bind(
+        std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()), std::ref(rng));
 
       xnnpack::Buffer<int32_t> inputs(4096);
-      xnnpack::Buffer<int8_t> outputs(inputs.size());
+      xnnpack::Buffer<uint8_t> outputs(inputs.size());
 
       std::uniform_real_distribution<float> scale_distribution(0x1.000000p-23f, 0x1.FFFFFEp-1f);
       const float scale = scale_distribution(rng);
       for (size_t i = 0; i < inputs.size(); i++) {
-        const int8_t approximate_output = std::min(std::max(int8_t(i8rng()), int8_t(qmin())), int8_t(qmax()));
+        const uint8_t approximate_output = std::min(std::max(uint8_t(u8rng()), uint8_t(qmin())), uint8_t(qmax()));
         const int32_t input = int32_t(double(approximate_output) / double(scale));
         inputs[i] = input;
       }
@@ -639,7 +597,7 @@ class RequantizationTester {
         *std::min_element(outputs.cbegin(), outputs.cend()));
 
       for (size_t i = 0; i < inputs.size(); i++) {
-        const int8_t reference_output = xnn_qs8_requantize_rndnu(
+        const uint8_t reference_output = xnn_qu8_requantize_rndnu(
           inputs[i], scale, zero_point(), qmin(), qmax());
         ASSERT_EQ(int32_t(reference_output), int32_t(outputs[i]));
       }
diff --git a/test/rope-operator-tester.h b/test/rope-operator-tester.h
index e887fec4892..33194b17d63 100644
--- a/test/rope-operator-tester.h
+++ b/test/rope-operator-tester.h
@@ -133,7 +133,7 @@ class RoPEOperatorTester {
       xnn_operator_t rope_op = nullptr;
 
       const xnn_status status = xnn_create_rope_nthc_f16(
-        /*max_tokens=*/tokens(), /*flags=*/0, &rope_op);
+        /*flags=*/0, &rope_op);
       if (status == xnn_status_unsupported_hardware) {
         GTEST_SKIP();
       }
@@ -237,7 +237,7 @@ class RoPEOperatorTester {
       xnn_operator_t rope_op = nullptr;
 
       const xnn_status status = xnn_create_rope_nthc_f32(
-        /*max_tokens=*/tokens(), /*flags=*/0, &rope_op);
+        /*flags=*/0, &rope_op);
       if (status == xnn_status_unsupported_hardware) {
         GTEST_SKIP();
       }
diff --git a/test/rope.cc b/test/rope.cc
index e377d59562e..54ce6a1bcdc 100644
--- a/test/rope.cc
+++ b/test/rope.cc
@@ -98,7 +98,6 @@ TEST_F(RoPETestF16, define)
   const struct xnn_node* node = &subgraph->nodes[0];
   ASSERT_EQ(node->type, xnn_node_type_rope);
   ASSERT_EQ(node->compute_type, xnn_compute_type_fp16);
-  ASSERT_EQ(node->params.rope.max_tokens, max_tokens);
   ASSERT_EQ(node->num_inputs, 2);
   ASSERT_EQ(node->inputs[0], input_id);
   ASSERT_EQ(node->inputs[1], weights_id);
@@ -143,7 +142,6 @@ TEST_F(RoPETestF32, define)
   const struct xnn_node* node = &subgraph->nodes[0];
   ASSERT_EQ(node->type, xnn_node_type_rope);
   ASSERT_EQ(node->compute_type, xnn_compute_type_fp32);
-  ASSERT_EQ(node->params.rope.max_tokens, max_tokens);
   ASSERT_EQ(node->num_inputs, 2);
   ASSERT_EQ(node->inputs[0], input_id);
   ASSERT_EQ(node->inputs[1], weights_id);
@@ -161,7 +159,7 @@ TEST_F(RoPETestF16, matches_operator_api)
   std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
   std::generate(weights.begin(), weights.end(), [&]() { return f32dist(rng); });
 
-  const xnn_status status = xnn_create_rope_nthc_f16(max_tokens, /*flags=*/0, &op);
+  const xnn_status status = xnn_create_rope_nthc_f16(/*flags=*/0, &op);
   if (status == xnn_status_unsupported_hardware) {
     GTEST_SKIP();
   }
@@ -239,7 +237,7 @@ TEST_F(RoPETestF32, matches_operator_api)
   std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
   std::generate(weights.begin(), weights.end(), [&]() { return f32dist(rng); });
 
-  const xnn_status status = xnn_create_rope_nthc_f32(max_tokens, /*flags=*/0, &op);
+  const xnn_status status = xnn_create_rope_nthc_f32(/*flags=*/0, &op);
   if (status == xnn_status_unsupported_hardware) {
     GTEST_SKIP();
   }
diff --git a/test/rsum-microkernel-tester.h b/test/rsum-microkernel-tester.h
index 3bf2df6a0e8..e7884719b54 100644
--- a/test/rsum-microkernel-tester.h
+++ b/test/rsum-microkernel-tester.h
@@ -18,10 +18,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams.h"
 #include "xnnpack/requantization.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 class RSumMicrokernelTester {
diff --git a/test/s32-f32-vcvt.cc b/test/s32-f32-vcvt.cc
index 1bf048248a2..26f3916b37e 100644
--- a/test/s32-f32-vcvt.cc
+++ b/test/s32-f32-vcvt.cc
@@ -19,5 +19,5 @@ XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_ou
 XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \
 XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \
 XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);
-#include "src/s32-f32-vcvt/s32-f32-vcvt.h"
+#include "s32-f32-vcvt/s32-f32-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
diff --git a/test/s32-vmul.cc b/test/s32-vmul.cc
index 0d4eeb7ab50..366aa3ebca4 100644
--- a/test/s32-vmul.cc
+++ b/test/s32-vmul.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, false, datatype, ukern
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, false, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);
-#include "src/s32-vmul/s32-vmul.h"
+#include "s32-vmul/s32-vmul.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/s32-vmulc.cc b/test/s32-vmulc.cc
index 35dd5ca65d7..6d5756f0f43 100644
--- a/test/s32-vmulc.cc
+++ b/test/s32-vmulc.cc
@@ -21,5 +21,5 @@ XNN_TEST_BINARY_BATCH_GT(ukernel, arch_flags, batch_tile, true, datatype, ukerne
 XNN_TEST_BINARY_INPLACE_A(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);      \
 XNN_TEST_BINARY_INPLACE_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);      \
 XNN_TEST_BINARY_INPLACE_A_AND_B(ukernel, arch_flags, batch_tile, true, datatype, ukernel, VBinaryMicrokernelTester::OpType::Mul, init_params);
-#include "src/s32-vmul/s32-vmulc.h"
+#include "s32-vmul/s32-vmulc.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/s8-vclamp.cc b/test/s8-vclamp.cc
index 153893d3a2c..ee0c42e78e3 100644
--- a/test/s8-vclamp.cc
+++ b/test/s8-vclamp.cc
@@ -34,5 +34,5 @@ XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                         \
 XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                            \
 XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);
-#include "src/s8-vclamp/s8-vclamp.h"
+#include "s8-vclamp/s8-vclamp.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/scaled-dot-product-attention.cc b/test/scaled-dot-product-attention.cc
index e335bbadfe9..d5270f9c28d 100644
--- a/test/scaled-dot-product-attention.cc
+++ b/test/scaled-dot-product-attention.cc
@@ -20,6 +20,7 @@
 #include "xnnpack.h"
 #include "xnnpack/aligned-allocator.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/subgraph.h"
 #include "replicable_random_device.h"
diff --git a/test/sigmoid.cc b/test/sigmoid.cc
index 377f73812bb..b8ffe05bc24 100644
--- a/test/sigmoid.cc
+++ b/test/sigmoid.cc
@@ -11,6 +11,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/softmax.cc b/test/softmax.cc
index b8e80605a8f..d059d9c9787 100644
--- a/test/softmax.cc
+++ b/test/softmax.cc
@@ -13,6 +13,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/space-to-depth-2d.cc b/test/space-to-depth-2d.cc
index f3b7b47af00..7e5d77f2efb 100644
--- a/test/space-to-depth-2d.cc
+++ b/test/space-to-depth-2d.cc
@@ -14,6 +14,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/square-root.cc b/test/square-root.cc
index e8acac879fe..62ac4780330 100644
--- a/test/square-root.cc
+++ b/test/square-root.cc
@@ -12,6 +12,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/square.cc b/test/square.cc
index 431ae4adb09..da2908cb444 100644
--- a/test/square.cc
+++ b/test/square.cc
@@ -12,6 +12,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/static-expand-dims.cc b/test/static-expand-dims.cc
index 3bc8ec7b56c..3d26649cf6c 100644
--- a/test/static-expand-dims.cc
+++ b/test/static-expand-dims.cc
@@ -16,6 +16,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
@@ -77,10 +78,11 @@ TEST_F(StaticExpandDimsTestInt8, define)
                           nullptr, 0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
   ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
 
+  CalculateExpectedShape();
   output_id = XNN_INVALID_NODE_ID;
   ASSERT_EQ(
     xnn_status_success, xnn_define_quantized_tensor_value(
-                          subgraph, xnn_datatype_qint8, zero_point, scale, dims.size(), dims.data(),
+                          subgraph, xnn_datatype_qint8, zero_point, scale, expected_shape.size(), expected_shape.data(),
                           nullptr, 1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
   ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
 
@@ -135,10 +137,11 @@ TEST_F(StaticExpandDimsTestInt8, matches_operator_api)
                           nullptr, /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
   ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
 
+  CalculateExpectedShape();
   output_id = XNN_INVALID_NODE_ID;
   ASSERT_EQ(
     xnn_status_success, xnn_define_quantized_tensor_value(
-                          subgraph, xnn_datatype_qint8, zero_point, scale, dims.size(), dims.data(),
+                          subgraph, xnn_datatype_qint8, zero_point, scale, expected_shape.size(), expected_shape.data(),
                           nullptr, /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
   ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
 
@@ -160,7 +163,6 @@ TEST_F(StaticExpandDimsTestInt8, matches_operator_api)
   std::vector<size_t> out_dims(XNN_MAX_TENSOR_DIMS);
   ASSERT_EQ(xnn_status_success, xnn_get_external_value_shape(runtime, output_id, &num_out_dims, &out_dims[0]));
   out_dims.resize(num_out_dims);
-  CalculateExpectedShape();
   EXPECT_EQ(expected_shape, out_dims);
 }
 
@@ -179,10 +181,11 @@ TEST_F(StaticExpandDimsTestF16, define)
                           nullptr, 0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
   ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
 
+  CalculateExpectedShape();
   output_id = XNN_INVALID_NODE_ID;
   ASSERT_EQ(
     xnn_status_success, xnn_define_tensor_value(
-                          subgraph, xnn_datatype_fp16, dims.size(), dims.data(),
+                          subgraph, xnn_datatype_fp16, expected_shape.size(), expected_shape.data(),
                           nullptr, 1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
   ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
 
@@ -235,10 +238,11 @@ TEST_F(StaticExpandDimsTestF16, matches_operator_api)
                           nullptr, /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
   ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
 
+  CalculateExpectedShape();
   output_id = XNN_INVALID_NODE_ID;
   ASSERT_EQ(
     xnn_status_success, xnn_define_tensor_value(
-                          subgraph, xnn_datatype_fp16, dims.size(), dims.data(),
+                          subgraph, xnn_datatype_fp16, expected_shape.size(), expected_shape.data(),
                           nullptr, /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id));
   ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
 
@@ -260,6 +264,5 @@ TEST_F(StaticExpandDimsTestF16, matches_operator_api)
   std::vector<size_t> out_dims(XNN_MAX_TENSOR_DIMS);
   ASSERT_EQ(xnn_status_success, xnn_get_external_value_shape(runtime, output_id, &num_out_dims, &out_dims[0]));
   out_dims.resize(num_out_dims);
-  CalculateExpectedShape();
   EXPECT_EQ(expected_shape, out_dims);
 }
diff --git a/test/static-reduce.cc b/test/static-reduce.cc
index 181f18c44fe..9dff65376bb 100644
--- a/test/static-reduce.cc
+++ b/test/static-reduce.cc
@@ -311,7 +311,7 @@ INSTANTIATE_TEST_SUITE_P(ReduceTest, ReduceTest,
                              Values(xnn_reduce_sum, xnn_reduce_mean), Bool())),
                          [](auto p) { return p.param.Name(); });
 
-TEST_P(ReduceTest, SubgraphDefineWorks) {
+TEST_P(ReduceTest, define) {
   const Param p = GetParam();
   ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr));
 
@@ -347,7 +347,7 @@ TEST_P(ReduceTest, SubgraphDefineWorks) {
   ASSERT_EQ(node->flags, p.keep_dims ? XNN_FLAG_KEEP_DIMS : 0);
 }
 
-TEST_P(ReduceTest, SubgraphAPIResultsMatchesOperatorAPI) {
+TEST_P(ReduceTest, matches_operator_api) {
   const Param p = GetParam();
   ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr));
 
@@ -463,10 +463,12 @@ TEST_P(ReduceTest, SubgraphAPIResultsMatchesOperatorAPI) {
   CompareOutputs(p.datatype);
 }
 
-TEST_P(ReduceTest, ReshapingWorks) {
+TEST_P(ReduceTest, reshape) {
   const Param p = GetParam();
   ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr));
 
+  GenerateRandomInput(p.datatype);
+
   // Call subgraph API.
   xnn_subgraph_t subgraph = nullptr;
   ASSERT_EQ(xnn_status_success, xnn_create_subgraph(2, /*flags=*/0, &subgraph));
diff --git a/test/static-reshape.cc b/test/static-reshape.cc
index fccc6b63608..9d2df5ed362 100644
--- a/test/static-reshape.cc
+++ b/test/static-reshape.cc
@@ -16,6 +16,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/static-resize-bilinear-2d.cc b/test/static-resize-bilinear-2d.cc
index ef624886892..25ebe70d40b 100644
--- a/test/static-resize-bilinear-2d.cc
+++ b/test/static-resize-bilinear-2d.cc
@@ -15,10 +15,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 template <class T> class StaticResizeBilinear2DTestBase : public ::testing::Test {
diff --git a/test/static-slice.cc b/test/static-slice.cc
index 2673f3ba433..ff107cecbff 100644
--- a/test/static-slice.cc
+++ b/test/static-slice.cc
@@ -16,6 +16,7 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/static-transpose.cc b/test/static-transpose.cc
index eeaa0a2e797..084e1a9cf3b 100644
--- a/test/static-transpose.cc
+++ b/test/static-transpose.cc
@@ -15,6 +15,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/subgraph-fp16.cc b/test/subgraph-fp16.cc
index b6a2fa4fc1d..6c3e2f575c7 100644
--- a/test/subgraph-fp16.cc
+++ b/test/subgraph-fp16.cc
@@ -17,6 +17,7 @@
 #include <gtest/gtest.h>
 #include "xnnpack.h"
 #include "xnnpack/allocation-type.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/subgraph.h"
 #include "mock-allocator.h"
diff --git a/test/subgraph-size.c b/test/subgraph-size.c
index 1dcbdb50176..0cffe7d11ed 100644
--- a/test/subgraph-size.c
+++ b/test/subgraph-size.c
@@ -71,9 +71,6 @@ int main(int argc, char** argv) {
     case 7:
       xnn_define_binary(NULL, xnn_binary_add, NULL, 0, 0, 0, 0);
       break;
-    case 9:
-      xnn_define_prelu(NULL, 0, 0, 0, 0);
-      break;
     case 10:
       xnn_define_clamp(NULL, 0.0f, 0.0f, 0, 0, 0);
       break;
diff --git a/test/tanh-operator-tester.h b/test/tanh-operator-tester.h
index 017e4602cd0..ec5dc2eaf24 100644
--- a/test/tanh-operator-tester.h
+++ b/test/tanh-operator-tester.h
@@ -18,6 +18,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "replicable_random_device.h"
 
 class TanhOperatorTester {
diff --git a/test/tanh.cc b/test/tanh.cc
index a8bd67271c0..241f6bdd1b5 100644
--- a/test/tanh.cc
+++ b/test/tanh.cc
@@ -11,6 +11,7 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/math.h"
 #include "xnnpack/node-type.h"
 #include "xnnpack/operator.h"
 #include "xnnpack/subgraph.h"
diff --git a/test/u32-f32-vcvt.cc b/test/u32-f32-vcvt.cc
index aa750e94bef..48ffb59b3ca 100644
--- a/test/u32-f32-vcvt.cc
+++ b/test/u32-f32-vcvt.cc
@@ -19,5 +19,5 @@ XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_ou
 XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \
 XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \
 XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);
-#include "src/u32-f32-vcvt/u32-f32-vcvt.h"
+#include "u32-f32-vcvt/u32-f32-vcvt.h"
 #undef XNN_CVT_UKERNEL_WITH_PARAMS
diff --git a/test/u8-vclamp.cc b/test/u8-vclamp.cc
index 4faa4670e1f..ebcec214a40 100644
--- a/test/u8-vclamp.cc
+++ b/test/u8-vclamp.cc
@@ -34,5 +34,5 @@ XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init
 XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                         \
 XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);                            \
 XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params);
-#include "src/u8-vclamp/u8-vclamp.h"
+#include "u8-vclamp/u8-vclamp.h"
 #undef XNN_UKERNEL_WITH_PARAMS
diff --git a/test/unary-operator-tester.cc b/test/unary-operator-tester.cc
index 56c12497b27..83044400c0a 100644
--- a/test/unary-operator-tester.cc
+++ b/test/unary-operator-tester.cc
@@ -21,6 +21,7 @@
 #include <gtest/gtest.h>
 #include "xnnpack.h"
 #include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "replicable_random_device.h"
 
 namespace xnnpack {
diff --git a/test/vbinary-microkernel-tester.cc b/test/vbinary-microkernel-tester.cc
index 0f64451d260..ee3fd1cc7c7 100644
--- a/test/vbinary-microkernel-tester.cc
+++ b/test/vbinary-microkernel-tester.cc
@@ -21,11 +21,12 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams-init.h"
 #include "xnnpack/microparams.h"
 #include "xnnpack/requantization.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 void VBinaryMicrokernelTester::Test(xnn_f16_vbinary_ukernel_fn vbinary,
diff --git a/test/vcmul-microkernel-tester.h b/test/vcmul-microkernel-tester.h
index 56824def81f..0dc567139df 100644
--- a/test/vcmul-microkernel-tester.h
+++ b/test/vcmul-microkernel-tester.h
@@ -16,9 +16,10 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
 #include "xnnpack/isa-checks.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 class VCMulMicrokernelTester {
diff --git a/test/vcvt-microkernel-tester.cc b/test/vcvt-microkernel-tester.cc
index 50a356313fc..d294192cf78 100644
--- a/test/vcvt-microkernel-tester.cc
+++ b/test/vcvt-microkernel-tester.cc
@@ -77,10 +77,6 @@ void VCvtMicrokernelTester::Test(
 
 void VCvtMicrokernelTester::Test(xnn_f16_qs8_vcvt_ukernel_fn vcvt,
                                  xnn_init_f16_qs8_cvt_params_fn init_params) {
-  ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
-  ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
-  ASSERT_LT(qmin(), qmax());
-
   ASSERT_GE(output_zero_point(), std::numeric_limits<int8_t>::min());
   ASSERT_LE(output_zero_point(), std::numeric_limits<int8_t>::max());
 
@@ -101,8 +97,7 @@ void VCvtMicrokernelTester::Test(xnn_f16_qs8_vcvt_ukernel_fn vcvt,
 
 
     struct xnn_f16_qs8_cvt_params params;
-    init_params(&params, scale(),
-                output_zero_point(), qmin(), qmax());
+    init_params(&params, scale(), output_zero_point());
 
     // Call optimized micro-kernel.
     vcvt(batch_size() * sizeof(xnn_float16), input.data(), output.data(), &params);
@@ -111,9 +106,11 @@ void VCvtMicrokernelTester::Test(xnn_f16_qs8_vcvt_ukernel_fn vcvt,
     for (size_t i = 0; i < batch_size(); i++) {
       float scaled_input = input[i] * scale_fp16;
       scaled_input = std::min<float>(
-          scaled_input, static_cast<float>(qmax() - output_zero_point()));
+          scaled_input, static_cast<float>(std::numeric_limits<int8_t>::max() -
+                                           output_zero_point()));
       scaled_input = std::max<float>(
-          scaled_input, static_cast<float>(qmin() - output_zero_point()));
+          scaled_input, static_cast<float>(std::numeric_limits<int8_t>::min() -
+                                           output_zero_point()));
       output_ref[i] = static_cast<int8_t>(
           std::lrintf(scaled_input) + static_cast<long>(output_zero_point()));
     }
@@ -134,10 +131,6 @@ void VCvtMicrokernelTester::Test(xnn_f16_qs8_vcvt_ukernel_fn vcvt,
 void VCvtMicrokernelTester::Test(
     xnn_f32_qs8_vcvt_ukernel_fn vcvt,
     xnn_init_f32_qs8_cvt_params_fn init_params) const {
-  ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
-  ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
-  ASSERT_LT(qmin(), qmax());
-
   ASSERT_GE(output_zero_point(), std::numeric_limits<int8_t>::min());
   ASSERT_LE(output_zero_point(), std::numeric_limits<int8_t>::max());
 
@@ -151,7 +144,7 @@ void VCvtMicrokernelTester::Test(
     std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
 
     struct xnn_f32_qs8_cvt_params params;
-    init_params(&params, scale(), output_zero_point(), qmin(), qmax());
+    init_params(&params, scale(), output_zero_point());
 
     // Call optimized micro-kernel.
     vcvt(batch_size() * sizeof(float), input.data(), output.data(), &params);
@@ -160,9 +153,11 @@ void VCvtMicrokernelTester::Test(
     for (size_t i = 0; i < batch_size(); i++) {
       float scaled_input = input[i] * scale();
       scaled_input = std::min<float>(
-          scaled_input, static_cast<float>(qmax() - output_zero_point()));
+          scaled_input, static_cast<float>(std::numeric_limits<int8_t>::max() -
+                                           output_zero_point()));
       scaled_input = std::max<float>(
-          scaled_input, static_cast<float>(qmin() - output_zero_point()));
+          scaled_input, static_cast<float>(std::numeric_limits<int8_t>::min() -
+                                           output_zero_point()));
       output_ref[i] = static_cast<int8_t>(
           std::lrintf(scaled_input) + static_cast<long>(output_zero_point()));
     }
@@ -181,10 +176,6 @@ void VCvtMicrokernelTester::Test(
 void VCvtMicrokernelTester::Test(
     xnn_f32_qu8_vcvt_ukernel_fn vcvt,
     xnn_init_f32_qu8_cvt_params_fn init_params) const {
-  ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
-  ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
-  ASSERT_LT(qmin(), qmax());
-
   ASSERT_GE(output_zero_point(), std::numeric_limits<uint8_t>::min());
   ASSERT_LE(output_zero_point(), std::numeric_limits<uint8_t>::max());
 
@@ -198,7 +189,7 @@ void VCvtMicrokernelTester::Test(
     std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
 
     struct xnn_f32_qu8_cvt_params params;
-    init_params(&params, scale(), output_zero_point(), qmin(), qmax());
+    init_params(&params, scale(), output_zero_point());
 
     // Call optimized micro-kernel.
     vcvt(batch_size() * sizeof(float), input.data(), output.data(), &params);
@@ -207,9 +198,11 @@ void VCvtMicrokernelTester::Test(
     for (size_t i = 0; i < batch_size(); i++) {
       float scaled_input = input[i] * scale();
       scaled_input = std::min<float>(
-          scaled_input, static_cast<float>(qmax() - output_zero_point()));
+          scaled_input, static_cast<float>(std::numeric_limits<uint8_t>::max() -
+                                           output_zero_point()));
       scaled_input = std::max<float>(
-          scaled_input, static_cast<float>(qmin() - output_zero_point()));
+          scaled_input, static_cast<float>(std::numeric_limits<uint8_t>::min() -
+                                           output_zero_point()));
       output_ref[i] = static_cast<uint8_t>(
           std::lrintf(scaled_input) + static_cast<long>(output_zero_point()));
     }
diff --git a/test/vcvt-microkernel-tester.h b/test/vcvt-microkernel-tester.h
index f1f7689b27c..927f4429957 100644
--- a/test/vcvt-microkernel-tester.h
+++ b/test/vcvt-microkernel-tester.h
@@ -52,20 +52,6 @@ class VCvtMicrokernelTester {
 
   int16_t output_zero_point() const { return this->output_zero_point_; }
 
-  VCvtMicrokernelTester& qmin(int16_t qmin) {
-    this->qmin_ = qmin;
-    return *this;
-  }
-
-  int16_t qmin() const { return this->qmin_; }
-
-  VCvtMicrokernelTester& qmax(int16_t qmax) {
-    this->qmax_ = qmax;
-    return *this;
-  }
-
-  int16_t qmax() const { return this->qmax_; }
-
   VCvtMicrokernelTester& iterations(size_t iterations) {
     this->iterations_ = iterations;
     return *this;
@@ -114,8 +100,6 @@ class VCvtMicrokernelTester {
   float scale_ = 1.75f;
   int16_t input_zero_point_ = 0;
   int16_t output_zero_point_ = 5;
-  int16_t qmin_ = std::numeric_limits<int16_t>::min();
-  int16_t qmax_ = std::numeric_limits<int16_t>::max();
   size_t batch_size_ = 1;
   size_t iterations_ = 15;
 };
@@ -124,8 +108,6 @@ template <typename T>
 VCvtMicrokernelTester make_vcvt_tester() {
   if (std::is_integral<T>::value) {
     return VCvtMicrokernelTester()
-        .qmin(std::numeric_limits<T>::min())
-        .qmax(std::numeric_limits<T>::max())
         .output_zero_point(std::numeric_limits<T>::min() / 2 +
                            std::numeric_limits<T>::max() / 2 + 1);
   } else {
@@ -272,43 +254,3 @@ VCvtMicrokernelTester make_vcvt_tester() {
           .Test(__VA_ARGS__);                                               \
     }                                                                       \
   }
-
-#define XNN_TEST_CVT_QMIN(ukernel, arch_flags, batch_tile, datatype_in, \
-                          datatype_out, ...)                            \
-  TEST(ukernel, qmin) {                                                 \
-    TEST_REQUIRES_ARCH_FLAGS(arch_flags);                               \
-    const size_t batch_scale = get_batch_scale<datatype_in>();          \
-    const size_t batch_end = batch_tile * batch_scale * 5;              \
-    const size_t batch_step = std::max<size_t>(2, batch_end / 8) - 1;   \
-    for (int32_t qmin = std::numeric_limits<datatype_out>::min();       \
-         qmin < std::numeric_limits<datatype_out>::max(); qmin += 51) { \
-      for (size_t batch_size = 1; batch_size <= batch_end;              \
-           batch_size += batch_step) {                                  \
-        make_vcvt_tester<datatype_out>()                                \
-            .batch_size(batch_size)                                     \
-            .scale(500)                                                 \
-            .qmin(qmin)                                                 \
-            .Test(__VA_ARGS__);                                         \
-      }                                                                 \
-    }                                                                   \
-  }
-
-#define XNN_TEST_CVT_QMAX(ukernel, arch_flags, batch_tile, datatype_in,  \
-                          datatype_out, ...)                             \
-  TEST(ukernel, qmax) {                                                  \
-    TEST_REQUIRES_ARCH_FLAGS(arch_flags);                                \
-    const size_t batch_scale = get_batch_scale<datatype_in>();           \
-    const size_t batch_end = batch_tile * batch_scale * 5;               \
-    const size_t batch_step = std::max<size_t>(2, batch_end / 8) - 1;    \
-    for (int32_t qmax = std::numeric_limits<datatype_out>::min() + 1;    \
-         qmax <= std::numeric_limits<datatype_out>::max(); qmax += 51) { \
-      for (size_t batch_size = 1; batch_size <= batch_end;               \
-           batch_size += batch_step) {                                   \
-        make_vcvt_tester<datatype_out>()                                 \
-            .batch_size(batch_size)                                      \
-            .scale(500)                                                  \
-            .qmax(qmax)                                                  \
-            .Test(__VA_ARGS__);                                          \
-      }                                                                  \
-    }                                                                    \
-  }
diff --git a/test/vhswish-microkernel-tester.h b/test/vhswish-microkernel-tester.h
index 9ef61f6a069..d12b3476c08 100644
--- a/test/vhswish-microkernel-tester.h
+++ b/test/vhswish-microkernel-tester.h
@@ -85,6 +85,13 @@ class VHSwishMicrokernelTester {
     return this->iterations_;
   }
 
+  VHSwishMicrokernelTester& inplace(bool inplace) {
+    this->inplace_ = inplace;
+    return *this;
+  }
+
+  bool inplace() const { return this->inplace_; }
+
   void Test(xnn_qs8_vhswish_ukernel_fn vhswish, xnn_init_qs8_hswish_params_fn init_params) const {
     ASSERT_GE(input_zero_point(), std::numeric_limits<int8_t>::min());
     ASSERT_LE(input_zero_point(), std::numeric_limits<int8_t>::max());
@@ -100,15 +107,9 @@ class VHSwishMicrokernelTester {
     xnnpack::Buffer<int8_t> output_ref(batch_size());
     for (size_t iteration = 0; iteration < iterations(); iteration++) {
       std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
-      for (int i = 0; i < batch_size(); i++) {
-        input[i] = i;
-      }
       union xnn_qs8_hswish_params params;
       init_params(&params, input_zero_point(), output_zero_point(), input_scale(), output_scale());
 
-      // Call optimized micro-kernel.
-      vhswish(batch_size() * sizeof(int8_t), input.data(), output.data(), &params);
-
       // Compute reference results
       const int32_t input_scale_div = (int32_t) lrintf(256.0f * input_scale() / 6.0f);
       const int32_t scale_ratio = (int32_t) lrintf(256.0f * input_scale() / output_scale());
@@ -125,6 +126,13 @@ class VHSwishMicrokernelTester {
         output_ref[i] = static_cast<int8_t>(output_value);
       }
 
+      // Call optimized micro-kernel.
+      vhswish(batch_size() * sizeof(int8_t), input.data(), inplace() ? input.data() : output.data(), &params);
+
+      if (inplace()) {
+        std::copy_n(input.data(), batch_size(), output.data());
+      }
+
       // Verify results.
       for (size_t i = 0; i < batch_size(); i++) {
         EXPECT_EQ(int32_t(output[i]), int32_t(output_ref[i]))
@@ -149,15 +157,9 @@ class VHSwishMicrokernelTester {
     xnnpack::Buffer<uint8_t> output_ref(batch_size());
     for (size_t iteration = 0; iteration < iterations(); iteration++) {
       std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
-      for (int i = 0; i < batch_size(); i++) {
-        input[i] = i;
-      }
       union xnn_qu8_hswish_params params;
       init_params(&params, input_zero_point(), output_zero_point(), input_scale(), output_scale());
 
-      // Call optimized micro-kernel.
-      vhswish(batch_size() * sizeof(uint8_t), input.data(), output.data(), &params);
-
       // Compute reference results
       const int32_t input_scale_div = (int32_t) lrintf(256.0f * input_scale() / 6.0f);
       const int32_t scale_ratio = (int32_t) lrintf(256.0f * input_scale() / output_scale());
@@ -174,6 +176,13 @@ class VHSwishMicrokernelTester {
         output_ref[i] = static_cast<uint8_t>(output_value);
       }
 
+      // Call optimized micro-kernel.
+      vhswish(batch_size() * sizeof(uint8_t), input.data(), inplace() ? input.data() : output.data(), &params);
+
+      if (inplace()) {
+        std::copy_n(input.data(), batch_size(), output.data());
+      }
+
       // Verify results.
       for (size_t i = 0; i < batch_size(); i++) {
         EXPECT_EQ(int32_t(output[i]), int32_t(output_ref[i]))
@@ -190,4 +199,68 @@ class VHSwishMicrokernelTester {
   int16_t output_zero_point_ = 5;
   size_t batch_size_ = 1;
   size_t iterations_ = 15;
+  bool inplace_ = false;
 };
+
+#define XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, \
+                                ...)                                       \
+  TEST(ukernel, batch_eq) {                                                \
+    TEST_REQUIRES_ARCH_FLAGS(arch_flags);                                  \
+    const size_t batch_scale = get_batch_scale<datatype>();                \
+    VHSwishMicrokernelTester()                                             \
+        .batch_size(batch_tile * batch_scale)                              \
+        .Test(__VA_ARGS__);                                                \
+  }
+
+#define XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, \
+                                 ...)                                       \
+  TEST(ukernel, batch_div) {                                                \
+    if (batch_tile == 1) return;                                            \
+    TEST_REQUIRES_ARCH_FLAGS(arch_flags);                                   \
+    const size_t batch_scale = get_batch_scale<datatype>();                 \
+    const size_t batch_step = batch_tile * batch_scale;                     \
+    for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step;  \
+         batch_size += batch_step) {                                        \
+      VHSwishMicrokernelTester().batch_size(batch_size).Test(__VA_ARGS__);  \
+    }                                                                       \
+  }
+
+#define XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, \
+                                ...)                                       \
+  TEST(ukernel, batch_lt) {                                                \
+    if (batch_tile == 1) return;                                           \
+    TEST_REQUIRES_ARCH_FLAGS(arch_flags);                                  \
+    const size_t batch_scale = get_batch_scale<datatype>();                \
+    const size_t batch_end = batch_tile * batch_scale;                     \
+    for (size_t batch_size = 1; batch_size < batch_end; batch_size++) {    \
+      VHSwishMicrokernelTester().batch_size(batch_size).Test(__VA_ARGS__); \
+    }                                                                      \
+  }
+
+#define XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, \
+                                ...)                                       \
+  TEST(ukernel, batch_gt) {                                                \
+    TEST_REQUIRES_ARCH_FLAGS(arch_flags);                                  \
+    const size_t batch_scale = get_batch_scale<datatype>();                \
+    const size_t batch_step = batch_tile * batch_scale;                    \
+    const size_t batch_end = batch_tile == 1 ? 10 : 2 * batch_step;        \
+    for (size_t batch_size = batch_step + 1; batch_size < batch_end;       \
+         batch_size++) {                                                   \
+      VHSwishMicrokernelTester().batch_size(batch_size).Test(__VA_ARGS__); \
+    }                                                                      \
+  }
+
+#define XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ...) \
+  TEST(ukernel, inplace) {                                                     \
+    TEST_REQUIRES_ARCH_FLAGS(arch_flags);                                      \
+    const size_t batch_scale = get_batch_scale<datatype>();                    \
+    const size_t batch_end = batch_tile * batch_scale;                         \
+    const size_t batch_step = std::max(1, batch_tile - 1);                     \
+    for (size_t batch_size = 1; batch_size <= batch_end;                       \
+         batch_size += batch_step) {                                           \
+      VHSwishMicrokernelTester()                                               \
+          .batch_size(batch_size)                                              \
+          .inplace(true)                                                       \
+          .Test(__VA_ARGS__);                                                  \
+    }                                                                          \
+  }
diff --git a/test/vlog-microkernel-tester.h b/test/vlog-microkernel-tester.h
deleted file mode 100644
index 19a10e4393b..00000000000
--- a/test/vlog-microkernel-tester.h
+++ /dev/null
@@ -1,161 +0,0 @@
-// Copyright 2022 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-#include <functional>
-#include <random>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "xnnpack.h"
-#include "xnnpack/common.h"
-#include "xnnpack/math.h"
-#include "xnnpack/microfnptr.h"
-#include "xnnpack/buffer.h"
-#include "replicable_random_device.h"
-
-extern "C" XNN_INTERNAL const uint16_t xnn_table_vlog[129];
-
-class VLogMicrokernelTester {
- public:
-  VLogMicrokernelTester& batch(size_t batch) {
-    assert(batch != 0);
-    this->batch_ = batch;
-    return *this;
-  }
-
-  size_t batch() const {
-    return this->batch_;
-  }
-
-  VLogMicrokernelTester& input_lshift(uint32_t input_lshift) {
-    assert(input_lshift < 32);
-    this->input_lshift_ = input_lshift;
-    return *this;
-  }
-
-  uint32_t input_lshift() const {
-    return this->input_lshift_;
-  }
-
-  VLogMicrokernelTester& output_scale(uint32_t output_scale) {
-    this->output_scale_ = output_scale;
-    return *this;
-  }
-
-  uint32_t output_scale() const {
-    return this->output_scale_;
-  }
-
-  VLogMicrokernelTester& inplace(bool inplace) {
-    this->inplace_ = inplace;
-    return *this;
-  }
-
-  bool inplace() const {
-    return this->inplace_;
-  }
-
-  VLogMicrokernelTester& iterations(size_t iterations) {
-    this->iterations_ = iterations;
-    return *this;
-  }
-
-  size_t iterations() const {
-    return this->iterations_;
-  }
-
-  void Test(xnn_u32_vlog_ukernel_fn vlog) const {
-    xnnpack::ReplicableRandomDevice rng;
-    auto i16rng = std::bind(std::uniform_int_distribution<uint16_t>(), std::ref(rng));
-    auto i32rng = std::bind(std::uniform_int_distribution<uint32_t>(), std::ref(rng));
-
-    xnnpack::Buffer<uint32_t> x(batch() + XNN_EXTRA_BYTES / sizeof(uint32_t));
-    xnnpack::Buffer<uint16_t> y(batch() * (inplace() ? sizeof(uint32_t) / sizeof(uint16_t) : 1) + XNN_EXTRA_BYTES / sizeof(uint32_t));
-    xnnpack::Buffer<uint16_t> y_ref(batch());
-    const uint32_t* x_data = inplace() ? reinterpret_cast<const uint32_t*>(y.data()) : x.data();
-
-    for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      std::generate(x.begin(), x.end(), std::ref(i32rng));
-      std::generate(y.begin(), y.end(), std::ref(i16rng));
-      std::generate(y_ref.begin(), y_ref.end(), std::ref(i16rng));
-
-      // Compute reference results.
-      for (size_t n = 0; n < batch(); n++) {
-        const uint32_t x_value = x_data[n];
-        const uint32_t scaled = x_value << input_lshift();
-        uint32_t log_value = 0;
-        if (scaled != 0) {
-          const uint32_t out_scale = output_scale();
-
-          const int log_scale = 65536;
-          const int log_scale_log2 = 16;
-          const int log_coeff = 45426;
-          const uint32_t log2x = math_clz_nonzero_u32(scaled) ^ 31;  // log2 of scaled
-          assert(log2x < 32);
-
-          // Number of segments in the log lookup table. The table will be log_segments+1
-          // in length (with some padding).
-          const int log_segments_log2 = 7;
-
-          // Part 1
-          uint32_t frac = scaled - (UINT32_C(1) << log2x);
-
-          // Shift the fractional part into msb of 16 bits
-          frac =  XNN_UNPREDICTABLE(log2x < log_scale_log2) ?
-              (frac << (log_scale_log2 - log2x)) :
-              (frac >> (log2x - log_scale_log2));
-
-          // Part 2
-          const uint32_t base_seg = frac >> (log_scale_log2 - log_segments_log2);
-          const uint32_t seg_unit = (UINT32_C(1) << log_scale_log2) >> log_segments_log2;
-
-          assert(128 == (1 << log_segments_log2));
-          assert(base_seg < (1 << log_segments_log2));
-
-          const uint32_t c0 = xnn_table_vlog[base_seg];
-          const uint32_t c1 = xnn_table_vlog[base_seg + 1];
-          const uint32_t seg_base = seg_unit * base_seg;
-          const uint32_t rel_pos = ((c1 - c0) * (frac - seg_base)) >> log_scale_log2;
-          const uint32_t fraction =  frac + c0 + rel_pos;
-
-          const uint32_t log2 = (log2x << log_scale_log2) + fraction;
-          const uint32_t round = log_scale / 2;
-          const uint32_t loge = (((uint64_t) log_coeff) * log2 + round) >> log_scale_log2;
-
-          // Finally scale to our output scale
-          log_value = (out_scale * loge + round) >> log_scale_log2;
-        }
-
-        const uint32_t vout = math_min_u32(log_value, (uint32_t) INT16_MAX);
-        y_ref[n] = vout;
-      }
-
-      // Call optimized micro-kernel.
-      vlog(batch(), x_data, input_lshift(), output_scale(), y.data());
-
-      // Verify results.
-      for (size_t n = 0; n < batch(); n++) {
-        EXPECT_EQ(y[n], y_ref[n])
-          << ", input_lshift " << input_lshift()
-          << ", output_scale " << output_scale()
-          << ", batch " << n << " / " << batch();
-      }
-    }
-  }
-
- private:
-  size_t batch_{1};
-  uint32_t input_lshift_{4};
-  uint32_t output_scale_{16};
-  bool inplace_{false};
-  size_t iterations_{15};
-};
diff --git a/test/vmulcaddc-microkernel-tester.h b/test/vmulcaddc-microkernel-tester.h
index 0594e3ead4d..dd178750fba 100644
--- a/test/vmulcaddc-microkernel-tester.h
+++ b/test/vmulcaddc-microkernel-tester.h
@@ -16,10 +16,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams.h"
 #include "xnnpack/pack.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 class VMulCAddCMicrokernelTester {
diff --git a/test/vunary-microkernel-tester.cc b/test/vunary-microkernel-tester.cc
index 130053b82ee..6e7bd0ed7bd 100644
--- a/test/vunary-microkernel-tester.cc
+++ b/test/vunary-microkernel-tester.cc
@@ -18,10 +18,11 @@
 
 #include <gtest/gtest.h>
 #include "xnnpack.h"
+#include "xnnpack/buffer.h"
 #include "xnnpack/common.h"
+#include "xnnpack/math.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams.h"
-#include "xnnpack/buffer.h"
 #include "replicable_random_device.h"
 
 #ifndef M_SQRT1_2
diff --git a/test/x16-packw.cc b/test/x16-packw.cc
index cb3f26371fe..9ca158b2be9 100644
--- a/test/x16-packw.cc
+++ b/test/x16-packw.cc
@@ -32,7 +32,7 @@ std::string GetTestName(const testing::TestParamInfo<XnnTest::ParamType>& info)
   { #ukernel, ukernel, arch_flags, nr, kr, sr, kblock, nr_scale },
 
 const XnnTestParam xnn_test_params[] = {
-#include "src/x16-packw/x16-packw.h"
+#include "x16-packw/x16-packw.h"
 };
 
 #undef XNN_UKERNEL
diff --git a/test/x32-packb.cc b/test/x32-packb.cc
index 66d8c909cbb..ff90b1d2adb 100644
--- a/test/x32-packb.cc
+++ b/test/x32-packb.cc
@@ -32,8 +32,8 @@ std::string GetTestName(const testing::TestParamInfo<XnnTest::ParamType>& info)
   { #ukernel, PackBMicrokernelTester::Kernel{ukernel}, arch_flags, channel_tile, channel_subtile, channel_round },
 
 const XnnTestParam xnn_test_params[] = {
-#include "src/x32-packb/x32-packb.h"
-#include "src/x32-zerob/x32-zerob.h"
+#include "x32-packb/x32-packb.h"
+#include "x32-zerob/x32-zerob.h"
 };
 
 #undef XNN_UKERNEL
diff --git a/test/x32-packw.cc b/test/x32-packw.cc
index d2bc52b2312..3cb4b52cb08 100644
--- a/test/x32-packw.cc
+++ b/test/x32-packw.cc
@@ -32,7 +32,7 @@ std::string GetTestName(const testing::TestParamInfo<XnnTest::ParamType>& info)
   { #ukernel, ukernel, arch_flags, nr, kr, sr, kblock, nr_scale },
 
 const XnnTestParam xnn_test_params[] = {
-#include "src/x32-packw/x32-packw.h"
+#include "x32-packw/x32-packw.h"
 };
 
 #undef XNN_UKERNEL
diff --git a/test/x32-packx.cc b/test/x32-packx.cc
index 5986e3b1dc8..58deea144aa 100644
--- a/test/x32-packx.cc
+++ b/test/x32-packx.cc
@@ -32,7 +32,7 @@ std::string GetTestName(const testing::TestParamInfo<XnnTest::ParamType>& info)
   { #ukernel, ukernel, arch_flags, k, mr },
 
 const XnnTestParam xnn_test_params[] = {
-#include "src/x32-packx/x32-packx.h"
+#include "x32-packx/x32-packx.h"
 };
 
 #undef XNN_UKERNEL
diff --git a/test/x8-lut.cc b/test/x8-lut.cc
index 2edfd7537bd..b986d436dd6 100644
--- a/test/x8-lut.cc
+++ b/test/x8-lut.cc
@@ -1040,7 +1040,7 @@ TEST(X8_LUT__SCALAR_U16, inplace) {
 #endif  // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   TEST(X8_LUT__AVX512VBMI_VPERMX2B_U64, batch_eq_64) {
     TEST_REQUIRES_X86_AVX512VBMI;
     LUTMicrokernelTester()
@@ -1084,10 +1084,10 @@ TEST(X8_LUT__SCALAR_U16, inplace) {
         .Test(xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u64);
     }
   }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   TEST(X8_LUT__AVX512VBMI_VPERMX2B_U128, batch_eq_128) {
     TEST_REQUIRES_X86_AVX512VBMI;
     LUTMicrokernelTester()
@@ -1131,10 +1131,10 @@ TEST(X8_LUT__SCALAR_U16, inplace) {
         .Test(xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u128);
     }
   }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   TEST(X8_LUT__AVX512VBMI_VPERMX2B_U192, batch_eq_192) {
     TEST_REQUIRES_X86_AVX512VBMI;
     LUTMicrokernelTester()
@@ -1178,10 +1178,10 @@ TEST(X8_LUT__SCALAR_U16, inplace) {
         .Test(xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u192);
     }
   }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
   TEST(X8_LUT__AVX512VBMI_VPERMX2B_U256, batch_eq_256) {
     TEST_REQUIRES_X86_AVX512VBMI;
     LUTMicrokernelTester()
@@ -1225,7 +1225,7 @@ TEST(X8_LUT__SCALAR_U16, inplace) {
         .Test(xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u256);
     }
   }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#endif  // XNN_ENABLE_AVX512VBMI && (XNN_ARCH_X86 || XNN_ARCH_X86_64)
 
 
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
diff --git a/test/x8-packq.cc b/test/x8-packq.cc
index 1ae6cd9d4f7..d0df2abcfd8 100644
--- a/test/x8-packq.cc
+++ b/test/x8-packq.cc
@@ -33,7 +33,7 @@ std::string GetTestName(const testing::TestParamInfo<XnnTest::ParamType>& info)
   { #ukernel, ukernel, arch_flags, unroll },
 
 const XnnTestParam xnn_test_params[] = {
-#include "src/x8-packq/x8-packq.h"
+#include "x8-packq/x8-packq.h"
 };
 
 #undef XNN_UKERNEL
diff --git a/test/x8-packw.cc b/test/x8-packw.cc
index c6bdcc55ab7..12694864f60 100644
--- a/test/x8-packw.cc
+++ b/test/x8-packw.cc
@@ -33,7 +33,7 @@ std::string GetTestName(const testing::TestParamInfo<XnnTest::ParamType>& info)
   { #ukernel, ukernel, arch_flags, nr, kr, sr, kblock, nr_scale },
 
 const XnnTestParam xnn_test_params[] = {
-#include "src/x8-packw/x8-packw.h"
+#include "x8-packw/x8-packw.h"
 };
 
 #undef XNN_UKERNEL
diff --git a/test/xN-transpose.cc b/test/xN-transpose.cc
index 8cd0fc7ed35..b55d226daac 100644
--- a/test/xN-transpose.cc
+++ b/test/xN-transpose.cc
@@ -97,12 +97,12 @@ TestParams transpose_ukernels[] = {
                               block_width, block_height)                       \
   {#ukernel,     arch_flags,  make_ukernel_wrapper(ukernel),                   \
    element_size, block_width, block_height},
-#include "src/x8-transposec/x8-transposec.h"
-#include "src/x16-transposec/x16-transposec.h"
-#include "src/x24-transposec/x24-transposec.h"
-#include "src/x32-transposec/x32-transposec.h"
-#include "src/x64-transposec/x64-transposec.h"
-#include "src/xx-transposev/xx-transposev.h"
+#include "x8-transposec/x8-transposec.h"
+#include "x16-transposec/x16-transposec.h"
+#include "x24-transposec/x24-transposec.h"
+#include "x32-transposec/x32-transposec.h"
+#include "x64-transposec/x64-transposec.h"
+#include "xx-transposev/xx-transposev.h"
 };
 #undef XNN_TRANSPOSE_UKERNEL
 
@@ -112,7 +112,7 @@ TestParams transposev_ukernels[] = {
                               block_width, block_height)                       \
   {#ukernel,     arch_flags,  make_ukernel_wrapper(ukernel),                   \
    element_size, block_width, block_height},
-#include "src/xx-transposev/xx-transposev.h"
+#include "xx-transposev/xx-transposev.h"
 };
 #undef XNN_TRANSPOSE_UKERNEL
 
diff --git a/test/xx-fill.cc b/test/xx-fill.cc
index c22c5c1fcb1..ce141aa2956 100644
--- a/test/xx-fill.cc
+++ b/test/xx-fill.cc
@@ -128,7 +128,7 @@ struct TestParams {
 
 #define XNN_FILL_UKERNEL(arch_flags, ukernel) {#ukernel, arch_flags, ukernel},
 TestParams test_params[] = {
-#include "src/xx-fill/xx-fill.h"
+#include "xx-fill/xx-fill.h"
 };
 #undef XNN_FILL_UKERNEL
 
diff --git a/test/xx-pad.cc b/test/xx-pad.cc
index 776bba6419c..33cc22424e4 100644
--- a/test/xx-pad.cc
+++ b/test/xx-pad.cc
@@ -188,7 +188,7 @@ struct TestParams {
 #define XNN_PAD_UKERNEL(arch_flags, ukernel, tile_size) \
   {#ukernel, arch_flags, ukernel, tile_size},
 TestParams test_params[] = {
-#include "src/xx-pad/xx-pad.h"
+#include "xx-pad/xx-pad.h"
 };
 #undef XNN_PAD_UKERNEL
 
diff --git a/tools/generate-gemm-test.py b/tools/generate-gemm-test.py
index 500bd8e1644..e776ddabef0 100755
--- a/tools/generate-gemm-test.py
+++ b/tools/generate-gemm-test.py
@@ -881,8 +881,8 @@ def main(args):
 //   Generator: {generator}
 
 #include <benchmark/benchmark.h>
-#include "bench/gemm-benchmark.h"
-#include "bench/utils.h"
+#include "gemm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/common.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/isa-checks.h"
diff --git a/tools/generate-prelu-test.py b/tools/generate-prelu-test.py
deleted file mode 100755
index 1b901d8c20c..00000000000
--- a/tools/generate-prelu-test.py
+++ /dev/null
@@ -1,235 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2019 Google LLC
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import codecs
-import math
-import os
-import re
-import sys
-import yaml
-
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from primes import next_prime
-import xngen
-import xnncommon
-
-
-parser = argparse.ArgumentParser(description='PReLU microkernel test generator')
-parser.add_argument("-s", "--spec", metavar="FILE", required=True,
-                    help="Specification (YAML) file")
-parser.add_argument("-o", "--output", metavar="FILE", required=True,
-                    help='Output (C++ source) file')
-parser.set_defaults(defines=list())
-
-
-def split_ukernel_name(name):
-  match = re.fullmatch(r"xnn_(f16|f32)_prelu_ukernel__(.+)_(\d+)x(\d+)", name)
-  assert match is not None
-  row_tile = int(match.group(3))
-  channel_tile = int(match.group(4))
-
-  arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(2))
-  return row_tile, channel_tile, arch, isa
-
-
-PRELU_TEST_TEMPLATE = """\
-TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  PReLUMicrokernelTester()
-    .rows(${ROW_TILE})
-    .channels(${CHANNEL_TILE})
-    .Test(${", ".join(TEST_ARGS)});
-}
-
-$if CHANNEL_TILE > 1:
-  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}) {
-    $if ISA_CHECK:
-      ${ISA_CHECK};
-    for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*10}; channels += ${CHANNEL_TILE}) {
-      PReLUMicrokernelTester()
-        .rows(${ROW_TILE})
-        .channels(channels)
-        .Test(${", ".join(TEST_ARGS)});
-    }
-  }
-
-  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}) {
-    $if ISA_CHECK:
-      ${ISA_CHECK};
-    for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
-      PReLUMicrokernelTester()
-        .rows(${ROW_TILE})
-        .channels(channels)
-        .Test(${", ".join(TEST_ARGS)});
-    }
-  }
-
-TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
-    PReLUMicrokernelTester()
-      .rows(${ROW_TILE})
-      .channels(channels)
-      .Test(${", ".join(TEST_ARGS)});
-  }
-}
-
-$if ROW_TILE > 1:
-  TEST(${TEST_NAME}, rows_lt_${ROW_TILE}) {
-    $if ISA_CHECK:
-      ${ISA_CHECK};
-    for (size_t rows = 1; rows < ${ROW_TILE}; rows++) {
-      for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(${", ".join(TEST_ARGS)});
-      }
-    }
-  }
-
-  TEST(${TEST_NAME}, rows_div_${ROW_TILE}) {
-    $if ISA_CHECK:
-      ${ISA_CHECK};
-    for (size_t rows = ${ROW_TILE*2}; rows <= ${ROW_TILE*4}; rows += ${ROW_TILE}) {
-      for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
-        PReLUMicrokernelTester()
-          .rows(rows)
-          .channels(channels)
-          .Test(${", ".join(TEST_ARGS)});
-      }
-    }
-  }
-
-TEST(${TEST_NAME}, rows_gt_${ROW_TILE}) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  for (size_t rows = ${ROW_TILE+1}; rows < ${ROW_TILE*2}; rows++) {
-    for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .Test(${", ".join(TEST_ARGS)});
-    }
-  }
-}
-
-TEST(${TEST_NAME}, input_stride) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  for (size_t rows = 1; rows <= ${ROW_TILE*3}; rows += ${max(1, ROW_TILE-1)}) {
-    for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .input_stride(${next_prime(CHANNEL_TILE*5+1)})
-        .iterations(1)
-        .Test(${", ".join(TEST_ARGS)});
-    }
-  }
-}
-
-TEST(${TEST_NAME}, output_stride) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  for (size_t rows = 1; rows <= ${ROW_TILE*3}; rows += ${max(1, ROW_TILE-1)}) {
-    for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .output_stride(${next_prime(CHANNEL_TILE*5+1)})
-        .iterations(1)
-        .Test(${", ".join(TEST_ARGS)});
-    }
-  }
-}
-
-TEST(${TEST_NAME}, inplace) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  for (size_t rows = 1; rows <= ${ROW_TILE*3}; rows += ${max(1, ROW_TILE-1)}) {
-    for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
-      PReLUMicrokernelTester()
-        .rows(rows)
-        .channels(channels)
-        .inplace(true)
-        .iterations(1)
-        .Test(${", ".join(TEST_ARGS)});
-    }
-  }
-}
-"""
-
-
-def generate_test_cases(ukernel, row_tile, channel_tile, isa):
-  """Generates all tests cases for a PRELU micro-kernel.
-
-  Args:
-    ukernel: C name of the micro-kernel function.
-    row_tile: Number of rows (pixels) processed per one iteration of the outer
-              loop of the micro-kernel.
-    channel_tile: Number of channels processed per one iteration of the inner
-                  loop of the micro-kernel.
-    isa: instruction set required to run the micro-kernel. Generated unit test
-         will skip execution if the host processor doesn't support this ISA.
-
-  Returns:
-    Code for the test case.
-  """
-  _, test_name = ukernel.split("_", 1)
-  _, datatype, ukernel_type, _ = ukernel.split("_", 3)
-  return xngen.preprocess(PRELU_TEST_TEMPLATE, {
-      "TEST_NAME": test_name.upper().replace("UKERNEL_", ""),
-      "TEST_ARGS": [ukernel],
-      "DATATYPE": datatype,
-      "ROW_TILE": row_tile,
-      "CHANNEL_TILE": channel_tile,
-      "ISA_CHECK": xnncommon.generate_isa_check_macro(isa),
-      "next_prime": next_prime,
-    })
-
-
-def main(args):
-  options = parser.parse_args(args)
-
-  with codecs.open(options.spec, "r", encoding="utf-8") as spec_file:
-    spec_yaml = yaml.safe_load(spec_file)
-    if not isinstance(spec_yaml, list):
-      raise ValueError("expected a list of micro-kernels in the spec")
-
-    tests = """\
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-//
-// Auto-generated file. Do not edit!
-//   Specification: {specification}
-//   Generator: {generator}
-
-
-#include <gtest/gtest.h>
-#include "xnnpack/common.h"
-#include "xnnpack/isa-checks.h"
-#include "xnnpack/prelu.h"
-#include "prelu-microkernel-tester.h"
-""".format(specification=options.spec, generator=sys.argv[0])
-
-    for ukernel_spec in spec_yaml:
-      name = ukernel_spec["name"]
-      row_tile, channel_tile, arch, isa = split_ukernel_name(name)
-
-      test_case = generate_test_cases(name, row_tile, channel_tile, isa)
-      tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa)
-
-    xnncommon.overwrite_if_changed(options.output, tests)
-
-
-if __name__ == "__main__":
-  main(sys.argv[1:])
diff --git a/tools/generate-rdsum-benchmark.py b/tools/generate-rdsum-benchmark.py
index 77f572220de..3a81702ded2 100755
--- a/tools/generate-rdsum-benchmark.py
+++ b/tools/generate-rdsum-benchmark.py
@@ -109,8 +109,8 @@ def main(args):
 //   Specification: {specification}
 //   Generator: {generator}
 
-#include "bench/rsum-benchmark.h"
-#include "bench/utils.h"
+#include "rsum-benchmark.h"
+#include "utils.h"
 #include <benchmark/benchmark.h>
 
 #include "xnnpack.h"
diff --git a/tools/generate-spmm-test.py b/tools/generate-spmm-test.py
index 23f9b3f10ed..9a5f3f14836 100755
--- a/tools/generate-spmm-test.py
+++ b/tools/generate-spmm-test.py
@@ -486,8 +486,8 @@ def main(args):
 //   Generator: {generator}
 
 #include <benchmark/benchmark.h>
-#include "bench/spmm-benchmark.h"
-#include "bench/utils.h"
+#include "spmm-benchmark.h"
+#include "utils.h"
 #include "xnnpack/gemm.h"
 #include "xnnpack/microfnptr.h"
 #include "xnnpack/microparams-init.h"
diff --git a/tools/generate-vhswish-test.py b/tools/generate-vhswish-test.py
deleted file mode 100755
index 4c9310c6bb8..00000000000
--- a/tools/generate-vhswish-test.py
+++ /dev/null
@@ -1,227 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2023 Google LLC
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import codecs
-import math
-import os
-import re
-import sys
-import yaml
-
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-import xngen
-import xnncommon
-
-
-parser = argparse.ArgumentParser(
-  description='Vector Hardswish operation microkernel test generator')
-parser.add_argument("-s", "--spec", metavar="FILE", required=True,
-                    help="Specification (YAML) file")
-parser.add_argument("-o", "--output", metavar="FILE", required=True,
-                    help='Output (C++ source) file')
-parser.set_defaults(defines=list())
-
-
-def split_ukernel_name(name):
-  match = re.fullmatch(r"xnn_(qs8|qu8)_vhswish_ukernel__(.+)_u(\d+)(v)?", name)
-  if match is None:
-    raise ValueError("Unexpected microkernel name: " + name)
-
-  datatype = match.group(1)
-  batch_tile = int(match.group(3))
-
-  arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(2))
-  return datatype, batch_tile, arch, isa
-
-
-HSWISH_TEST_TEMPLATE = """\
-TEST(${TEST_NAME}, batch_eq_${BATCH_TILE}) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  VHSwishMicrokernelTester()
-    .batch_size(${BATCH_TILE})
-    $if DATATYPE == "QU8":
-      .input_zero_point(150)
-      .output_zero_point(100)
-    .Test(${", ".join(TEST_ARGS)});
-}
-
-$if BATCH_TILE > 1:
-  TEST(${TEST_NAME}, batch_div_${BATCH_TILE}) {
-    $if ISA_CHECK:
-      ${ISA_CHECK};
-    for (size_t batch_size = ${BATCH_TILE*2}; batch_size < ${BATCH_TILE*10}; batch_size += ${BATCH_TILE}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        $if DATATYPE == "QU8":
-          .input_zero_point(150)
-          .output_zero_point(100)
-        .Test(${", ".join(TEST_ARGS)});
-    }
-  }
-
-  TEST(${TEST_NAME}, batch_lt_${BATCH_TILE}) {
-    $if ISA_CHECK:
-      ${ISA_CHECK};
-    for (size_t batch_size = 1; batch_size < ${BATCH_TILE}; batch_size++) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        $if DATATYPE == "QU8":
-          .input_zero_point(150)
-          .output_zero_point(100)
-        .Test(${", ".join(TEST_ARGS)});
-    }
-  }
-
-TEST(${TEST_NAME}, batch_gt_${BATCH_TILE}) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  for (size_t batch_size = ${BATCH_TILE+1}; batch_size < ${10 if BATCH_TILE == 1 else BATCH_TILE*2}; batch_size++) {
-    VHSwishMicrokernelTester()
-      .batch_size(batch_size)
-      $if DATATYPE == "QU8":
-        .input_zero_point(150)
-        .output_zero_point(100)
-      .Test(${", ".join(TEST_ARGS)});
-  }
-}
-
-TEST(${TEST_NAME}, input_scale) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  for (size_t batch_size = 1; batch_size <= ${BATCH_TILE*5}; batch_size += ${max(1, BATCH_TILE-1)}) {
-    for (float input_scale : {4.0f, 16.0f, 64.0f}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_scale(input_scale)
-        $if DATATYPE == "QU8":
-          .input_zero_point(150)
-          .output_zero_point(100)
-        .Test(${", ".join(TEST_ARGS)});
-      }
-  }
-}
-
-TEST(${TEST_NAME}, output_scale) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  for (size_t batch_size = 1; batch_size <= ${BATCH_TILE*5}; batch_size += ${max(1, BATCH_TILE-1)}) {
-    for (float output_scale : {4.0f, 16.0f, 64.0f}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .output_scale(output_scale)
-        $if DATATYPE == "QU8":
-          .input_zero_point(150)
-          .output_zero_point(100)
-        .Test(${", ".join(TEST_ARGS)});
-      }
-  }
-}
-
-TEST(${TEST_NAME}, input_zero_point) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
-    for (size_t batch_size = 1; batch_size <= ${BATCH_TILE*5}; batch_size += ${max(1, BATCH_TILE-1)}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .input_zero_point(input_zero_point)
-        $if DATATYPE == "QU8":
-          .output_zero_point(100)
-        .Test(${", ".join(TEST_ARGS)});
-    }
-  }
-}
-
-TEST(${TEST_NAME}, output_zero_point) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
-    for (size_t batch_size = 1; batch_size <= ${BATCH_TILE*5}; batch_size += ${max(1, BATCH_TILE-1)}) {
-      VHSwishMicrokernelTester()
-        .batch_size(batch_size)
-        $if DATATYPE == "QU8":
-          .input_zero_point(150)
-        .output_zero_point(output_zero_point)
-        .Test(${", ".join(TEST_ARGS)});
-    }
-  }
-}
-"""
-
-
-def generate_test_cases(ukernel, init_fn, datatype, batch_tile, isa):
-  """Generates all tests cases for a Vector Hardswish micro-kernel.
-
-  Args:
-    ukernel: C name of the micro-kernel function.
-    init_fn: C name of the function to initialize microkernel parameters.
-    datatype: data type.
-    batch_tile: Number of batch elements processed per one iteration of the
-                inner loop of the micro-kernel.
-    isa: instruction set required to run the micro-kernel. Generated unit test
-         will skip execution if the host processor doesn't support this ISA.
-
-  Returns:
-    Code for the test case.
-  """
-  _, test_name = ukernel.split("_", 1)
-  test_args = [ukernel]
-  if init_fn:
-    test_args.append(init_fn)
-  return xngen.preprocess(HSWISH_TEST_TEMPLATE, {
-      "TEST_NAME": test_name.upper().replace("UKERNEL_", ""),
-      "TEST_ARGS": test_args,
-      "BATCH_TILE": batch_tile,
-      "DATATYPE": datatype.upper(),
-      "ISA_CHECK": xnncommon.generate_isa_check_macro(isa),
-    })
-
-
-def main(args):
-  options = parser.parse_args(args)
-
-  with codecs.open(options.spec, "r", encoding="utf-8") as spec_file:
-    spec_yaml = yaml.safe_load(spec_file)
-    if not isinstance(spec_yaml, list):
-      raise ValueError("expected a list of micro-kernels in the spec")
-
-    tests = """\
-// Copyright 2023 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-//
-// Auto-generated file. Do not edit!
-//   Specification: {specification}
-//   Generator: {generator}
-
-
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "xnnpack/common.h"
-#include "xnnpack/isa-checks.h"
-#include "xnnpack/microparams-init.h"
-#include "xnnpack/vhswish.h"
-#include "vhswish-microkernel-tester.h"
-""".format(specification=options.spec, generator=sys.argv[0])
-
-    for ukernel_spec in spec_yaml:
-      name = ukernel_spec["name"]
-      init_fn = ukernel_spec.get("init")
-      datatype, batch_tile, arch, isa = split_ukernel_name(name)
-
-      test_case = generate_test_cases(
-        name, init_fn, datatype, batch_tile, isa)
-      tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa)
-
-    xnncommon.overwrite_if_changed(options.output, tests)
-
-
-if __name__ == "__main__":
-  main(sys.argv[1:])
diff --git a/tools/generate-vlog-test.py b/tools/generate-vlog-test.py
deleted file mode 100755
index 96522b43217..00000000000
--- a/tools/generate-vlog-test.py
+++ /dev/null
@@ -1,176 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2022 Google LLC
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import codecs
-import math
-import os
-import re
-import sys
-import yaml
-
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from primes import next_prime
-import xngen
-import xnncommon
-
-
-parser = argparse.ArgumentParser(description='VLog microkernel test generator')
-parser.add_argument("-s", "--spec", metavar="FILE", required=True,
-                    help="Specification (YAML) file")
-parser.add_argument("-o", "--output", metavar="FILE", required=True,
-                    help='Output (C++ source) file')
-parser.set_defaults(defines=list())
-
-
-def split_ukernel_name(name):
-  match = re.fullmatch(r"xnn_u32_vlog_ukernel__(.+)_x(\d+)", name)
-  assert match is not None
-  batch_tile = int(match.group(2))
-
-  arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(1))
-  return batch_tile, arch, isa
-
-
-VLOG_TEST_TEMPLATE = """\
-TEST(${TEST_NAME}, DISABLED_batch_eq_${BATCH_TILE}) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  VLogMicrokernelTester()
-    .batch(${BATCH_TILE})
-    .Test(${", ".join(TEST_ARGS)});
-}
-
-$if BATCH_TILE > 1:
-  TEST(${TEST_NAME}, DISABLED_batch_div_${BATCH_TILE}) {
-    $if ISA_CHECK:
-      ${ISA_CHECK};
-    for (size_t batch = ${BATCH_TILE*2}; batch < ${BATCH_TILE*10}; batch += ${BATCH_TILE}) {
-      VLogMicrokernelTester()
-        .batch(batch)
-        .Test(${", ".join(TEST_ARGS)});
-    }
-  }
-
-  TEST(${TEST_NAME}, DISABLED_batch_lt_${BATCH_TILE}) {
-    $if ISA_CHECK:
-      ${ISA_CHECK};
-    for (size_t batch = 1; batch < ${BATCH_TILE}; batch++) {
-      VLogMicrokernelTester()
-        .batch(batch)
-        .Test(${", ".join(TEST_ARGS)});
-    }
-  }
-
-TEST(${TEST_NAME}, DISABLED_batch_gt_${BATCH_TILE}) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  for (size_t batch = ${BATCH_TILE+1}; batch < ${10 if BATCH_TILE == 1 else BATCH_TILE*2}; batch++) {
-    VLogMicrokernelTester()
-      .batch(batch)
-      .Test(${", ".join(TEST_ARGS)});
-  }
-}
-
-TEST(${TEST_NAME}, DISABLED_input_lshift) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  for (uint32_t input_lshift = 0; input_lshift < 32; input_lshift++) {
-    VLogMicrokernelTester()
-      .batch(${BATCH_TILE})
-      .input_lshift(input_lshift)
-      .Test(${", ".join(TEST_ARGS)});
-  }
-}
-
-TEST(${TEST_NAME}, DISABLED_output_scale) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  for (uint32_t output_scale = 0; output_scale < 65536; output_scale += ${next_prime(BATCH_TILE + 1)}) {
-    VLogMicrokernelTester()
-      .batch(${BATCH_TILE})
-      .output_scale(output_scale)
-      .Test(${", ".join(TEST_ARGS)});
-  }
-}
-
-TEST(${TEST_NAME}, DISABLED_inplace) {
-  $if ISA_CHECK:
-    ${ISA_CHECK};
-  for (size_t batch = ${BATCH_TILE+1}; batch < ${10 if BATCH_TILE == 1 else BATCH_TILE*2}; batch++) {
-    VLogMicrokernelTester()
-      .batch(batch)
-      .inplace(true)
-      .Test(${", ".join(TEST_ARGS)});
-  }
-}
-
-"""
-
-
-def generate_test_cases(ukernel, batch_tile, isa):
-  """Generates all tests cases for a VLog micro-kernel.
-
-  Args:
-    ukernel: C name of the micro-kernel function.
-    batch_tile: Number of batch processed per one iteration of the inner
-                  loop of the micro-kernel.
-    isa: instruction set required to run the micro-kernel. Generated unit test
-         will skip execution if the host processor doesn't support this ISA.
-
-  Returns:
-    Code for the test case.
-  """
-  _, test_name = ukernel.split("_", 1)
-  _, datatype, ukernel_type, _ = ukernel.split("_", 3)
-  return xngen.preprocess(VLOG_TEST_TEMPLATE, {
-      "TEST_NAME": test_name.upper().replace("UKERNEL_", ""),
-      "TEST_ARGS": [ukernel],
-      "DATATYPE": datatype,
-      "BATCH_TILE": batch_tile,
-      "ISA_CHECK": xnncommon.generate_isa_check_macro(isa),
-      "next_prime": next_prime,
-    })
-
-
-def main(args):
-  options = parser.parse_args(args)
-
-  with codecs.open(options.spec, "r", encoding="utf-8") as spec_file:
-    spec_yaml = yaml.safe_load(spec_file)
-    if not isinstance(spec_yaml, list):
-      raise ValueError("expected a list of micro-kernels in the spec")
-
-    tests = """\
-// Copyright 2022 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-//
-// Auto-generated file. Do not edit!
-//   Specification: {specification}
-//   Generator: {generator}
-
-
-#include <gtest/gtest.h>
-#include "xnnpack/common.h"
-#include "xnnpack/isa-checks.h"
-#include "xnnpack/vlog.h"
-#include "vlog-microkernel-tester.h"
-""".format(specification=options.spec, generator=sys.argv[0])
-
-    for ukernel_spec in spec_yaml:
-      name = ukernel_spec["name"]
-      batch_tile, arch, isa = split_ukernel_name(name)
-
-      test_case = generate_test_cases(name, batch_tile, isa)
-      tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa)
-
-    xnncommon.overwrite_if_changed(options.output, tests)
-
-
-if __name__ == "__main__":
-  main(sys.argv[1:])
diff --git a/tools/generate-vunary-test.py b/tools/generate-vunary-test.py
index 6af8142f6a4..82593c9c74d 100755
--- a/tools/generate-vunary-test.py
+++ b/tools/generate-vunary-test.py
@@ -18,6 +18,7 @@
 )
 parser.add_argument("-t", "--tester", metavar="TESTER", required=True,
                     choices=[
+                    "VHSwishMicrokernelTester",
                     "VLReLUMicrokernelTester",
                     "VUnaryMicrokernelTester"],
                     help="Tester class to be used in the generated test")
@@ -106,8 +107,7 @@
 XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)});
 XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)});
 
-$if OP_TYPE != "SquareRootShift":
-  XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)});
+XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)});
 $if OP_TYPE == "Clamp":
   XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)});
   XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)});
@@ -202,21 +202,65 @@
           }
       }
     }
-$if OP_TYPE == "SquareRootShift":
-  TEST(ukernel, shift) {
-    TEST_REQUIRES_ARCH_FLAGS(arch_flags);
-    const size_t batch_scale = get_batch_scale<datatype>();
-    const size_t batch_end = batch_tile * batch_scale;
-    const size_t batch_step = std::max(1, batch_tile - 1);
-    for (uint32_t shift = 0; shift < 32; shift++) {
-      for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) {
-        ${TESTER}()
-          .batch_size(batch_size)
-          .shift(shift)
-          .Test(${", ".join(TEST_ARGS)});
+$if OP_TYPE == "HardSwish":
+  $if "f" not in DATATYPE:
+    TEST(ukernel, input_scale) {
+      TEST_REQUIRES_ARCH_FLAGS(arch_flags);
+      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+        for (float input_scale : {4.0f, 16.0f, 64.0f}) {
+          VHSwishMicrokernelTester()
+            .batch_size(batch_size)
+            .input_scale(input_scale)
+            $if "qu8" in DATATYPE:
+              .input_zero_point(150)
+              .output_zero_point(100)
+            .Test(${", ".join(TEST_ARGS)});
+          }
+      }
+    }
+
+    TEST(ukernel, output_scale) {
+      TEST_REQUIRES_ARCH_FLAGS(arch_flags);
+      for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+        for (float output_scale : {4.0f, 16.0f, 64.0f}) {
+          VHSwishMicrokernelTester()
+            .batch_size(batch_size)
+            .output_scale(output_scale)
+            $if "qu8" in DATATYPE:
+              .input_zero_point(150)
+              .output_zero_point(100)
+            .Test(${", ".join(TEST_ARGS)});
+          }
+      }
+    }
+
+    TEST(ukernel, input_zero_point) {
+      TEST_REQUIRES_ARCH_FLAGS(arch_flags);
+      for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+        for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+          VHSwishMicrokernelTester()
+            .batch_size(batch_size)
+            .input_zero_point(input_zero_point)
+            $if "qu8" in DATATYPE:
+              .output_zero_point(100)
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+    TEST(ukernel, output_zero_point) {
+      TEST_REQUIRES_ARCH_FLAGS(arch_flags);
+      for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+        for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+          VHSwishMicrokernelTester()
+            .batch_size(batch_size)
+            $if "qu8" in DATATYPE:
+              .input_zero_point(150)
+            .output_zero_point(output_zero_point)
+            .Test(${", ".join(TEST_ARGS)});
+        }
       }
     }
-  }
 $if DATATYPE == "f32" and OP_TYPE in SPECIAL_VALUES_F32:
   TEST(ukernel, special_values) {
     TEST_REQUIRES_ARCH_FLAGS(arch_flags);
@@ -261,14 +305,12 @@ def main(args):
 
   tester = options.tester
   tester_header = {
+      "VHSwishMicrokernelTester": "vhswish-microkernel-tester.h",
       "VLReLUMicrokernelTester": "vlrelu-microkernel-tester.h",
       "VUnaryMicrokernelTester": "vunary-microkernel-tester.h",
   }[tester]
 
-  op_header = {
-      "VLReLUMicrokernelTester": "vlrelu.h",
-      "VUnaryMicrokernelTester": "vunary.h",
-  }[tester]
+  op_header = "vunary.h"
   tests = """\
 // Copyright 2019 Google LLC
 //
diff --git a/tools/xnncommon.py b/tools/xnncommon.py
index afe72e8b4cf..26c06900303 100644
--- a/tools/xnncommon.py
+++ b/tools/xnncommon.py
@@ -57,6 +57,7 @@ def _remove_duplicate_newlines(text):
   "avx256vnnigfni": "XNN_ENABLE_AVX256VNNIGFNI",
   "avx512f": "XNN_ENABLE_AVX512F",
   "avx512skx": "XNN_ENABLE_AVX512SKX",
+  "avx512vbmi": "XNN_ENABLE_AVX512VBMI",
   "avx512vnni": "XNN_ENABLE_AVX512VNNI",
   "avx512vnnigfni": "XNN_ENABLE_AVX512VNNIGFNI",
   "avx512amx": "XNN_ENABLE_AVX512AMX",